Regular Expression (RegEx)#

Programmable Regular Expression (PRegEx)#

Installation#

pip install pregex

[1]:

from pregex import *
import re

RegEx#

Introduction#

Any character including newline character .
Digit \d
Non-digit \D
Character [a-zA-Z]
Non-character [^a-zA-Z]
Punctuation [{-~!-\/\[-:-@]`
Non-punctuation [^{-~!-\/\[-:-@]`
Whitespace \s
Non-whitespace \S
Non-alphanumeric and underscore [^a-zA-Z_\d]

Reptitions#

Zero or more repetitions *: regex_1*
One or more repetitions +: regex_1+
At least x and at most y repetitions {x, y}: regex_1{x, y}
At least x repetitions {x,}: regex_1{x,}
At most y repetitions {,y}: regex_1{,y}
Exactly x repetitions {x}: regex_1{x}
Starting items ^: ^regex_1
Ending items $: regex_1$

[2]:

# begin with 2 or more digits, 0 or more lowercase letters, end with 0 or more uppercase letters
test_string = '14'
regex_pattern = r'^\d{2,}[a-z]*[A-Z]*$'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.MatchAtStart(qu.AtLeast(cl.AnyDigit(), 2)) + qu.Indefinite(cl.AnyLowercaseLetter()) + asr.MatchAtEnd(qu.Indefinite(cl.AnyUppercaseLetter()))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match

\A\d{2,}[a-z]*[A-Z]*\Z

[2]:

['14']

Grouping and Capturing#

Word boundaries \b: \bregex_1\b
Capturing group ()
Alternative matching |

[3]:

# match word starting with vowel (a, e, i, o, u, A, E, I, O or U), consist of letters (lowercase and uppercase both) only
test_string = 'Found any animal?'
regex_pattern = r'\b[aeiouAEIOU][a-zA-Z]*\b'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.WordBoundary() + cl.AnyFrom('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U') + qu.Indefinite(cl.AnyLetter()) + asr.WordBoundary()
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match

\b[euoOEiUaAI][A-Za-z]*\b

[3]:

['any', 'animal']

[4]:

# must start with Mr., Mrs., Ms., Dr. or Er., the rest of the string must contain only one or more English alphabetic letters (upper and lowercase)
test_string = 'Mr.DOSHI'
regex_pattern = r'^(Mr|Mrs|Ms|Dr|Er)\.[a-zA-Z]+$'
regex_match = re.findall(regex_pattern, test_string)
pre = gr.Capture(asr.MatchAtStart(op.Either('Mr', 'Mrs', 'Ms', 'Dr', 'Er'))) + '.' + asr.MatchAtEnd(qu.AtLeast(cl.AnyLetter(), 1))
pregex_match = pre.get_captures(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match[0] == pregex_match[0][0]
print(pregex_pattern)
regex_match

(\A(?:Mr|Mrs|Ms|Dr|Er))\.[A-Za-z]+\Z

[4]:

['Mr']

Backreferences#

[5]:

test_string = 'ab #1?AZa$ab #1?AZa$'
regex_pattern = r'([a-z])(\w)(\s)(\W)(\d)(\D)([A-Z])([a-zA-Z])([aeiouAEIOU])(\S)\1\2\3\4\5\6\7\8\9\10'
regex_match = re.findall(regex_pattern, test_string)
regex_match

[5]:

[('a', 'b', ' ', '#', '1', '?', 'A', 'Z', 'a', '$')]

Assertions#

Positive lookahead (?=): regex_1(?=regex_2)

[6]:

# match all occurrences of o followed immediately by oo
test_string = 'gooooo!'
regex_pattern = r'o(?=oo)'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.FollowedBy('o', 'oo')
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match

o(?=oo)

[6]:

['o', 'o', 'o']

Negative lookahead (?!): regex_1(?!regex_2)

[7]:

# match all characters which are not immediately followed by that same character
test_string = 'gooooo'
regex_pattern = r'(.)(?!\1)'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.NotFollowedBy(gr.Capture(cl.Any(), name='g1'), gr.Backreference(name='g1'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match

(?P<g1>.)(?!(?P=g1))

[7]:

['g', 'o']

Positive lookbehind (?<=): (?<=regex_2)regex_1

[8]:

# match all the occurences of digit which are immediately preceded by odd digit
test_string = '123Go!'
regex_pattern = r'(?<=[13579])\d'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.PrecededBy(cl.AnyDigit(), cl.AnyFrom('1', '3', '5', '7', '9'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match

(?<=[35917])\d

[8]:

['2']

Negative lookbehind (?<!): (?<!regex_2)regex_1

[9]:

# match all the occurences of characters which are not immediately preceded by vowels
test_string = '1o1s'
regex_pattern = r'(?<![aeiouAEIOU]).'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.NotPrecededBy(cl.Any(), cl.AnyFrom('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match

(?<![euoOEiUaAI]).

[9]:

['1', 'o', 's']