Regular Expression (RegEx)#
Programmable Regular Expression (PRegEx)#
Installation#
pip install pregex
[1]:
from pregex import *
import re
RegEx#
Introduction#
Any character including newline character
.Digit
\dNon-digit
\DCharacter
[a-zA-Z]Non-character
[^a-zA-Z]Punctuation
[{-~!-\/\[-:-@]`Non-punctuation
[^{-~!-\/\[-:-@]`Whitespace
\sNon-whitespace
\SNon-alphanumeric and underscore
[^a-zA-Z_\d]
Reptitions#
Zero or more repetitions
*:regex_1*One or more repetitions
+:regex_1+At least
xand at mostyrepetitions{x, y}:regex_1{x, y}At least
xrepetitions{x,}:regex_1{x,}At most
yrepetitions{,y}:regex_1{,y}Exactly
xrepetitions{x}:regex_1{x}Starting items
^:^regex_1Ending items
$:regex_1$
[2]:
# begin with 2 or more digits, 0 or more lowercase letters, end with 0 or more uppercase letters
test_string = '14'
regex_pattern = r'^\d{2,}[a-z]*[A-Z]*$'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.MatchAtStart(qu.AtLeast(cl.AnyDigit(), 2)) + qu.Indefinite(cl.AnyLowercaseLetter()) + asr.MatchAtEnd(qu.Indefinite(cl.AnyUppercaseLetter()))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
\A\d{2,}[a-z]*[A-Z]*\Z
[2]:
['14']
Grouping and Capturing#
Word boundaries
\b:\bregex_1\bCapturing group
()Alternative matching
|
[3]:
# match word starting with vowel (a, e, i, o, u, A, E, I, O or U), consist of letters (lowercase and uppercase both) only
test_string = 'Found any animal?'
regex_pattern = r'\b[aeiouAEIOU][a-zA-Z]*\b'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.WordBoundary() + cl.AnyFrom('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U') + qu.Indefinite(cl.AnyLetter()) + asr.WordBoundary()
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
\b[euoOEiUaAI][A-Za-z]*\b
[3]:
['any', 'animal']
[4]:
# must start with Mr., Mrs., Ms., Dr. or Er., the rest of the string must contain only one or more English alphabetic letters (upper and lowercase)
test_string = 'Mr.DOSHI'
regex_pattern = r'^(Mr|Mrs|Ms|Dr|Er)\.[a-zA-Z]+$'
regex_match = re.findall(regex_pattern, test_string)
pre = gr.Capture(asr.MatchAtStart(op.Either('Mr', 'Mrs', 'Ms', 'Dr', 'Er'))) + '.' + asr.MatchAtEnd(qu.AtLeast(cl.AnyLetter(), 1))
pregex_match = pre.get_captures(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match[0] == pregex_match[0][0]
print(pregex_pattern)
regex_match
(\A(?:Mr|Mrs|Ms|Dr|Er))\.[A-Za-z]+\Z
[4]:
['Mr']
Backreferences#
[5]:
test_string = 'ab #1?AZa$ab #1?AZa$'
regex_pattern = r'([a-z])(\w)(\s)(\W)(\d)(\D)([A-Z])([a-zA-Z])([aeiouAEIOU])(\S)\1\2\3\4\5\6\7\8\9\10'
regex_match = re.findall(regex_pattern, test_string)
regex_match
[5]:
[('a', 'b', ' ', '#', '1', '?', 'A', 'Z', 'a', '$')]
Assertions#
Positive lookahead
(?=):regex_1(?=regex_2)
[6]:
# match all occurrences of o followed immediately by oo
test_string = 'gooooo!'
regex_pattern = r'o(?=oo)'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.FollowedBy('o', 'oo')
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
o(?=oo)
[6]:
['o', 'o', 'o']
Negative lookahead
(?!):regex_1(?!regex_2)
[7]:
# match all characters which are not immediately followed by that same character
test_string = 'gooooo'
regex_pattern = r'(.)(?!\1)'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.NotFollowedBy(gr.Capture(cl.Any(), name='g1'), gr.Backreference(name='g1'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
(?P<g1>.)(?!(?P=g1))
[7]:
['g', 'o']
Positive lookbehind
(?<=):(?<=regex_2)regex_1
[8]:
# match all the occurences of digit which are immediately preceded by odd digit
test_string = '123Go!'
regex_pattern = r'(?<=[13579])\d'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.PrecededBy(cl.AnyDigit(), cl.AnyFrom('1', '3', '5', '7', '9'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
(?<=[35917])\d
[8]:
['2']
Negative lookbehind
(?<!):(?<!regex_2)regex_1
[9]:
# match all the occurences of characters which are not immediately preceded by vowels
test_string = '1o1s'
regex_pattern = r'(?<![aeiouAEIOU]).'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.NotPrecededBy(cl.Any(), cl.AnyFrom('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
(?<![euoOEiUaAI]).
[9]:
['1', 'o', 's']