Regular Expression (RegEx)#

Programmable Regular Expression (PRegEx)#

Installation#

pip install pregex
[1]:
from pregex import *
import re

RegEx#

Introduction#

  • Any character including newline character .

  • Digit \d

  • Non-digit \D

  • Character [a-zA-Z]

  • Non-character [^a-zA-Z]

  • Punctuation [{-~!-\/\[-:-@]`

  • Non-punctuation [^{-~!-\/\[-:-@]`

  • Whitespace \s

  • Non-whitespace \S

  • Non-alphanumeric and underscore [^a-zA-Z_\d]

Reptitions#

  • Zero or more repetitions *: regex_1*

  • One or more repetitions +: regex_1+

  • At least x and at most y repetitions {x, y}: regex_1{x, y}

  • At least x repetitions {x,}: regex_1{x,}

  • At most y repetitions {,y}: regex_1{,y}

  • Exactly x repetitions {x}: regex_1{x}

  • Starting items ^: ^regex_1

  • Ending items $: regex_1$

[2]:
# begin with 2 or more digits, 0 or more lowercase letters, end with 0 or more uppercase letters
test_string = '14'
regex_pattern = r'^\d{2,}[a-z]*[A-Z]*$'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.MatchAtStart(qu.AtLeast(cl.AnyDigit(), 2)) + qu.Indefinite(cl.AnyLowercaseLetter()) + asr.MatchAtEnd(qu.Indefinite(cl.AnyUppercaseLetter()))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
\A\d{2,}[a-z]*[A-Z]*\Z
[2]:
['14']

Grouping and Capturing#

  • Word boundaries \b: \bregex_1\b

  • Capturing group ()

  • Alternative matching |

[3]:
# match word starting with vowel (a, e, i, o, u, A, E, I, O or U), consist of letters (lowercase and uppercase both) only
test_string = 'Found any animal?'
regex_pattern = r'\b[aeiouAEIOU][a-zA-Z]*\b'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.WordBoundary() + cl.AnyFrom('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U') + qu.Indefinite(cl.AnyLetter()) + asr.WordBoundary()
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
\b[euoOEiUaAI][A-Za-z]*\b
[3]:
['any', 'animal']
[4]:
# must start with Mr., Mrs., Ms., Dr. or Er., the rest of the string must contain only one or more English alphabetic letters (upper and lowercase)
test_string = 'Mr.DOSHI'
regex_pattern = r'^(Mr|Mrs|Ms|Dr|Er)\.[a-zA-Z]+$'
regex_match = re.findall(regex_pattern, test_string)
pre = gr.Capture(asr.MatchAtStart(op.Either('Mr', 'Mrs', 'Ms', 'Dr', 'Er'))) + '.' + asr.MatchAtEnd(qu.AtLeast(cl.AnyLetter(), 1))
pregex_match = pre.get_captures(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match[0] == pregex_match[0][0]
print(pregex_pattern)
regex_match
(\A(?:Mr|Mrs|Ms|Dr|Er))\.[A-Za-z]+\Z
[4]:
['Mr']

Backreferences#

[5]:
test_string = 'ab #1?AZa$ab #1?AZa$'
regex_pattern = r'([a-z])(\w)(\s)(\W)(\d)(\D)([A-Z])([a-zA-Z])([aeiouAEIOU])(\S)\1\2\3\4\5\6\7\8\9\10'
regex_match = re.findall(regex_pattern, test_string)
regex_match
[5]:
[('a', 'b', ' ', '#', '1', '?', 'A', 'Z', 'a', '$')]

Assertions#

  • Positive lookahead (?=): regex_1(?=regex_2)

[6]:
# match all occurrences of o followed immediately by oo
test_string = 'gooooo!'
regex_pattern = r'o(?=oo)'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.FollowedBy('o', 'oo')
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
o(?=oo)
[6]:
['o', 'o', 'o']
  • Negative lookahead (?!): regex_1(?!regex_2)

[7]:
# match all characters which are not immediately followed by that same character
test_string = 'gooooo'
regex_pattern = r'(.)(?!\1)'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.NotFollowedBy(gr.Capture(cl.Any(), name='g1'), gr.Backreference(name='g1'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
(?P<g1>.)(?!(?P=g1))
[7]:
['g', 'o']
  • Positive lookbehind (?<=): (?<=regex_2)regex_1

[8]:
# match all the occurences of digit which are immediately preceded by odd digit
test_string = '123Go!'
regex_pattern = r'(?<=[13579])\d'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.PrecededBy(cl.AnyDigit(), cl.AnyFrom('1', '3', '5', '7', '9'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
(?<=[35917])\d
[8]:
['2']
  • Negative lookbehind (?<!): (?<!regex_2)regex_1

[9]:
# match all the occurences of characters which are not immediately preceded by vowels
test_string = '1o1s'
regex_pattern = r'(?<![aeiouAEIOU]).'
regex_match = re.findall(regex_pattern, test_string)
pre = asr.NotPrecededBy(cl.Any(), cl.AnyFrom('a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U'))
pregex_match = pre.get_matches(test_string)
pregex_pattern = pre.get_pattern()
assert regex_match == pregex_match
print(pregex_pattern)
regex_match
(?<![euoOEiUaAI]).
[9]:
['1', 'o', 's']