This notebook mirrors the Google Python Course: Regular Expressions
Regular expressions comprise a pattern matching language. They also are a formal grammar that is a proper subset of context free grammars. In addition, regular expressions are provably equivalent to deterministic finite state automata, aka deterministic finite state acceptors or DFA's.
The functions defined in this notebook are found in retest.py.
Python implements regular expression pattern matching in the re module.
import re
dir(re)
['A', 'ASCII', 'DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'Match', 'Pattern', 'RegexFlag', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cache', '_compile', '_compile_repl', '_expand', '_locale', '_pickle', '_special_chars_map', '_subx', 'compile', 'copyreg', 'enum', 'error', 'escape', 'findall', 'finditer', 'fullmatch', 'functools', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'template']
A pattern is a string containing either characters or meta-characters.
The re method search(pattern, string) performs a pattern match.
pat = 'xxx'
r = re.search(pat, ' x ')
r ### there is no match
r2 = re.search(pat, 'xxxyyyxxx')
r2 ### there is a match!
<re.Match object; span=(0, 3), match='xxx'>
The power of regular expressions is that they can specify patterns, not just fixed characters. Here are the most basic patterns which match single characters:
re.search('ab.','abxddd')
<re.Match object; span=(0, 3), match='abx'>
re.search('ab..', 'abc') ## no match
re.search('\w\w\w','ab345')
<re.Match object; span=(0, 3), match='ab3'>
re.search('\w\w\w','ab dc') ## no match
re.search('\w\w\W\w','ab dc')
<re.Match object; span=(0, 4), match='ab d'>
re.search(r'\w\w\b\w\w','ab dc')
re.search(r'\b\w\w\b','ab dc')
<re.Match object; span=(0, 2), match='ab'>
re.search(r'\bfoo\b', 'foo')
<re.Match object; span=(0, 3), match='foo'>
re.search(r'\bfoo\b', 'foo.')
<re.Match object; span=(0, 3), match='foo'>
re.search(r'\bfoo\b', '(foo)')
<re.Match object; span=(1, 4), match='foo'>
re.search(r'\bfoo\b', 'bar foo baz')
<re.Match object; span=(4, 7), match='foo'>
re.search(r'\bfoo\b', 'foobar')
re.search(r'\bfoo\b', 'foo3')
re.search('\w\w\s\w\w','ab dc')
<re.Match object; span=(0, 5), match='ab dc'>
re.search('\S\S\s\S\S', 'ab dc')
<re.Match object; span=(0, 5), match='ab dc'>
re.search('\d\d\d','12345')
<re.Match object; span=(0, 3), match='123'>
re.search('\d\d\d\D\d\d\d','123 456')
<re.Match object; span=(0, 7), match='123 456'>
re.search('\d\d\d','x123456')
<re.Match object; span=(1, 4), match='123'>
re.search('^\d\d\d','x123456') # no match
re.search('^\d\d\d','123456')
<re.Match object; span=(0, 3), match='123'>
re.search('\d\d\d$','123456')
<re.Match object; span=(3, 6), match='456'>
re.search('\d\d\d$','123456 ') ## no match
re.search('\d\d\d$','123456 '.strip())
<re.Match object; span=(3, 6), match='456'>
re.search('ab\.','abc') ## no match
re.search('ab\.', 'ab...')
<re.Match object; span=(0, 3), match='ab.'>
re.search('(cat|dog)','my cat')
<re.Match object; span=(3, 6), match='cat'>
re.search('(cat|dog)', 'your dog')
<re.Match object; span=(5, 8), match='dog'>
re.search('^(a|e|i|o|u)123', 'e123')
<re.Match object; span=(0, 4), match='e123'>
re.search('^[aeiou]123', 'o123456')
<re.Match object; span=(0, 4), match='o123'>
re.search('^[^aeiou]123','x123456')
<re.Match object; span=(0, 4), match='x123'>
re.search('^[a-z]123','x123456')
<re.Match object; span=(0, 4), match='x123'>
re.search('^[^a-z]123','X123456')
<re.Match object; span=(0, 4), match='X123'>
Things get more interesting when you use + and * to specify repetition in the pattern
First the search finds the leftmost match for the pattern, and second it tries to use up as much of the string as possible -- i.e. + and go as far as possible (the + and \ are said to be "greedy").
re.match('pi+g', 'piiiig') # one or more i's, as many as possible
<re.Match object; span=(0, 6), match='piiiig'>
Finds the first/leftmost solution, and within it drives the + as far as possible (aka 'leftmost and largest').
In this example, note that it does not get to the second set of i's.
match = re.search(r'i+', 'piigiiii')
match
<re.Match object; span=(1, 3), match='ii'>
\s* = zero or more whitespace chars
Here look for 3 digits, possibly separated by whitespace.
re.search('\d\s*\d\s*\d', 'xx1 2 3xx')
<re.Match object; span=(2, 7), match='1 2 3'>
re.search(r'\d\s*\d\s*\d', 'xx12 3xx')
<re.Match object; span=(2, 6), match='12 3'>
re.search(r'\d\s*\d\s*\d', 'xx12 3xx')
<re.Match object; span=(2, 13), match='12 3'>
^ = matches the start of string, so the first case fails:
re.search(r'^b\w+', 'foobar')
re.search(r'b\w+', 'foobar')
<re.Match object; span=(3, 6), match='bar'>
Square brackets indicate a character class. e.g. [aeiou] matches any vowel
re.match('^[aeiou]+$','aaaaeee')
<re.Match object; span=(0, 7), match='aaaaeee'>
re.match('^[aeiou]+$','aaaaxxxeee')
def retest(str = 'an example word:cat!!'):
pat = 'word:\w\w\w'
match = re.search(pat, str)
# If-statement after search() tests if it succeeded
if match:
print ('found: {}'.format( match.group()))
else:
print ('Did not find {} in {}'.format(pat, str))
retest()
found: word:cat
retest('hello world')
Did not find word:\w\w\w in hello world
retest('this is word:123456')
found: word:123
patterns = ['aaa', # contains aaa
'abc', # contains abc
'...', # contains three characters
'^...$', # starts with three characters
'\.\.\.', # contains three periods
'abd', # contains abd
'^[aeiou]*$', # contains only vowels
'^[^aeiou]*$', # contains only NON vowels
'\w\W\w', # two word characters separated by a non-word char
'\w\w\w', # three word characters
'^\d+$', # contains only decimal digits
'^[0-7]+$', # contains only octal digits
'^[0-9A-Fa-f]+$', # contains only hexadecimal digits
'^[a-z]*$', # contains only lower case letters
'^\s+', # starts with a whitespace char
'^\d\s?', # starts with a digit followed by zero or one space
'\w+@\w+', # match email address
'(b)*(a)*(c)',
'^b*a*c$',
'^(b|c)*(a|b)*$',
'^bb*(ab|ba)*|(bbc|cbc)*$',
'^(ab|ba|cb|bc|ca|ac)*$',
'(bc|bcc)(bac|cba)(cba|aa)',
'\AThe', # \A is beginning of string
]
def retest2(patlist = patterns):
while (True):
str = input("\nEnter string: ")
if str == "quit": break
for pat in patlist:
match = re.search(pat, str)
# match = re.match(pat, str)
if match:
print ('Matched: {} with: {}'.format(pat, match.group(0)))
else:
print ('Did not find {} in {}'.format(pat, str))
retest2()
Enter string: abc Did not find aaa in abc Matched: abc with: abc Matched: ... with: abc Matched: ^...$ with: abc Did not find \.\.\. in abc Did not find abd in abc Did not find ^[aeiou]*$ in abc Did not find ^[^aeiou]*$ in abc Did not find \w\W\w in abc Matched: \w\w\w with: abc Did not find ^\d+$ in abc Did not find ^[0-7]+$ in abc Matched: ^[0-9A-Fa-f]+$ with: abc Matched: ^[a-z]*$ with: abc Did not find ^\s+ in abc Did not find ^\d\s? in abc Did not find \w+@\w+ in abc Matched: (b)*(a)*(c) with: bc Did not find ^b*a*c$ in abc Did not find ^(b|c)*(a|b)*$ in abc Matched: ^bb*(ab|ba)*|(bbc|cbc)*$ with: Did not find ^(ab|ba|cb|bc|ca|ac)*$ in abc Did not find (bc|bcc)(bac|cba)(cba|aa) in abc Did not find \AThe in abc Enter string: joe@yale Did not find aaa in joe@yale Did not find abc in joe@yale Matched: ... with: joe Did not find ^...$ in joe@yale Did not find \.\.\. in joe@yale Did not find abd in joe@yale Did not find ^[aeiou]*$ in joe@yale Did not find ^[^aeiou]*$ in joe@yale Matched: \w\W\w with: e@y Matched: \w\w\w with: joe Did not find ^\d+$ in joe@yale Did not find ^[0-7]+$ in joe@yale Did not find ^[0-9A-Fa-f]+$ in joe@yale Did not find ^[a-z]*$ in joe@yale Did not find ^\s+ in joe@yale Did not find ^\d\s? in joe@yale Matched: \w+@\w+ with: joe@yale Did not find (b)*(a)*(c) in joe@yale Did not find ^b*a*c$ in joe@yale Did not find ^(b|c)*(a|b)*$ in joe@yale Matched: ^bb*(ab|ba)*|(bbc|cbc)*$ with: Did not find ^(ab|ba|cb|bc|ca|ac)*$ in joe@yale Did not find (bc|bcc)(bac|cba)(cba|aa) in joe@yale Did not find \AThe in joe@yale
def retest3(patlist = ['(\w+)@(\w+)', '^(\d\d\d).(\d\d\d)']):
while (True):
str = input("\nEnter string: ")
if str == "quit": break
for pat in patlist:
match = re.search(pat, str)
if match:
print ('Matched: {} group 1: {} group 2: {}'.format(pat, match.group(1), match.group(2)))
else:
print ('Did not find {} in {}'.format(pat, str))
retest3()
def retest4(patlist = patterns):
while (True):
str = input("\nEnter string: ")
if str == "quit": break
for pat in patlist:
match = re.findall(pat, str)
if match:
print ('Total matches for {}: {}'.format(pat, len(match)))
else:
print ('Did not find {} in {}'.format(pat, str))
retest4()
Enter string: aaaaaa Total matches for aaa: 2 Did not find abc in aaaaaa Total matches for ...: 2 Did not find ^...$ in aaaaaa Did not find \.\.\. in aaaaaa Did not find abd in aaaaaa Total matches for ^[aeiou]*$: 1 Did not find ^[^aeiou]*$ in aaaaaa Did not find \w\W\w in aaaaaa Total matches for \w\w\w: 2 Did not find ^\d+$ in aaaaaa Did not find ^[0-7]+$ in aaaaaa Total matches for ^[0-9A-Fa-f]+$: 1 Total matches for ^[a-z]*$: 1 Did not find ^\s+ in aaaaaa Did not find ^\d\s? in aaaaaa Did not find \w+@\w+ in aaaaaa Did not find (b)*(a)*(c) in aaaaaa Did not find ^b*a*c$ in aaaaaa Total matches for ^(b|c)*(a|b)*$: 1 Total matches for ^bb*(ab|ba)*|(bbc|cbc)*$: 1 Did not find ^(ab|ba|cb|bc|ca|ac)*$ in aaaaaa Did not find (bc|bcc)(bac|cba)(cba|aa) in aaaaaa Did not find \AThe in aaaaaa
End of regular expressions notebook.