理解完正则学会一半:
import re
def test_patterns(text, patterns=[]):
"""Given source text and a list of patterns, look for
matches for each pattern within the text and print
them to stdout.
"""
print
print ''.join(str(i/10 or ' ') for i in range(len(text)))
print ''.join(str(i%10) for i in range(len(text)))
print text
# Look for each pattern in the text and print the results
for pattern in patterns:
print
print 'Matching "%s"' % pattern
for match in re.finditer(pattern, text):
s = match.start()
e = match.end()
print ' %2d : %2d = "%s"' % \
(s, e-1, text[s:e])
return
if __name__ == '__main__':
print "*"*50
#Pattern Syntax
test_patterns('abbaaabbbbaaaaa', ['ab'])
print "*"*50
#Repetition
test_patterns('abbaaabbbbaaaaa',
[ 'ab*', # a followed by zero or more b
'ab+', # a followed by one or more b
'ab?', # a followed by zero or one b
'ab{3}', # a followed by three b
'ab{2,3}', # a followed by two to three b
])
print "*"*50
#Character Sets
test_patterns('abbaaabbbbaaaaa',
[ '[ab]', # either a or b
'a[ab]+', # a followed by one or more a or b
'a[ab]+?', # a followed by one or more a or b, not greedy
])
print "*"*50
test_patterns('This is some text -- with punctuation.',
[ '[^-. ]+', # sequences without -, ., or space
])
print "*"*50
test_patterns('This is some text -- with punctuation.',
[ '[a-z]+', # sequences of lower case letters
'[A-Z]+', # sequences of upper case letters
'[a-zA-Z]+', # sequences of lower or upper case letters
'[A-Z][a-z]+', # one upper case letter followed by lower case letters
])
print "*"*50
test_patterns('abbaaabbbbaaaaa',
[ 'a.', # a followed by any one character
'b.', # b followed by any one character
'a.*b', # a followed by anything, ending in b
'a.*?b', # a followed by anything, ending in b
])
print "*"*50
#Escape Codes
# Code Meaning
# \d a digit
# \D a non-digit
# \s whitespace (tab, space, newline, etc.)
# \S non-whitespace
# \w alphanumeric
# \W non-alphanumeric
test_patterns('This is a prime #1 example!',
[ r'\d+', # sequence of digits
r'\D+', # sequence of non-digits
r'\s+', # sequence of whitespace
r'\S+', # sequence of non-whitespace
r'\w+', # alphanumeric characters
r'\W+', # non-alphanumeric
])
print "*"*50
test_patterns(r'\d+ \D+ \s+ \S+ \w+ \W+',
[ r'\\d\+',
r'\\D\+',
r'\\s\+',
r'\\S\+',
r'\\w\+',
r'\\W\+',
])
#Anchoring
# Code Meaning
# ^ start of string, or line
# $ end of string, or line
# \A start of string
# \Z end of string
# \b empty string at the beginning or end of a word
# \B empty string not at the beginning or end of a word
print "*"*50
test_patterns('This is some text -- with punctuation.',
[ r'^\w+', # word at start of string
r'\A\w+', # word at start of string
r'\w+\S*$', # word at end of string, with optional punctuation
r'\w+\S*\Z', # word at end of string, with optional punctuation
r'\w*t\w*', # word containing 't'
r'\bt\w+', # 't' at start of word
r'\w+t\b', # 't' at end of word
r'\Bt\B', # 't', not start or end of word
])
输出结果:
**************************************************
11111
012345678901234
abbaaabbbbaaaaa
Matching "ab"
0 : 1 = "ab"
5 : 6 = "ab"
**************************************************
11111
012345678901234
abbaaabbbbaaaaa
Matching "ab*"
0 : 2 = "abb"
3 : 3 = "a"
4 : 4 = "a"
5 : 9 = "abbbb"
10 : 10 = "a"
11 : 11 = "a"
12 : 12 = "a"
13 : 13 = "a"
14 : 14 = "a"
Matching "ab+"
0 : 2 = "abb"
5 : 9 = "abbbb"
Matching "ab?"
0 : 1 = "ab"
3 : 3 = "a"
4 : 4 = "a"
5 : 6 = "ab"
10 : 10 = "a"
11 : 11 = "a"
12 : 12 = "a"
13 : 13 = "a"
14 : 14 = "a"
Matching "ab{3}"
5 : 8 = "abbb"
Matching "ab{2,3}"
0 : 2 = "abb"
5 : 8 = "abbb"
**************************************************
11111
012345678901234
abbaaabbbbaaaaa
Matching "[ab]"
0 : 0 = "a"
1 : 1 = "b"
2 : 2 = "b"
3 : 3 = "a"
4 : 4 = "a"
5 : 5 = "a"
6 : 6 = "b"
7 : 7 = "b"
8 : 8 = "b"
9 : 9 = "b"
10 : 10 = "a"
11 : 11 = "a"
12 : 12 = "a"
13 : 13 = "a"
14 : 14 = "a"
Matching "a[ab]+"
0 : 14 = "abbaaabbbbaaaaa"
Matching "a[ab]+?"
0 : 1 = "ab"
3 : 4 = "aa"
5 : 6 = "ab"
10 : 11 = "aa"
12 : 13 = "aa"
**************************************************
1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.
Matching "[^-. ]+"
0 : 3 = "This"
5 : 6 = "is"
8 : 11 = "some"
13 : 16 = "text"
21 : 24 = "with"
26 : 36 = "punctuation"
**************************************************
1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.
Matching "[a-z]+"
1 : 3 = "his"
5 : 6 = "is"
8 : 11 = "some"
13 : 16 = "text"
21 : 24 = "with"
26 : 36 = "punctuation"
Matching "[A-Z]+"
0 : 0 = "T"
Matching "[a-zA-Z]+"
0 : 3 = "This"
5 : 6 = "is"
8 : 11 = "some"
13 : 16 = "text"
21 : 24 = "with"
26 : 36 = "punctuation"
Matching "[A-Z][a-z]+"
0 : 3 = "This"
**************************************************
11111
012345678901234
abbaaabbbbaaaaa
Matching "a."
0 : 1 = "ab"
3 : 4 = "aa"
5 : 6 = "ab"
10 : 11 = "aa"
12 : 13 = "aa"
Matching "b."
1 : 2 = "bb"
6 : 7 = "bb"
8 : 9 = "bb"
Matching "a.*b"
0 : 9 = "abbaaabbbb"
Matching "a.*?b"
0 : 1 = "ab"
3 : 6 = "aaab"
**************************************************
11111111112222222
012345678901234567890123456
This is a prime #1 example!
Matching "\d+"
17 : 17 = "1"
Matching "\D+"
0 : 16 = "This is a prime #"
18 : 26 = " example!"
Matching "\s+"
4 : 4 = " "
7 : 7 = " "
9 : 9 = " "
15 : 15 = " "
18 : 18 = " "
Matching "\S+"
0 : 3 = "This"
5 : 6 = "is"
8 : 8 = "a"
10 : 14 = "prime"
16 : 17 = "#1"
19 : 26 = "example!"
Matching "\w+"
0 : 3 = "This"
5 : 6 = "is"
8 : 8 = "a"
10 : 14 = "prime"
17 : 17 = "1"
19 : 25 = "example"
Matching "\W+"
4 : 4 = " "
7 : 7 = " "
9 : 9 = " "
15 : 16 = " #"
18 : 18 = " "
26 : 26 = "!"
**************************************************
1111111111222
01234567890123456789012
\d+ \D+ \s+ \S+ \w+ \W+
Matching "\\d\+"
0 : 2 = "\d+"
Matching "\\D\+"
4 : 6 = "\D+"
Matching "\\s\+"
8 : 10 = "\s+"
Matching "\\S\+"
12 : 14 = "\S+"
Matching "\\w\+"
16 : 18 = "\w+"
Matching "\\W\+"
20 : 22 = "\W+"
**************************************************
1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.
Matching "^\w+"
0 : 3 = "This"
Matching "\A\w+"
0 : 3 = "This"
Matching "\w+\S*$"
26 : 37 = "punctuation."
Matching "\w+\S*\Z"
26 : 37 = "punctuation."
Matching "\w*t\w*"
13 : 16 = "text"
21 : 24 = "with"
26 : 36 = "punctuation"
Matching "\bt\w+"
13 : 16 = "text"
Matching "\w+t\b"
13 : 16 = "text"
Matching "\Bt\B"
23 : 23 = "t"
30 : 30 = "t"
33 : 33 = "t"
待续...