re – Regular Expressions 理解完正则学会一半

最新推荐文章于 2024-02-07 02:47:19 发布

u010571535

最新推荐文章于 2024-02-07 02:47:19 发布

阅读量726

点赞数

分类专栏： Python 文章标签： Python re 正则

Python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

理解完正则学会一半：

import re

def test_patterns(text, patterns=[]):
    """Given source text and a list of patterns, look for
    matches for each pattern within the text and print
    them to stdout.
    """
    print
    print ''.join(str(i/10 or ' ') for i in range(len(text)))
    print ''.join(str(i%10) for i in range(len(text)))
    print text

    # Look for each pattern in the text and print the results
    for pattern in patterns:
        print
        print 'Matching "%s"' % pattern
        for match in re.finditer(pattern, text):
            s = match.start()
            e = match.end()
            print '  %2d : %2d = "%s"' % \
                (s, e-1, text[s:e])
    return

if __name__ == '__main__':
    print "*"*50
    #Pattern Syntax
    test_patterns('abbaaabbbbaaaaa', ['ab'])
    print "*"*50
    #Repetition
    test_patterns('abbaaabbbbaaaaa',
              [ 'ab*',     # a followed by zero or more b
                'ab+',     # a followed by one or more b
                'ab?',     # a followed by zero or one b
                'ab{3}',   # a followed by three b
                'ab{2,3}', # a followed by two to three b
                ])
    print "*"*50
    #Character Sets
    test_patterns('abbaaabbbbaaaaa',
              [ '[ab]',    # either a or b
                'a[ab]+',  # a followed by one or more a or b
                'a[ab]+?', # a followed by one or more a or b, not greedy
                ])
    print "*"*50
    test_patterns('This is some text -- with punctuation.',
              [ '[^-. ]+',  # sequences without -, ., or space
                ])
    print "*"*50
    test_patterns('This is some text -- with punctuation.',
              [ '[a-z]+',      # sequences of lower case letters
                '[A-Z]+',      # sequences of upper case letters
                '[a-zA-Z]+',   # sequences of lower or upper case letters
                '[A-Z][a-z]+', # one upper case letter followed by lower case letters
                ])
    print "*"*50
    test_patterns('abbaaabbbbaaaaa',
              [ 'a.',   # a followed by any one character
                'b.',   # b followed by any one character
                'a.*b', # a followed by anything, ending in b
                'a.*?b', # a followed by anything, ending in b
                ])
    print "*"*50
    #Escape Codes
    # Code  Meaning
    # \d    a digit
    # \D    a non-digit
    # \s    whitespace (tab, space, newline, etc.)
    # \S    non-whitespace
    # \w    alphanumeric
    # \W    non-alphanumeric
    test_patterns('This is a prime #1 example!',
              [ r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
                ])
    print "*"*50
    test_patterns(r'\d+ \D+ \s+ \S+ \w+ \W+',
              [ r'\\d\+',
                r'\\D\+',
                r'\\s\+',
                r'\\S\+',
                r'\\w\+',
                r'\\W\+',
                ])
    #Anchoring
    # Code  Meaning
    # ^ start of string, or line
    # $ end of string, or line
    # \A    start of string
    # \Z    end of string
    # \b    empty string at the beginning or end of a word
    # \B    empty string not at the beginning or end of a word
    print "*"*50
    test_patterns('This is some text -- with punctuation.',
              [ r'^\w+',     # word at start of string
                r'\A\w+',    # word at start of string
                r'\w+\S*$',  # word at end of string, with optional punctuation
                r'\w+\S*\Z', # word at end of string, with optional punctuation
                r'\w*t\w*',  # word containing 't'
                r'\bt\w+',   # 't' at start of word
                r'\w+t\b',   # 't' at end of word
                r'\Bt\B',    # 't', not start or end of word
                ])

输出结果：

**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "ab"
   0 :  1 = "ab"
   5 :  6 = "ab"
**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "ab*"
   0 :  2 = "abb"
   3 :  3 = "a"
   4 :  4 = "a"
   5 :  9 = "abbbb"
  10 : 10 = "a"
  11 : 11 = "a"
  12 : 12 = "a"
  13 : 13 = "a"
  14 : 14 = "a"

Matching "ab+"
   0 :  2 = "abb"
   5 :  9 = "abbbb"

Matching "ab?"
   0 :  1 = "ab"
   3 :  3 = "a"
   4 :  4 = "a"
   5 :  6 = "ab"
  10 : 10 = "a"
  11 : 11 = "a"
  12 : 12 = "a"
  13 : 13 = "a"
  14 : 14 = "a"

Matching "ab{3}"
   5 :  8 = "abbb"

Matching "ab{2,3}"
   0 :  2 = "abb"
   5 :  8 = "abbb"
**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "[ab]"
   0 :  0 = "a"
   1 :  1 = "b"
   2 :  2 = "b"
   3 :  3 = "a"
   4 :  4 = "a"
   5 :  5 = "a"
   6 :  6 = "b"
   7 :  7 = "b"
   8 :  8 = "b"
   9 :  9 = "b"
  10 : 10 = "a"
  11 : 11 = "a"
  12 : 12 = "a"
  13 : 13 = "a"
  14 : 14 = "a"

Matching "a[ab]+"
   0 : 14 = "abbaaabbbbaaaaa"

Matching "a[ab]+?"
   0 :  1 = "ab"
   3 :  4 = "aa"
   5 :  6 = "ab"
  10 : 11 = "aa"
  12 : 13 = "aa"
**************************************************

          1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.

Matching "[^-. ]+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 : 11 = "some"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"
**************************************************

          1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.

Matching "[a-z]+"
   1 :  3 = "his"
   5 :  6 = "is"
   8 : 11 = "some"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"

Matching "[A-Z]+"
   0 :  0 = "T"

Matching "[a-zA-Z]+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 : 11 = "some"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"

Matching "[A-Z][a-z]+"
   0 :  3 = "This"
**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "a."
   0 :  1 = "ab"
   3 :  4 = "aa"
   5 :  6 = "ab"
  10 : 11 = "aa"
  12 : 13 = "aa"

Matching "b."
   1 :  2 = "bb"
   6 :  7 = "bb"
   8 :  9 = "bb"

Matching "a.*b"
   0 :  9 = "abbaaabbbb"

Matching "a.*?b"
   0 :  1 = "ab"
   3 :  6 = "aaab"
**************************************************

          11111111112222222
012345678901234567890123456
This is a prime #1 example!

Matching "\d+"
  17 : 17 = "1"

Matching "\D+"
   0 : 16 = "This is a prime #"
  18 : 26 = " example!"

Matching "\s+"
   4 :  4 = " "
   7 :  7 = " "
   9 :  9 = " "
  15 : 15 = " "
  18 : 18 = " "

Matching "\S+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 :  8 = "a"
  10 : 14 = "prime"
  16 : 17 = "#1"
  19 : 26 = "example!"

Matching "\w+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 :  8 = "a"
  10 : 14 = "prime"
  17 : 17 = "1"
  19 : 25 = "example"

Matching "\W+"
   4 :  4 = " "
   7 :  7 = " "
   9 :  9 = " "
  15 : 16 = " #"
  18 : 18 = " "
  26 : 26 = "!"
**************************************************

          1111111111222
01234567890123456789012
\d+ \D+ \s+ \S+ \w+ \W+

Matching "\\d\+"
   0 :  2 = "\d+"

Matching "\\D\+"
   4 :  6 = "\D+"

Matching "\\s\+"
   8 : 10 = "\s+"

Matching "\\S\+"
  12 : 14 = "\S+"

Matching "\\w\+"
  16 : 18 = "\w+"

Matching "\\W\+"
  20 : 22 = "\W+"
**************************************************

          1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.

Matching "^\w+"
   0 :  3 = "This"

Matching "\A\w+"
   0 :  3 = "This"

Matching "\w+\S*$"
  26 : 37 = "punctuation."

Matching "\w+\S*\Z"
  26 : 37 = "punctuation."

Matching "\w*t\w*"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"

Matching "\bt\w+"
  13 : 16 = "text"

Matching "\w+t\b"
  13 : 16 = "text"

Matching "\Bt\B"
  23 : 23 = "t"
  30 : 30 = "t"
  33 : 33 = "t"

待续...