正则表达式
正则表达式简介
搜索(searching),即在字符串任意部分中搜索匹配的模式;
匹配(matching)是指判断一个字符串能否从起始处全部或者部分地匹配某个模式。
择一匹配的管道符号
(|),使用择一匹配符号匹配多个正则表达式模式
匹配任意单个字符
点号或者句点(.)符号匹配除了换行符\n 以外的任何字符(匹配句点需要转义)
从字符串起始或者结尾或者单词边界匹配
特殊字符\b 和\B 可以用来匹配字符边界。
创建字符集
限定范围和否定
使用闭包操作符实现存在性和频数匹配
表示字符集的特殊字符
使用圆括号指定分组
扩展表示法
正则表达式和Python 语言
re 模块来支持正则表达式(替换regex 模块和regsub 模块,若导入会触发ImportError 异常。)
re.compile()预编译来提升执行性能
匹配对象以及group()和groups()方法
group()要么返回整个匹配对象,要么根据要求返回特定子组。
groups()则仅返回一个包含唯一或者全部子组的元组。
如果没有子组的要求,那么当group()仍然返回整个匹配时,groups()返回一个空元组。
使用match()方法匹配字符串
从字符串的起始部分对模式进行匹配。如果匹配成功,就返回一个匹配对象;如果匹配失败,就返回None,匹配对象的group()方法能够用于显示那个成功的匹配。
>>> m = re.match('foo', 'foo') # 模式匹配字符串
>>> if m is not None: # 如果匹配成功,就输出匹配内容
... m.group()
...
'foo'
>>> m # 确认返回的匹配对象
<re.MatchObject instance at 80ebf48>
#失败的匹配示例,它返回None
>>> m = re.match('foo', 'bar')# 模式并不能匹配字符串
>>> if m is not None: m.group() # (实际中最好不要省略,单行版本的if 语句)
...
>> >
>>> m = re.match('foo', 'food on the table') # 匹配成功
>>> m.group()
'f oo'
#如果匹配失败,将会抛出AttributeError 异常。
>>> re.match('foo', 'food on the table').group()
'foo'
使用search()在一个字符串中查找模式(搜索与匹配的对比)
>>> m = re.search('foo', 'seafood') # 使用 search() 代替
>>> if m is not None: m.group()
...
'foo' # 搜索成功,但是匹配失败
>> >
匹配多个字符串
>>> bt = 'bat|bet|bit' # 正则表达式模式: bat、bet、bit
>>> m = re.match(bt, 'bat') # 'bat' 是一个匹配
>>> if m is not None: m.group()
...
'bat'
>>> m = re.match(bt, 'blt') # 对于 'blt' 没有匹配
>>> if m is not None: m.group()
...
>>> m = re.match(bt, 'He bit me!') # 不能匹配字符串
>>> if m is not None: m.group()
...
>>> m = re.search(bt, 'He bit me!') # 通过搜索查找 'bit'
>>> if m is not None: m.group()
...
'b it'
匹配任何单个字符
>>> anyend = '.end'
>>> m = re.match(anyend, 'bend') # 点号匹配 'b'
>>> if m is not None: m.group()
...
'bend'
>>> m = re.match(anyend, 'end') # 不匹配任何字符
>>> if m is not None: m.group()
...
>>> m = re.match(anyend, '\nend') # 除了 \n 之外的任何字符
>>> if m is not None: m.group()
...
>>> m = re.search('.end', 'The end.')# 在搜索中匹配 ' '
>>> if m is not None: m.group()
...
' end'
>>> patt314 = '3.14' # 表示正则表达式的点号
>>> pi_patt = '3\.14' # 表示字面量的点号 (dec. point)
>>> m = re.match(pi_patt, '3.14') # 精确匹配
>>> if m is not None: m.group()
...
'3.14'
>>> m = re.match(patt314, '3014') # 点号匹配'0'
>>> if m is not None: m.group()
...
'3014'
>>> m = re.match(patt314, '3.14') # 点号匹配 '.'
>>> if m is not None: m.group()
...
'3 .14'
创建字符集([ ])
>>> m = re.match('[cr][23][dp][o2]', 'c3po')# 匹配 'c3po'
>>> if m is not None: m.group()
...
'c3po'
>>> m = re.match('[cr][23][dp][o2]', 'c2do')# 匹配 'c2do'
>>> if m is not None: m.group()
...
'c2do'
>>> m = re.match('r2d2|c3po', 'c2do')# 不匹配 'c2do'
>>> if m is not None: m.group()
...
>>> m = re.match('r2d2|c3po', 'r2d2')# 匹配 'r2d2'
>>> if m is not None: m.group()
...
'r 2d2'
重复、特殊字符以及分组
>>> patt = '\w+@(\w+\.)?\w+\.com'
>>> re.match(patt, 'nobody@xxx.com').group()
'nobody@xxx.com'
>>> re.match(patt, 'nobody@www.xxx.com').group()
'n obody@www.xxx.com'
>>> patt = '\w+@(\w+\.)*\w+\.com'
>>> re.match(patt, 'nobody@www.xxx.yyy.zzz.com').group()
'n obody@www.xxx.yyy.zzz.com'
>>> m = re.match('\w\w\w-\d\d\d', 'abc-123')
>>> if m is not None: m.group()
...
'abc-123'
>>> m = re.match('\w\w\w-\d\d\d', 'abc-xyz')
>>> if m is not None: m.group()
...
>>>
group()通常用于以普通方式显示所有的匹配部分,但也能用于获取各个匹配的子组。
>>> m = re.match('(\w\w\w)-(\d\d\d)', 'abc-123')
>>> m.group() # 完整匹配
'abc-123'
>>> m.group(1) # 子组 1
'abc'
>>> m.group(2) # 子组 2
'123'
>>> m.groups() # 全部子组
(' abc', '123')
groups()方法来获取一个包含所有匹配子字符串的元组
>>> m = re.match('ab', 'ab') # 没有子组
>>> m.group() # 完整匹配
'ab'
>>> m.groups() # 所有子组
()
>>>
>>> m = re.match('(ab)', 'ab') # 一个子组
>>> m.group() # 完整匹配
'ab'
>>> m.group(1) # 子组 1
'ab'
>>> m.groups() # 全部子组
('ab',)
>>>
>>> m = re.match('(a)(b)', 'ab') # 两个子组
>>> m.group() # 完整匹配
'ab'
>>> m.group(1) # 子组 1
'a'
>>> m.group(2) # 子组 2
'b'
>>> m.groups() # 所有子组
匹配字符串的起始和结尾以及单词边界
>>> m = re.search('^The', 'The end.') # 匹配
>>> if m is not None: m.group()
...
'The'
>>> m = re.search('^The', 'end. The') # 不作为起始
>>> if m is not None: m.group()
...
>>> m = re.search(r'\bthe', 'bite the dog') # 在边界
>>> if m is not None: m.group()
...
'the'
>>> m = re.search(r'\bthe', 'bitethe dog') # 有边界
>>> if m is not None: m.group()
...
>>> m = re.search(r'\Bthe', 'bitethe dog') # 没有边界
>>> if m is not None: m.group()
...
't he'
使用findall()和finditer()查找每一次出现的位置
findall()查询字符串中某个正则表达式模式全部的非重复出现情况。
>>> re.findall('car', 'car')
['car']
>>> re.findall('car', 'scary')
['car']
>>> re.findall('car', 'carry the barcardi to the car')
[' car', 'car', 'car']
>>> s = 'This and that.'
>>> re.findall(r'(th\w+) and (th\w+)', s, re.I)
[('This', 'that')]
>>> re.finditer(r'(th\w+) and (th\w+)', s,
... re.I).next().groups()
('This', 'that')
>>> re.finditer(r'(th\w+) and (th\w+)', s,
... re.I).next().group(1)
'This'
>>> re.finditer(r'(th\w+) and (th\w+)', s,
... re.I).next().group(2)
'that'
>>> [g.groups() for g in re.finditer(r'(th\w+) and (th\w+)',
... s, re.I)]
[( 'This', 'that')]
>>> re.findall(r'(th\w+)', s, re.I)
['This', 'that']
>>> it = re.finditer(r'(th\w+)', s, re.I)
>>> g = it.next()
>>> g.groups()
('This',)
>>> g.group(1)
'This'
>>> g = it.next()
>>> g.groups()
('that',)
>>> g.group(1)
'that'
>>> [g.group(1) for g in re.finditer(r'(th\w+)', s, re.I)]
[' This', 'that']
使用sub()和subn()搜索与替换
实现搜索和替换功能:sub()和subn()。
subn()和sub()一样,但subn()还返回一个表示替换的总数,替换后的字符串和表示替换总数的数字一起作为一个拥有两个
元素的元组返回。
>>> re.sub('X', 'Mr. Smith', 'attn: X\n\nDear X,\n')
'attn: Mr. Smith\012\012Dear Mr. Smith,\012'
>>>
>>> re.subn('X', 'Mr. Smith', 'attn: X\n\nDear X,\n')
('attn: Mr. Smith\012\012Dear Mr. Smith,\012', 2)
>>>
>>> print re.sub('X', 'Mr. Smith', 'attn: X\n\nDear X,\n')
attn: Mr. Smith
Dear Mr. Smith,
>>> re.sub('[ae]', 'X', 'abcdef')
'XbcdXf'
>>> re.subn('[ae]', 'X', 'abcdef')
(' XbcdXf', 2)
>>> re.sub(r'(\d{1,2})/(\d{1,2})/(\d{2}|\d{4})',
... r'\2/\1/\3', '2/20/91') # Yes, Python is...
'20/2/91'
>>> re.sub(r'(\d{1,2})/(\d{1,2})/(\d{2}|\d{4})',
... r'\2/\1/\3', '2/20/1991') # ... 20+ years old!
'20/2/1991'
在限定模式上使用split()分隔字符串
>>> re.split(':', 'str1:str2:str3')
[' str1', 'str2', 'str3']
>>> import re
>>> DATA = (
... 'Mountain View, CA 94040',
... 'Sunnyvale, CA',
... 'Los Altos, 94023',
... 'Cupertino 95014',
... 'Palo Alto CA',
... )
>>> for datum in DATA:
... print re.split(', |(?= (?:\d{5}|[A-Z]{2})) ', datum)
...
['Mountain View', 'CA', '94040']
['Sunnyvale', 'CA']
['Los Altos', '94023']
['Cupertino', '95014']
[' Palo Alto', 'CA']
扩展符号
>>> re.findall(r'(?i)yes', 'yes? Yes. YES!!')
['yes', 'Yes', 'YES']
>>> re.findall(r'(?i)th\w+', 'The quickest way is through this
tunnel.')
['The', 'through', 'this']
>>> re.findall(r'(?im)(^th[\w ]+)', """
... This line is the first,
... another line,
... that line, it's the best
... """)
[' This line is the first', 'that line']
>>> re.findall(r'th.+', '''
... The first line
... the second line
... the third line
... ''')
['the second line', 'the third line']
>>> re.findall(r'(?s)th.+', '''
... The first line
... the second line
... the third line
... ''')
[' the second line\nthe third line\n']
>>> re.search(r'''(?x)
... \((\d{3})\) # 区号
... [ ] # 空白符
... (\d{3}) # 前缀
... - # 横线
... (\d{4}) # 终点数字
... ''', '(800) 555-1212').groups()
(' 800', '555', '1212')
>>> re.findall(r'http://(?:\w+\.)*(\w+\.com)',
... 'http://google.com http://www.google.com http://
code.google.com')
['google.com', 'google.com', 'google.com']
>>> re.search(r'\((?P<areacode>\d{3})\) (?P<prefix>\d{3})-(?:\d{4})',
... '(800) 555-1212').groupdict()
{' areacode': '800', 'prefix': '555'}
>>> re.sub(r'\((?P<areacode>\d{3})\) (?P<prefix>\d{3})-(?:\d{4})',
... '(\g<areacode>) \g<prefix>-xxxx', '(800) 555-1212')
'( 800) 555-xxxx'
>>> bool(re.match(r'\((?P<areacode>\d{3})\) (?P<prefix>\d{3})-
(?P<number>\d{4}) (?P=areacode)-(?P=prefix)-(?P=number)
1(?P=areacode)(?P=prefix)(?P=number)',
... '(800) 555-1212 800-555-1212 18005551212'))
True
>>> bool(re.match(r'''(?x)
...
... # match (800) 555-1212, save areacode, prefix, no.
... \((?P<areacode>\d{3})\)[ ](?P<prefix>\d{3})-(?P<number>\d{4})
...
... # space
... [ ]
...
... # match 800-555-1212
... (?P=areacode)-(?P=prefix)-(?P=number)
...
... # space
... [ ]
...
... # match 18005551212
... 1(?P=areacode)(?P=prefix)(?P=number)
...
... ''', '(800) 555-1212 800-555-1212 18005551212'))
True
>>> re.findall(r'\w+(?= van Rossum)',
... '''
... Guido van Rossum
... Tim Peters
... Alex Martelli
... Just van Rossum
... Raymond Hettinger
... ''')
['Guido', 'Just']
>>> re.findall(r'(?m)^\s+(?!noreply|postmaster)(\w+)',
... '''
... sales@phptr.com
... postmaster@phptr.com
... eng@phptr.com
... noreply@phptr.com
... admin@phptr.com
... ''')
['sales', 'eng', 'admin']
>>> ['%s@aw.com' % e.group(1) for e in \
re.finditer(r'(?m)^\s+(?!noreply|postmaster)(\w+)',
... '''
... sales@phptr.com
... postmaster@phptr.com
... eng@phptr.com
... noreply@phptr.com
... admin@phptr.com
... ''')]
[' sales@aw.com', 'eng@aw.com', 'admin@aw.com']
>>> bool(re.search(r'(?:(x)|y)(?(1)y|x)', 'xy'))
True
>>> bool(re.search(r'(?:(x)|y)(?(1)y|x)', 'xx'))
False
正则表达式示例
#!/usr/bin/env python
import os
from distutils.log import warn as printf #兼容python2
import re
with os.popen('who','r') as f:
for eachLine in f:
print(re.split(r'\s\s+|\t',eachLine.strip()))
处理DOS 环境下tasklist 命令的输出(retasklist.py)
#!/usr/bin/env python
import os
import re
f = os.popen('tasklist /nh', 'r')
for eachLine in f:
print re.findall(
'([\w.]+(?: [\w.]+)*)\s\s+(\d+) \w+\s\s+\d+\s\s+([\d,]+ K)',
eachLine.rstrip())
f.close()
用于正则表达式练习的数据生成器(gendata.py)
#!/usr/bin/env python
from distutils.log import warn as printf
from random import randrange, choice
from string import ascii_lowercase as lc
from time import ctime
tlds = ( 'com', 'edu', 'net', 'org', 'gov' )
if hasattr(__builtins__, 'xrange'):
myrng = xrange
else:
myrng = range
for i in myrng(randrange(5, 11)):
dtint = randrange(2**32) # pick date
dtstr = ctime(dtint) # date string
llen = randrange(4, 7) # login is shorter
login = ''.join(choice(lc) for j in myrng(llen))
dlen = randrange(llen, 13) # domain is longer
dom = ''.join(choice(lc) for j in myrng(dlen))
printf('%s::%s@%s.%s::%d-%d-%d' % (dtstr, login,
dom, choice(tlds), dtint, llen, dlen))