2.字符串和文本
2.1针对任意多的分隔符拆分字符串
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re
print(re.split(r'[;,\s]\s*', line))
#用到捕获组,匹配文本包含在最后的结果中
fileds = re.split(r'(;|,|\s)\s*', line)
print(fileds)
#不想使用捕获组,用(?:)指定非捕获组
print(re.split(r'(?:,|;|\s)\s*', line))
2.2在字符串的开头或结尾处作文本匹配
#startswith endswith
filename = "spam.txt"
print(filename.endswith('.txt'))
print(filename.startswith('file'))
url = 'http://www.python.org'
print(url.startswith('http:'))
#针对多个选项做检查,只需给startswith和endswith提供包含可能选项的元组
#正则匹配
import re
print(re.match('http:|https:|ftp:', url))
2.3利用Shell通配符做字符串匹配
'''
当工作在UNIX Shell下时, 像使用常见的通配符模式
'''
from fnmatch import fnmatch, fnmatchcase
print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9]*'))
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
print([name for name in names if fnmatch(name, 'Dat*.csv')])
#区分大小写
print(fnmatchcase('foo.txt', '*.TXT'))
2.4文本模式的匹配和查找
#简单匹配,startswith, endswith, find
text = 'yeah, but no, but yeah, but no, but yeah'
print(text == 'yeah')
print(text.startswith('yeah'))
print(text.endswith('yeah'))
print(text.find('no'))
#较复杂的匹配
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
import re
if re.match(r'\d+/\d+/\d+', text1):
print('yes')
else:
print('no')
#将正则编译成对象
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
print('yes')
else:
print('no')
2.5查找和替换文本
#简单文本, str.replace()
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.replace('yeah', 'yep'))
#较复杂, re.sub()
text = 'Today is 11/27/2012. PyCon starts 3/13/2013'
import re
#\加数字表示捕获组的第几个元素
print(re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text))
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(r'\3-\1-\2', text))
#更加复杂, 指定替换回调函数
from calendar import month_abbr
def change_date(m):
#month_abbr返回月份的名字
mon_name = month_abbr[int(m.group(1))]
return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
print(datepat.sub(change_date, text))
#替换次数
newtext, n =datepat.subn(r'\3-\1-\2', text)
print(n)
2.6以不区分大小写的方式对文本做查找和替换
import re
text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
print(re.sub('python', 'snake', text, flags=re.IGNORECASE))
#待替换的文本与匹配的文本大小写吻合
#函数闭包
def matchcase(word):
def replace(m):
text = m.group()
if text.isupper():
return word.upper()
elif text.islower():
return word.lower()
elif text[0].isupper():
return word.capitalize()
else:
return word
return replace
print(re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE))
2.7定义实现最短匹配的正则表达式
#非贪婪匹配
import re
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer says "no."'
print(str_pat.findall(text1))
text2 = 'Computer says "no." Phone says "yes."'
print(str_pat.findall(text2))
#在*或+后添加一个?
str_pat1 = re.compile(r'\"(.*?)\"')
print(str_pat1.findall(text2))