是文本处理的强大武器,在网页爬虫中有大量的应用。
1、简单的匹配
#matching string
import re
pattern1='cat'
pattern2='bird'
string="dog runs to cat"
print(pattern1 in string)
print(pattern2 in string)
用正则表达式匹配
#regular expression
import re
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(re.search(pattern1,string))
print(re.search(pattern2,string))
#search在string中找pattern
返回
<re.Match object; span=(12, 15), match=‘cat’>
None
2、灵活匹配
#multiple patterns(run or ran)
import re
pptn=r"r[au]n"#前面有个r代表是正则表达式形式,[]代表多种情况
print(re.search(ptn,"dog runs to cat"))
<re.Match object; span=(4, 7), match=‘run’>
import re
print(re.search(r"r[A-Z]n","dog runs to cat"))
print(re.search(r"r[a-z]n","dog runs to cat"))
print(re.search(r"r[0-9]n","dog r2ns to cat"))
print(re.search(r"r[0-9a-z]n","dog runs to cat"))
None
<re.Match object; span=(4, 7), match=‘run’>
<re.Match object; span=(4, 7), match=‘r2n’>
<re.Match object; span=(4, 7), match=‘run’>
3、按类型匹配
#数字
##############
#\d:代表数字
print(re.search(r"r\dn","run r4n"))
#\D:不是数字形式
print(re.search(r"r\Dn","run r4n"))
<re.Match object; span=(4, 7), match=‘r4n’>
<re.Match object; span=(0, 3), match=‘run’>
#空白
##############
#\s:代表空白[\t\n\r\f\v]都代编空白符
print(re.search(r"r\sn","r\nn r4n"))
#\S:不是空格形式
print(re.search(r"r\Sn","r\nn r4n"))
<re.Match object; span=(0, 3), match=‘r\nn’>
<re.Match object; span=(4, 7), match=‘r4n’>
#所有字母数字和”_“
##############
#\w:代表[a-zA-Z0-9_]
print(re.search(r"r\wn","r\nn r4n"))
#\W:不是w的形式
print(re.search(r"r\Wn","r\nn r4n"))
#空白字符
##############
#\b:empty string (在单词的开始或结尾)
print(re.search(r"\bruns\b","dog runs to cat"))
#\B:不是b的形式,即前后不是空格
print(re.search(r"\Bruns\B","dog runs to cat"))#不用管是否贴近文字
print(re.search(r"\bruns \b","dog runs to cat"))#只有贴着文字的空白符才可以
<re.Match object; span=(4, 8), match=‘runs’>
None
None
#匹配\:\\
#.:可以匹配任何除了\n的东西
#句尾句首
############
#^:匹配句首
print(re.search(r"^dog","dog runs to cat"))
#$:匹配句尾
print(re.search(r"cat$","dog runs to cat"))
<re.Match object; span=(0, 3), match=‘dog’>
<re.Match object; span=(12, 15), match=‘cat’>
#是否
###############
#?:may or may not occur
print(re.search(r"Mon(day)?","Monday"))
print(re.search(r"Mon(day)?","Mon"))
<re.Match object; span=(0, 6), match=‘Monday’>
<re.Match object; span=(0, 3), match=‘Mon’>
#多行匹配
string="""
dog runs to cat.
I run to dog.
"""
print(re.search(r"^I",string))
print(re.search(r"^I",string,flags=re.M))#多行匹配,可以去找下一行的首
None
<re.Match object; span=(18, 19), match=‘I’>
4、重复匹配
#+:1或多次
#*0或多次
#可选次数{n,m}出现n,m次
print(re.search(r"ab{2,10}","a"))
print(re.search(r"ab{2,10}","abbbbb"))
None
<re.Match object; span=(0, 6), match=‘abbbbb’>
5、分组
#group,分批次匹配
match=re.search(r"(\d+),Date:(.+)","ID:021523,Date:Feb/12/2017")
print(match.group())
print(match.group(1))
print(match.group(2))
021523,Date:Feb/12/2017
021523
Feb/12/2017
#group
match=re.search(r"(?P<id>\d+),Date:(?P<date>.+)","ID:021523,Date:Feb/12/2017")
#把组加入名字方便区分?P<id>
print(match.group('id'))
print(match.group('date'))
021523
Feb/12/2017
6、findall
#寻找所有匹配
#findall
print(re.findall(r"r[ua]n","run ran ren"))
#|:代表or
print(re.findall(r"(run|ran)","run ran ren"))
print(re.findall(r"r(u|r)n","run ran ren"))
[‘run’, ‘ran’]
[‘run’, ‘ran’]
[‘u’, ‘a’]
7、replace
#替换
#re.sub() replace
print(re.sub(r"r[au]ns","catches","dog runs to cat"))
dog catches to cat
8、split
#fenlie
print(re.split(r"[,;\.]","a;b,c.d;e"))#按左边形式拆分
[‘a’, ‘b’, ‘c’, ‘d’, ‘e’]
9、compile
compiled_re=re.compile(r"r[ua]n")#先编译要匹配的东西,在去匹配
print(compiled_re.search("dog runs to cat"))
<re.Match object; span=(4, 7), match=‘run’>