学习爬虫前先了解一下正则表达式吧~
导入模块
import re #正则表达式模块
简单匹配
# matching string
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(pattern1 in string)
print(pattern2 in string)
True
False
用正则寻找配对
# regular expression
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(re.search(pattern1,string)) #显示匹配到的对象
print(re.search(pattern1,string).group()) #加grop()可以显示匹配子串
print(re.search(pattern1,string).span()) #加span()显示匹配到的子串在原字符串中的索引位置
print(re.search(pattern2,string))
<_sre.SRE_Match object at 0x7fde38270b28>
cat
(12, 15)
None
匹配多种可能使用[]
# multiple patterns ("run" or "ran")
print(re.search(r'r[au]n',"dog runs to cat").group())
run
匹配更多种可能
# continue
print(re.search(r'r[A-Z]n','dog runs to cat'))
print(re.search(r'r[a-z]n','dog runs to cat'))
print(re.search(r'r[0-9]n','dog r2ns to cat'))
print(re.search(r'r[0-9a-z]n','dog runs to cat'))
None
<_sre.SRE_Match object at 0x7fde382ab1d0>
<_sre.SRE_Match object at 0x7fde382ab1d0>
<_sre.SRE_Match object at 0x7fde382ab1d0>
特殊种类匹配
数字
# \d: decimal digit 任何数字
print(re.search(r'r\dn','run r4n').group())
# \D: any non-decimal digit 不是数字
print(re.search(r'r\Dn','run r4n').group())
r4n
run
空白
# \s : any white apace [\t\n\r\f\v] 任何white space
print(re.search(r'r\sn','r\nn r4n').group())
# \S : opposite to \s, any none-white space
print(re.search(r'r\Sn','r\nn r4n').group())
r
n
r4n
所有字母数字和"__"
# \w : [a-zA-Z0-9_] 任何大小写字母,数字
print(re.search(r'r\wn','r\nn r4n').group())
# \W: opposite to \w
print(re.search(r'r\Wn','r\nn r4n').group())
r4n
r
n
空白字符
# \b : empty string (only at the start or end of the world) 空白字符(只在某个字的开头或结尾)
print(re.search(r'\bruns\b','dog runs to cat').group())
# \B : empty string (but not at the start or end of a world) 空白字符(不在某个字的开头或结尾)
print(re.search(r'\B runs \B','dog runs to cat').group())
runs
runs
特殊字符 任意字符
# \\ : match \ 匹配\
print(re.search(r'runs\\','runs\ to me').group())
# . : match anything (except \n) 匹配任何字符(除了\n)
print(re.search(r'r.n','r[ns to me]').group())
runs\
r[n
句首句尾
# ^ : match line beginning
print(re.search(r'^dog','dog runs to cat').group())
# $ : match line ending
print(re.search(r'cat$','dog runs to cat').group())
dog
None
cat
是否
# ? : may or may not accur ?前面的字符可有可无
print(re.search(r'Mon(day)?','Monday').group())
print(re.search(r'Mon(day)?','Mon').group())
print(re.search(r'Mon(day)?','Mond').group())
Monday
Mon
Mon
多行匹配
# multi-line
string="""
dog runs to cat.
I run to dog.
"""
print(re.search(r'^I',string))
print(re.search(r'^I',string,flags=re.M).group()) #加flags=re.M参数可以单独对每一行处理
print(re.search(r'^I',string,flags=re.MULTILINE).group())
None
I
I
0或多次
# * : occur 0 or more times
print(re.search(r'ab*','a').group())
print(re.search(r'ab*','abbb').group())
a
abbb
1或多次
# + : occur 1 or more times
print(re.search(r'ab+','a'))
print(re.search(r'ab+','abbb').group())
None
abbb
可选次数
# {n,m} : occur n to m times
print(re.search(r'ab{2,10}','a'))
print(re.search(r'ab{2,10}','abbbb').group())
None
abbbb
group组
# group
match=re.search(r'(\d+), Data: (.+)','ID: 20180317, Data: Mar/17/2018')
print(match.group())
print(match.group(1))
print(match.group(2))
20180317, Data: Mar/17/2018
20180317
Mar/17/2018
match=re.search(r'(?P<id>\d+), Data: (?P<date>.+)','ID: 20180317, Data: Mar/17/2018')
print(match.group('id'))
print(match.group('date'))
20180317
Mar/17/2018
寻找所有匹配
# findall
print(re.findall(r'r[ua]n','run ran ren'))
['run', 'ran']
# | : or #要么是前者,要么是后者
print(re.findall(r'run|ran','run ran ren'))
['run', 'ran']
替换
# re.sub() replace
print(re.sub(r'r[au]ns','catches','dog runs to cat'))
print(re.sub(r'I','You','I like apple'))
dog catches to cat
You like apple
分裂
# re.split()
print(re.split(r'[,;\.]','a;b,c.d;e.f'))
['a', 'b', 'c', 'd', 'e', 'f']
compile
# compile
compiled_re=re.compile(r'r[au]n')
print(compiled_re.search('dog ran to cat').group())
ran