网页爬虫1--正则表达式

这里写图片描述

教程来源: 莫烦Python

学习爬虫前先了解一下正则表达式吧~

导入模块

import re #正则表达式模块

简单匹配

# matching string
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(pattern1 in string)
print(pattern2 in string)
True
False

用正则寻找配对

# regular expression
pattern1="cat"
pattern2="bird"
string="dog runs to cat"
print(re.search(pattern1,string)) #显示匹配到的对象
print(re.search(pattern1,string).group()) #加grop()可以显示匹配子串
print(re.search(pattern1,string).span()) #加span()显示匹配到的子串在原字符串中的索引位置
print(re.search(pattern2,string))
<_sre.SRE_Match object at 0x7fde38270b28>
cat
(12, 15)
None

匹配多种可能使用[]

# multiple patterns ("run" or "ran")
print(re.search(r'r[au]n',"dog runs to cat").group())
run

匹配更多种可能

# continue
print(re.search(r'r[A-Z]n','dog runs to cat'))
print(re.search(r'r[a-z]n','dog runs to cat'))
print(re.search(r'r[0-9]n','dog r2ns to cat'))
print(re.search(r'r[0-9a-z]n','dog runs to cat'))
None
<_sre.SRE_Match object at 0x7fde382ab1d0>
<_sre.SRE_Match object at 0x7fde382ab1d0>
<_sre.SRE_Match object at 0x7fde382ab1d0>

特殊种类匹配

数字

# \d: decimal digit 任何数字
print(re.search(r'r\dn','run r4n').group())
# \D: any non-decimal digit 不是数字
print(re.search(r'r\Dn','run r4n').group())
r4n
run

空白

# \s : any white apace [\t\n\r\f\v] 任何white space
print(re.search(r'r\sn','r\nn r4n').group())
# \S : opposite to \s, any none-white space 
print(re.search(r'r\Sn','r\nn r4n').group())
r
n
r4n

所有字母数字和"__"

# \w : [a-zA-Z0-9_] 任何大小写字母,数字
print(re.search(r'r\wn','r\nn r4n').group())
# \W: opposite to \w
print(re.search(r'r\Wn','r\nn r4n').group())
r4n
r
n

空白字符

# \b : empty string (only at the start or end of the world) 空白字符(只在某个字的开头或结尾)
print(re.search(r'\bruns\b','dog runs to cat').group())
# \B : empty string (but not at the start or end of a world) 空白字符(不在某个字的开头或结尾)
print(re.search(r'\B runs \B','dog  runs  to cat').group())
runs
 runs 

特殊字符 任意字符

# \\ : match \ 匹配\
print(re.search(r'runs\\','runs\ to me').group())
# . : match anything (except \n) 匹配任何字符(除了\n)
print(re.search(r'r.n','r[ns to me]').group())
runs\
r[n

句首句尾

# ^ : match line beginning
print(re.search(r'^dog','dog runs to cat').group())
# $ : match line ending
print(re.search(r'cat$','dog runs to cat').group())
dog
None
cat

是否

# ? : may or may not accur ?前面的字符可有可无
print(re.search(r'Mon(day)?','Monday').group())
print(re.search(r'Mon(day)?','Mon').group())
print(re.search(r'Mon(day)?','Mond').group())
Monday
Mon
Mon

多行匹配

# multi-line
string="""
dog runs to cat.
I run to dog.
"""
print(re.search(r'^I',string))
print(re.search(r'^I',string,flags=re.M).group()) #加flags=re.M参数可以单独对每一行处理
print(re.search(r'^I',string,flags=re.MULTILINE).group())
None
I
I

0或多次

# * : occur 0 or more times
print(re.search(r'ab*','a').group())
print(re.search(r'ab*','abbb').group())
a
abbb

1或多次

# + : occur 1 or more times
print(re.search(r'ab+','a'))
print(re.search(r'ab+','abbb').group())
None
abbb

可选次数

# {n,m} : occur n to m times
print(re.search(r'ab{2,10}','a'))
print(re.search(r'ab{2,10}','abbbb').group())
None
abbbb

group组

# group
match=re.search(r'(\d+), Data: (.+)','ID: 20180317, Data: Mar/17/2018')
print(match.group())
print(match.group(1))
print(match.group(2))
20180317, Data: Mar/17/2018
20180317
Mar/17/2018
match=re.search(r'(?P<id>\d+), Data: (?P<date>.+)','ID: 20180317, Data: Mar/17/2018')
print(match.group('id'))
print(match.group('date'))
20180317
Mar/17/2018

寻找所有匹配

# findall
print(re.findall(r'r[ua]n','run ran ren'))
['run', 'ran']
# | : or #要么是前者,要么是后者
print(re.findall(r'run|ran','run ran ren'))
['run', 'ran']

替换

# re.sub() replace
print(re.sub(r'r[au]ns','catches','dog runs to cat'))
print(re.sub(r'I','You','I like apple'))
dog catches to cat
You like apple

分裂

# re.split()
print(re.split(r'[,;\.]','a;b,c.d;e.f'))
['a', 'b', 'c', 'd', 'e', 'f']

compile

# compile
compiled_re=re.compile(r'r[au]n')
print(compiled_re.search('dog ran to cat').group())
ran
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值