正则表达式
参考崔庆才爬虫;图片来源脚本之家
re.match
re.match尝试才能够字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none
re.match(pattern,string,flags=0)
最常规的匹配
import re
content="Hello 123 4567 World_This is a Regex Demo"
result=re.match('^Hello\s\d{3}\s\d{4}\s\w{10}.*Demo$',content)
print(result)
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
泛匹配
import re
content="Hello 123 4567 World_This is a Regex Demo"
result=re.match("^Hello.*Demo$",content)
print(result)
print(result.group()) #输出匹配到的组
print(result.span()) #输出匹配到串的位置
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)
匹配目标
import re
content="Hello 1234567 World_this is a Regex Demo"
result=re.match("^Hello\s(\d+)\sWorld.*Demo$",content) #()会将匹配到的结果存到一个组中
print(result)
print(result.group(1)) #group(1)表示匹配到的第一个括号的内容,group(2)则是第二个......
print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_this is a Regex Demo'>
1234567
(0, 40)
贪婪匹配
import re
content="Hello 1234567 World_This is a Regex Demo"
result=re.match("^He.*(\d+).*Demo$",content) #贪婪匹配从左到右尽可能的匹配更多的字符
print(result)
print(result.group(1)) #输出 7
print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
7
(0, 40)
非贪婪匹配
import re
content="Hello 1234567 World_This is a Regex Demo"
result=re.match("He.*?(\d+).*Demo$",content) #非贪婪匹配 ?表示匹配尽量少的字符
print(result)
print(result.group(1)) #输出 1234567
print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)
匹配模式
import re
content="""Hello 1234567 World_This
is a Regex Demo"""
result=re.match("^He.*?(\d+).*?Demo$",content,re.S) #匹配换行符
print(result)
print(result.group(1))
print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 1234567 World_This \nis a Regex Demo'>
1234567
(0, 41)
转义
import re
content="price is $5.00"
result=re.match("^price is $5.00$",content) # $符在re中有特殊意义,如需使用该字符需要转义
print(result)
None
import re
content="price is $5.00"
result=re.match("^price is \$5.00",content) #转义 \$ 表示$字符
print(result)
print(result.group())
print(result.span())
<_sre.SRE_Match object; span=(0, 14), match='price is $5.00'>
price is $5.00
(0, 14)
总结:尽量使用泛匹配,使用括号得到匹配目标,尽量使用非贪婪模式,有换行符就用re.S
re.search
re.search扫描整个字符串并返回第一个成功的匹配
import re
content="Extra strings Hello 1234567 World_This is a Regex Demo Extra strings"
result=re.match("Hello.*?(\d+).*?Demo",content)
print(result)
None
import re
content="Extra strings Hello 1234567 World_This is a Regex Demo Extra strings"
result=re.search("Hello.*?(\d+).*?Demo",content)
print(result)
print(result.group(1))
print(result.span())
<_sre.SRE_Match object; span=(14, 54), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(14, 54)
总结:为了匹配方便,能使用search就不用match
1.re.match() 从第一个字符开始找, 如果第一个字符就不匹配就返回None, 不继续匹配. 用于判断字符串开头或整个字符串是否匹配,速度快.
2.re.search() 会整个字符串查找,直到找到第一个匹配。并且立即返回。
匹配练习
import re
html="""
<html>
<head></head>
<body>
<div>
<ul>
<ll data-view="5" class="active">
<a href="/3.mp3" singer="陈慧琳">记事本</a>
</ll>
<ll data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</ll>
<ll data-view="6" class="active">
<a href="/3.mp3" singer="beyond">往事随风</a>
</ll>
</ul>
</div>
</body>
</html>
"""
result=re.search('<ll.*?singer="(.*?)">(.*?)</a>',html,re.S)
print(result.group(1),result.group(2))
陈慧琳 记事本
import re
html="""
<html>
<head></head>
<body>
<div>
<ul>
<ll data-view="5" class="active">
<a href="/3.mp3" singer="陈慧琳">记事本</a>
</ll>
<ll data-view="4" class="active"><a href="/3.mp3" singer="齐秦">往事随风</a>
</ll>
<ll data-view="6" class="active">
<a href="/3.mp3" singer="beyond">往事随风</a>
</ll>
</ul>
</div>
</body>
</html>
"""
result=re.search('<ll.*?singer="(.*?)">(.*?)</a>',html) #.号无法匹配换行符,在这里会匹配到齐秦,因为a标签与ll无换行
print(result.group(1),result.group(2))
齐秦 往事随风
re.findall
搜索字符串,以列表形式返回全部能匹配的子串
import re
html="""
<html>
<head></head>
<body>
<div>
<ul>
<ll data-view="5" class="active">
<a href="/3.mp3" singer="陈慧琳">记事本</a>
</ll>
<ll data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</ll>
<ll data-view="6" class="active">
<a href="/3.mp3" singer="beyond">往事随风</a>
</ll>
</ul>
</div>
</body>
</html>
"""
result=re.findall('<ll.*?singer="(.*?)">(.*?)</a>',html,re.S)
print(result)
for info in result:
print(info)
[('陈慧琳', '记事本'), ('齐秦', '往事随风'), ('beyond', '往事随风')]
('陈慧琳', '记事本')
('齐秦', '往事随风')
('beyond', '往事随风')
import re
html="""
<html>
<head></head>
<body>
<div>
<ul>
<ll data-view="5" class="active">
<a singer="陈慧琳">记事本</a>
</ll>
<ll data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</ll>
<ll data-view="6" class="active">
<a href="/3.mp3" singer="beyond">光辉岁月</a>
</ll>
</ul>
</div>
</body>
</html>
"""
result=re.findall('<ll.*?>\s*?(a.*?>)?(\w+)(</a>)?\s*?</ll?',html,re.S)
print(result)
[('', '记事本', '</a>'), ('', '往事随风', '</a>'), ('', '光辉岁月', '</a>')]
re.sub
替换字符串中每一个匹配的子串后返回替换后的字符串
import re
content="Extra strings Hello 1234567 World_This is a Regex Demo Extra strings"
content=re.sub('\d+',"",content)
print(content)
Extra strings Hello World_This is a Regex Demo Extra strings
import re
content="Extra strings Hello 1234567 World_This is a Regex Dmo Extra strings"
content=re.sub('(\d+)',r"\1 0000",content) #\1 获取匹配到的第一个()括号内的串,后面加上0000替换原来的()括号内的串
print(content)
Extra strings Hello 1234567 0000 World_This is a Regex Dmo Extra strings
import re
html="""
<html>
<head></head>
<body>
<div>
<ul>
<ll data-view="5" class="active">
<a href="/3.mp3" singer="陈慧琳">记事本</a>
</ll>
<ll data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</ll>
<ll data-view="6" class="active">
<a href="/3.mp3" singer="beyond">往事随风</a>
</ll>
</ul>
</div>
</body>
</html>
"""
content=re.sub("<a.*?>|</a>",'',html,re.S) #使用re.sub去除a标签
result=re.findall('<ll.*?>(.*?)</ll>',content,re.S)
print(content)
print(result)
<html>
<head></head>
<body>
<div>
<ul>
<ll data-view="5" class="active">
记事本
</ll>
<ll data-view="4" class="active">
往事随风
</ll>
<ll data-view="6" class="active">
往事随风
</ll>
</ul>
</div>
</body>
</html>
['\n 记事本\n', '\n 往事随风\n', '\n 往事随风\n']
re.compile
将正则字符串编译成一个对象
import re
content="""Hello 1234567 World_This
is a Regex Demo"""
pattern=re.compile('Hello.*Demo',re.S)
result=re.match(pattern,content)
print(result)
result=re.match('Hello.*Demo',content,re.S)
print(result)
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>
练习
import re
import requests
def getHTML(url):
try:
r=requests.get(url,timeout=10)
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
except:
print("Exception")
return None
def parseHTML(html):
pattern=re.compile('<li.*?"cover">.*?href=(.*?).*?class="author">(.*?).*?year">(.*?).*?</li>',re.S)
ls=re.match(pattern,html)
print(ls)
#for info in ls:
# url,name,date=info
# print(url,name,date)
if __name__=="__main__":
url="https://book.douban.com/"
html=getHTML(url)
parseHTML(html)
None