以下笔记来自B站路飞学城的视频课。
import re
# findall 的效率不高。况且字符串多的时候,不好查看。为解决这个为题,用finditer
list = re.findall(r"\d+", " 我的电话号是10086,另外一个号码是10010")
print(list)
"""
结果:
['10086', '10010']
"""
it = re.finditer(r"\d+", " 我的电话号是10086,另外一个号码是10010")
for i in it:
print(i.group())
"""
结果:
10086
10010
"""
# re.search用法,找到一个结果就返回,返回的结果是match,同样需要group拿数据, 只能拿第一个
s = re.search(r"\d+", " 我的电话号是10086,另外一个号码是10010")
print(s.group())
"""
结果:
10086
"""
# re.match用法,从头开始匹配,这样下边的就匹配不到任何东西
m = re.match(r"\d+", " 我的电话号是10086,另外一个号码是10010")
print(m.group())
# re.match用法,从头开始匹配,这样下边的就匹配不到任何东西
m = re.match(r"\d+", " 我的电话号是10086,另外一个号码是10010")
print(m.group())
"""
结果:
10086
10010
"""
s = """
<div class='jay'><span id='1'>常麒躺</span></div>
<div class='jj'><span id='2'>来铁</span></div>
<div cLass='jolin'><span id='3'>大聪明</span></div>
<div class='sylar'><span id='4'>范思管</span></div>
<div class='tory'><span id='5'>胡说八道</span></div>
"""
# (?P<分组名字>正在表达式) 可以提出匹配的内容
# 在group("分组名字") 可以打印出来,他的类型是字符串
obj = re.compile(r"<div class='(?P<class>.*?)'><span id='\d'>(?P<agcdef>.*?)</span></div>",re.S) #re.S 目的是让.能匹配换行符
ret = obj.finditer(s)
for i in ret:
print(i.group("class"))
"""
结果:
jay
jj
sylar
tory
"""
#简单爬取豆瓣信息
import re, requests, csv
url = "https://movie.douban.com/chart"
dic = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
resp = requests.get(url=url, headers=dic)
contant = resp.text
resp.close()
obj = re.compile(
r'<div class="pl2">.*?class="">(?P<name>.*?)/.*?<p class="pl">(?P<year>.*?)/.*?<span class="rating_nums">(?P<score>.*?)</span>.*?<span class="pl">((?P<num>.*?))</span>',
re.S) # re.S 目的是让.能匹配换行符
ret = obj.finditer(contant)
# f = open("data.csv", mode="w")
# csvwirter = csv.writer(f)
with open("data.csv",'w') as f:
for i in ret:
# print(i.group("name").strip())
# print(i.group("year").strip())
# print(i.group("score").strip())
# print(i.group("num").strip('()'))
dic = i.groupdict()
dic['name'] = dic['name'].strip()
dic['year'] = dic['year'].strip()
dic['score'] = dic['score'].strip()
dic['num'] = dic['num'].strip("()")
csvwirter = csv.writer(f)
csvwirter.writerow(dic.values())
print("over!")