#拿到页面源代码 #用re提取有效信息 import requests import re import csv url="https://movie.douban.com/top250" header={ {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0' } #拿到页面源代码 #用re提取有效信息 import requests import re url="https://movie.douban.com/top250" header={ {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0'} resp = requests.get(url,headers=header) page = resp.text #.*?尽可能少的匹配 obj=re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>' r'</span>.*?<p class="">.*?<br>(?P<year>.*?) ',re.S)#re.S让.可以匹配换行符 result=obj.finditer(page) for it in result: print(it.group("name")) print(it.group("year")) page = resp.text #.*?尽可能少的匹配 obj=re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>' r'</span>.*?<p class="">.*?<br>(?P<year>.*?) ',re.S)#re.S让.可以匹配换行符 result=obj.finditer(page) f=open("work.csv",mode="w") csvwrite=csv.writer(f) for it in result: #print(it.group("name")) #print(it.group("year")) dic=it.groupdict() dic['year']=dic['year'].strip() csvwrite.writerow(dic.values()) f.close()
学习爬虫第三天
最新推荐文章于 2024-07-12 16:42:46 发布