#导入需要的包
import requests
import re
num=int(input("请输入要爬取的页数"))
for i in range(0,num):
url= "https://movie.douban.com/top250"
i=i*25
url=url+f"?start={i}&filter="
head= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"}
resp =requests. get(url, headers=head) # 处理一个小小的反爬19
resp. encoding = 'utf-8'
obj=re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?<br>(?P<year>.*?) .*?property="v:average">(?P<score>.*?)</span>.*?<span>(?P<num>.*?)人评价</span>',re.S)
rest=obj.finditer(resp.text)
for item in rest:
dic=item.groupdict()
dic['year']= dic['year'].strip() #去除year两端的空格,换行符,制表符
print(dic)
python爬虫最简实现之豆瓣top250
最新推荐文章于 2024-10-03 09:00:56 发布