import requests
import re
f = open("豆瓣250电影.csv", mode="w", encoding='utf-8')
i=-25
while i<250:
url = "https://movie.douban.com/top250?start="+str(i+25)+"&filter="
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
resp = requests.get(url)
resp = requests.get(url, headers=headers)
pageSource = resp.text
obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</sp'
r'an>.*?导演: (?P<dao>.*?) .*?<br>'
r'(?P<year>.*?) .*?<span class="rating_num" property="v:average">'
r'(?P<score>.*?)</span>.*?<span>(?P<num>.*?)人评价</span>', re.S)
# 进行正则匹配
result = obj.finditer(pageSource)
for item in result:
name = item.group("name")
dao = item.group("dao")
year = item.group("year").strip() # 去掉字符串左右两端的空白 空格----1949,位置在最前面。随意会有空格。输出的时候屏蔽掉
score = item.group("score")
num = item.group("num")
f.write(f"{name},{dao},{year},{score},{num}\n") # 如果觉着low. 可以更换成csv模块. 进行数据写入
i=i+25 #细节:这个不在for循环里
f.close() # 退出 while 循环,才关闭!!
resp.close() # 退出 while 循环,才关闭!!
print("豆瓣TOP250提取10页数据完毕.")
Python 循环爬虫DB
最新推荐文章于 2024-11-14 08:53:59 发布