爬取豆瓣“热门电影”数据:ID,电影名,评分
写入CSV文件
上代码:
import requests
import re
import csv
def getting(url):
resp = requests.get(url,headers=header)
resp.encoding = 'utf-8'
#print(resp.status_code)
#print(type(resp))
return resp
def info_r(resp):
obj_id = re.compile(r'rate":"(?P<sore>.*?)".*?title":"(?P<name>.*?)".*?id":"(?P<id>.*?)","cover_y',re.S)
result = obj_id.findall(resp.text)
return result ##这是个列表
ef writing(list):
for i in list:
with open("movie_list.csv","a+") as f:
writer = csv.writer(f)
writer.writerow(i)
def main():
result_0 =[]
###豆瓣热门电影主网页0-20-40
for i in range(0,101,20):
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=" ##F12的network找到的XHR文件
url = url + str(i)
#url = f"https://movie.douban.com/explore#!type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start={i}" for i in range(0,101,20)
resp = getting(url)
print(type(resp.text))
result = info_r(resp)
for l in result:
result_0.append(l)
# print(result_0)
# print("列表长度:" + str(len(result_0)))
writing(result_0)
if __name__ =='__main__':
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}
main()
计划把豆瓣电影的电影都爬一遍,怕把我的IP的直接拉黑,以后没得玩了。除了代理IP,慢慢爬数据可以吗?
我的CSV文件不知道为什么都隔了一行,我还得再调调。