import pandas as pd
import requests
import re
total=[]
class Spider_db:
def __init__(self):
self.comment_url = "https://movie.douban.com/subject/10604086/comments?start={}&limit=20&status=P&sort=new_score"
self.comment_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
"cookie":'####'} # 需要您自行登录豆瓣后,F12-->抓包--->得到您的cookie信息,然后替换即可
def get_comment_url(self):
url_list = []
for i in range(30):
url_list.append(self.comment_url.format(i * 20))
return url_list
def comment_db(self, url):
resp = requests.get(url=url, headers=self.comment_headers)
comments = re.findall('<span class="short">(.*)</span>', resp.content.decode())
id = re.findall('<div.*?="avatar".*?<a.*?="(.*?)".*?</a>', resp.text, re.S)
you=re.findall('<span class="votes vote-count">(.*)</span>', resp.content.decode())
data=re.findall('<span class="comment-time " title="(.+?)">', resp.content.decode())
pf = re.findall('<span class="allstar[0-9]+ rating" title="(.+?)">', resp.content.decode())
add=re.findall('<span class="comment-location">(.*?)</span>', resp.text, re.S)
print(add)
for comments,you,data,pf,id,add in zip(comments,you,data,pf,id,add):
info = {"ID":id,
'评论': comments,
'有用指数':you,
"时间":data,
"推荐指数":pf,
"地址":add
}
total.append(info)
def run(self):
# 1.页面规律总结,得到要爬取数据的页面
comment_list=self.get_comment_url()
for url in comment_list:
self.comment_db(url)
comment_list.index(url) + 1
print("shue")
if __name__ == '__main__':
spider = Spider_db()
spider.run()
df = pd.DataFrame(total)
df.to_csv('封神第一部:朝歌风云 短评.csv',encoding='utf_8_sig')
自己摸索中,如有错误或者更换的欢迎指导,谢谢
执行后截图: