爬豆瓣封神第一部：朝歌风云短评

最新推荐文章于 2024-07-10 19:27:42 发布

泡椒鸡jo

最新推荐文章于 2024-07-10 19:27:42 发布

阅读量76

点赞数

文章标签： python

本文链接：https://blog.csdn.net/error_404_11/article/details/132211821

版权

import pandas as pd
import requests
import re
total=[]
class Spider_db:
    def __init__(self):
        self.comment_url = "https://movie.douban.com/subject/10604086/comments?start={}&limit=20&status=P&sort=new_score"
        self.comment_headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0",
            "cookie":'####'}  # 需要您自行登录豆瓣后，F12-->抓包--->得到您的cookie信息，然后替换即可
    def get_comment_url(self):
        url_list = []
        for i in range(30):
            url_list.append(self.comment_url.format(i * 20))
        return url_list
    def  comment_db(self, url):
        resp = requests.get(url=url, headers=self.comment_headers)
        comments = re.findall('<span class="short">(.*)</span>', resp.content.decode())
        id = re.findall('<div.*?="avatar".*?<a.*?="(.*?)".*?</a>', resp.text, re.S)
        you=re.findall('<span class="votes vote-count">(.*)</span>', resp.content.decode())
        data=re.findall('<span class="comment-time " title="(.+?)">', resp.content.decode())
        pf = re.findall('<span class="allstar[0-9]+ rating" title="(.+?)">', resp.content.decode())
        add=re.findall('<span class="comment-location">(.*?)</span>', resp.text, re.S)
        print(add)
        for comments,you,data,pf,id,add in zip(comments,you,data,pf,id,add):
            info = {"ID":id,
                    '评论': comments,
                    '有用指数':you,
                    "时间":data,
                    "推荐指数":pf,
                    "地址":add
                    }
            total.append(info)
    def run(self):
        # 1.页面规律总结，得到要爬取数据的页面
        comment_list=self.get_comment_url()
        for url in comment_list:
            self.comment_db(url)
            comment_list.index(url) + 1
        print("shue")
if __name__ == '__main__':
    spider = Spider_db()
    spider.run()
df = pd.DataFrame(total)
df.to_csv('封神第一部：朝歌风云 短评.csv',encoding='utf_8_sig')

自己摸索中，如有错误或者更换的欢迎指导，谢谢

执行后截图：