豆瓣电视剧评分加日期加评论爬取-CSDN博客

本文链接：https://blog.csdn.net/starfish_s/article/details/117828476
该代码实现了一个爬虫，用于抓取豆瓣电影特定页面的评论信息，包括评论ID、发表时间和评分。首先定义了一个获取HTML页面的函数，使用requests库发送GET请求，并设置多种User-Agent以防止被识别为机器人。接着，解析HTML内容，提取评论ID、时间及评分。最后，遍历每条评论，获取完整内容并写入文件。此爬虫适用于收集用户对电影的评价数据。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
参考了大佬的正则表达式，暂时不写教程，直接上代码
import requests
import random
import re
from lxml import etree

def get_html(url):
        user_agent = [
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
                'Opera/8.0 (Windows NT 5.1; U; en)',
                'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
                'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
                'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
                'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ',
                "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
                "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
                "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
        ]

        headers = {
                'Cookie': 'll="108309"; bid=ELlOU2v6wzk; __yadk_uid=cra70mWYZDspH84ns7OQe5MlRXTYRcPQ; _vwo_uuid_v2=DC107E9CBF8E0A6699D04FC69E2982C81|48d6b514f6792d4dc1a14d5910891045; __gads=ID=2e9aa8e46ca22d82-227af46190c40025:T=1604664940:RT=1604664940:S=ALNI_MajmByuNuUd5eaZXqlcwPe6XdP25g; gr_user_id=412ad26e-7363-4fd6-bc57-68a7e7df2ec2; douban-fav-remind=1; __utmz=30149280.1623417103.11.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1623417111.8.8.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ap_v=0,6.0; _vwo_uuid_v2=DC107E9CBF8E0A6699D04FC69E2982C81|48d6b514f6792d4dc1a14d5910891045; __utma=30149280.1777458498.1604664845.1623417103.1623421591.12; __utmb=30149280.0.10.1623421591; __utmc=30149280; __utma=223695111.1738783349.1604664845.1623417111.1623421591.9; __utmb=223695111.0.10.1623421591; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1623421591%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=d5e147dfb6e78760.1604664845.9.1623421591.1623417824.; _pk_ses.100001.4cf6=*',
                'User-Agent' : str(random.choice(user_agent)),
                "Host": "movie.douban.com",
                "Connection": "keep-alive"

        }
        res = requests.get(url, headers=headers)
        return res
def get_comment_id_time_star(data):
        id_pattern = re.compile('<h2><a href="https://movie.douban.com/review/(.*?)/', re.S)
        id = re.findall(id_pattern, data)
        time_pattern = re.compile('<span content=".*?" class=".*?">(.*?)</span>', re.S)
        time = re.findall(time_pattern, data)
        star_pattern = re.compile('<span class="allstar(.*?) main-title-rating"',re.S)
        star = re.findall(star_pattern, data)
        id_time_star = [id, time, star]
        return id_time_star


def parse_comment(data):
        content_pattern = re.compile('data-original(.*?)main-author', re.S)
        content = re.findall(content_pattern, data)
        text_pattern = re.compile('[\u4e00-\u9fa5|，、“”‘’：！~@#￥【】*（）——+。；？]+', re.S)
        text = re.findall(text_pattern, content[0])
        text = ''.join(text)
        return text



if __name__ == '__main__':
        for i in range(0,48):
                print("正在爬取第{}页.......".format(i+1))
                index_url = "https://movie.douban.com/subject/30228394/reviews?start={}".format(i * 20)
                index_res = get_html(index_url).text
                id_time_star = get_comment_id_time_star(index_res)
                id = id_time_star[0]
                time = id_time_star[1]
                star = id_time_star[2]
                print("正在处理评论内容........")
                for j in range(len(id)):
                        print("正在爬取第{}条评论.......".format(j+1))
                        comment_url = "https://movie.douban.com/j/review/{}/full".format(id[j])
                        # comment_url = "https://movie.douban.com/j/review/13300536/full"
                        comment_res = get_html(comment_url).text
                        content = parse_comment(comment_res)
                        print("将评论写入文件......")
                        with open("content.txt", "a", encoding="utf-8") as f:
                                f.write(content + "\n")
                print("评论爬取完成........")
                # 将评分写入star.txt文件中
                print("将评分和日期写入文件........")
                with open("star.txt", "a") as f:
                        for _ in star:
                                f.write(_ + "\n")

                # 将日期写入date.txt文件中
                with open("date.txt", "a") as f:
                        for _ in time:
                                f.write(_ + "\n")
                print("第{}页爬取完成".format(i+1))