爬取豆瓣最佳电影影评Top250

# 爬取豆瓣最佳电影影评Top250

一、介绍及任务目标

1.1 介绍

本项目为2025年2月测试完成。

1.2 任务目标

爬取豆瓣最佳电影影评Top250
1、爬取豆瓣影评
2、信息展列如下: 远山 2025-01-04 23:16:21
#第51届柏林电影节儿童部门最佳影片奖# #横滨电影节特别奖# #粉红巨匠小沼胜时隔12年电影界复归之作# 进入新世纪,小沼胜用30年来在粉红电影中积累的经验,拍出了他本人导演生涯中唯一一部非QS向电影。影片以洗练而纯净的叙事,诗意且优美的镜头,讲述了上世纪60年代生活在片瀬
3、将这些信息照行存放在csv文件中,上面所拿到的信息为展开后的完整信息,不是缩略

二、代码结构及含义

2.1 代码结构

阴影部分为迭代淘汰部分,无需在意。
在这里插入图片描述

2.2 含义

douban.py文件是爬取的代码文件;
requ.py文件是快代理的代理IP请求文件。
多次爬取豆瓣影评会被封禁本地IP,所以最好使用代理IP。

三、代码示例

3.1 douban.py

import csv
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import random
import logging

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 要爬取的网页链接
BASE_URL = "https://movie.douban.com/review/best/"
# 保存数据的文件路径
SAVE_PATH = r"C:\Users\矢志不渝\Desktop\douban\豆瓣电影影评最佳.csv"
# 爬取的页数
PAGE_COUNT = 5
# 定义请求头
HEADERS = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Cookie': 'll="32144"; bid=3RohyNWVHj0; _pk_id.100001.4cf6=23907e7ebb54271c.1686100051.; __yadk_uid=OGVnwlTfotLSH5XdI6JXtqA5kQsN0x89; _vwo_uuid_v2=D1E8222BE3569A942B9687BFC0D7D966C|59c360096de363c25bd7c2994af63c64; __gads=ID=0902d3edd87ac6ff-222ef5a967e2003d:T=1688109672:RT=1688109672:S=ALNI_MaZQ_lI9mzmEI5AhRb_5LLQVCGQBA; __gpi=UID=00000c7cae331650:T=1688109672:RT=1688109672:S=ALNI_MaqXeTvGSxmShelBp8foNJqyq8o4g; __utmv=30149280.25330; __utmc=30149280; __utmc=223695111; dbcl2="253305871:w8B2qq3cOOA"; ck=PU0Q; __utmz=30149280.1705207739.16._pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1705215531%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.522421068.1686100051.1705207739.1705215531.17; __utmb=30149280.0.10.1705215531; __utma=223695111.711123561.1686100051.1705207739.1705215531.14; __utmb=223695111.0.10.1705215531'
}

# 定义一个函数来生成随机的 User - Agent
def get_random_user_agent():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15"
    ]
    return random.choice(user_agents)

def main():
    try:
        datalist = getData()
        saveData(datalist, SAVE_PATH)
        logging.info("爬取完毕!")
    except Exception as e:
        logging.error(f"程序执行出错: {e}")

# 爬取网页
def getData():
    datalist = []
    for i in range(0, PAGE_COUNT):
        url = BASE_URL + f"?start={i * 20}"
        try:
            html = askURL(url)
            if html:
                tree = etree.HTML(html)
                review_list = tree.xpath('//div[@class="review-list chart "]//div[@class="main review-item"]')
                for review in review_list:
                    data = extract_review_info(review)
                    datalist.append(data)
            time.sleep(random.uniform(2, 5))
        except requests.RequestException as e:
            logging.error(f"处理页面 {url} 时出现网络错误: {e}")
        except Exception as e:
            logging.error(f"处理页面 {url} 时出错: {e}")
    return datalist

# 得到指定一个 URL 的网页内容
def askURL(url):
    headers = {
        "User-Agent": get_random_user_agent(),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Referer": "https://movie.douban.com/"
    }
    headers.update(HEADERS)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        elif response.status_code == 403:
            logging.error(f"请求 URL {url} 时被禁止访问: {response.status_code}")
        else:
            logging.error(f"请求 URL {url} 时出错: {response.status_code}")
    except requests.RequestException as e:
        logging.error(f"请求 URL {url} 时出错: {e}")
    return None

# 提取影评信息
def extract_review_info(item):
    # 影评作者
    author_tag = item.xpath('.//a[@class="name"]')
    author = author_tag[0].text.strip() if author_tag else ''

    # 影评日期
    date_tag = item.xpath('.//span[@class="main-meta"]')
    date = date_tag[0].text.strip() if date_tag else ''

    # 影评标题
    title_tag = item.xpath('.//a[@class="title-link"]')
    title = title_tag[0].text.strip() if title_tag else ''

    # 匹配影评 data - id,因为点击页面的展开加载完整评论,属于异步加载,需要单独请求完整评论的 url
    data_id = item.xpath('.//div[@class="main-bd"]/div[1]/@data-rid')
    if data_id:
        full_review_url = f'https://movie.douban.com/j/review/{data_id[0]}/full'
        full_content = get_full_review(full_review_url)
    else:
        full_content = ''

    # 完整信息
    full_info = f"{author}  {date}\n{title}\n{full_content}"
    return full_info

# 获取完整评论
def get_full_review(url):
    headers = {
        "User-Agent": get_random_user_agent(),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Referer": "https://movie.douban.com/"
    }
    headers.update(HEADERS)
    try:
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        json_data = response.json()
        html_content = json_data.get('html', '')
        soup = BeautifulSoup(html_content, 'lxml')
        p_list = soup.find_all('p')
        full_content = ''.join([p.get_text().strip() for p in p_list])
        return full_content
    except requests.RequestException as e:
        logging.error(f"请求完整评论 {url} 时出现网络错误: {e}")
    except ValueError as e:
        logging.error(f"解析 JSON 数据时出现错误: {e}")
    return ''

# 保存数据到 CSV 文件
def saveData(datalist, savepath):
    logging.info("开始保存数据...")
    try:
        with open(savepath, 'w', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
            for data in datalist:
                # 去除数据中的换行符,确保数据在 CSV 中占一行
                clean_data = data.replace('\n', ' ')
                writer.writerow([clean_data])
        logging.info(f"文件已保存到: {savepath}")
    except Exception as e:
        logging.error(f"保存文件时出错: {e}")

if __name__ == "__main__":
    main()

3.2 requ.py

该代码为快代理官方提供,我的代理IP也早就过期,便不展示了。

3.3 gitee仓库地址

完整代码,包括爬取的评论文件在gitee仓库:https://gitee.com/zpyszby/doubanyingpingpaqv.git

四、爬取成果展示

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值