# 爬取豆瓣最佳电影影评Top250
文章目录
一、介绍及任务目标
1.1 介绍
本项目为2025年2月测试完成。
1.2 任务目标
爬取豆瓣最佳电影影评Top250
1、爬取豆瓣影评
2、信息展列如下: 远山 2025-01-04 23:16:21
#第51届柏林电影节儿童部门最佳影片奖# #横滨电影节特别奖# #粉红巨匠小沼胜时隔12年电影界复归之作# 进入新世纪,小沼胜用30年来在粉红电影中积累的经验,拍出了他本人导演生涯中唯一一部非QS向电影。影片以洗练而纯净的叙事,诗意且优美的镜头,讲述了上世纪60年代生活在片瀬
3、将这些信息照行存放在csv文件中,上面所拿到的信息为展开后的完整信息,不是缩略
二、代码结构及含义
2.1 代码结构
阴影部分为迭代淘汰部分,无需在意。
2.2 含义
douban.py文件是爬取的代码文件;
requ.py文件是快代理的代理IP请求文件。
多次爬取豆瓣影评会被封禁本地IP,所以最好使用代理IP。
三、代码示例
3.1 douban.py
import csv
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import random
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 要爬取的网页链接
BASE_URL = "https://movie.douban.com/review/best/"
# 保存数据的文件路径
SAVE_PATH = r"C:\Users\矢志不渝\Desktop\douban\豆瓣电影影评最佳.csv"
# 爬取的页数
PAGE_COUNT = 5
# 定义请求头
HEADERS = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Cookie': 'll="32144"; bid=3RohyNWVHj0; _pk_id.100001.4cf6=23907e7ebb54271c.1686100051.; __yadk_uid=OGVnwlTfotLSH5XdI6JXtqA5kQsN0x89; _vwo_uuid_v2=D1E8222BE3569A942B9687BFC0D7D966C|59c360096de363c25bd7c2994af63c64; __gads=ID=0902d3edd87ac6ff-222ef5a967e2003d:T=1688109672:RT=1688109672:S=ALNI_MaZQ_lI9mzmEI5AhRb_5LLQVCGQBA; __gpi=UID=00000c7cae331650:T=1688109672:RT=1688109672:S=ALNI_MaqXeTvGSxmShelBp8foNJqyq8o4g; __utmv=30149280.25330; __utmc=30149280; __utmc=223695111; dbcl2="253305871:w8B2qq3cOOA"; ck=PU0Q; __utmz=30149280.1705207739.16._pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1705215531%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.522421068.1686100051.1705207739.1705215531.17; __utmb=30149280.0.10.1705215531; __utma=223695111.711123561.1686100051.1705207739.1705215531.14; __utmb=223695111.0.10.1705215531'
}
# 定义一个函数来生成随机的 User - Agent
def get_random_user_agent():
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15"
]
return random.choice(user_agents)
def main():
try:
datalist = getData()
saveData(datalist, SAVE_PATH)
logging.info("爬取完毕!")
except Exception as e:
logging.error(f"程序执行出错: {e}")
# 爬取网页
def getData():
datalist = []
for i in range(0, PAGE_COUNT):
url = BASE_URL + f"?start={i * 20}"
try:
html = askURL(url)
if html:
tree = etree.HTML(html)
review_list = tree.xpath('//div[@class="review-list chart "]//div[@class="main review-item"]')
for review in review_list:
data = extract_review_info(review)
datalist.append(data)
time.sleep(random.uniform(2, 5))
except requests.RequestException as e:
logging.error(f"处理页面 {url} 时出现网络错误: {e}")
except Exception as e:
logging.error(f"处理页面 {url} 时出错: {e}")
return datalist
# 得到指定一个 URL 的网页内容
def askURL(url):
headers = {
"User-Agent": get_random_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Referer": "https://movie.douban.com/"
}
headers.update(HEADERS)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
elif response.status_code == 403:
logging.error(f"请求 URL {url} 时被禁止访问: {response.status_code}")
else:
logging.error(f"请求 URL {url} 时出错: {response.status_code}")
except requests.RequestException as e:
logging.error(f"请求 URL {url} 时出错: {e}")
return None
# 提取影评信息
def extract_review_info(item):
# 影评作者
author_tag = item.xpath('.//a[@class="name"]')
author = author_tag[0].text.strip() if author_tag else ''
# 影评日期
date_tag = item.xpath('.//span[@class="main-meta"]')
date = date_tag[0].text.strip() if date_tag else ''
# 影评标题
title_tag = item.xpath('.//a[@class="title-link"]')
title = title_tag[0].text.strip() if title_tag else ''
# 匹配影评 data - id,因为点击页面的展开加载完整评论,属于异步加载,需要单独请求完整评论的 url
data_id = item.xpath('.//div[@class="main-bd"]/div[1]/@data-rid')
if data_id:
full_review_url = f'https://movie.douban.com/j/review/{data_id[0]}/full'
full_content = get_full_review(full_review_url)
else:
full_content = ''
# 完整信息
full_info = f"{author} {date}\n{title}\n{full_content}"
return full_info
# 获取完整评论
def get_full_review(url):
headers = {
"User-Agent": get_random_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Referer": "https://movie.douban.com/"
}
headers.update(HEADERS)
try:
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
json_data = response.json()
html_content = json_data.get('html', '')
soup = BeautifulSoup(html_content, 'lxml')
p_list = soup.find_all('p')
full_content = ''.join([p.get_text().strip() for p in p_list])
return full_content
except requests.RequestException as e:
logging.error(f"请求完整评论 {url} 时出现网络错误: {e}")
except ValueError as e:
logging.error(f"解析 JSON 数据时出现错误: {e}")
return ''
# 保存数据到 CSV 文件
def saveData(datalist, savepath):
logging.info("开始保存数据...")
try:
with open(savepath, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for data in datalist:
# 去除数据中的换行符,确保数据在 CSV 中占一行
clean_data = data.replace('\n', ' ')
writer.writerow([clean_data])
logging.info(f"文件已保存到: {savepath}")
except Exception as e:
logging.error(f"保存文件时出错: {e}")
if __name__ == "__main__":
main()
3.2 requ.py
该代码为快代理官方提供,我的代理IP也早就过期,便不展示了。
3.3 gitee仓库地址
完整代码,包括爬取的评论文件在gitee仓库:https://gitee.com/zpyszby/doubanyingpingpaqv.git