## 导入相关包
import requests
import random
import time
import csv
import re
from fake_useragent import UserAgent # 随机生成UserAgent
from lxml import etree # xpath解析
## 创建文件对象
f = open("流浪地球2.csv", "w", encoding="utf-8-sig", newline="")
csv_write = csv.DictWriter(
f, fieldnames=["评论者", "评分等级", "评论日期", "点赞数", "评论内容"]
)
csv_write.writeheader() # 写入文件头
## 设置请求头参数:User-Agent, cookie, referer
headers = {
# 随机生成User-Agent
"User-Agent": UserAgent().random,
# 不同用户不同时间访问,cookie都不一样,根据自己网页的来,获取方法见另一篇博客
"cookie": '_pk_id.100001.4cf6=13dc5d924fb6c06d.1712935937.; __yadk_uid=qX18ajXpIeOjfikZvJ0IdCuwxPAJB7vf; viewed="35803666_30282496"; __gads=ID=18e80b5cd1f92607:T=1712935925:RT=1714094400:S=ALNI_MZuep4TG2PcWCOit9trcngPrJeuGg; __gpi=UID=00000dea49edbd04:T=1712935925:RT=1714094400:S=ALNI_MZnSID8bIFPhRmE2W3UERHumyMc1Q; ll="118172"; bid=d51V9_srCsk; __utma=30149280.1433237315.1666266083.1720251223.1734744201.8; __utmc=30149280; __utmz=30149280.1734744201.8.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; dbcl2="285494959:gs59tP0zdPQ"; ck=oDzf; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.28549; __utmb=30149280.5.10.1734744201; __utma=223695111.251309946.1712935937.1712935937.1734744286.2; __utmb=223695111.0.10.1734744286; __utmc=223695111; __utmz=223695111.1734744286.2.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1734744286%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.100001.4cf6=1; _vwo_uuid_v2=DFB8BAC67A1DA6A0BC24F463C0ABCB266|36be8ce713f0fdb2a9f7b74af7be8dcc; frodotk_db="dd7c74622ad324f998755d2680796eeb"',
# 设置从何处跳转过来
"referer": "https://movie.douban.com/subject/35267208/comments?limit=20&status=P&sort=new_score",
}
## 循环爬取20页短评,每页短评20条
for i in range(20):
url = "https://movie.douban.com/subject/35267208/comments?start={}&limit=20&status=P&sort=new_score".format(
i * 20
)
# request请求获取网页页面
page_text = requests.get(url=url, headers=headers).text
# etree解析HTML文档
tree = etree.HTML(page_text)
# 获取评论者字段
reviewer = tree.xpath(
"//div[@class='comment-item ']//span[@class='comment-info']/a/text()"
)
# 获取评分等级字段
score = tree.xpath(
"//div[@class='comment-item ']//span[@class='comment-info']/span[2]/@title"
)
# 获取评论日期字段
comment_date = tree.xpath(
"//div[@class='comment-item ']//span[@class='comment-time ']/text()"
)
# 获取点赞数字段
vote_count = tree.xpath(
"//div[@class='comment-item ']//span[@class='votes vote-count']/text()"
)
# 获取评论内容字段
comments = tree.xpath("//p[@class=' comment-content']/span/text()")
# 去除评论日期的换行符及空格
comment_date = list(
map(lambda date: re.sub("\s+", "", date), comment_date)
) # 去掉换行符制表符
comment_date = list(filter(None, comment_date)) # 去掉上一步产生的空元素
# 由于每页20条评论,故需循环20次依次将获取到的字段值写入文件中
for j in range(20):
data_dict = {
"评论者": reviewer[j],
"评分等级": score[j],
"评论日期": comment_date[j],
"点赞数": vote_count[j],
"评论内容": comments[j],
}
csv_write.writerow(data_dict)
print("第{}页爬取成功".format(i + 1))
# 设置睡眠时间间隔,防止频繁访问网站
time.sleep(random.randint(4, 8))
print("---------------")
print("所有评论爬取成功")
爬取豆瓣电影《流浪地球2》影评
最新推荐文章于 2025-05-23 09:16:47 发布