'''
https://movie.douban.com/subject/35660795/comments?status=P
# 第一页
https://movie.douban.com/subject/35660795/comments?limit=20&status=P&sort=new_score
# 第二页
https://movie.douban.com/subject/35660795/comments?start=20&limit=20&status=P&sort=new_score
# 第三页
https://movie.douban.com/subject/35660795/comments?start=40&limit=20&status=P&sort=new_score
# 第四页
https://movie.douban.com/subject/35660795/comments?start=60&limit=20&status=P&sort=new_score
1 豆瓣爬虫 获取到某一部电影的 影评和星级评定
2 表格存储
3 数据分析 可以通过词云来展示评论
4 数据分析 可以通过图标来展示评级
'''
import requests
import random
from bs4 import BeautifulSoup
import time
import csv
# 获取某一部电影的 影评和星级评定
def getdata():
for i in range(20):
if i == 0:
url = "https://movie.douban.com/subject/35267208/comments?limit=20&status=P&sort=new_score"
else:
url = "https://movie.douban.com/subject/35267208/comments?start=%d&limit=20&status=P&sort=new_score"%(i*20)
print("第%d页"%(i+1))
print(url)
# 请求
req = requests.get(url=url,headers=getheader())
# 解析
bf = BeautifulSoup(req.text,"html.parser")
comment_item_list = bf.select("#comments > div")[0:-1]
print(len(comment_item_list))
for item in comment_item_list:
# 评论
content = item.select_one("div.comment > p > span")
# 星级
satr = item.select_one("div.comment > h3 > span.comment-info > span.rating")
if content and satr:
print("星级:",satr.get("title"))
print("评论:",content.string)
# 存入表格
with open("流浪地球.csv","a",encoding="utf-8",newline="")as file:
csv.writer(file).writerow([content.string,satr.get("title")])
# time.sleep(random.randint(0,2))
print(req)
pass
def getheader():
cookie = 'll="118374"; bid=5BWnT93nswI; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1687919030%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DqX37PTYhScD_awxCeP3Ktm7oufMdiPcsRBLJGfUXpF-Ae2HXb1bkvRff2_FogQao%26wd%3D%26eqid%3D9b766358000ba8a300000002649b99b2%22%5D; _pk_id.100001.4cf6=76f6078b4899eed2.1687919030.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.751936906.1687919030.1687919030.1687919030.1; __utmc=30149280; __utmz=30149280.1687919030.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.2094033427.1687919030.1687919030.1687919030.1; __utmb=223695111.0.10.1687919030; __utmc=223695111; __utmz=223695111.1687919030.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=OS3qPoepToTPqTYUkhMS1sWrhA1dE7NN; _vwo_uuid_v2=D8A732448233270BE48169B86529963B2|7b9fbf6d7844f3ca88bd806ae18e3b88; __gads=ID=0e8c2fb9e2ff39f4-22bf202afce100c4:T=1687919055:RT=1687919055:S=ALNI_MZMRaV7Kg1R7LALq5HoCGxMz5lbDQ; __gpi=UID=00000c772dca9e00:T=1687919055:RT=1687919055:S=ALNI_MZ0rWJuRVQHvCa5VHkoHTJER1dlZg; __utmt=1; __utmb=30149280.4.9.1687921460720; dbcl2="203371577:W3oBbzkRxao"; ck=fdSm; push_noty_num=0; push_doumail_num=0'
# Edg
header1 ={
"cookie":cookie,
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58"
}
# chrome
header2 ={
"cookie": cookie,
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 360
header3 = {
"cookie": cookie,
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
# mac
header4 = {
"cookie": cookie,
"User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
headerlist = [header1,header2,header3,header4]
return headerlist[random.randint(0, len(headerlist)-1)]
if __name__ == '__main__':
# 获取数据
getdata()```
5 豆瓣评论下载
最新推荐文章于 2025-06-04 16:07:42 发布