1. 影评地址
https://movie.douban.com/subject/26885074/reviews?start=0
2. 获取影评数据
豆瓣反爬比较严重,单线程就好了
# @Time : 2020/1/15 14:52
# @Author : GKL
# FileName : spider.py
# Software : PyCharm
import requests
from lxml import etree
import json
import time
class Spider(object):
def __init__(self):
# self.url = 'https://movie.douban.com/subject/26885074/reviews?start=0'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_data(self, url):
"""
获取数据并做持久化存储
:param url:
:return:
"""
response = requests.get(url, headers=self.headers).content.decode('utf-8')
# xpath 对象
page = etree.HTML(response)
# 获取所有数据节点
node_list = page.xpath('//div[@class="review-list "]/div')
for node in node_list:
# 作者
author = node.xpath('.//header[@class="main-hd"]//a[2]/text()')[0]
# 评论
text = node.xpath('string(.//div[@class="main-bd"]//div[@class="short-content"])')
print(author)
items = {
'author': author,
'text': text.strip()
}
# 持久化存储
with open('yewen.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(items, ensure_ascii=False) + '\n')
def run(self):
"""
翻页及运行逻辑
:return:
"""
for i in range(1, 47):
url = 'https://movie.douban.com/subject/26885074/reviews?start={}'.format(i*20)
print('正在爬取第{}页'.format(i))
self.get_data(url)
time.sleep(3)
if __name__ == '__main__':
s = Spider()
s.run()
3. 制作词云图
import jieba
from wordcloud import WordCloud
import json
f = open("yewen.json", "r", encoding="utf-8")
data_list = f.readlines()
str = ''
for data in data_list:
text = json.loads(data)['text']
str += text
# 替换无关紧要的词语
result_str = str.replace('展开', '').replace('这篇', '')\
.replace('影评', '').replace('电影', '').replace('这部', '').replace('可能', '').replace('剧情', '')
cut_text = jieba.lcut(result_str)
result = " ".join(cut_text)
wc = WordCloud(
font_path='simhei.ttf', # 字体
background_color="white", # 背景色
max_words=600, # 最大词数
width=1000, # 输出宽度
height=1000,
# 字的尺寸限制
min_font_size=20,
max_font_size=100,
# mask= plt.imread('snake.jpg') # 背景图片
)
wc.generate(result) # 转化为词云的操作
wc.to_file("test.jpg") # 保存
f.close()