《叶问4》完结篇影评词云统计分析

最新推荐文章于 2020-02-26 19:11:10 发布

guokanglun

最新推荐文章于 2020-02-26 19:11:10 发布

阅读量425

点赞数

本文链接：https://blog.csdn.net/gklcsdn/article/details/103990742

版权

文章目录

1. 影评地址

https://movie.douban.com/subject/26885074/reviews?start=0

2. 获取影评数据

豆瓣反爬比较严重，单线程就好了

# @Time : 2020/1/15 14:52
# @Author : GKL
# FileName : spider.py
# Software : PyCharm

import requests
from lxml import etree
import json
import time


class Spider(object):
    def __init__(self):
        # self.url = 'https://movie.douban.com/subject/26885074/reviews?start=0'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }

    def get_data(self, url):
        """
        获取数据并做持久化存储
        :param url: 
        :return: 
        """
        response = requests.get(url, headers=self.headers).content.decode('utf-8')
        
        # xpath 对象
        page = etree.HTML(response)
        
        # 获取所有数据节点
        node_list = page.xpath('//div[@class="review-list  "]/div')

        for node in node_list:
            
            # 作者
            author = node.xpath('.//header[@class="main-hd"]//a[2]/text()')[0]
            
            # 评论
            text = node.xpath('string(.//div[@class="main-bd"]//div[@class="short-content"])')

            print(author)
            items = {
                'author': author,
                'text': text.strip()
            }
            
            # 持久化存储
            with open('yewen.json', 'a', encoding='utf-8') as f:
                f.write(json.dumps(items, ensure_ascii=False) + '\n')


    def run(self):
        """
        翻页及运行逻辑
        :return: 
        """
        for i in range(1, 47):
            url = 'https://movie.douban.com/subject/26885074/reviews?start={}'.format(i*20)
            print('正在爬取第{}页'.format(i))
            self.get_data(url)
            time.sleep(3)


if __name__ == '__main__':
    s = Spider()
    s.run()

3. 制作词云图

import jieba
from wordcloud import WordCloud
import json


f = open("yewen.json", "r", encoding="utf-8")
data_list = f.readlines()
str = ''
for data in data_list:
    text = json.loads(data)['text']
    str += text

# 替换无关紧要的词语
result_str = str.replace('展开', '').replace('这篇', '')\
    .replace('影评', '').replace('电影', '').replace('这部', '').replace('可能', '').replace('剧情', '')

cut_text = jieba.lcut(result_str)
result = " ".join(cut_text)
wc = WordCloud(
    font_path='simhei.ttf',  # 字体
    background_color="white",  # 背景色
    max_words=600,  # 最大词数
    width=1000,  # 输出宽度
    height=1000,
    # 字的尺寸限制
    min_font_size=20,
    max_font_size=100,
    # mask= plt.imread('snake.jpg')  # 背景图片
)
wc.generate(result)  # 转化为词云的操作
wc.to_file("test.jpg")  # 保存

f.close()