电视剧《知否知否，应是绿肥红瘦》——剧评词云制作

本文链接：https://blog.csdn.net/sinat_37341950/article/details/87301550

前几天看了一下csdn微信公众号发的关于知否剧评的词云制作觉得很有趣，因此也模仿了一下十分有趣，可是还有些细节要注意。

第一步代码如下：

# -*-coding:utf-8-*-
#导入相关模块
import requests
from lxml import etree
import jieba
import time

#载入浏览器请求头，避免被检测
header = {

    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
}

#获取下一页url地址，使用xpath匹配
'''
def getPageNum(url):
    if url:
        req = requests.get(url, headers=header)
        html = etree.HTML(req.text)
        pageNum = html.xpath(u"//div[@class='paginator']/a[last()]/text()")[0]
    return pageNum
'''
#开始抓取，匹配XPath拿到数据
def getContent(url):
    if url:
        req = requests.get(url, headers=header)
        html = etree.HTML(req.text)
        data = html.xpath(u"//span[@class='short']/text()")
    return data
#将每页的url写入一个列表
def getUrl():
    dataUrl = []
    #评论不多，我们可以自定义抓取100页，实际评论只有66页
    for i in range(1, int(100)):
        url = "https://movie.douban.com/subject/26928226/comments?start=%s" % ((i - 1) * 20)
        #将所有的页码写入列表
        dataUrl.append(url)
    return dataUrl
#主程序开始
if __name__ == '__main__':
    url = "https://movie.douban.com/subject/26928226/comments?start=20"
    #pageNum = getPageNum(url)
    data = getUrl()
    datas = []
    dic = dict()
    print(data)
    print(datas)
    #打开创建文件
    file_handle = open('./3.txt', mode='w+', encoding='utf-8')
    for u in data:
        #避免程序过快执行，模拟人为操作，程序睡眠
        time.sleep(1)
        for d in getContent(u):
            #抓取的内容写入txt
            file_handle.write(d)
            # 避免程序过快执行被检测，模拟人为操作，程序睡眠
            print(d)
            time.sleep(1)
            #使用jieba分词，对词频进行分析
            jdata = jieba.cut(d)
            for i in jdata:
                # 将数据做简单的清理，去掉乱码符号空格
                if len(i.strip()) > 1:
                    datas.append(i.strip())
    #数据在控制台简单的展示
    for i in datas:
        if datas.count(i) > 1:
            dic[i] = datas.count(i)
    for key, values in dic.items():
        print("%s===%d" % (key, values))

保存到了3.txt文本文件中

# -*- coding: utf-8 -*-

# 导入必要的模块
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 创建停用词列表
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子进行jieba分词，虽然WordCloud也有分词功能，但感觉没有jieba分词的结果好
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('stopwords1893.txt')  # 这里加载停用词的路径
    outstr = []
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t' and word != ' ' and word != '\n':
                outstr.append(word)
    return outstr

# 打开并逐行读取文本文档
f = open("3.txt", 'r')
lines = f.readlines()

sentence = ''
for line in lines:
    sentence = ''.join([sentence, line]) # 每行句子都连接起来

f.close()

# 输入文本得到jieba分词结果
word_result_list = seg_sentence(sentence)
# 将分词连接起来，以逗号分隔
word_result = ','.join(word_result_list)


plt.figure(figsize=(12,6))

# 中文字体的保存目录
font = r'SimHei.ttf'

# 词云的参数设置
wc = WordCloud(
    background_color='white', # 设置背景颜色为白色
    colormap='winter', # 设置颜色风格为'winter'
    font_path=font, # 设置中文字体
    width=1280, # 设置词云图的宽度
    height=720, # 设置词云图的高度
    max_font_size=150, # 设置字体显示的最大值
    max_words=200 # 设置最多能显示的词数
)
# 输入文本给词云做处理
wc.generate(word_result)

# 显示词云图
plt.imshow(wc)

# "off"表示不显示轴坐标
plt.axis("off")
plt.show()

# 输出词云图到当前目录
wc.to_file("pict_wordcloud.jpg")

这里要注意的是要下载两个文件：一个是停用词列表：stopwords1893.txt；一个是字体文件 SimHei.ttf

有兴趣的自行下载哦~~

效果图如下：