词云---利用python对电影评价的爬取

最新推荐文章于 2023-11-26 23:32:34 发布

cgsthtm

最新推荐文章于 2023-11-26 23:32:34 发布

阅读量1.1k

点赞数

分类专栏： python 文章标签： python wordcloud matplotlib bs4 jieba

python 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

转自：https://www.cnblogs.com/qq1141100952com/p/8906070.html 在其基础上做了一点点改动

爬取豆瓣《白日焰火》评论词云效果图：

所需文件如下图：

其中alice_mask.png随便找一张图片即可，stopwords.txt自行百度搜索下载，1.py代码如下：

# -*- coding: utf-8 -*-
 
import warnings
warnings.filterwarnings("ignore")
import jieba  # 分词包
import numpy  # numpy计算包
import codecs  # codecs提供的open方法来指定打开的文件的语言编码，它会在读取的时候自动转换为内部unicode
import re
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from urllib import request
from bs4 import BeautifulSoup as bs
from wordcloud import WordCloud,ImageColorGenerator # 词云包
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
 
 
 
# 分析网页函数
def getNowPlayingMovie_list():
    resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
    html_data = resp.read().decode('utf-8')
    soup = bs(html_data, 'html.parser')
    nowplaying_movie = soup.find_all('div', id='nowplaying')
    nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
    nowplaying_list = []
    for item in nowplaying_movie_list:
        nowplaying_dict = {}
        nowplaying_dict['id'] = item['data-subject']
        for tag_img_item in item.find_all('img'):
            nowplaying_dict['name'] = tag_img_item['alt']
            nowplaying_list.append(nowplaying_dict)
    return nowplaying_list
 
# 爬取评论函数
def getCommentsById(movieId, pageNum):
    eachCommentList = []
    if pageNum > 0:
        start = (pageNum - 1) * 20
    else:
        return False
    requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
    print(requrl)
    resp = request.urlopen(requrl)
    html_data = resp.read().decode('utf-8')
    #print(html_data)
    soup = bs(html_data, 'html.parser')
    comment_div_lits = soup.find_all('div', class_='comment')
    #print(comment_div_lits)
    for item in comment_div_lits:
        #print(item)
        print(item.find_all('p')[0])
        #print(type(item.find_all('p')[0]))
        if item.find_all('p')[0] is not None:
            eachCommentList.append(item.find_all('p')[0])
    return eachCommentList
 
def main():
    # 循环获取第一个电影的前10页评论
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    print(NowPlayingMovie_list)
    for i in range(10):
        num = i + 1
        #commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)
        commentList_temp = getCommentsById('21941804', num)#豆瓣 白日焰火 id 21941804
        print(commentList_temp)
        commentList.append(commentList_temp)
    #print(commentList)
 
    # 将列表中的数据转换为字符串
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()
    #print(comments)

    # 使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)
    #print(cleaned_comments)
 
    # 使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df = pd.DataFrame({'segment': segment})
 
    # 去掉停用词
    stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],encoding='gbk')  # quoting=3全不引用
    #print(stopwords)
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
    print(words_df)
 
    # 统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
    #  print(words_stat.head())
 
    bg_pic = numpy.array(Image.open("alice_mask.png"))
 
    # 用词云进行显示
    wordcloud = WordCloud(
        font_path="simhei.ttf",
        background_color="white",
        max_font_size=80,
        width = 2000,
        height = 1800,
        mask = bg_pic,
        mode = "RGBA"
    )
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    print(word_frequence)
    """
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)
        #print(word_frequence_list)
    """
    wordcloud = wordcloud.fit_words(word_frequence)
 
    image_colors = ImageColorGenerator(bg_pic) # 根据图片生成词云颜色
 
    plt.imshow(wordcloud) #显示词云图片
    plt.axis("off")
    plt.show()
    wordcloud.to_file('show_Chinese.png')  # 把词云保存下来
 
main()