战狼2豆瓣词云

最新推荐文章于 2024-08-21 18:17:45 发布

名字好难起2222

最新推荐文章于 2024-08-21 18:17:45 发布

阅读量606

点赞数

分类专栏： python 文章标签： python 爬

本文链接：https://blog.csdn.net/u014299421/article/details/81146670

版权

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

#!/usr/bin/python
# -*- coding: UTF-8 -*-
 

import urllib.request
import urllib.error
import re
import threading
from lxml import etree
import time
import pandas as pd
from html.parser import HTMLParser
import sys
import jieba
import numpy    #numpy计算包
import matplotlib.pyplot as plt
 
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud#词云包


headers=("user-agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)

url="https://movie.douban.com/subject/26363254/comments?start=0&limit=20&sort=new_score&status=P"
response  = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response) 
page = html.xpath('//li[@class="is-active"]/span/text()')
pagenum=page[0][3:-1]#获取短评条数
n=int(pagenum)//20#获取短评(short commentary)页数

comments = ''
try:
    for i in range(0,10):#暂未登陆，仅爬取前10页
        url="https://movie.douban.com/subject/26363254/comments?start="+str(i*20)+"&limit=20&sort=new_score&status=P"
        response = opener.open(url).read().decode("utf-8","ignore")
        html = etree.HTML(response)
        sc_list = html.xpath('//span[@class="short"]/text()')#不要忽视@
       
        for j in range(0,len(sc_list)):
            non_num = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)#过滤emoji,0xfffd, 即"replacement character
            sc_list[j]=sc_list[j].translate(non_num)
            comments = comments + (str(sc_list[j])).strip()#为方便对数据进行清洗，将列表中的数据放在一个字符串数组
            pattern = re.compile(r'[\u4e00-\u9fa5]+')#清洗标点符号，至少匹配一个汉字的写法
            filterdata = re.findall(pattern, comments)
            cleaned_comments = ''.join(filterdata)#将序列中的元素以指定的字符连接生成一个新的字符串。
                
    segment = jieba.lcut(cleaned_comments)#返回list
    words_df=pd.DataFrame({'segment':segment})#segment列名
    stopwords=pd.read_csv("chineseStopWords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用,index_col为默认为None,即不指定行索引，系统自动加上行索引（0-）,
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]#去掉在stopword中显示的文字,words_df的type仍然是DataFrame
    words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})#以segment分类，以计数对该列聚合,size()函数主要是用来统计矩阵元素个数
    words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)

    wordcloud=WordCloud(font_path="Lib/site-packages/wordcloud/font163/simheittf.ttf",background_color="white",max_font_size=80,width=1000,height=600) #指定字体类型、字体大小和字体颜色
    word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}#'{吴京': '8','主旋律': '6'}word_frequence的类型是字典
    
    wordcloud=wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
    if hasattr(e,"reason"):
        print(e.reason)
        time.sleep(10)
except Exception as e:
    print("exception:"+str(e))#若为Exception异常，延时10秒执行
    time.sleep(10)

词云如下图所示，运行过程中出现FutureWarning: using a dict on a Series for aggregation is deprecated and will be removed in a future version，待解决。

标题