战狼2豆瓣词云

#!/usr/bin/python
# -*- coding: UTF-8 -*-
 

import urllib.request
import urllib.error
import re
import threading
from lxml import etree
import time
import pandas as pd
from html.parser import HTMLParser
import sys
import jieba
import numpy    #numpy计算包
import matplotlib.pyplot as plt
 
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud#词云包


headers=("user-agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)

url="https://movie.douban.com/subject/26363254/comments?start=0&limit=20&sort=new_score&status=P"
response  = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response) 
page = html.xpath('//li[@class="is-active"]/span/text()')
pagenum=page[0][3:-1]#获取短评条数
n=int(pagenum)//20#获取短评(short commentary)页数

comments = ''
try:
    for i in range(0,10):#暂未登陆,仅爬取前10页
        url="https://movie.douban.com/subject/26363254/comments?start="+str(i*20)+"&limit=20&sort=new_score&status=P"
        response = opener.open(url).read().decode("utf-8","ignore")
        html = etree.HTML(response)
        sc_list = html.xpath('//span[@class="short"]/text()')#不要忽视@
       
        for j in range(0,len(sc_list)):
            non_num = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)#过滤emoji,0xfffd, 即"replacement character
            sc_list[j]=sc_list[j].translate(non_num)
            comments = comments + (str(sc_list[j])).strip()#为方便对数据进行清洗,将列表中的数据放在一个字符串数组
            pattern = re.compile(r'[\u4e00-\u9fa5]+')#清洗标点符号,至少匹配一个汉字的写法
            filterdata = re.findall(pattern, comments)
            cleaned_comments = ''.join(filterdata)#将序列中的元素以指定的字符连接生成一个新的字符串。
                
    segment = jieba.lcut(cleaned_comments)#返回list
    words_df=pd.DataFrame({'segment':segment})#segment列名
    stopwords=pd.read_csv("chineseStopWords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用,index_col为默认为None,即不指定行索引,系统自动加上行索引(0-),
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]#去掉在stopword中显示的文字,words_df的type仍然是DataFrame
    words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})#以segment分类,以计数对该列聚合,size()函数主要是用来统计矩阵元素个数
    words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)

    wordcloud=WordCloud(font_path="Lib/site-packages/wordcloud/font163/simheittf.ttf",background_color="white",max_font_size=80,width=1000,height=600) #指定字体类型、字体大小和字体颜色
    word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}#'{吴京': '8','主旋律': '6'}word_frequence的类型是字典
    
    wordcloud=wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
    if hasattr(e,"reason"):
        print(e.reason)
        time.sleep(10)
except Exception as e:
    print("exception:"+str(e))#若为Exception异常,延时10秒执行
    time.sleep(10)
    

 词云如下图所示,运行过程中出现FutureWarning: using a dict on a Series for aggregation is deprecated and will be removed in a future version,待解决。

词云标题
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值