#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib.request
import urllib.error
import re
import threading
from lxml import etree
import time
import pandas as pd
from html.parser import HTMLParser
import sys
import jieba
import numpy #numpy计算包
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud#词云包
headers=("user-agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
url="https://movie.douban.com/subject/26363254/comments?start=0&limit=20&sort=new_score&status=P"
response = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response)
page = html.xpath('//li[@class="is-active"]/span/text()')
pagenum=page[0][3:-1]#获取短评条数
n=int(pagenum)//20#获取短评(short commentary)页数
comments = ''
try:
for i in range(0,10):#暂未登陆,仅爬取前10页
url="https://movie.douban.com/subject/26363254/comments?start="+str(i*20)+"&limit=20&sort=new_score&status=P"
response = opener.open(url).read().decode("utf-8","ignore")
html = etree.HTML(response)
sc_list = html.xpath('//span[@class="short"]/text()')#不要忽视@
for j in range(0,len(sc_list)):
non_num = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)#过滤emoji,0xfffd, 即"replacement character
sc_list[j]=sc_list[j].translate(non_num)
comments = comments + (str(sc_list[j])).strip()#为方便对数据进行清洗,将列表中的数据放在一个字符串数组
pattern = re.compile(r'[\u4e00-\u9fa5]+')#清洗标点符号,至少匹配一个汉字的写法
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)#将序列中的元素以指定的字符连接生成一个新的字符串。
segment = jieba.lcut(cleaned_comments)#返回list
words_df=pd.DataFrame({'segment':segment})#segment列名
stopwords=pd.read_csv("chineseStopWords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用,index_col为默认为None,即不指定行索引,系统自动加上行索引(0-),
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]#去掉在stopword中显示的文字,words_df的type仍然是DataFrame
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})#以segment分类,以计数对该列聚合,size()函数主要是用来统计矩阵元素个数
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
wordcloud=WordCloud(font_path="Lib/site-packages/wordcloud/font163/simheittf.ttf",background_color="white",max_font_size=80,width=1000,height=600) #指定字体类型、字体大小和字体颜色
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}#'{吴京': '8','主旋律': '6'}word_frequence的类型是字典
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:"+str(e))#若为Exception异常,延时10秒执行
time.sleep(10)
词云如下图所示,运行过程中出现FutureWarning: using a dict on a Series for aggregation is deprecated and will be removed in a future version,待解决。
标题