# -*- coding: utf-8 -*-
"""
Created on Tue Aug 28 14:38:16 2018
@author: wenyun.wxw
"""
import jieba
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
data=ratecontent_less+ratecontent_jd_less #合并天猫评论和京东评论
num=len(data)
for i in range(num):
if data[i]=='此用户没有填写评论!':
del data[i]
def fenci(ratecontent):
#添加的自定义中文语句
jieba.add_word('小度在家')
jieba.add_word('摄像头')
#导入停用词文档
stopwords = {}.fromkeys([ line.rstrip() for line in open('stop_words.txt') ])
word=[]
#当ratecontent为多个句子的时候,循环开始------------------------------------------------------
for i in range(len(ratecontent)):
#结巴分词
seg_list = jieba.cut(ratecontent[i], cut_all=False)#精确模式
sent='/'.join(seg_list)
#根据标点分割
#\s可以匹配一个空格(也包括Tab等空白符),所以\s+表示至少有一个空格
sentlist=re.split(r'[/,;,\s,,,。,!,!,~,、,?,hellip,amp,&,(,),《,\d]+',sent)
#停用词处理,且只保留中文
sentlist2 = []
for i in range(len(sentlist)):
sentlist[i] = re.sub(r'[^\u4e00-\u9fa5]', "",sentlist[i])#只保留中文
if sentlist[i] not in stopwords and sentlist[i] != '':
sentlist2.append(sentlist[i])
#分词结果用合并,每行合并
word.append(sentlist2)
#合并word所有词
wordall=[]
for w in word:
wordall.extend(w)
#词频统计
wordcount= {} #字典
for item in wordall:
if item not in wordcount:
wordcount[item] = 1
else:
wordcount[item] += 1
#根据词频从高到低排序
wordcount_sort=sorted(wordcount.items(),key=lambda item:item[1],reverse=True)
return word,wordcount,wordcount_sort
if __name__ == '__main__':
word,wordcount,wordcount_sort=fenci(data)
#可视化词云
#按照词频决定字的大小,fit_words接受的参数为字典,注意必须为字典可以直接使用结巴分词带频率的结果
wordcloud = WordCloud(background_color="white",font_path="file:///C:/windows/Fonts/MSYH.TTC",
width=1000, height=860, margin=2).fit_words(wordcount)
#perferences图片inline改为qt5
plt.imshow(wordcloud)
plt.axis("off")