# -*- coding: utf-8 -*-
"""
Created on Fri Jul 13 20:00:33 2018
@author: wenyun.wxw
"""
import jieba
import re
ratecontent2=ratecontent
for i in range(len(ratecontent)):
if ratecontent[i]=='此用户没有填写评论!':
del ratecontent[i]
def fenci(ratecontent):
#添加的自定义中文语句
jieba.add_word('天猫精灵')
#导入停用词文档
stopwords = {}.fromkeys([ line.rstrip() for line in open('stop_words.txt') ])
word=[]
#当ratecontent为多个句子的时候,循环开始------------------------------------------------------
for i in range(len(ratecontent)):
#结巴分词
seg_list = jieba.cut(ratecontent[i], cut_all=False)#精确模式
sent='/'.join(seg_list)
#根据标点分割
#\s可以匹配一个空格(也包括Tab等空白符),所以\s+表示至少有一个空格
sentlist=re.split(r'[/,;,\s,,,。,!,!,~,、,?,hellip,amp,&,(,),《,\d]+',sent)
#停用词处理
sentlist2 = []
for i in range(len(sentlist)):
#for w in sentlist:
if sentlist[i] not in stopwords:
sentlist2.append(sentlist[i])
#分词结果用合并,每行合并
word.append(sentlist2)
#------------------------------------------------------
#print(word)
#print("第"+str(i)+"行:"+"/ ".join(seg_list))
#thulac分词
#import thulac
#for i in range(0,len(ratecontent)):
# seg_list=thulac.thulac().cut(ratecontent[i])
# word.append('/'.join(seg_list))
##根据标点分割,把word每句分词分行
#word2= []
#wordall=''.join(word)#把word合并
##\s可以匹配一个空格(也包括Tab等空白符),所以\s+表示至少有一个空格
#word2 = re.split(r'[/,;,\s,,,。,!,!,~,、,?,hellip,amp,&,(,),《,\d]+',wordall)
##停用词处理
#stopwords = {}.fromkeys([ line.rstrip() for line in open('stop_words.txt') ])
#wordlist = []
#for w in word:
# if w not in stopwords:
# wordlist.append(w)
#合并word所有词
wordall=[]
for w in word:
wordall.extend(w)
#词频统计
wordcount= {} #字典
for item in wordall:
if item not in wordcount:
wordcount[item] = 1
else:
wordcount[item] += 1
#根据词频从高到低排序
wordcount_sort=sorted(wordcount.items(),key=lambda item:item[1],reverse=True)
return word,wordcount,wordcount_sort
word,wordcount,wordcount_sort=fenci(ratecontent)
#可视化词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt
#wordcloud = WordCloud(background_color="white",font_path="file:///C:/windows/Fonts/MSYH.TTC",
# width=1000, height=860, margin=2).generate(wordall)
#按照词频决定字的大小,fit_words接受的参数为字典,注意必须为字典可以直接使用结巴分词带频率的结果
wordcloud = WordCloud(background_color="white",font_path="file:///C:/windows/Fonts/MSYH.TTC",
width=1000, height=860, margin=2).fit_words(wordcount)
#perferences图片inline改为qt5
plt.imshow(wordcloud)
plt.axis("off")
# 导出结果
file = open('分词结果.csv','w')
for i in list(range(len(word))):
file.write(','.join(word[i])+'\n')
file.close()
file = open('词频统计.csv','w')
for i in list(range(len(wordcount_sort))):
file.write(','.join((wordcount_sort[i][0],str(wordcount_sort[i][1])))+'\n')
file.close()