# -*- coding:utf-8 -*-
import re #正则
import jieba
from wordcloud import WordCloud
import jieba.analyse
from collections import Counter
import matplotlib.pyplot as plt #图片包
import pandas as pd
#加载自定义词典
jieba.load_userdict("E:/Z/Z&R语言/京东评论/zidingyi.txt")
#加载停用词词典
stopwords = {}.fromkeys([ line.rstrip() for line in open('E:/Z/Z&R语言/京东评论/tingyong.txt',encoding="utf-8") ])
with open("E:/Z/Z&R语言/京东评论/jdphonepinglun-utf8.txt",encoding="utf-8") as f:
str = f.read()
shangpingid = re.compile(r"(?<=\{\"_id\":\").*?(?=\",)", re.S)
pinglun = re.compile(r"(?<=:\[).*?(?=\])", re.S)
idlist = re.findall(shangpingid, str)
pinglunlist = re.findall(pinglun, str)
# print("id数量",len(idlist))
# print("评论数量",len(pinglunlist))
'''关键词和相应的权重
# 第一个参数:待提取关键词的文本
# 第二个参数:返回关键词的数量,重要性从高到低排序
# 第三个参数:是否同时返回每个关键词的权重
# 第四个参数:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词
keywords = jieba.analyse.extract_tags(pinglunlist[1], topK=20, withWeight=True, allowPOS=())
# 访问提取结果
for item in keywords:
# 分别为关键词和相应的权重
print(item[0], item[1])
'''
'''分词
seg_list = jieba.cut(j, cut_all=True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut(j, cut_all=False)
print(",".join(seg_list)) # 精确模式 默认是精确模式
seg_list = jieba.cut_for_search(j) # 搜索引擎模式
print(",".join(seg_list))
'''
#标签云绘制
def Cloud(cut,name=""):
counts = []
for word in cut:
if len(word) == 1 or word in stopwords:
continue
else:
counts.append(word)
c = Counter(counts)
#写csv文件
words = []
pingshu = []
for item in c.most_common(2000):
words.append(item[0])
pingshu.append(item[1])
dataframe = pd.DataFrame({'词': words, '出现次数': pingshu})
dataframe.to_csv(name+"的词频数.csv", index=False, sep=',', encoding="utf_8_sig")
#绘制标签云
wordCloud = WordCloud(
font_path='data/simhei.ttf', # 设置字体为中文字体
background_color='black', # 设置词云背景颜色
)
wordCloud.fit_words(c) # 配置词云
wordCloud.to_file(name+"标签云.jpg")
print(name)
# plt.imshow(wordCloud)
# plt.axis("off")
# plt.show()
#总评论
cutzong = jieba.lcut(str, cut_all=False) # jieba.lcut 可以直接输出列表
Cloud(cutzong,"总体")
#各商品id
i=0
try:
for j in pinglunlist:
cutData = jieba.lcut(j, cut_all=False) # jieba.lcut 可以直接输出列表
if len(cutData) >= 20000:
Cloud(cutData,idlist[i])
print(" ".join(cutData))
i=i+1
else:
i=i+1
except:
pass
结巴分词使用
最新推荐文章于 2018-12-26 10:28:05 发布