import re
import jieba
import jieba.analyse # 提取关键内容
from pymongo import MongoClient
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
from pylab import *
from wordcloud import WordCloud
mpl.rcParams['font.sans-serif'] = ['SimHei'] #添加汉字/ #用来正常显示中文标签
mpl.rcParams['axes.unicode_minus'] = False #用来正常显示负号
def load_file():
'''
加载外部词典,正则去除所有的标点符号,返回纯文本
'''
jieba.load_userdict("C:/Lib/dict_lzf.txt") # 加载外部自定义词典
client = MongoClient('localhost', 27017) # 链接数据库
db = client['Taoguba']
name = 'List'
for i in range(5):
db_name = name + str(i + 1)
db_emotino = db[db_name]
news = db_emotino.find()
C_new = []
for i in news:
new = (i["Content"])
r = '[’!@#~¥%……&*() ——+|}{“:”?》《,。、‘;’、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+'
news1 = re.sub(r, '', new)
news1 = re.sub('[a-zA-Z0-9]', '', news1)
stop_new = stop_dict(news1)
cut = cut_package(stop_new)
C_new.append(cut)
print("----每一篇词频发布如下-----")
find_(str(cut))
print("---------所有文章词频发布如下---------")
lable, x = find_(str(C_new))
make_pic(db_name, lable, x)
w_cloud(db_name, str(C_new))
def stop_dict(news):
'''
去除所有的停用词
'''
stopwords = open("C:/Anaconda/stopworld.txt", 'r',
encoding='utf-8').read()
outstr = ''
for word in news:
if word not in stopwords:
outstr += word
return outstr
def cut_package(news):
'''
按照不同的模式切分
'''
seg_list = [x for x in jieba.cut(news, cut_all=False)] # 精确切割模式(默认为精确模式)
seg = (' '.join(seg_list))
return seg
# print(seg)
# seg_list = jieba.cut(news, cut_all=True) # 全模式
# print("Full Mode:", ' '.join(seg_list))
# seg_list = jieba.cut_for_search(news) # 搜索引擎模式
# print("Full Mode:", ' '.join(seg_list))
def find_(item):
'''
关键词提取
withWeight 为是否一并返回关键词权重值,默认值为 False
allowPOS 仅包括指定词性的词,默认值为空,即不筛选
'''
c = jieba.analyse.extract_tags(item, topK=10, withWeight=True) # 关键词提取,返回10个出现最多的词
world = []
weight = []
for i in c:
a1 = i[0]
world.append(a1)
b1 = i[1]
weight.append(b1)
print("%s 的词频为:%s" % (a1, b1))
return world, weight
def make_pic(name, lable, x):
'''
绘图
'''
idx = np.arange(len(x))
color = cm.jet(np.array(x) / max(x))
plt.barh(idx, x, color=color)
plt.yticks(idx + 0.4, lable)
plt.grid(axis='x')
plt.xlabel('出现频率')
plt.ylabel('标签')
plt.title('文档中出现频率最高的前十个词')
# plt.show()
savefig("C:/python/web/ %s.png" % name)
def w_cloud(name, seg):
# back_coloring = imread("G:/Anaconda/work/data/LZF.png")
# ima_graph = np.array(ima)
wc = WordCloud(font_path="C:/Anaconda/font/GB2312.ttf",
background_color="black", width=2300, height=1000, max_font_size=1000)
# wc = WordCloud(background_color="black", width=2300, height=1000, max_font_size=1000)
wc.generate(seg)
# imacolor = ImageColorGenerator(ima_graph)
# wc.recolor(color_func=imacolor)
wc.to_file(r"C:/python/ %s.jpg" % name)
plt.figure("词云图")
plt.imshow(wc)
plt.axis("off")
plt.show()
def emo_pic(name, data):
labels = '消极', '积极'
fracs = data
colors = ['yellow', 'red']
explode = (0, 0.08) # 偏移量
plt.subplot(aspect=1)
plt.pie(fracs, explode=explode, labels=labels, colors=colors, autopct='%.0f%%', shadow=True, radius=1)
# %.0f%% 整数, %1.1f%% 一位小数
plt.legend()
plt.axis('equal')
savefig("C:/python/ %s.png" % name)
plt.show()
def main():
load_file()
if __name__ == '__main__':
main()
Worldcloud文本词云图
最新推荐文章于 2024-09-16 08:00:00 发布