坑参考https://blog.csdn.net/heyuexianzi/article/details/76851377
https://mp.weixin.qq.com/s/0Bw8QUo1YfWZR_Boeaxu_Q
逻辑清楚详细https://www.cnblogs.com/delav/p/7845539.html
#-- coding: utf-8 --
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as pl
加载自定义分词字典
jieba.load_userdict(“news.txt”)#去搜狗下载专用词库
待分析文本
import codecs
file = codecs.open(‘laojiumen.txt’, ‘r’, encoding=’utf-8’)
content = file.read()
file.close()
特殊词加载
jieba.add_word(u’二月红’)
去掉停顿词
#
words_list1 = []
word_generator = jieba.cut(content, cut_all=False) # 返回的是一个迭代器
with open(‘outwords.txt’, ‘r’, encoding=’utf-8’) as f:
unicode_text = f.read()
f.close() # outwords文本中词的格式是’一词一行’
for word in word_generator:
if word.strip() not in unicode_text:
words_list1.append(word)
同义词替换
words_list = []
for seg in words_list1:
if len(seg) > 1:
if seg == “二爷”:
seg = “二月红”
print(type(seg))
words_list.append(seg);
words_list = ’ ‘.join(words_list)#将list转化成字符串
词频统计
segStat = {}
for seg in words_list:
if seg in segStat:
segStat[seg] += 1
else:
segStat[seg] = 1
print(segStat)
创建词云
wc = WordCloud(background_color=’white’, # 背景颜色
max_words=1000, # 最大词数
#mask=None, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
max_font_size=100, # 显示字体的最大值
font_path=”C:/Windows/Fonts/SimHei.ttf”, # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
random_state=42, # 为每个词返回一个PIL颜色
scale=2,
# width=1000, # 图片的宽
# height=860 #图片的长
)
wc.generate(words_list)
pl.imshow(wc)
pl.axis(“off”)
pl.show()
使用底图绘制词云
import numpy as np
import PIL.Image as Image
coloring = np.array(Image.open(“C:/Users/Administrator/Pictures/pkq.jpg”))
wc = WordCloud(background_color=’white’, # 背景颜色
max_words=1000, # 最大词数
mask=coloring, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
max_font_size=100, # 显示字体的最大值
font_path=”C:/Windows/Fonts/SimHei.ttf”, # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
random_state=42, # 为每个词返回一个PIL颜色
scale=2,
# width=1000, # 图片的宽
# height=860 #图片的长
)
wc.generate(words_list)
image_colors = ImageColorGenerator(coloring)
pl.imshow(wc.recolor(color_func=image_colors))
pl.imshow(wc)
pl.axis(“off”)
pl.show()
“`