import sys, jieba
import jieba.posseg as psg
from collections import Counter
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
# 第一次分好词后以下内容注释,节省时间
# with open('/Users/guoguojia/Desktop/dmbj.txt', 'r') as f:
# dm_txt = f.read()
#
# dm_words_with_attr = [] # 南派三叔 nr
# for x in psg.cut(dm_txt): # psg.cut jieba的分词
# if len(x.word) > 2:
# dm_words_with_attr.append((x.word, x.flag))
#
# print(len(dm_words_with_attr))
# with open('/Users/guoguojia/Desktop/dm_out.txt', 'w+') as f:
# for x in dm_words_with_attr:
# f.write(f'{x[0]}\t{x[1]}\n')
dm_words_with_attr = []
with open('/Users/guoguojia/Desktop/dm_out.txt', 'r') as f:
for x in f.readlines():
pair = x.split()
if len(pair) < 2:
continue
dm_words_with_attr.append((pair[0], pair[1]))
attr_dict = {}
for x in dm_words_with_attr:
attr_dict[x[0]] = x[1]
# 过滤的参数
stop_attr = ['eng', 'm', 'v', 'r', 'l', 'rr', 'c', 'n', 'd', 'i', 't', ]
words = [x[0] for x in dm_words_with_attr if x[1] not in stop_attr]
# 实现提取前Top n 功能
c = Counter(words).most_common(500)
print(c)
# with open('/Users/guoguojia/Desktop/most.txt', 'w+') as f:
# for x in c:
# f.write(f'{x[0]}\n')
attr_dict = {}
for i, j in c:
attr_dict[i] = j
# 词云背景图片
img = np.array(Image.open('/Users/guoguojia/Desktop/12.jpg'))
wc = WordCloud(
background_color="white", #背景颜色
mask=img,
max_words=500, #显示最大词数
font_path="/System/Library/Fonts/Supplemental/Arial Unicode.ttf", #使用字体
min_font_size=10,
max_font_size=50,
scale=2, # 比列放大 数值越大 词云越清晰
# width=1680, #图幅宽度
# height=1050,
random_state=50,
relative_scaling=False,
).generate_from_frequencies(attr_dict)
# 绘制文字的颜色以背景图颜色为参考
image_color = ImageColorGenerator(img)
# 好像是结合原图色彩啥的 忘记了。。。。
wc.recolor(color_func=image_color)
plt.figure() # 创建图像
plt.imshow(wc, interpolation="bilinear")
# 关闭坐标轴
plt.axis("off")
plt.show()
# 保存图片
wc.to_file('/Users/guoguojia/Desktop/14.jpg')
scale=2, 按原图比列放大,数值越大 图像越清晰,在保存的时候才能看到。