工具
- Win7
- Python3.6.2
- 使用到的库 wordcloud jieba re matplotlib
开启项目
- 安装好jupyter后,执行
jupyter notebook
- 点击新建文件
相关代码
一般的词云统计
%matplotlib inline #执行才能在线查看图片
import re
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
txt = open('bobocode.txt',encoding="utf-8").read()
new_txt = re.findall(r'[u4e00-u9fa5]+',txt) #去掉英语和光字符,保留中文
txt_list = ''
for word in new_txt:
if word in txt_list:
continue
txt_list +=word
seg_list = jieba.cut(txt_list) #用分词把句子分为每个词语
new_text = " ".join(seg_list)
wordcloud_txt= WordCloud(font_path="DroidSansFallbackFull.ttf").generate(new_text) #
plt.imshow(wordcloud_txt)
wordcloud_txt.to_file('pic.png') #下载保存图片
plt.axis("off")
plt.show()
输出结果
带有遮罩图片的词云
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud
#自定义随机颜色
def grey_color_func(word,font_size,position,orientation,random_state=None,**kwargs):
return "hsl(0,0%%,%d%%)" % random.randint(60,100)
mask = np.array(Image.open("lz.jpg"))
result_word = WordCloud(font_path="DroidSansFallbackFull.ttf",max_words=2000,
mask=mask,margin=10,
random_state=1).generate(new_text)
default_color = result_word.to_array()
plt.title("Custom colors")
plt.imshow(result_word.recolor(color_func=grey_color_func,random_state=5))
result_word.to_file("site.png")
plt.axis("off")
plt.figure()
plt.title(u'bobocode博客词频统计')
plt.rc('font', family='SimHei', size=13) #解决中文显示问题
plt.imshow(default_color)
plt.axis("off")
输出结果
总结
通过以上可以看出我的博客频繁出现的词语,侧重哪个方面一目了然