import pandas as pd
import jieba
from wordcloud import *
from os import path
from time import clock
from collections import defaultdict
from scipy.misc import imread
start = clock()
# 读取数据
data = pd.read_table('D:/meidi.txt', header = None)
data = data.dropna() #去除缺失
# 统计描述
data.describe()
## 文本去重
# 使用drop_duplicates
dat_drop = data.drop_duplicates()
# 使用unique
# dat_uni = data[0].unique()
## 结巴分词
dat_fc = []
data_test = list(dat_drop[0])
for i in data_test:
dat_fc.append(list(jieba.cut(i)))
## 停用词过滤
# 导入停用词词库
stop_word = pd.read_table('D:/stopwords.txt', sep = '\s+', encoding = 'utf-8',header = None, engine='python')
stop = list(stop_word[0])
stop = [' ',''] + stop
# 过滤停用词
comment_word = []
for x in dat_fc:
a = [i for i in x if i not in stop]
comment_word.append(a)
pos_text =defaultdict(int)
for i in comment_word:
for j in i:
pos_text[j] = (pos_text[j]+i.count(j))
# for i in pos_text:
# print(i)
# 设置背景图片
alice_coloring = imread(path.join("E:/", "20160303160528046.jpg"))
# 画词云
wc = WordCloud( font_path='E:/font/simhei.ttf',#设置字体,自己去下载字体,设置路径
background_color="black", #背景颜色
max_words=2000,# 词云显示的最大词数
# ax_font_size=100, #字体最大值
random_state=42,
mask=alice_coloring,#设置背景图片
)
# 显示
wc_pos = wc.generate_from_frequencies(pos_text)
# wc_pos = wc.generate(pos_text)
image_colors = ImageColorGenerator(alice_coloring)# 从图片生成颜色值
plt.figure()# 绘制词云
plt.imshow(wc.recolor(color_func=image_colors))#将wordcloud中的字体颜色特换
plt.axis("off")
plt.show()
# 图片存储
wc.to_file(path.join("D:/", "test2.png"))
stop = clock()
print ("Cost time is : %0.3f s"%(stop-start))#统计系统运行时间
底图
效果图