本篇文章讲pyecharts和wordcloud两种方式画词云图,代码如下:
因为pyecharts的输入数据类型是列表中嵌套元组,类似下面这种的:
[('REX', 54), ('CASH+CARRY', 54), ('JUMBO', 824), ('SHOPPER', 147), ('BAG', 1951), ('VINTAGE', 1325), ('LEAF', 134), ('PEACE', 11), ('WOODEN', 507), ('BLOCK', 109)]
所以先把字符串describe_document变为字典,然后再list(字典.items())
转为pyecharts需要的数据格式。
from pyecharts.charts import WordCloud
frequency = {}
for word in describe_document.split():
if word not in frequency:
frequency[word] = 1
else:
frequency[word] += 1
word_list = list(frequency.items())
word_list
mywordcloud = WordCloud()
mywordcloud.add('',word_list, shape='circle')
# 渲染图片
mywordcloud.render_notebook()
wordcloud库的generate
函数可以直接对字符串文本进行分词
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 数据采样
data = data.sample(20000,random_state = 22)
# 文本拼接
describe_document = " ".join(data['describe_cutted'])
fig = plt.figure(figsize=(20,10))
# 创建词云对象
wordcloud = WordCloud(background_color = "white",
random_state = 30,
scale = 2,
collocations = False
)
# 生成词云
wordcloud.generate(describe_document)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()