词云图在Python里面可以调包运行生成
首先将数据获取下来,我用CSV文件存储,一行就是一条数据
获取的数据有很多噪声,先去停用词
'''先去重和清洗'''
df= pd.read_csv('你的文件名字,我用csv',error_bad_lines=False)
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
stopwords = stopwordslist(r'去停用词.txt')
#重新设置索引
df.reset_index(drop=True,inplace=True)
content = df['text']#读取数据,我的数据是在文件列名为text的
worker = lambda s : [x.word for x in psg.cut(s)] #自定义分词
word=content.apply(worker)#自定义分词
print(word[0])#输出查看一下
outstr = ''
for i in word:#写个for循环用来去停用词
for j in i:
if j not in stopwords:
if j != '\t':
outstr += j
outstr += " "
然后再调用一下matplotlib和wordClooud进行词云图生成就行
以下是完整源码
import json
import os
import pandas as pd
import re
import jieba
import jieba.posseg as psg
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
'''先去重和清洗'''
df= pd.read_csv('你的文件名字,我用csv',error_bad_lines=False)
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
stopwords = stopwordslist(r'去停用词.txt')
df.reset_index(drop=True,inplace=True)
#
content = df['text']
worker = lambda s : [x.word for x in psg.cut(s)]
word=content.apply(worker)
print(word[0])
outstr = ''
for i in word:
for j in i:
if j not in stopwords:
if j != '\t':
outstr += j
outstr += " "
# 生成对象
wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=500, height=400, mode="RGBA", background_color='white').generate(outstr)
wc.to_file('词云图.png')
# 显示词云图
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()