1.csv文件转字典形式,统计频繁词的权重,并绘制词云图
def read_csv_to_dict(index) -> dict:
"""
读取csv数据
数据格式为:'用户id', '用户名', '性别', '地区', '生日', '微博id', '微博内容'
:param index: 读取某一列 从0开始
:return: dic属性为key,次数为value
"""
with open(CSV_FILE_PATH, 'r', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
# for columns in reader:
# print(columns)
column = [columns[index] for columns in reader]
print(column)
dic = collections.Counter(column)#统计列表元素出现次数
# 删除空字符串
if '' in dic:
dic.pop('')
print(dic)#Counter({'我太难了': 3, '超话粉丝大咖': 2, '#周杰伦演唱会# 抵制黄牛!真正歌迷抢不到,:4})
return dic
def analysis_sina_content():
# 读取微博内容列
dic = read_csv_to_dict(6)
#print(dic)
# 数据清洗,去掉无效词
jieba.analyse.set_stop_words(STOP_WORDS_FILE_PATH)
# 词数统计
words_count_list = jieba.analyse.textrank(' '.join(dic.keys()), topK=50, withWeight=True)
# 函数:jieba.analyse.textrank(string, topK=20, withWeight=True, allowPOS=())
# string:待处理语句
# topK:关键字的个数,默认20
# withWeight:是否返回权重值,默认false
# allowPOS:是否仅返回指定类型,默认为空
print(words_count_list)
#[('杭州', 1.0), ('演唱会', 0.9047694519491188), ('抢到', 0.4155709853243528), ('门票', 0.32065316150633894)]
print(len(words_count_list))#50
# 生成词云
word_cloud = (
WordCloud()
.add("", words_count_list, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
.set_global_opts(title_opts=opts.TitleOpts(title="周杰伦打榜微博内容分析"))
)
word_cloud.render('word_cloud.html')
2.字典文件统计词频,并生成词云
python制作词云大致分这几个步骤:
1.制作停用词文本
2.读取文本并使用停用词
3.配置词云参数并生成图片形状的词云
代码如下:
'''统计词频'''
def statistics(texts, stopwords):
words_dict = {}
for text in texts:
temp = jieba.cut(text)
for t in temp:
if t in stopwords or t == 'unknow':
continue
if t in words_dict.keys():
words_dict[t] += 1
else:
words_dict[t] = 1
return words_dict
'''词云'''
def drawWordCloud(words, title, savepath='./results'):
if not os.path.exists(savepath):
os.mkdir(savepath)
wc = WordCloud(font_path='simkai.ttf', background_color='white', max_words=2000, width=1920, height=1080, margin=5)
wc.generate_from_frequencies(words)
wc.to_file(os.path.join(savepath, title+'.png'))
with open('python_61.pkl', 'rb') as f:
data = pickle.load(f)
# 词云
stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
texts = [j[2] for i, j in data.items()]
words_dict = statistics(texts, stopwords)
drawWordCloud(words_dict, 'python相关图书简介词云', savepath='./results')
3.txt文件统计词频并绘制词云图
import jieba.analyse
import imageio
import jieba.posseg as pseg
def jieba_cut():
#西游记停用词
fr = open('西游记停用词.txt', 'r')
stop_word_list = fr.readlines()
new_stop_word_list = []
for stop_word in stop_word_list:
stop_word = stop_word.replace('\ufeef', '').strip()
new_stop_word_list.append(stop_word)
print(stop_word_list) #输出停用词
#输出西游记 词语出现的次数
fr_xyj=open('xyj.txt','r',encoding='utf-8')
s=fr_xyj.read()
words=jieba.cut(s,cut_all=False)
word_dict={}
word_list=''
for word in words:
if (len(word) > 1 and not word in new_stop_word_list):
word_list = word_list + ' ' + word
if (word_dict.get(word)):
word_dict[word] = word_dict[word] + 1
else:
word_dict[word] = 1
fr.close()
##print(word_list)
#print(word_dict) #输出西游记 词语出现的次数
#按次数进行排序
sort_words=sorted(word_dict.items(),key=lambda x:x[1],reverse=True)
print(sort_words[0:101])#输出前0-100的词
from wordcloud import WordCloud
color_mask =imageio.imread("1.png")
wc = WordCloud(
background_color="black", # 背景颜色
max_words=500, # 显示最大词数
font_path="D:/软件(学习)/Python/PyCharm/font/simsun.ttc", # 使用字体
min_font_size=15,
max_font_size=50,
width=400,
height=860,
mask=color_mask) # 图幅宽度
i=str('why')
wc.generate(word_list)
wc.to_file(str(i)+".png")
jieba_cut()
要生成词云的形状:1.png
最后的生成的词云: