import wordcloud
import re
import jieba
import pandas as pd
import numpy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
type = ['','&percent_type=h','&percent_type=m','&percent_type=l']
for j in range(4):
with open('影评{}'.format(type[j]) + '.txt',"r",encoding='utf-8') as f:#
#可以选择替换txt文件
data=f.readlines()
col = ''
for k in data:
k = k.strip('\n') # 去掉读取中的换行字符
col=col+k
pattern = re.compile(r'[\u4e00-\u9fa5]+')#查找汉字
filterdata = re.findall(pattern, col)
cleanedcol=''.join(filterdata)
segment = jieba.lcut(cleanedcol)#分解成词语
words_df = pd.DataFrame({'segment': segment})
#创建Dataframe型的数据结构,类似于表格,用于存放排序segment
stopwords = pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'],encoding='gbk')
#读取stopwords中的内容存入stopwords(和Dtaframe结构相似)中
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
#isin函数用于清洗数据,把words_df中的数据换成两个表中都不存在的数据
words_stat=words_df.groupby(by=['segment'])['segment'].agg([("计数",numpy.size)])
#对words_stat中的词语统计,新添一列计数表示每个词出现的频率
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
#按照计数中的数值大小降序排序
word_fre={x[0]:x[1] for x in words_stat.head(1000).values}
#读取前两列数据key和value
word_fre_list=[]
for key in word_fre:
temp = (key,word_fre[key])
word_fre_list.append(temp)
#存入word_fre_list中
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",width=2000,height=1000)
wordcloud=wordcloud.fit_words(dict(word_fre_list))
wordcloud.to_file("F://PycharmDemo/Project/DouBan/temp{}".format(type[j])+".png")
#-----------------------------------------------------------------------------------
# with open('F:\\PycharmDemo\Test\影评{}'.format(type[j]) + '.txt',"r",encoding='utf-8') as f:
# wordcloud = WordCloud(background_color='Black', # 背景色
# width=2000, # 宽度
# height=1000, # 高度
# margin=1, # 词语边缘距离
# font_path='C:\windows\Fonts\simhei.ttf') .generate(f.read())
# wordcloud.to_image().show()
python进行词语筛选生成词云
最新推荐文章于 2024-07-17 09:15:39 发布