上一篇绘制了一个简单的词云图,下面在学习一些改进。
from wordcloud import WordCloud
import jieba
import imageio
with open('threekingdom.txt', 'r', encoding='utf-8') as f1:
list1 = f1.read()
list2 = jieba.lcut(list1)
excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此", "如何", "东吴",
"商议", "军马", "引兵", "次日", "主公", "左右", "大喜", "军士", "天下",
"今日", "于是", "不敢", "魏兵", "都督", "陛下","玄德曰","孔明曰"
}
count={}#字典
for word in list2:
if len(word)<=1:
continue
else:
count[word]=count.get(word,0)+1
count["孔明"]=count["孔明"]+count["孔明曰"]
count["玄德"] = count["玄德"] + count["玄德曰"]+count["刘备"]
count["关公"] = count["关公"] + count["云长"]
for word in excludes:
del count[word]
#print(count)
#转换成列表排序
new_word_list=list(count.items())#元组
#print(new_word_list)
#进行排序
new_word_list.sort(key=lambda x:x[1],reverse=True)
text_list=[]
for x in range(10):#显示前二十
#print(new_word_list[x])
role,count=new_word_list[x]#拆包
print(role,count)
for j in range(count):
text_list.append(role)
print(text_list)
text=" ".join(text_list)
mask = imageio.imread("china.jpg")
wc = WordCloud(
width=1000,
height=800,
background_color='white',
font_path="MSYH.TTC",
mask=mask,
collocations=False
).generate(text).to_file("三国词云.png")
上面的代码主要的改进是,把我不需要的词但是词频高的删除,之后再把重复的词从词云中删除,把同义词合并。注意wordcount的collocation的用法,还有这里的元组的转换,还有join函数,都是值得研究的点。