一:《红楼梦》高频中文词语统计
import jieba
f=open('红楼梦.txt','r')
txt=f.read()
f.close()
words=jieba.icut(txt)
counts={}
for word in words:
if len(word)==1:
continue
else:
counts[word]=counts.get(word,0)+1
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word,count=items[i]
print('{0:<15},{1:>5}'.format(word,count))
二: 《红楼梦》人物出场统计
import jieba
excludes=['什么','一个','我们','那是','你们','如今',\
'说到','知道','老太太','起来','姑娘','这里',\
'出来','他们','众人','自己','一面','太太',\
'只见','怎么','奶奶','两个','没有','不是',\
'不如','这个','听见']
file=open('hongloumeng.txt','r',encoding='utf-8')
text=file.read()
file.close()
count={}
text=jieba.icut(text)
for i in text:
if len(i)==1:
continue
else:
count[i]=count.get(i,0)+1
for i in excludes:
del count[i]
item=list(count.items())
item.sort(key=lambda m:m[1],reverse=True)
for i in range(10):
w,c=item[i]
print('{0:<20}{1:>10}'.format(w,c))
三。《红楼梦》人物出场词云
import jieba
from wordcloud import WordCloud
excludes=['什么','一个','我们','那是','你们','如今',\
'说到','知道','老太太','起来','姑娘','这里',\
'出来','他们','众人','自己','一面','太太',\
'只见','怎么','奶奶','两个','没有','不是',\
'不如','这个','听见']
file=open('hongloumeng.txt','r',encoding='utf-8')
txt=file.read()
file.close()
txt=jieba.icut(txt)
words=' '.join(txt)
wordcloud=WordCloud(background='white',\
width=800,\
height=600,\
font_path='msyh.ttc',\
max_words=200,\
max_font_size=80,\
stopwords=excludes,\
).generate(words)
wordcloud.to_file('红楼梦基本词云.png')