1、读取文件
import jieba
with open('china145.txt','r',encoding='utf-8') as f:
renmin=f.read()
2、分词
jieba.load_userdict('china145.txt')
#分词
word_list=jieba.cut(renmin,cut_all=False)
#创建分词字典
wd={}
for word in word_list:
if word in wd:
wd[word]+=1
else:
wd[word]=1
ci=list(wd.keys())
#删除无用词
with open('stopword.txt','r') as ft:
stopword=ft.read()
for word in ci:
if wd[word]<5 or len(word)<2 or word in stopword or "一" in word:
wd.pop(word)
#打印新的词典
print(wd)
#数个数
ci, num, data = list(wd.keys()), list(wd.values()),[]
#
for i in range(len(wd)):#字典长度
data.append((num[i],ci[i])) #逐个将键值对存入data中
data.sort()#升序排列
data.reverse()#逆序,得到所需的降序排列
wd_sorted={}
print(len(data),data[0],data[0][0],data[0][1])
for i in range(len(data)):
wd_sorted[data[i][1]]=data[i][0]
print(wd_sorted)
3、词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
mask=np.array(Image.open('heart.png'))
font=r'c:\windows\fonts\simfang.ttf'
wc=WordCloud(background_color="white",mask=mask,font_path=font).generate_from_frequencies(wd)
plt.imshow(wc)
plt.axis("off")
plt.show()
【其他方法】
分词:利用jieba的方式有所不同
import jieba
f=open("file.txt","r+",encoding="utf-8").read()
word=jieba.lcut(f)
for i in word:
txt=[].append(word)
print (txt)
pin={}
for i in txt:
pin[i]=0
print(pin)
for i in txt:
pin[i]=pin[i]+1
print(pin)