tfidf做为计算每个词在文件中的频率,可以过滤掉常见词语,保留重要词语
在编程中可以使用jieba包自带的方法进行计算
import jieba.analyse
test1 =""
fencilist=[]
with open(r"testtxt",'r',encoding="UTF-8") as test:
for line in test:
line.strip()
test1+=line
fencilist=jieba.cut(test1)
fencilist=list(set(fencilist))
with open(r"fenciescult",'w',encoding="UTF-8") as f:
for i in fencilist:
f.write(i+'\n')
tfidf=[]
tfidf.append(jieba.analyse.extract_tags(test1,len(fencilist),True,allowPOS="a"))
dict=dict(jieba.analyse.extract_tags(test1,len(fencilist),True,allowPOS="a"))
with open("tfidf","a",encoding="UTF-8") as tfidf:
for key,values in dict.items():
tfidf.write(key+str(values)+"\n")