**
TF-IDF算法:
**
TF-IDF(词频-逆文档频率)是信息检索中衡量一个词语重要程度的统计指标。公式为:
TF是词频,代表一个词在文中出现的次数,公式为:
DF是文档频率,代表有多少篇文章包含词,DF的倒数(inverse)称为IDF,公式为:
log函数的好处:当x的值非常大时,logd的值也不大。由于log函数是单调函数,优化目标是一致,因此不影响结果的计算。
效果图:
代码(使用pyhanlp中的词典):
# -*- coding:utf-8 -*-
"""
1.创建一个语料库---初始状态
2.文档分词,并将文档存入语料库中
3.计算tf,计算idf
4.排序,获取关键字,通过数值来控制输出关键字的个数
"""
import glob
from pyhanlp import JClass,HanLP
import time
import math
class InitCorpusAndDictionary:
"""获取初始的语料库文档信息,假定语料库中的文档都是txt格式的"""
def __init__(self):
self.dic = self.dictionary() #实例化的过程中自动加载字典
def getFlieList(self):
"""返回txt形式文件列表"""
return glob.glob("*.txt")
def dictionary(self):
"""
加载HanLP中的词库
返回一个集合形式词库
"""
IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil') #自定义词典兼容含有空格的路径
path = HanLP.Config.CoreDictionaryPath #获取核心词典路径
dic = IOUtil.loadDictionary([path]) #返回的为一个字典
return set(dic.keySet())
def AddCorpus(self,text):
"""添加语料库"""
filename = str(int(time.time()))+".txt"
try:
with open(filename,"w",encoding="utf-8") as f:
f.write(text)
except Exception as e:
print("文件写入失败",e)
class ParseWord:
"""拆分语句,形成分词"""
def __init__(self,text):
self.a = InitCorpusAndDictionary()
self.dic = self.a.dic
self.text = text
self.a.AddCorpus(self.text)
#逆向最长匹配
def backward_segment(self):
word_list = []
i = len(self.text) - 1
while i >= 0:
word = self.text[i]
for j in range(i):
#print(text[j:i+1])
if self.text[j:i+1] in self.dic:
if len(self.text[j:i+1]) > len(word):
word = self.text[j:i+1]
word_list.insert(0,word)
i -= len(word)
return word_list
class CountTFIDF:
"""
计算 tf = 词在一篇文章中出现的次数/文章总的词汇数
idf = log(语料库的总文档数/包含该词条的文档数+1),分母+1是为了避免分母为0
TFIDF=TF*IDF
"""
def __init__(self,text):
self.text = text
self.wordlist = ParseWord(self.text).backward_segment()
self.flielist = InitCorpusAndDictionary().getFlieList()
self.tf()
self.idf()
def tf(self):
"""计算 tf"""
self.worddict = {}
for word in self.wordlist:
self.worddict[word] = self.wordlist.count(word)/len(self.wordlist)
#print(self.worddict)
def idf(self):
"""计算idf"""
self.idf_dict = dict.fromkeys(self.worddict.keys(),0) #生成值为0的字典
for filename in self.flielist:
with open(filename,encoding="utf-8") as f: #获取词在语料库中个数
words = f.read()
for key in self.idf_dict.keys():
if key in words:
self.idf_dict[key] += 1
#print(self.idf_dict)
for key in self.idf_dict.keys(): #idf概率
self.idf_dict[key] = math.log(len(self.flielist)/(self.idf_dict[key]+1))
#print(self.idf_dict)
def if_idf(self):
"""计算if_idf"""
self.if_idf = {}
for key in self.idf_dict.keys():
self.if_idf[key] = round(self.worddict[key]*self.idf_dict[key],4)
return self.if_idf
class ControlKeywordOutput:
"""排序,控制关键字的输出"""
def __init__(self,text,num):
self.if_idf = CountTFIDF(text).if_idf()
self.wordlist = CountTFIDF(text).wordlist
self.num = num
def extractionKeyword(self):
word_sort = sorted(self.if_idf.items(),key=lambda x:x[1],reverse=True)
if self.num <= len(word_sort): # 防止输入的关键字数大于分词数,程序报索引超出
self.num = self.num
else:
self.num = len(word_sort)
#print(word_sort)
extract_word = [word_sort[i][0] for i in range(self.num)]
print(f'文档的关键字为:{",".join(extract_word)}')
if __name__ == "__main__":
text1 = "当下雨天地面积水"
text2 = "东方巨龙正在觉醒"
text3 = "下雨天的积水"
ControlKeywordOutput(text1,3).extractionKeyword()
ControlKeywordOutput(text2,8).extractionKeyword()
ControlKeywordOutput(text3,2).extractionKeyword()