#本程序的作用是通过TF/IDF算法完成对文本的关键词提取,输出前十个关键词。
import math
import jieba
import jieba.posseg as psg
from gensim import corpora, models
from jieba import analyse
import functools
class TfIdf(object):
# 四个参数分别是:训练好的idf字典,默认idf值,处理后的待提取文本,关键词数量
def __init__(self, idf_dic, default_idf, word_list, keyword_num):
self.word_list = word_list
self.idf_dic, self.default_idf = idf_dic, default_idf
self.tf_dic = self.get_tf_dic()
self.keyword_num = keyword_num
def get_tf_dic(self):
tf_dic = {}
# 任务:完成word_list的tf值的统计函数,将结果存储到tf_dic变量中
# ** Begin *****#
for word in self.word_list:
tf_dic[word] = tf_dic.get(word, 0.0) + 1.0
tt_count = len(self.word_list)
for k, v in tf_dic.items():
tf_dic[k] = float(v) / tt_count
# ** End **#
return tf_dic
# 按公式计算tf-idf
def get_tfidf(self):
tfidf_dic = {}
for word in self.word_list:
idf = self.idf_dic.get(word, self.default_idf)
tf = self.tf_dic.get(word, 0)
tfidf = tf * idf
tfidf_dic[word] = tfidf
tfidf_dic.items()
# 根据tf-idf排序,去排名前keyword_num的词作为关键词
for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
print(k + "/ ", end='')
print()
#排序函数,用于topK关键词的按值排序
def cmp(e1, e2):
import numpy as np
res = np.sign(e1[1] - e2[1])
if res != 0:
return res
else:
a = e1[0] + e2[0]
b = e2[0] + e1[0]
if a > b:
return 1
elif a == b:
return 0
else:
return -1