1.首先我们要明白tf-idf计算的数学公式:
以上的三个公式就是tf-idf的计算过程,我们分三个阶段进行计算。
我首先是进行词频的计算,然后根据词频中的单词去计算每个单词的逆文档率,最后求出TF-IDF值。
2.词频的计算:
'''
存储一些词语的数量然后存储总的词数量
然后选取前五十个关键字的结果
同时来计算文本中的关键字的tf-idf值
逆文档频率是计算外部微博文档的关键字的值
'''
import os
import pickle
import json
from DateBase.connect_DB import connect_db
def get_cluster_weibos(): # 这段代码是返回分词的微博列表
# segments = np.load(os.path.join('../cluster/output', 'segment.npy'))
with open(os.path.join('../cluster/output', 'segment.pkl'), 'rb') as file_read:
segments = pickle.load(file_read)
return segments
def construct_dict(dictionary, word): # 对文本进行词频统计
if word in dictionary.keys():
dictionary[word] += 1
else:
dictionary[word] = 1
return dictionary
def dict_sort(dict):
return sorted(dict.iteritems(), key=lambda d: d[1], reverse=True)
def count_words(lines):
np = {} # 存储人名
ns = {} # 存储地名
ni = {} # 存储机构名
t = {} # 存储时间词
v = {} # 存储动词
word_count = 0
temp_count = 0
for line in lines:
print(line)
try: # 有些数据原本就有问题
line = json.loads(line)
except:
continue
temp_count += 1
print(temp_count)
for word_type in line:
word_count += 1
word = word_type[0] # 词本身
type = word_type[1] # 词的词性
if type == 'np':
np = construct_dict(np, word)
elif type == 'ns':
ns = construct_dict(ns, word)
elif type == 'ni':
ni = construct_dict(ni, word)
elif type == 't': # 这里记得处理下,时间的处理要小心
t = construct_dict(t, word)
elif type == 'v':
v = construct_dict(v, word)
else:
continue
return (np, ns, ni, t, v, word_count)
def save_dict(lines, entity_name_list_path):
entity_name_list = count_words(lines)
with open(entity_name_list_path, 'wb') as file_write:
pickle.dump(entity_name_list, file_write)
del entity_name_list
def main():
output_dir = 'output_dict'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_path = os.path.join(output_dir, 'entity_name_list.pkl')
lines = get_cluster_weibos()
save_dict(lines[2], output_path) # 在这里我只是取出了一个事件
# 这里读取文本内容
if __name__ == '__main__':
main()
以上的代码是进行实体名的提取(因为我原来的项目中有实体名的提取这个东西),这里最后以pickle的形式保存了每个实体名的词频度与单词总数。
3.计算逆文档率与TF-IDF
'''
本代码是利用tf-idf公式进行词语权值的计算
本段代码重要性是,列表与字典的转换
字典的排序(利用value或是key)
'''
import os
import pickle
import math
from DateBase.connect_DB import connect_db
def get_tf_dict(): # 获取实体名的计数
input_path = os.path.join('output_dict', 'entity_name_list.pkl')
with open(input_path, 'rb') as file_read:
entity_name_list = pickle.load(file_read)
return entity_name_list
def get_outside_weibo(): # 获取逆文档
db = connect_db()
cursor = db.cursor()
sql = "select text from weibo_output"
try:
cursor.execute(sql)
lines = cursor.fetchall()
db.commit()
except:
db.rollback()
return lines
def init_idf(dict_tf, dict_idf): # 初始化idf值
for key in dict_tf.keys():
dict_idf[key] = 0
return dict_idf
def count_idf(idf_dicts, dict_idf, words): # 计算idf值
for key in dict_idf.keys():
if key in words:
idf_dicts[key] += 1
return idf_dicts
def count_tf_idf(idf_dicts, tf_dicts, words_count, documents_count): # 计算每个单词的tf-idf值
for tf_dict in tf_dicts: # 取一个tf字典
for key in tf_dict.keys(): # 取一个关键词
tf_dict[key] = tf_dict[key] / words_count
idf_dicts[key] = math.log(documents_count / (idf_dicts[key] + 1))
idf_dicts[key] = tf_dict[key] * idf_dicts[key]
return idf_dicts
def list2dict(temp_list):
''' 将列表转换为字典 '''
return dict(temp_list)
def dict2list(dic:dict):
''' 将字典转化为列表 '''
keys = dic.keys()
vals = dic.values()
lst = [(key, val) for key, val in zip(keys, vals)]
return lst
def sort_dict(tf_dict): # 按照value降序排列
return sorted(dict2list(tf_dict), key=lambda x: x[1], reverse=True)
def save_file(obj, file_path):
with open(file_path, 'wb') as file_write:
pickle.dump(obj, file_write)
def classify_dict(tf_dicts_100, idf_dicts): # 将单词进行分类处理
tf_idf_list = []
for tf_dict_100 in tf_dicts_100:
tf_idf_dict = {}
for key in tf_dict_100.keys():
tf_idf_dict[key] = idf_dicts[key]
tf_idf_list.append(tf_idf_dict)
return tf_idf_list
def get_tf_idf(lines):
'''首先搜集所有需要的信息:文档数量,文档,单词总数,词频字典(在这里词频字典设置为前一百)'''
documents_count = len(lines) # 文档数量
idf_dicts = {}
entity_name_list = get_tf_dict()
words_count = entity_name_list[-1] # 取单词总数
tf_dicts = entity_name_list[: len(entity_name_list) - 1] # 取词频字典
tf_list = []
for tf_dict in tf_dicts:
tf_list.append(sort_dict(tf_dict)[:100]) # 取前一百的数据
tf_dicts_100 = []
for tf in tf_list:
tf_dicts_100.append(list2dict(tf))
save_file(tf_dicts_100, os.path.join('output_dict', 'tf_dict_100'))
for tf_dict in tf_dicts_100: # 初始化idf字典
idf_dicts = init_idf(tf_dict, idf_dicts)
for tf_dict in tf_dicts_100:
for line in lines:
words = line[0].split(' ')
idf_dicts = count_idf(idf_dicts, tf_dict, words)
idf_dicts = count_tf_idf(idf_dicts, tf_dicts_100, words_count, documents_count)
tf_idf_list = classify_dict(tf_dicts_100, idf_dicts)
sort_tf_idf_list = []
for tf_idf in tf_idf_list:
sort_tf_idf_list.append(list2dict(sort_dict(tf_idf)))
return sort_tf_idf_list
def main():
input_dir = 'outside_input'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
input_file = 'outside_segment.pkl'
if not os.path.exists(os.path.join(input_dir, input_file)):
lines = get_outside_weibo()
with open(os.path.join(input_dir, input_file), 'wb') as file_write:
pickle.dump(lines, file_write)
else:
with open(os.path.join(input_dir, input_file), 'rb') as file_read:
lines = pickle.load(file_read)
tf_idf_dicts = get_tf_idf(lines)
save_file(tf_idf_dicts, os.path.join('output_dict', 'tf_idf.pkl'))
if __name__ == '__main__':
main()
代码中count_idf()是用来计算逆文档频率,count_tf_idf()是用来计算最后的结果的。