python tf-idf的实现

最新推荐文章于 2024-07-30 18:04:32 发布

maybe_fate

最新推荐文章于 2024-07-30 18:04:32 发布

阅读量3.2k

点赞数 2

分类专栏： python 文章标签： python tf-idf

本文链接：https://blog.csdn.net/maybe_fate/article/details/79892269

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

1.首先我们要明白tf-idf计算的数学公式：

以上的三个公式就是tf-idf的计算过程，我们分三个阶段进行计算。

我首先是进行词频的计算，然后根据词频中的单词去计算每个单词的逆文档率，最后求出TF-IDF值。

2.词频的计算：

'''
存储一些词语的数量然后存储总的词数量
然后选取前五十个关键字的结果

同时来计算文本中的关键字的tf-idf值

逆文档频率是计算外部微博文档的关键字的值
'''

import os
import pickle
import json

from DateBase.connect_DB import connect_db

def get_cluster_weibos(): # 这段代码是返回分词的微博列表
    # segments = np.load(os.path.join('../cluster/output', 'segment.npy'))
    with open(os.path.join('../cluster/output', 'segment.pkl'), 'rb') as file_read:
        segments = pickle.load(file_read)
    return segments

def construct_dict(dictionary, word): # 对文本进行词频统计
    if word in dictionary.keys():
        dictionary[word] += 1
    else:
        dictionary[word] = 1
    return dictionary

def dict_sort(dict):
    return sorted(dict.iteritems(), key=lambda d: d[1], reverse=True)

def count_words(lines):
    np = {}  # 存储人名
    ns = {}  # 存储地名
    ni = {}  # 存储机构名
    t = {}  # 存储时间词
    v = {}  # 存储动词
    word_count = 0
    temp_count = 0
    for line in lines:
        print(line)
        try: # 有些数据原本就有问题
            line = json.loads(line)
        except:
            continue
        temp_count += 1
        print(temp_count)
        for word_type in line:
            word_count += 1
            word = word_type[0]  # 词本身
            type = word_type[1]  # 词的词性
            if type == 'np':
                np = construct_dict(np, word)
            elif type == 'ns':
                ns = construct_dict(ns, word)
            elif type == 'ni':
                ni = construct_dict(ni, word)
            elif type == 't':  # 这里记得处理下，时间的处理要小心
                t = construct_dict(t, word)
            elif type == 'v':
                v = construct_dict(v, word)
            else:
                continue

    return (np, ns, ni, t, v, word_count)

def save_dict(lines, entity_name_list_path):
    entity_name_list = count_words(lines)
    with open(entity_name_list_path, 'wb') as file_write:
        pickle.dump(entity_name_list, file_write)

    del entity_name_list

def main():
    output_dir = 'output_dict'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_path = os.path.join(output_dir, 'entity_name_list.pkl')
    lines = get_cluster_weibos()
    save_dict(lines[2], output_path) # 在这里我只是取出了一个事件
    # 这里读取文本内容

if __name__ == '__main__':
    main()

以上的代码是进行实体名的提取（因为我原来的项目中有实体名的提取这个东西），这里最后以pickle的形式保存了每个实体名的词频度与单词总数。

3.计算逆文档率与TF-IDF

'''
本代码是利用tf-idf公式进行词语权值的计算

本段代码重要性是，列表与字典的转换
字典的排序（利用value或是key）
'''

import os
import pickle
import math

from DateBase.connect_DB import connect_db

def get_tf_dict(): # 获取实体名的计数
    input_path = os.path.join('output_dict', 'entity_name_list.pkl')
    with open(input_path, 'rb') as file_read:
        entity_name_list = pickle.load(file_read)

    return entity_name_list

def get_outside_weibo(): # 获取逆文档
    db = connect_db()
    cursor = db.cursor()

    sql = "select text from weibo_output"
    try:
        cursor.execute(sql)
        lines = cursor.fetchall()
        db.commit()
    except:
        db.rollback()

    return lines

def init_idf(dict_tf, dict_idf): # 初始化idf值
    for key in dict_tf.keys():
        dict_idf[key] = 0

    return dict_idf

def count_idf(idf_dicts, dict_idf, words): # 计算idf值
    for key in dict_idf.keys():
        if key in words:
            idf_dicts[key] += 1

    return idf_dicts

def count_tf_idf(idf_dicts, tf_dicts, words_count, documents_count): # 计算每个单词的tf-idf值
    for tf_dict in tf_dicts: # 取一个tf字典
        for key in tf_dict.keys(): # 取一个关键词
            tf_dict[key] = tf_dict[key] / words_count
            idf_dicts[key] = math.log(documents_count / (idf_dicts[key] + 1))
            idf_dicts[key] = tf_dict[key] * idf_dicts[key]

    return idf_dicts

def list2dict(temp_list):
    ''' 将列表转换为字典 '''
    return dict(temp_list)

def dict2list(dic:dict):
    ''' 将字典转化为列表 '''
    keys = dic.keys()
    vals = dic.values()
    lst = [(key, val) for key, val in zip(keys, vals)]
    return lst

def sort_dict(tf_dict): # 按照value降序排列
    return sorted(dict2list(tf_dict), key=lambda x: x[1], reverse=True)

def save_file(obj, file_path):
    with open(file_path, 'wb') as file_write:
        pickle.dump(obj, file_write)

def classify_dict(tf_dicts_100, idf_dicts): # 将单词进行分类处理
    tf_idf_list = []
    for tf_dict_100 in tf_dicts_100:
        tf_idf_dict = {}
        for key in tf_dict_100.keys():
            tf_idf_dict[key] = idf_dicts[key]
        tf_idf_list.append(tf_idf_dict)

    return tf_idf_list

def get_tf_idf(lines):
    '''首先搜集所有需要的信息：文档数量，文档，单词总数，词频字典（在这里词频字典设置为前一百）'''
    documents_count = len(lines) # 文档数量
    idf_dicts = {}
    entity_name_list = get_tf_dict()
    words_count = entity_name_list[-1] # 取单词总数
    tf_dicts = entity_name_list[: len(entity_name_list) - 1] # 取词频字典
    tf_list = []
    for tf_dict in tf_dicts:
        tf_list.append(sort_dict(tf_dict)[:100]) # 取前一百的数据

    tf_dicts_100 = []
    for tf in tf_list:
        tf_dicts_100.append(list2dict(tf))
    save_file(tf_dicts_100, os.path.join('output_dict', 'tf_dict_100'))

    for tf_dict in tf_dicts_100: # 初始化idf字典
        idf_dicts = init_idf(tf_dict, idf_dicts)

    for tf_dict in tf_dicts_100:
        for line in lines:
            words = line[0].split(' ')
            idf_dicts = count_idf(idf_dicts, tf_dict, words)

    idf_dicts = count_tf_idf(idf_dicts, tf_dicts_100, words_count, documents_count)

    tf_idf_list = classify_dict(tf_dicts_100, idf_dicts)
    sort_tf_idf_list = []
    for tf_idf in tf_idf_list:
        sort_tf_idf_list.append(list2dict(sort_dict(tf_idf)))

    return sort_tf_idf_list


def main():
    input_dir = 'outside_input'
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
    input_file = 'outside_segment.pkl'
    if not os.path.exists(os.path.join(input_dir, input_file)):
        lines = get_outside_weibo()
        with open(os.path.join(input_dir, input_file), 'wb') as file_write:
            pickle.dump(lines, file_write)
    else:
        with open(os.path.join(input_dir, input_file), 'rb') as file_read:
            lines = pickle.load(file_read)

        tf_idf_dicts = get_tf_idf(lines)
        save_file(tf_idf_dicts, os.path.join('output_dict', 'tf_idf.pkl'))

if __name__ == '__main__':
    main()

代码中count_idf（）是用来计算逆文档频率，count_tf_idf（）是用来计算最后的结果的。