TF-IDF方法学习&手工细节实现

最新推荐文章于 2022-05-05 21:58:20 发布

数学小牛马

最新推荐文章于 2022-05-05 21:58:20 发布

阅读量402

点赞数

分类专栏：数学 python NLP

本文链接：https://blog.csdn.net/qq_43409560/article/details/111306997

版权

数学同时被 3 个专栏收录

39 篇文章 3 订阅

订阅专栏

python

11 篇文章 0 订阅

订阅专栏

NLP

3 篇文章 0 订阅

订阅专栏

参照莫凡python
理解型的敲了敲代码
过多解释不说，代码理解仅供参考，同时感谢莫凡的知识分享

鄙人尚在学习，原理层方面知识不太了解
实现为主，并且主要以学习NLP知识为主

'''
Descripttion: 
Author: ssw
Date: 2020-12-17 00:14:45
LastEditors: ssw
LastEditTime: 2020-12-17 02:59:42
'''
'''
TF-IDF方法分为TF与IDF
Term Frequency And Inverse Document Frequency
前者(某篇文章)为单个文章词语词频率
后者(全部文章)为把所有文章出现的词语权重拉低，突出关键词部分

TF-IDF方法是构建向量矩阵
横向为文档，竖向为文档关键字词频
构成横向的向量，每篇文章的向量，搜索是对比搜索关键字与向量的距离(COSINE距离方法)

'''

import numpy as np
from collections import Counter
import itertools
# from visual import show_tfidf

docs = [
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup",
]
# 文章数据
docs_words = [d.replace(",", "").split(" ") for d in docs]
# 将所有的词语逗号替换为无，之后做拼接，找到所有词汇
# data.replace(1,2).split(" ")
# 将data数据的1位置换成2位置，并将以空格分割开
vocab = set(itertools.chain(*docs_words))
# 将所有的词汇set统计
v2i = {v: i for i, v in enumerate(vocab)}
# 字典内容为 词：序号
i2v = {i: v for v, i in v2i.items()}
# 字典内容为 序号：词

def safe_log(x):
    '''
    安全的log函数
    将不等于0的位置做标记
    对不为零的位置做log运算并返回
    '''
    mask = x != 0
    x[mask] = np.log(x[mask])
    return x

tf_methods = {
    "log": lambda x: np.log(x + 1),
    # log函数，并防止log(0)出现
    "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
    # 扩展方法，其中keepdims为保证矩阵运算的二维性质
    "boolean": lambda x: np.minimum(x, 1),
    # 布尔方法，有就是1，无就是0
    "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
    # 更高级的方法，用到了上面的安全底数函数
}

idf_methods = {
    # 本质是：log（所有文档数/所有文档中词w的个数）
    # 不过多的考虑，即为所有文档书恒定，出现的w词个数越多，w词的权重越小
    # 三种变异方法目前不做学习
    "log": lambda x: 1 + np.log(len(docs) / (x + 1)),
    "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
    "len_norm": lambda x: x / (np.sum(np.square(x)) + 1),
}

def get_tf(methods="log"):
    '''
    返回TF矩阵
    '''
    _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)
    # 建立TF矩阵，大小为[n_vocab, n_doc]=>[词语个数,文本句数]
    for i, d in enumerate(docs_words):
        # 遍历14个句子，i表示句子编号，d表示句子内容
        counter = Counter(d)
        # 统计句子中的词语出现次数，返回Counter类型数据，对应=>"词语：出现次数"
        for v in counter.keys():
            _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
            # 遍历当年话的每一个词语，[v2i[v], i]=>[词语编号，句子编号]
            # 当前词语出现的次数/当前句子的最高频率出现数量
    weight_tf = tf_methods.get(methods, None)
    # 取得对应方法的lambda函数
    if weight_tf is None:
        raise ValueError
        # 异常处理
    return weight_tf(_tf)

def get_idf(methods="log"):
    '''
    返回IDF矩阵
    '''
    df = np.zeros((len(i2v), 1))
    # 建立初始IDF矩阵，大小为=>[词总数,1]
    for i in range(len(i2v)):
        # 遍历词语
        d_count = 0
        for d in docs_words:
            # 遍历句子
            d_count += 1 if i2v[i] in d else 0
            # 自加若该句子中有这个词语
        df[i, 0] = d_count
        # 记录词语整体出现信息
    idf_fn = idf_methods.get(methods, None)
    # 调用lambda表达式
    if idf_fn is None:
        raise ValueError
    return idf_fn(df)

def cosine_similarity(q, _tf_idf):
    '''
    相似度向量余弦距离计算
    '''
    unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
    unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
    # 单位化
    similarity = unit_ds.T.dot(unit_q).ravel()
    # 高维余弦公式分母为1，分子向量乘法，ravel将答案一维呈现为数值
    return similarity
    
def docs_score(q, len_norm=False):
    q_words = q.replace(",", "").split(" ")
    # 将单词分开
    
    unknown_v = 0
    # 位置单词个数
    for v in set(q_words):
        # 统计词语
        if v not in v2i:
            # 如果不存在则在词语库中添加，未知词汇++
            v2i[v] = len(v2i)
            i2v[len(v2i) - 1] = v
            unknown_v += 1
    
    if unknown_v > 0:
        # 如果有未知词汇
        # 对idf，tf-idf矩阵拼接修改
        # np.concatenate()函数拼接使用
        _idf = np.concatenate((idf, np.zeros((unknown_v, 1),dtype=np.float)), axis=0)
        _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]),dtype=np.float)), axis=0)
    else:
        _idf, _tf_idf = idf, tf_idf
    
    counter = Counter(q_words)
    # 计算搜索句子的tf
    q_tf = np.zeros((len(_idf), 1), dtype=np.float)
    for v in counter.keys():
        # 遍历统计词汇
        q_tf[v2i[v], 0] = counter[v]
        
    q_vec = q_tf * _idf
    q_score = cosine_similarity(q_vec, _tf_idf)
    if len_norm:
        # 可求解每个句子对其的相似度或者说是关键词占有率
        len_docs = [len(d) for d in docs_words]
        q_score = q_score / np.array(len_docs)
    
    return q_score


def get_keywords(n=2):
    '''
    输出句子关键词
    '''
    for c in range(3):
        # 遍历三句话
        col = tf_idf[:, c]
        # 提取第C句话的TF-IDF向量
        idx = np.argsort(col)[-n:]
        # 记录前两个主题词的索引
        # argsort默认按照索引排序，升序
        print("doc {}, top {} keywords {}" \
            .format(c+1, n, [i2v[i] for i in idx]))
        
        


tf = get_tf()
idf = get_idf()
tf_idf = tf * idf
# TF矩阵*IDF矩阵为结果

# get_keywords(2)
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]

print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))

效果：

top 3 docs for 'I get a coffee cup':
['I have a party today', 'I like coffee, I like book and I like apple', 'It is coffee time, bring your cup']

数学小牛马

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
TF-IDF方法学习&手工细节实现

参照莫凡python理解型的敲了敲代码过多解释不说，代码理解仅供参考，同时感谢莫凡的知识分享鄙人尚在学习，原理层方面知识不太了解实现为主，并且主要以学习NLP知识为主'''Descripttion: Author: sswDate: 2020-12-17 00:14:45LastEditors: sswLastEditTime: 2020-12-17 02:59:42''''''TF-IDF方法分为TF与IDFTerm Frequency And Inverse Docume
复制链接

扫一扫