TF-IDF方法学习&手工细节实现

11 篇文章 0 订阅
3 篇文章 0 订阅

参照莫凡python
理解型的敲了敲代码
过多解释不说,代码理解仅供参考,同时感谢莫凡的知识分享

  • 鄙人尚在学习,原理层方面知识不太了解
  • 实现为主,并且主要以学习NLP知识为主
'''
Descripttion: 
Author: ssw
Date: 2020-12-17 00:14:45
LastEditors: ssw
LastEditTime: 2020-12-17 02:59:42
'''
'''
TF-IDF方法分为TF与IDF
Term Frequency And Inverse Document Frequency
前者(某篇文章)为单个文章词语词频率
后者(全部文章)为把所有文章出现的词语权重拉低,突出关键词部分

TF-IDF方法是构建向量矩阵
横向为文档,竖向为文档关键字词频
构成横向的向量,每篇文章的向量,搜索是对比搜索关键字与向量的距离(COSINE距离方法)

'''

import numpy as np
from collections import Counter
import itertools
# from visual import show_tfidf

docs = [
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup",
]
# 文章数据
docs_words = [d.replace(",", "").split(" ") for d in docs]
# 将所有的词语逗号替换为无,之后做拼接,找到所有词汇
# data.replace(1,2).split(" ")
# 将data数据的1位置换成2位置,并将以空格分割开
vocab = set(itertools.chain(*docs_words))
# 将所有的词汇set统计
v2i = {v: i for i, v in enumerate(vocab)}
# 字典内容为 词:序号
i2v = {i: v for v, i in v2i.items()}
# 字典内容为 序号:词

def safe_log(x):
    '''
    安全的log函数
    将不等于0的位置做标记
    对不为零的位置做log运算并返回
    '''
    mask = x != 0
    x[mask] = np.log(x[mask])
    return x

tf_methods = {
    "log": lambda x: np.log(x + 1),
    # log函数,并防止log(0)出现
    "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
    # 扩展方法,其中keepdims为保证矩阵运算的二维性质
    "boolean": lambda x: np.minimum(x, 1),
    # 布尔方法,有就是1,无就是0
    "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
    # 更高级的方法,用到了上面的安全底数函数
}

idf_methods = {
    # 本质是:log(所有文档数/所有文档中词w的个数)
    # 不过多的考虑,即为所有文档书恒定,出现的w词个数越多,w词的权重越小
    # 三种变异方法目前不做学习
    "log": lambda x: 1 + np.log(len(docs) / (x + 1)),
    "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
    "len_norm": lambda x: x / (np.sum(np.square(x)) + 1),
}

def get_tf(methods="log"):
    '''
    返回TF矩阵
    '''
    _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)
    # 建立TF矩阵,大小为[n_vocab, n_doc]=>[词语个数,文本句数]
    for i, d in enumerate(docs_words):
        # 遍历14个句子,i表示句子编号,d表示句子内容
        counter = Counter(d)
        # 统计句子中的词语出现次数,返回Counter类型数据,对应=>"词语:出现次数"
        for v in counter.keys():
            _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
            # 遍历当年话的每一个词语,[v2i[v], i]=>[词语编号,句子编号]
            # 当前词语出现的次数/当前句子的最高频率出现数量
    weight_tf = tf_methods.get(methods, None)
    # 取得对应方法的lambda函数
    if weight_tf is None:
        raise ValueError
        # 异常处理
    return weight_tf(_tf)

def get_idf(methods="log"):
    '''
    返回IDF矩阵
    '''
    df = np.zeros((len(i2v), 1))
    # 建立初始IDF矩阵,大小为=>[词总数,1]
    for i in range(len(i2v)):
        # 遍历词语
        d_count = 0
        for d in docs_words:
            # 遍历句子
            d_count += 1 if i2v[i] in d else 0
            # 自加若该句子中有这个词语
        df[i, 0] = d_count
        # 记录词语整体出现信息
    idf_fn = idf_methods.get(methods, None)
    # 调用lambda表达式
    if idf_fn is None:
        raise ValueError
    return idf_fn(df)

def cosine_similarity(q, _tf_idf):
    '''
    相似度向量余弦距离计算
    '''
    unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
    unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
    # 单位化
    similarity = unit_ds.T.dot(unit_q).ravel()
    # 高维余弦公式分母为1,分子向量乘法,ravel将答案一维呈现为数值
    return similarity
    
def docs_score(q, len_norm=False):
    q_words = q.replace(",", "").split(" ")
    # 将单词分开
    
    unknown_v = 0
    # 位置单词个数
    for v in set(q_words):
        # 统计词语
        if v not in v2i:
            # 如果不存在则在词语库中添加,未知词汇++
            v2i[v] = len(v2i)
            i2v[len(v2i) - 1] = v
            unknown_v += 1
    
    if unknown_v > 0:
        # 如果有未知词汇
        # 对idf,tf-idf矩阵拼接修改
        # np.concatenate()函数拼接使用
        _idf = np.concatenate((idf, np.zeros((unknown_v, 1),dtype=np.float)), axis=0)
        _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]),dtype=np.float)), axis=0)
    else:
        _idf, _tf_idf = idf, tf_idf
    
    counter = Counter(q_words)
    # 计算搜索句子的tf
    q_tf = np.zeros((len(_idf), 1), dtype=np.float)
    for v in counter.keys():
        # 遍历统计词汇
        q_tf[v2i[v], 0] = counter[v]
        
    q_vec = q_tf * _idf
    q_score = cosine_similarity(q_vec, _tf_idf)
    if len_norm:
        # 可求解每个句子对其的相似度或者说是关键词占有率
        len_docs = [len(d) for d in docs_words]
        q_score = q_score / np.array(len_docs)
    
    return q_score


def get_keywords(n=2):
    '''
    输出句子关键词
    '''
    for c in range(3):
        # 遍历三句话
        col = tf_idf[:, c]
        # 提取第C句话的TF-IDF向量
        idx = np.argsort(col)[-n:]
        # 记录前两个主题词的索引
        # argsort默认按照索引排序,升序
        print("doc {}, top {} keywords {}" \
            .format(c+1, n, [i2v[i] for i in idx]))
        
        


tf = get_tf()
idf = get_idf()
tf_idf = tf * idf
# TF矩阵*IDF矩阵为结果

# get_keywords(2)
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]

print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))

效果:

top 3 docs for 'I get a coffee cup':
['I have a party today', 'I like coffee, I like book and I like apple', 'It is coffee time, bring your cup']
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

数学小牛马

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值