参照莫凡python
理解型的敲了敲代码
过多解释不说,代码理解仅供参考,同时感谢莫凡的知识分享
- 鄙人尚在学习,原理层方面知识不太了解
- 实现为主,并且主要以学习NLP知识为主
'''
Descripttion:
Author: ssw
Date: 2020-12-17 00:14:45
LastEditors: ssw
LastEditTime: 2020-12-17 02:59:42
'''
'''
TF-IDF方法分为TF与IDF
Term Frequency And Inverse Document Frequency
前者(某篇文章)为单个文章词语词频率
后者(全部文章)为把所有文章出现的词语权重拉低,突出关键词部分
TF-IDF方法是构建向量矩阵
横向为文档,竖向为文档关键字词频
构成横向的向量,每篇文章的向量,搜索是对比搜索关键字与向量的距离(COSINE距离方法)
'''
import numpy as np
from collections import Counter
import itertools
# from visual import show_tfidf
docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup",
]
# 文章数据
docs_words = [d.replace(",", "").split(" ") for d in docs]
# 将所有的词语逗号替换为无,之后做拼接,找到所有词汇
# data.replace(1,2).split(" ")
# 将data数据的1位置换成2位置,并将以空格分割开
vocab = set(itertools.chain(*docs_words))
# 将所有的词汇set统计
v2i = {v: i for i, v in enumerate(vocab)}
# 字典内容为 词:序号
i2v = {i: v for v, i in v2i.items()}
# 字典内容为 序号:词
def safe_log(x):
'''
安全的log函数
将不等于0的位置做标记
对不为零的位置做log运算并返回
'''
mask = x != 0
x[mask] = np.log(x[mask])
return x
tf_methods = {
"log": lambda x: np.log(x + 1),
# log函数,并防止log(0)出现
"augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
# 扩展方法,其中keepdims为保证矩阵运算的二维性质
"boolean": lambda x: np.minimum(x, 1),
# 布尔方法,有就是1,无就是0
"log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
# 更高级的方法,用到了上面的安全底数函数
}
idf_methods = {
# 本质是:log(所有文档数/所有文档中词w的个数)
# 不过多的考虑,即为所有文档书恒定,出现的w词个数越多,w词的权重越小
# 三种变异方法目前不做学习
"log": lambda x: 1 + np.log(len(docs) / (x + 1)),
"prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
"len_norm": lambda x: x / (np.sum(np.square(x)) + 1),
}
def get_tf(methods="log"):
'''
返回TF矩阵
'''
_tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)
# 建立TF矩阵,大小为[n_vocab, n_doc]=>[词语个数,文本句数]
for i, d in enumerate(docs_words):
# 遍历14个句子,i表示句子编号,d表示句子内容
counter = Counter(d)
# 统计句子中的词语出现次数,返回Counter类型数据,对应=>"词语:出现次数"
for v in counter.keys():
_tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
# 遍历当年话的每一个词语,[v2i[v], i]=>[词语编号,句子编号]
# 当前词语出现的次数/当前句子的最高频率出现数量
weight_tf = tf_methods.get(methods, None)
# 取得对应方法的lambda函数
if weight_tf is None:
raise ValueError
# 异常处理
return weight_tf(_tf)
def get_idf(methods="log"):
'''
返回IDF矩阵
'''
df = np.zeros((len(i2v), 1))
# 建立初始IDF矩阵,大小为=>[词总数,1]
for i in range(len(i2v)):
# 遍历词语
d_count = 0
for d in docs_words:
# 遍历句子
d_count += 1 if i2v[i] in d else 0
# 自加若该句子中有这个词语
df[i, 0] = d_count
# 记录词语整体出现信息
idf_fn = idf_methods.get(methods, None)
# 调用lambda表达式
if idf_fn is None:
raise ValueError
return idf_fn(df)
def cosine_similarity(q, _tf_idf):
'''
相似度向量余弦距离计算
'''
unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
# 单位化
similarity = unit_ds.T.dot(unit_q).ravel()
# 高维余弦公式分母为1,分子向量乘法,ravel将答案一维呈现为数值
return similarity
def docs_score(q, len_norm=False):
q_words = q.replace(",", "").split(" ")
# 将单词分开
unknown_v = 0
# 位置单词个数
for v in set(q_words):
# 统计词语
if v not in v2i:
# 如果不存在则在词语库中添加,未知词汇++
v2i[v] = len(v2i)
i2v[len(v2i) - 1] = v
unknown_v += 1
if unknown_v > 0:
# 如果有未知词汇
# 对idf,tf-idf矩阵拼接修改
# np.concatenate()函数拼接使用
_idf = np.concatenate((idf, np.zeros((unknown_v, 1),dtype=np.float)), axis=0)
_tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]),dtype=np.float)), axis=0)
else:
_idf, _tf_idf = idf, tf_idf
counter = Counter(q_words)
# 计算搜索句子的tf
q_tf = np.zeros((len(_idf), 1), dtype=np.float)
for v in counter.keys():
# 遍历统计词汇
q_tf[v2i[v], 0] = counter[v]
q_vec = q_tf * _idf
q_score = cosine_similarity(q_vec, _tf_idf)
if len_norm:
# 可求解每个句子对其的相似度或者说是关键词占有率
len_docs = [len(d) for d in docs_words]
q_score = q_score / np.array(len_docs)
return q_score
def get_keywords(n=2):
'''
输出句子关键词
'''
for c in range(3):
# 遍历三句话
col = tf_idf[:, c]
# 提取第C句话的TF-IDF向量
idx = np.argsort(col)[-n:]
# 记录前两个主题词的索引
# argsort默认按照索引排序,升序
print("doc {}, top {} keywords {}" \
.format(c+1, n, [i2v[i] for i in idx]))
tf = get_tf()
idf = get_idf()
tf_idf = tf * idf
# TF矩阵*IDF矩阵为结果
# get_keywords(2)
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))
效果:
top 3 docs for 'I get a coffee cup':
['I have a party today', 'I like coffee, I like book and I like apple', 'It is coffee time, bring your cup']