TF:
words 在单篇文章中出现的频次(词频)。单篇文章中某一个词出现频率很高的话,它在这篇文档中的重要性就比较高。
IDF:
words 在所有文档中出现的频次。IDF把语气词,把所有文档中都可能出现的词,把他们的重要性通通降低。IDF突出在某文档中真正的关键词的部分。
注释1:
将列表d中每一个元素中的","替换为"",然后将每一个元素使用" "进行拆分,原来每一个元素的拆分元素组成新的子列表。
docs_words = [d.replace(",","").split(" ") for d in docs]
注释2:
chain来源于包 import itertools
先将docs_words 使用星号表达式(类似于c++中的指针运算符)将所有元素取出,然后使用 itertools的chain方法拼接到一块,然后使用set进行去重,得到一个集合。
vocab = set(itertools.chain(*docs_words))
注释3:
组成一个字典
v2i = {v: i for i, v in enumerate(vocab)} i2v = {i: v for v, i in v2i.items()}
注释4:
tf_methods.get("augmented")(np.array([[1,2,3]]))
array([[0.66666667, 0.83333333, 1. ]])
注释5:
构建列表的一种方法: [i2v[i] for i in idx]
def get_keywords(n=2): for c in range(3): col = tf_idf[:, c] idx = np.argsort(col)[-n:] print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))
注释6:
ravel() :将多维数组转换为一维数组。
注释7:Counter可以对 列表/元组 元素出现的次数进行统计,返回字典,key是元素,value是该元素出现的次数
from collections import Counter
a=[1,2,3,4,1,1,1,]
Python 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]
Type 'copyright', 'credits' or 'license' for more information
IPython 8.3.0 -- An enhanced Interactive Python. Type '?' for help.
PyDev console: using IPython 8.3.0
Counter(a)
Out[2]: Counter({1: 4, 2: 1, 3: 1, 4: 1})
a=(1,2,3,4,1,1,1,)
Counter(a)
Out[4]: Counter({1: 4, 2: 1, 3: 1, 4: 1})
注释8: .T 数组转置 && .dot() 矩阵运算
def cosine_similarity(q, _tf_idf): unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True)) unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True)) similarity = unit_ds.T.dot(unit_q).ravel() return similarity
import numpy as np
from collections import Counter
import itertools
from visual import show_tfidf # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup",
]
docs_words = [d.replace(",","").split(" ") for d in docs]
vocab = set(itertools.chain(*docs_words))
v2i = {v: i for i, v in enumerate(vocab)}
i2v = {i: v for v, i in v2i.items()}
def safe_log(x):
mask = x != 0
x[mask] = np.log(x[mask])
return x
tf_methods = {
"log": lambda x: np.log(1+x),
"augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
"boolean": lambda x: np.minimum(x, 1),
"log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
}
idf_methods = {
"log": lambda x: 1 + np.log(len(docs) / (x+1)),
"prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
"len_norm": lambda x: x / (np.sum(np.square(x))+1),
}
def get_tf(method="log"):
# term frequency: how frequent a word appears in a doc
_tf = np.zeros((len(vocab), len(docs)), dtype=np.float64) # [n_vocab, n_doc]
for i, d in enumerate(docs_words):
counter = Counter(d)
for v in counter.keys():
_tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
weighted_tf = tf_methods.get(method, None)
if weighted_tf is None:
raise ValueError
return weighted_tf(_tf)
def get_idf(method="log"):
# inverse document frequency: low idf for a word appears in more docs, mean less important
df = np.zeros((len(i2v), 1))
for i in range(len(i2v)):
d_count = 0
for d in docs_words:
d_count += 1 if i2v[i] in d else 0
df[i, 0] = d_count
idf_fn = idf_methods.get(method, None)
if idf_fn is None:
raise ValueError
return idf_fn(df)
def cosine_similarity(q, _tf_idf):
unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
similarity = unit_ds.T.dot(unit_q).ravel()
return similarity
def docs_score(q, len_norm=False):
q_words = q.replace(",", "").split(" ")
# add unknown words
unknown_v = 0
for v in set(q_words):
if v not in v2i:
v2i[v] = len(v2i)
i2v[len(v2i)-1] = v
unknown_v += 1
if unknown_v > 0:
_idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0)
_tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0)
else:
_idf, _tf_idf = idf, tf_idf
counter = Counter(q_words)
q_tf = np.zeros((len(_idf), 1), dtype=np.float) # [n_vocab, 1]
for v in counter.keys():
q_tf[v2i[v], 0] = counter[v]
q_vec = q_tf * _idf # [n_vocab, 1]
q_scores = cosine_similarity(q_vec, _tf_idf)
if len_norm:
len_docs = [len(d) for d in docs_words]
q_scores = q_scores / np.array(len_docs)
return q_scores
def get_keywords(n=2):
for c in range(3):
col = tf_idf[:, c]
idx = np.argsort(col)[-n:]
print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))
tf = get_tf() # [n_vocab, n_doc]
idf = get_idf() # [n_vocab, 1]
tf_idf = tf * idf # [n_vocab, n_doc]
print("tf shape(vecb in each docs): ", tf.shape)
print("\ntf samples:\n", tf[:2])
print("\nidf shape(vecb in all docs): ", idf.shape)
print("\nidf samples:\n", idf[:2])
print("\ntf_idf shape: ", tf_idf.shape)
print("\ntf_idf sample:\n", tf_idf[:2])
# test
get_keywords()
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))
show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix")
show_tfidf():
def show_tfidf(tfidf, vocab, filename):
# [n_doc, n_vocab]
plt.imshow(tfidf, cmap="YlGn", vmin=tfidf.min(), vmax=tfidf.max())
plt.xticks(np.arange(tfidf.shape[1]), vocab, fontsize=6, rotation=90)
plt.yticks(np.arange(tfidf.shape[0]), np.arange(1, tfidf.shape[0]+1), fontsize=6)
plt.tight_layout()
# creating the output folder
output_folder = './visual/results/'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '%s.png') % filename, format="png", dpi=500)
plt.show()
Result:
tf shape(vecb in each docs): (47, 15)
tf samples:
[[0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0.69314718]
[0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0.69314718]]
idf shape(vecb in all docs): (47, 1)
idf samples:
[[3.01490302]
[3.01490302]]
tf_idf shape: (47, 15)
tf_idf sample:
[[0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 2.08977153]
[0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 2.08977153]]
doc0, top2 keywords ['here', 'stay']
doc1, top2 keywords ['here', 'happy']
doc2, top2 keywords ['am', 'bob']
top 3 docs for 'I get a coffee cup':
['It is coffee time, bring your cup', 'I like coffee, I like book and I like apple', 'I have a party today']