tf-idf

tf-idf

案例:用tf-idf找与I get a coffee cup相似度最高的3句话
莫烦python

环境:python3.6

import numpy as np
from collections import Counter
import itertools
from visual import show_tfidf   # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)

docs = [
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup",
]
docs_words = [d.replace(",", "").split(" ") for d in docs]#[['it', 'is', 'a', 'good', 'day', 'I', 'like', 'to', 'stay', 'here'], ['I', 'am', 'happy', 'to', 'be', 'here'], ['I', 'am', 'bob'], ['it', 'is', 'sunny', 'today'], ['I', 'have', 'a', 'party', 'today'], ['it', 'is', 'a', 'dog', 'and', 'that', 'is', 'a', 'cat'], ['there', 'are', 'dog', 'and', 'cat', 'on', 'the', 'tree'], ['I', 'study', 'hard', 'this', 'morning'], ['today', 'is', 'a', 'good', 'day'], ['tomorrow', 'will', 'be', 'a', 'good', 'day'], ['I', 'like', 'coffee', 'I', 'like', 'book', 'and', 'I', 'like', 'apple'], ['I', 'do', 'not', 'like', 'it'], ['I', 'am', 'kitty', 'I', 'like', 'bob'], ['I', 'do', 'not', 'care', 'who', 'like', 'bob', 'but', 'I', 'like', 'kitty'], ['It', 'is', 'coffee', 'time', 'bring', 'your', 'cup']]
vocab = set(itertools.chain(*docs_words))#词典
v2i = {v: i for i, v in enumerate(vocab)}#map(词:id)
i2v = {i: v for v, i in v2i.items()}#map(id:词)
print()

def safe_log(x):
    mask = x != 0
    x[mask] = np.log(x[mask])
    return x


tf_methods = {
        "log": lambda x: np.log(1+x),
        "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
        "boolean": lambda x: np.minimum(x, 1),
        "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
    }
idf_methods = {
        "log": lambda x: 1 + np.log(len(docs) / (x+1)),
        "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
        "len_norm": lambda x: x / (np.sum(np.square(x))+1),
    }


def get_tf(method="log"):
    # term frequency: how frequent a word appears in a doc
    _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)    # [n_vocab, n_doc]
    for i, d in enumerate(docs_words):#i=0,d=['it', 'is', 'a', 'good', 'day', 'I', 'like', 'to', 'stay', 'here']
        counter = Counter(d)#统计Counter({'it': 1, 'is': 1, 'a': 1, 'good': 1, 'day': 1, 'I': 1, 'like': 1, 'to': 1, 'stay': 1, 'here': 1})
        for v in counter.keys():
            # 词xx的频次/出现最多次数的词的频次
            _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]# counter.most_common(i)出现最多的i个单词,[('单词xx', xx的数量)],[0][1]是xx的数量

    weighted_tf = tf_methods.get(method, None)
    if weighted_tf is None:
        raise ValueError
    return weighted_tf(_tf)# 算法处理


def get_idf(method="log"):
    # inverse document frequency: low idf for a word appears in more docs, mean less important
    df = np.zeros((len(i2v), 1))#47x1,全0,统计每个词出现在哪些文章中,统计文章的篇数
    for i in range(len(i2v)):# 遍历词典
        d_count = 0
        for d in docs_words:# 遍历文章
            d_count += 1 if i2v[i] in d else 0#文章中有就+1
        df[i, 0] = d_count

    idf_fn = idf_methods.get(method, None)
    if idf_fn is None:
        raise ValueError
    return idf_fn(df)# 后续计算


def cosine_similarity(q, _tf_idf):
    unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
    unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
    similarity = unit_ds.T.dot(unit_q).ravel()
    return similarity


def docs_score(q, len_norm=False):
    q_words = q.replace(",", "").split(" ")

    # add unknown words
    unknown_v = 0
    for v in set(q_words):
        if v not in v2i:# 字典中没有
            v2i[v] = len(v2i)# 字典更新
            i2v[len(v2i)-1] = v# 字典更新
            unknown_v += 1# 未知词个数+1
    if unknown_v > 0:# concatenate数组拼接,axis=0是垂直拼接
        _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0)
        _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0)
    else:
        _idf, _tf_idf = idf, tf_idf
    counter = Counter(q_words)
    # 计算问句的每个词的tf值
    q_tf = np.zeros((len(_idf), 1), dtype=np.float)  # [n_vocab, 1]
    for v in counter.keys():
        q_tf[v2i[v], 0] = counter[v]

    q_vec = q_tf * _idf            # [n_vocab, 1]问句的if-idf值

    q_scores = cosine_similarity(q_vec, _tf_idf)
    if len_norm:
        len_docs = [len(d) for d in docs_words]
        q_scores = q_scores / np.array(len_docs)
    return q_scores


def get_keywords(n=2):
    for c in range(3):
        col = tf_idf[:, c]
        idx = np.argsort(col)[-n:]
        print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))


tf = get_tf()           # [n_vocab, n_doc]
idf = get_idf()         # [n_vocab, 1]
tf_idf = tf * idf       # [n_vocab, n_doc]
# print("tf shape(vecb in each docs): ", tf.shape)
# print("\ntf samples:\n", tf[:2])
# print("\nidf shape(vecb in all docs): ", idf.shape)
# print("\nidf samples:\n", idf[:2])
# print("\ntf_idf shape: ", tf_idf.shape)
# print("\ntf_idf sample:\n", tf_idf[:2])


# test
# get_keywords()
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]# 取前三
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))

# show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix")

稀疏矩阵简化代码

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from visual import show_tfidf   # this refers to visual.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)


docs = [
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup",
]

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(docs)# 15x44
print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
print("v2i: ", vectorizer.vocabulary_)


q = "I get a coffee cup"
qtf_idf = vectorizer.transform([q])#1x44
res = cosine_similarity(tf_idf, qtf_idf)# 利用了稀疏矩阵
res = res.ravel().argsort()[-3:]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))


# i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
# dense_tfidf = tf_idf.todense()
# show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")

输出

top 3 docs for 'I get a coffee cup':
['It is coffee time, bring your cup', 'I like coffee, I like book and I like apple', 'I have a party today']
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值