'''
Description: 搜索的扩展-使用sklearn的简版TF-IDF
Autor: 365JHWZGo
Date: 2021-11-18 14:39:15
LastEditors: 365JHWZGo
LastEditTime: 2021-11-18 14:59:20
'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup"
]
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(docs)
question = "what is the weather today"
qtf_idf = vectorizer.transform([question])
res = cosine_similarity(tf_idf,qtf_idf)
#返回一个numpy
res = res.ravel().argsort()[-3:]
print('\ntop 3 docs for "{}":\n{}'.format(question,[docs[i]for i in res[::-1]]))
运行结果:
top 3 docs for "what is the weather today":
['today is a good day', 'it is sunny today', 'there are dog and cat on the tree']