搜索引擎
TF-IDF
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from numpy import argsort
# 读取地址集
with open('中国行政区划.txt', encoding='utf-8') as f:
texts = f.read().split()
# 建模
corpora = [list(text) for text in texts]
dictionary = Dictionary(corpora)
num_features = len(dictionary.token2id)
corpora = [dictionary.doc2bow(c) for c in corpora]
tfidf = TfidfModel(corpora)
index = SparseMatrixSimilarity(tfidf[corpora], num_features)
# 搜索
while True:
kw = input('输入:').strip()
kw_vec = dictionary.doc2bow(list(kw))
sim = index[tfidf[kw_vec]]
ids = argsort(-sim)[:5] # 索引排序,返回前5
for i in ids:
print(texts[i])
效果
Word2Vec
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
# 配置
PATH = '中国行政区划.txt'
size = 100
# 读取地址集
with open(PATH, encoding='utf-8') as f:
texts = f.read().split()
# 建模
corpora = [list(text) for text in texts]
model = Word2Vec(corpora, size)
w2i = {w: i for i, w in enumerate(model.wv.index2word, 1)}
vectors = np.concatenate((np.zeros((1, size)), model.wv.vectors), axis=0)
w2v = lambda w: vectors[w2i.get(w, 0)] # word to vector
# s2v = lambda s: np.mean([w2v(w) for w in s], axis=0) # sentence to vector
s2v = lambda s: np.sum([w2v(w) for w in s], axis=0) # sentence to vector
corpora = np.array([s2v(words) for words in corpora])
# 搜索
while True:
kw = input('输入:').strip()
kw_vec = [s2v(kw)]
top_idxs = cosine_similarity(corpora, kw_vec).reshape(-1)
top_idxs = np.argsort(-top_idxs)[:5] # 索引排序,返回前5
for i in top_idxs:
print(texts[i])
效果
Word2Vec+TF-IDF
import numpy as np
from gensim.models import TfidfModel, Word2Vec
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
# 配置
PATH = '中国行政区划.txt'
size = 100
min_count = 1
# 数据准备
with open(PATH, encoding='utf-8') as f:
texts = f.read().split()
corpora = [list(text) for text in texts]
# 词向量模型
word2vec = Word2Vec(corpora, size, min_count=min_count)
w2i = {w: i for i, w in enumerate(word2vec.wv.index2word, 1)}
vectors = np.concatenate((np.zeros((1, size)), word2vec.wv.vectors), axis=0)
# TF-IDF模型
tfidf = TfidfModel([Counter(w2i[w] for w in words).most_common() for words in corpora])
idfs = np.array([[tfidf.idfs.get(i, 0)] for i in range(len(w2i) + 1)])
# 词向量+TfIdf
vectors = vectors * idfs
# 句向量
w2v = lambda w: vectors[w2i.get(w, 0)] # word to vector
s2v = lambda s: np.mean([w2v(w) for w in s], axis=0) # sentence to vector
# s2v = lambda s: np.sum([w2v(w) for w in s], axis=0) # sentence to vector
# 创建被检索文本
corpora = np.array([s2v(words) for words in corpora])
# 搜索
while True:
kw = input('输入:').strip()
kw_vec = [s2v(kw)]
top_idxs = cosine_similarity(corpora, kw_vec).reshape(-1)
top_idxs = np.argsort(-top_idxs)[:5] # 索引排序,返回前5
for i in top_idxs:
print(texts[i])
效果
倒排索引
创建倒排索引,优化检索效率
"""被检索文档"""
docs = {
'a', 'aa', 'aaa', 'aab', 'aac', 'ab', 'aba', 'abb', 'abc', 'ac', 'aca', 'acb', 'acc',
'b', 'ba', 'baa', 'bab', 'bac', 'bb', 'bba', 'bbb', 'bbc', 'bc', 'bca', 'bcb', 'bcc',
'c', 'ca', 'caa', 'cab', 'cac', 'cb', 'cba', 'cbb', 'cbc', 'cc', 'cca', 'ccb', 'ccc'
}
"""倒排索引"""
inverted = {w: {i for i in docs if w in i} for w in 'abc'}
# inverted = {
# 'a': {'a', 'aa', 'aaa', 'aab', 'aac', 'ab', 'aba', 'abb', 'abc', 'ac', 'aca', 'acb', 'acc',
# 'ba', 'baa', 'bab', 'bac', 'bba', 'bca', 'ca', 'caa', 'cab', 'cac', 'cba', 'cca'},
# 'b': {'aab', 'ab', 'aba', 'abb', 'abc', 'acb', 'b', 'ba', 'baa', 'bab', 'bac', 'bb', 'bba',
# 'bbb', 'bbc', 'bc', 'bca', 'bcb', 'bcc', 'cab', 'cb', 'cba', 'cbb', 'cbc', 'ccb'},
# 'c': {'aac', 'abc', 'ac', 'aca', 'acb', 'acc', 'bac', 'bbc', 'bc', 'bca', 'bcb', 'bcc',
# 'c', 'ca', 'caa', 'cab', 'cac', 'cb', 'cba', 'cbb', 'cbc', 'cc', 'cca', 'ccb', 'ccc'}
# }
def search(kw):
s = set(docs) # 全集
for w in set(kw):
s &= inverted[w] # 求交集
return s
while True:
kw = input('输入任意abc组合:').strip()
print(search(kw))
时间复杂度比较
from time import time
def timer(f, kw, times=99999):
"""时间复杂度计算"""
t = time()
for _ in range(times):
f(kw)
print('%.2f秒' % (time() - t), f.__doc__)
"""创建被检索文档"""
dt = {0: 'a', 1: 'b', 2: 'c'}
docs = set()
for i in range(3):
docs.add(dt[i])
for j in range(3):
docs.add(''.join([dt[i], dt[j]]))
for k in range(3):
docs.add(''.join([dt[i], dt[j], dt[k]]))
print(len(docs), docs)
# docs = {
# 'a', 'aa', 'aaa', 'aab', 'aac', 'ab', 'aba', 'abb', 'abc', 'ac', 'aca', 'acb', 'acc',
# 'b', 'ba', 'baa', 'bab', 'bac', 'bb', 'bba', 'bbb', 'bbc', 'bc', 'bca', 'bcb', 'bcc',
# 'c', 'ca', 'caa', 'cab', 'cac', 'cb', 'cba', 'cbb', 'cbc', 'cc', 'cca', 'ccb', 'ccc'
# }
"""倒排索引"""
inverted = {w: {i for i in docs if w in i} for w in 'abc'}
print(inverted)
# inverted = {
# 'a': {'a', 'aa', 'aaa', 'aab', 'aac', 'ab', 'aba', 'abb', 'abc', 'ac', 'aca', 'acb', 'acc',
# 'ba', 'baa', 'bab', 'bac', 'bba', 'bca', 'ca', 'caa', 'cab', 'cac', 'cba', 'cca'},
# 'b': {'aab', 'ab', 'aba', 'abb', 'abc', 'acb', 'b', 'ba', 'baa', 'bab', 'bac', 'bb', 'bba',
# 'bbb', 'bbc', 'bc', 'bca', 'bcb', 'bcc', 'cab', 'cb', 'cba', 'cbb', 'cbc', 'ccb'},
# 'c': {'aac', 'abc', 'ac', 'aca', 'acb', 'acc', 'bac', 'bbc', 'bc', 'bca', 'bcb', 'bcc',
# 'c', 'ca', 'caa', 'cab', 'cac', 'cb', 'cba', 'cbb', 'cbc', 'cc', 'cca', 'ccb', 'ccc'}
# }
def search1(kw):
"""直接检索"""
s = set()
kw = set(kw)
for i in docs: # 先遍历docs
for w in kw:
if w in i: # 此处求并集
s.add(i)
return s
def search2(kw):
"""使用倒排索引来检索"""
s = set()
for w in set(kw):
s |= inverted[w]
return s
while True:
kw = input('输入任意abc组合:').strip()
timer(search1, kw)
timer(search2, kw)
图解
附录
-
相关基础知识:TFIDF的文本相似度gensim实现
- https://blog.csdn.net/Yellow_python/article/details/81021142 语料下载地址:
- https://download.csdn.net/download/yellow_python/11064982