实现步骤
1.对语料实现word2index的倒排索引
2.构建 {"word": {"tfidf": float, "b2index": set}}搜索模型
3.查询 计算哪个index的tfidf之和最大,实现搜索功能
实现代码
封装成搜索类
logging.basicConfig(format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.INFO) class TFIDFSearch: def __init__(self, corpus): self.corpus = corpus # 待检索语料 # idf计算 self.idf_fuc = lambda word, count_lst: math.log(len(count_lst)) / ( 1 + sum(1 for count in count_lst if word in count)) def word2index(self): """实现倒排索引""" count_lst = [] # Counter结果 index_dic = {} # 倒排索引结果 tf_dic = {} # tf结果 for index, sent in enumerate(self.corpus): words = seg.segment(sent) word_coun