Python学习打卡--day33（练习：优化搜索引擎）

最新推荐文章于 2021-05-31 18:34:48 发布

hengxiaogu

最新推荐文章于 2021-05-31 18:34:48 发布

阅读量192

点赞数

分类专栏： python学习打卡文章标签： python

本文链接：https://blog.csdn.net/hengxiaogu/article/details/91060817

版权

python学习打卡专栏收录该内容

58 篇文章 5 订阅

订阅专栏

优化搜索引擎

"""
搜索引擎：支持按照顺序搜索单词，返回所在文件位置
"""
from test06.test0605.search_base import *
import re


class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine, self).__init__()
        self.inverted_index = {}

    # 将文件内容插入字典中 {单词：文件路径。。）
    def process_corpus(self, id, text):
        words = self.parse_text_to_words(text)  # 文件内容，集合模式
        # print('文件内容words:\n', words)
        for word in words:
            if word not in self.inverted_index:
                self.inverted_index[word] = []
            self.inverted_index[word].append(id)
        # print(self.inverted_index)   # {'four': ['./1.txt'], 'one': ['./1.txt'],..}

    def search(self, query):
        query_words = list(self.parse_text_to_words(query))  # 查询的单词换行成list
        print(query_words)  # i have a dream  --》 ['a', 'have', 'i', 'dream']
        query_words_index = list()
        for query_word in query_words:
            query_words_index.append(0)
        print(query_words_index)

        # 如果某一个查询单词的倒序索引为空，我们就立刻返回
        for query_word in query_words:
            if query_word not in self.inverted_index:
                return []

            result = []
            while True:
                # 首先，获得当前状态下所有倒序索引的index
                current_ids = []
                for idx, query_word in enumerate(query_words):
                    current_index = query_words_index[idx]
                    current_inverted_list = self.inverted_index[query_word]

                    # 已经遍历到了某一个倒序索引的末尾，结束 search
                    if current_index >= len(current_inverted_list):
                        return result

                    current_ids.append(current_inverted_list[current_index])

                    # 然后，如果current_ids的所有元素都一样，那么表明这个单词在这个元素对应的文档中
                    if all(x == current_ids[0] for x in current_ids):
                        result.append(current_ids[0])
                        query_words_index = [x + 1 for x in query_words_index]
                        continue

                    # 如果不是，我们就把最小的元素加一
                    min_val = min(current_ids)
                    min_val_pos = current_ids.index(min_val)
                    query_words_index[min_val_pos] += 1

    @staticmethod
    def parse_text_to_words(text):
        # 使用正则去除标点符号和换行符
        text = re.sub(r'[^\w]', ' ', text)
        # 转为小写
        text = text.lower()
        # 生成所有单词的列表
        word_list = text.split(' ')
        # 去除空白单词
        word_list = filter(None, word_list)
        # 返回单词的set,单词去重
        return set(word_list)


serch_engin = BOWInvertedIndexEngine()
main(serch_engin)

ps：参考书极客时间python课程编写

hengxiaogu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python学习打卡--day33（练习：优化搜索引擎）

优化搜索引擎"""搜索引擎：支持按照顺序搜索单词，返回所在文件位置"""from test06.test0605.search_base import *import reclass BOWInvertedIndexEngine(SearchEngineBase): def __init__(self): super(BOWInvertedIndexEn...
复制链接

扫一扫