简易搜索引擎

maplea2012

于 2023-07-13 16:48:22 发布

阅读量189

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/weixin_37901366/article/details/131706345

版权

Python 专栏收录该内容

36 篇文章 1 订阅

订阅专栏

一、基本需求

# 实现一个简易版的搜索引擎，基本功能：

"""

>> 输入搜索词,比如I love you,返回包含全部这三个词的【所有】file列表

Sample

1.txt: I Will love you forever until the end of the world

2.txt: I like your toy, can you share with me

3.txt: Do you love me,I hope so

说明: 由于只有1.txt和 3.txt 包含 I 、love和you都包含,所以最终应该返回 [1.txt, 3.txt]

"""

二、代码实现

1、定义父类

class BasicSearchEngine:
  def __init__(self,queryWords):
    self.queryWords = queryWords

  def words_to_id():
    """
    获取 单词:[文件名,...] 的键值对

    """
    raise Exception("该方法需要被继承")

  def read_dir():
    """
    读取目录中的所有文件,然后返回 文件名:文件内容的字典
    
    """
    raise Exception("该方法需要被继承")

2、定义子类

import os

class MySearchEngine(BasicSearchEngine):
  def __init__(self,queryWords,path):
    super().__init__(queryWords)
    self.path = path


  def words_to_id(self):

    id_to_text = self.read_dir()
    print(id_to_text)
    # 返回 单词：[file_name1,file_name2] 的键值对
    
    words_to_id = {}

    for id, text in id_to_text.items():
      # 正则匹配出所有单词/数字
      words = re.findall('[a-zA-Z0-9]+',text)
      words_set = set()
      # 将单词/数字 放到一个集合(去重)
      for word in words:
        words_set.add(word.lower())

      #对于集合中的单词/数字,判断归属于哪个文件，然后放到 words_to_id
      for word in words_set:
        if word not in words_to_id:
          words_to_id[word] = []
        words_to_id[word].append(id)

    return words_to_id


  def read_dir(self):
    dir_list = os.listdir(self.path)
    print(dir_list)
    # 存储 {文件名:文件内容} 的键值对
    id_to_text = {}
    for file_n in dir_list:
      file_path = self.path + "/" + file_n
      if os.path.isfile(file_path):
        with open(file_path,'r') as fin:
          result = fin.read()
          id_to_text[file_n] = result

    return id_to_text
  

  def invert_index(self):

    words_to_id = self.words_to_id()

    invert_index = []
    result = []

    # 如果某个query在words_to_id都不存在(意味着所有文本中 根本不存在这个单词),那么就立刻返回[]（注意该搜索引擎有一个前提条件:所有queryWords都出现才返回所在的file名）
    for word in self.queryWords:
      if word.lower() not in words_to_id:
        return result

    # 初始index都为0,即都从 单词:[文件名,...] 键值对的 第一个文件名开始遍历起
    for word in self.queryWords:
      invert_index.append(0)
    
    while True:

      file_names = []

      """
      (1)invert_index 记录的是从 每个单词文件列表的哪个位置开始遍历
      (2)如果其角标 >= 文件列表的数量,表明已角标越界,即已经遍历完,应该退出Search
      I: [1.txt,2.txt,3.txt]
      Love: [1.txt,3.txt]
      You: [1.txt,2.txt, 3.txt]
      """

      flag = False

      for idx, word in enumerate(self.queryWords):
        if invert_index[idx]  >= len(words_to_id[word.lower()]):
          flag = True
  
      if flag:
        break

      for idx ,word in enumerate(self.queryWords):
        
        # print(word.lower())
        
        file_name = words_to_id[word.lower()][invert_index[idx]]
        # print(file_name)
        file_names.append(file_name)   
     
      if len(set(file_names)) == 1:
        # 如果file_names的长度为1,说明所有的query words都出现在同一个文件,表明命中结果，需要将其加入result
        result.append(set(file_names))
        # 此时invert_index全部加1，即下次从第二个文件（假设上次是从第一个文件,这里只是为了说明方便）开始遍历起
        invert_index = [ i + 1 for i in invert_index]
        # print(invert_index)
      else:
        # 如果set(file_names)的长度不为1,那说明至少这些 query words在此次遍历中，至少分散在2个文件中，需要进行下次遍历
        
        
        min_val = 0 # 代表第一个位置
        file_name_min_squence = file_names[0].split('.')[0]  # 1.txt 中的1
        for idx, file_name in enumerate(file_names):
          if file_name.split('.')[0] < file_name_min_squence:
            file_name_min_squence = file_name.split('.')[0]
            # 文件名中数值最小的文件 到底是在哪个位置（0:I、1:Love、2:You）
            min_val = idx 

        #  最小的位置+1，代表从下一个文件开始遍历
        # print(min_val)
        invert_index[min_val] = invert_index[min_val] + 1
    return result
  
  def main(self):
    result = self.invert_index()
    print("search result:" +  str(result))

3、测试

# 测试
print('**********测试1**************')
mySearchEngine = MySearchEngine(["I","love","you","world"],"/dbfs/FileStore/Maple/query")
mySearchEngine.main()

print('**********测试2**************')
mySearchEngine = MySearchEngine(["I","love","you"],"/dbfs/FileStore/Maple/query")
mySearchEngine.main()

print('**********测试3**************')

mySearchEngine = MySearchEngine(["I"],"/dbfs/FileStore/Maple/query")
mySearchEngine.main()

maplea2012

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
简易搜索引擎

"""获取单词:[文件名,...] 的键值对"""raise Exception("该方法需要被继承")"""读取目录中的所有文件,然后返回文件名:文件内容的字典"""raise Exception("该方法需要被继承")2、定义子类import os# 返回单词：[file_name1,file_name2] 的键值对# 正则匹配出所有单词/数字# 将单词/数字放到一个集合(去重)#对于集合中的单词/数字,判断归属于哪个文件，然后放到 words_to_id。
复制链接

扫一扫