python实现简单的搜索引擎（不含爬虫）

最新推荐文章于 2024-05-13 01:51:12 发布

youroldz

最新推荐文章于 2024-05-13 01:51:12 发布

阅读量1k

点赞数 3

分类专栏： python 文章标签： python 搜索引擎 tkinter

本文链接：https://blog.csdn.net/qq_37601846/article/details/103238826

版权

python 专栏收录该内容

10 篇文章 2 订阅

订阅专栏

以前在极客时间学习python的时候按着步骤实现了很简单的搜索引擎，这里主要记录一下。然后就是觉得当时作者给的子类部分程序写的比较麻烦，这里修改了一下，更加浅显易懂。然后那时候学习tkinter，顺便就加了点很辣眼睛的GUI，大家看看就好：>

还有就是本程序不包含爬虫，只是搜索本地文件的txt，太深入的内容就没有啦，主要还是给自己做个总结吧，有需要的朋友也可以做个简单的参考。

直接上程序吧，如果有不懂的可以问我（估计没有）

GUI的类实现(windows_class.py)

from tkinter import *
from maindo import main


class App:
    def __init__(self, master, width, height, search_engine):
        self.master = master
        self.width = width
        self.height = height
        self.search_engine = search_engine
        self.initWidgets()
    def initWidgets(self):
        self.master.geometry("{}x{}".format(self.width, self.height))
        self.master.config(background="WHITE")
        # 文本
        l1=Label(self.master,
                 text='oldz\'s simple search engine',
                 font=('Courier New', '24','bold'),
                 bg='WHITE',
                 )
        l2 = Label(self.master,
                   text='result: ',
                   font=('Courier New', '30'),
                   bg='RED',
                   )
        l1.place(relx=0.005, rely=0.005)
        l2.place(relx=0.005, rely=0.13)
        # 输入框
        search_content = Entry(self.master,
                               width=80,
                               borderwidth=2,
                               font=('Helvetica', '20'),
                               highlightcolor='BLACK',
                               selectforeground='RED'
                               )
        search_content.place(relx=0.005, rely=0.05)
        # 按钮
        bu = Button(self.master,
                    text='search',
                    font=('Helvetica', '14','bold'),
                    activebackground='RED',
                    bg='YELLOW',
                    fg='BLUE',
                    command=lambda: main(self.search_engine, search_content,
                                         f_label, f_result)
                    )
        bu.pack_propagate(0)
        bu.place(relx=0.80, rely=0.051)
        # 输出Frame
        f_label = Frame(self.master, height=140, width=800, bg='WHITE', border=2)
        f_label.pack_propagate(0)  # 固定frame大小，如果不设置，frame会随着标签大小改变
        f_label.place(relx=0.005, rely=0.20)

        f_result = Frame(self.master, height=500, width=800, bg='WHITE', border=2)
        f_result.pack_propagate(0)
        f_result.place(relx=0.005, rely=0.30)

搜索引擎(serach_engine_class.py)

import re
import pylru
from collections import Counter
from file_io import txtfile_read

# 基类
class SearchEngineBase(object):
    def __init__(self):
        pass
    # 读取语料样本
    def add_corpus(self, file_path):
        text=txtfile_read(file_path)
        self.process_corpus(file_path, text)
    # 处理样本
    def process_corpus(self, id, text):
        raise Exception('process_corpus not implemented.')
    # 检索
    def search(self, query):
        raise Exception('search not implemented.')


class OLDZBOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(OLDZBOWInvertedIndexEngine, self).__init__()
        # self.inverted_通过process_corpus
        # 最终将包含索引样本中所有（word-id）的dict
        self.inverted_index = {}

    def process_corpus(self, id, text):
        # set
        words = self.parse_text_to_words(text)
        for word in words:
            if word not in self.inverted_index:
                # {word:[],...}
                # 将样本set中每个不重复单词作为key，list作为value存入dict中
                self.inverted_index[word] = []
            # 将出现过该词的txt(id)存入该词对应的list中
            # eg:
            # 'that': ['search_engine_local_file/1.txt', 'search_engine_local_file/2.txt']
            self.inverted_index[word].append(id)

    def search(self, query):
        # 将词袋set转为list：['I','have','a','dream']
        query_words = list(self.parse_text_to_words(query))
        # 如果某一个查询单词的倒序索引为空，我们就立刻返回
        # 只要有一个单词不在样本中就无法满足
        for query_word in query_words:
            if query_word not in self.inverted_index:
                return []
        while True:
            input_inverted_list=[]
            for idx, query_word in enumerate(query_words):
                current_inverted_list = self.inverted_index[query_word]
                input_inverted_list.append(current_inverted_list)
                # 已经遍历到了某一个倒序索引的末尾，结束 search
            result=self.l_output_same(input_inverted_list)
            return result

    # 处理文本
    @staticmethod
    def parse_text_to_words(text):
        # 使用正则表达式去除标点符号和换行符
        text = re.sub(r'[^\w ]', ' ', text)
        # 转为小写
        text = text.lower()
        # 生成所有单词的列表
        word_list = text.split(' ')
        # 去除空白单词
        word_list = filter(None, word_list)
        # 返回单词的 set
        return set(word_list)

    # 对L中的元素进行计数，将最多的元素输出
    @staticmethod
    def l_output_same(L):
        L_ALL = []
        L_SAME = []
        for l in L:
            L_ALL += l
        count_dict = Counter(L_ALL)
        count_dict_sort = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
        max_count = count_dict_sort[0][1]
        for id, count in count_dict_sort:
            if count == max_count:
                L_SAME.append(id)
        return L_SAME

# 处理缓存
class LRUCache(object):
    def __init__(self, size=32):
        self.cache = pylru.lrucache(size)

    def has(self, key):
        return key in self.cache

    def get(self, key):
        return self.cache[key]

    def set(self, key, value):
        self.cache[key] = value


class OLDZBOWInvertedIndexEngineWithCache(OLDZBOWInvertedIndexEngine, LRUCache):
    def __init__(self):
        super(OLDZBOWInvertedIndexEngineWithCache, self).__init__()
        LRUCache.__init__(self)

    def search(self, query):
        if self.has(query):
            print('cache hit!')
            return self.get(query)

        result = super(OLDZBOWInvertedIndexEngineWithCache, self).search(query)
        self.set(query, result)

        return result

读本地txt（file_io.py）

import linecache
import os
def txtfile_read(txtfilename):
    if os.path.exists(txtfilename):
        text = ''
        file_data=linecache.getlines(txtfilename)
        for line in range(len(file_data)):
            text += file_data[line]
        return text
    else:
        print('the "{}" is not existed'.format(txtfilename))

maindo.py(我都不知道为什么这么取名，很久的代码了，懒得改了)

import os
from tkinter import *
import tkinter.font as tkFont

file_dir = 'search_engine_local_file/'


def file_through_dir(filedir, file_type):
    file_list=[]
    for root, dirs, files in os.walk(filedir):
        for file in files:
            if file[-3:] == file_type:
                file_list.append(os.path.join(root, file))
    return file_list

def main(search_engine, search_content, f_label, f_result):
    # 清空f中的所有控件
    for widget in f_label.winfo_children():
        widget.destroy()
    for widget in f_result.winfo_children():
        widget.destroy()
    file_list = file_through_dir(file_dir,'txt')
    for file_path in file_list:
        search_engine.add_corpus(file_path)
    # 处理输入内容
    query = search_content.get()
    results = search_engine.search(query)
    # 结果
    if results == []:
        tip = 'No relevant content found!'
        la_tip = Label(f_label, text=tip, font=('Verdana', 30), width=30, bg='WHITE')
        la_tip.grid(row=0, column=0, sticky=NW)
    else:

        tip = str('found {} result(s) for you : '.format(len(results)))
        la_tip = Label(f_label, text=tip, font=('Verdana', 30), width=30, bg='WHITE')
        la_tip.grid(row=0, column=0, sticky=NW)
        for j in range(2):
            la_space=Label(f_label, text='  ', font=('Verdana', 20), width=30, bg='WHITE')
            la_space.grid(row=j+1, column=0)
        # 创建滚动条
        scroll = Scrollbar(f_result)
        scroll.pack(side=RIGHT, fill=Y)
        ft = tkFont.Font(family='Verdana', size=20, weight=tkFont.NORMAL, underline=1)
        res=Listbox(f_result,
                    yscrollcommand=scroll.set,
                    height=30,
                    width=200,
                    font=ft)
        for i in range(len(results)):
            str_result=str(str(i+1)+'. '+results[i])
            # la_result=Label(f_result, text=str_result)
            res.insert(END, str_result)
        res.pack(side=LEFT)
        scroll.config(command=res.yview)

主程序（do.py）

from search_engine_class import OLDZBOWInvertedIndexEngineWithCache
from tkinter import *
from windows_class import App


def sys_out(even):
    from tkinter import messagebox
    if messagebox.askokcancel('Exit', 'Confirm to exit?'):
        root.destroy()

search_engine = OLDZBOWInvertedIndexEngineWithCache()
root = Tk()
root.title("OLDZ")
w, h = root.maxsize()
App(root, w, h, search_engine)
root.bind('<Escape>', sys_out)

root.mainloop()

有些取名取的有点无意义，理解下 :<