以前在极客时间学习python的时候按着步骤实现了很简单的搜索引擎,这里主要记录一下。然后就是觉得当时作者给的子类部分程序写的比较麻烦,这里修改了一下,更加浅显易懂。然后那时候学习tkinter,顺便就加了点很辣眼睛的GUI,大家看看就好 :>
还有就是本程序不包含爬虫,只是搜索本地文件的txt,太深入的内容就没有啦,主要还是给自己做个总结吧,有需要的朋友也可以做个简单的参考。
直接上程序吧,如果有不懂的可以问我(估计没有)
- GUI的类实现(windows_class.py)
from tkinter import *
from maindo import main
class App:
def __init__(self, master, width, height, search_engine):
self.master = master
self.width = width
self.height = height
self.search_engine = search_engine
self.initWidgets()
def initWidgets(self):
self.master.geometry("{}x{}".format(self.width, self.height))
self.master.config(background="WHITE")
# 文本
l1=Label(self.master,
text='oldz\'s simple search engine',
font=('Courier New', '24','bold'),
bg='WHITE',
)
l2 = Label(self.master,
text='result: ',
font=('Courier New', '30'),
bg='RED',
)
l1.place(relx=0.005, rely=0.005)
l2.place(relx=0.005, rely=0.13)
# 输入框
search_content = Entry(self.master,
width=80,
borderwidth=2,
font=('Helvetica', '20'),
highlightcolor='BLACK',
selectforeground='RED'
)
search_content.place(relx=0.005, rely=0.05)
# 按钮
bu = Button(self.master,
text='search',
font=('Helvetica', '14','bold'),
activebackground='RED',
bg='YELLOW',
fg='BLUE',
command=lambda: main(self.search_engine, search_content,
f_label, f_result)
)
bu.pack_propagate(0)
bu.place(relx=0.80, rely=0.051)
# 输出Frame
f_label = Frame(self.master, height=140, width=800, bg='WHITE', border=2)
f_label.pack_propagate(0) # 固定frame大小,如果不设置,frame会随着标签大小改变
f_label.place(relx=0.005, rely=0.20)
f_result = Frame(self.master, height=500, width=800, bg='WHITE', border=2)
f_result.pack_propagate(0)
f_result.place(relx=0.005, rely=0.30)
- 搜索引擎(serach_engine_class.py)
import re
import pylru
from collections import Counter
from file_io import txtfile_read
# 基类
class SearchEngineBase(object):
def __init__(self):
pass
# 读取语料样本
def add_corpus(self, file_path):
text=txtfile_read(file_path)
self.process_corpus(file_path, text)
# 处理样本
def process_corpus(self, id, text):
raise Exception('process_corpus not implemented.')
# 检索
def search(self, query):
raise Exception('search not implemented.')
class OLDZBOWInvertedIndexEngine(SearchEngineBase):
def __init__(self):
super(OLDZBOWInvertedIndexEngine, self).__init__()
# self.inverted_通过process_corpus
# 最终将包含索引样本中所有(word-id)的dict
self.inverted_index = {}
def process_corpus(self, id, text):
# set
words = self.parse_text_to_words(text)
for word in words:
if word not in self.inverted_index:
# {word:[],...}
# 将样本set中每个不重复单词作为key,list作为value存入dict中
self.inverted_index[word] = []
# 将出现过该词的txt(id)存入该词对应的list中
# eg:
# 'that': ['search_engine_local_file/1.txt', 'search_engine_local_file/2.txt']
self.inverted_index[word].append(id)
def search(self, query):
# 将词袋set转为list:['I','have','a','dream']
query_words = list(self.parse_text_to_words(query))
# 如果某一个查询单词的倒序索引为空,我们就立刻返回
# 只要有一个单词不在样本中就无法满足
for query_word in query_words:
if query_word not in self.inverted_index:
return []
while True:
input_inverted_list=[]
for idx, query_word in enumerate(query_words):
current_inverted_list = self.inverted_index[query_word]
input_inverted_list.append(current_inverted_list)
# 已经遍历到了某一个倒序索引的末尾,结束 search
result=self.l_output_same(input_inverted_list)
return result
# 处理文本
@staticmethod
def parse_text_to_words(text):
# 使用正则表达式去除标点符号和换行符
text = re.sub(r'[^\w ]', ' ', text)
# 转为小写
text = text.lower()
# 生成所有单词的列表
word_list = text.split(' ')
# 去除空白单词
word_list = filter(None, word_list)
# 返回单词的 set
return set(word_list)
# 对L中的元素进行计数,将最多的元素输出
@staticmethod
def l_output_same(L):
L_ALL = []
L_SAME = []
for l in L:
L_ALL += l
count_dict = Counter(L_ALL)
count_dict_sort = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
max_count = count_dict_sort[0][1]
for id, count in count_dict_sort:
if count == max_count:
L_SAME.append(id)
return L_SAME
# 处理缓存
class LRUCache(object):
def __init__(self, size=32):
self.cache = pylru.lrucache(size)
def has(self, key):
return key in self.cache
def get(self, key):
return self.cache[key]
def set(self, key, value):
self.cache[key] = value
class OLDZBOWInvertedIndexEngineWithCache(OLDZBOWInvertedIndexEngine, LRUCache):
def __init__(self):
super(OLDZBOWInvertedIndexEngineWithCache, self).__init__()
LRUCache.__init__(self)
def search(self, query):
if self.has(query):
print('cache hit!')
return self.get(query)
result = super(OLDZBOWInvertedIndexEngineWithCache, self).search(query)
self.set(query, result)
return result
- 读本地txt(file_io.py)
import linecache
import os
def txtfile_read(txtfilename):
if os.path.exists(txtfilename):
text = ''
file_data=linecache.getlines(txtfilename)
for line in range(len(file_data)):
text += file_data[line]
return text
else:
print('the "{}" is not existed'.format(txtfilename))
- maindo.py(我都不知道为什么这么取名,很久的代码了,懒得改了)
import os
from tkinter import *
import tkinter.font as tkFont
file_dir = 'search_engine_local_file/'
def file_through_dir(filedir, file_type):
file_list=[]
for root, dirs, files in os.walk(filedir):
for file in files:
if file[-3:] == file_type:
file_list.append(os.path.join(root, file))
return file_list
def main(search_engine, search_content, f_label, f_result):
# 清空f中的所有控件
for widget in f_label.winfo_children():
widget.destroy()
for widget in f_result.winfo_children():
widget.destroy()
file_list = file_through_dir(file_dir,'txt')
for file_path in file_list:
search_engine.add_corpus(file_path)
# 处理输入内容
query = search_content.get()
results = search_engine.search(query)
# 结果
if results == []:
tip = 'No relevant content found!'
la_tip = Label(f_label, text=tip, font=('Verdana', 30), width=30, bg='WHITE')
la_tip.grid(row=0, column=0, sticky=NW)
else:
tip = str('found {} result(s) for you : '.format(len(results)))
la_tip = Label(f_label, text=tip, font=('Verdana', 30), width=30, bg='WHITE')
la_tip.grid(row=0, column=0, sticky=NW)
for j in range(2):
la_space=Label(f_label, text=' ', font=('Verdana', 20), width=30, bg='WHITE')
la_space.grid(row=j+1, column=0)
# 创建滚动条
scroll = Scrollbar(f_result)
scroll.pack(side=RIGHT, fill=Y)
ft = tkFont.Font(family='Verdana', size=20, weight=tkFont.NORMAL, underline=1)
res=Listbox(f_result,
yscrollcommand=scroll.set,
height=30,
width=200,
font=ft)
for i in range(len(results)):
str_result=str(str(i+1)+'. '+results[i])
# la_result=Label(f_result, text=str_result)
res.insert(END, str_result)
res.pack(side=LEFT)
scroll.config(command=res.yview)
- 主程序(do.py)
from search_engine_class import OLDZBOWInvertedIndexEngineWithCache
from tkinter import *
from windows_class import App
def sys_out(even):
from tkinter import messagebox
if messagebox.askokcancel('Exit', 'Confirm to exit?'):
root.destroy()
search_engine = OLDZBOWInvertedIndexEngineWithCache()
root = Tk()
root.title("OLDZ")
w, h = root.maxsize()
App(root, w, h, search_engine)
root.bind('<Escape>', sys_out)
root.mainloop()
有些取名取的有点无意义,理解下 :<