python一步一步实现简单的搜索引擎


待测试文档:

1.txt

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.

2.txt

This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day. . . .

3.txt

And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God’s children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: “Free at last! Free at last! Thank God Almighty, we are free at last!”

1. 简单的全文匹配

将待查找词语直接在所有文本中进行全文搜索

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 文本对比模型
class TextSearchEngine(SearchEngineBase):
    def __init__(self):
        super(TextSearchEngine,self).__init__()
        self.__idToTexts = {}
        
    def processCorpus(self,id,text):
        self.__idToTexts[id] = text;
        
    def search(self,query):
        results = []
        for id, text in self.__idToTexts.items():
            if query in text:
                results.append(id)
        return results
    
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = TextSearchEngine()
main(engine)

在这里插入图片描述

2. 词袋模型

将文本中的词语进行统计放入词袋,使用待查找词语为所有词袋进行查找,这是由于一个文本中用到大量重复的单词,浪费空间

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 词袋模型
class BowEngine(SearchEngineBase):
    def __init__(self):
        super(BowEngine,self).__init__()
        self.__idToWords = {}
        
    def processCorpus(self,id,text):
        self.__idToWords[id] = self.textToWords(text)
        
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
    def search(self,querys):
        queryWords = self.textToWords(querys)
        results = []
        for id,text in self.__idToWords.items():
            foundFlag = True
            for query in queryWords:
                if query not in text:
                    foundFlag = False
                    break
            if foundFlag:
                results.append(id)
        return results   
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BowEngine()
main(engine)

在这里插入图片描述

3. 词袋倒排索引模型

上面是通过对每个文件进行词语检测,当前将词语索引到所有文件,然后对所有词语求交集

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 词袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#创建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一个词没有,则整个短语一定不会存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngine()
main(engine)

在这里插入图片描述

4. 增加缓存

可以考虑使用LRU算法,此处只是示意了一个缓存。

import re
import pylru

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 词袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#创建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一个词没有,则整个短语一定不会存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
class LRUCache(object):
    def __init__(self,size=32):
        self.cache = pylru.lrucache(size)
        
    def has(self,key):
        return key in self.cache
    
    def get(self,key):
        return self.cache[key]
    
    def set(self,key,value):
        self.cache[key] = value        
        
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine,LRUCache):
    def __init__(self):
        super(BOWInvertedIndexEngineWithCache,self).__init__()
        LRUCache.__init__(self)
        
    def search(self,query):
        if self.has(query):
            return self.get(query)
        result = super(BOWInvertedIndexEngineWithCache,self).search(query)
        self.set(query,result)
        return result
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngineWithCache()
main(engine)

在这里插入图片描述

转自:https://www.zhenxiangsimple.com/2020/03/09/tech/python-spider/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

放羊郎

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值