待测试文档:
1.txt
I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.
2.txt
This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day. . . .
3.txt
And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God’s children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: “Free at last! Free at last! Thank God Almighty, we are free at last!”
1. 简单的全文匹配
将待查找词语直接在所有文本中进行全文搜索
import re
class SearchEngineBase(object):
def __init__(self):
pass
def addCorpus(self,filePath):
with open(filePath,'r') as fin:
text = fin.read()
self.processCorpus(filePath,text)
def processCorpus(self,id,text):
raise Exception('processCorpus not implemented')
def search(self,query):
raise Exception('search not implemented')
# 文本对比模型
class TextSearchEngine(SearchEngineBase):
def __init__(self):
super(TextSearchEngine,self).__init__()
self.__idToTexts = {}
def processCorpus(self,id,text):
self.__idToTexts[id] = text;
def search(self,query):
results = []
for id, text in self.__idToTexts.items():
if query in text:
results.append(id)
return results
def main(engine):
for filePath in ['1.txt','2.txt','3.txt']:
engine.addCorpus(filePath)
while True:
query = input('searching the words: ')
results = engine.search(query)
print('found {} results:'.format(len(results)))
for item in results:
print(item)
engine = TextSearchEngine()
main(engine)
2. 词袋模型
将文本中的词语进行统计放入词袋,使用待查找词语为所有词袋进行查找,这是由于一个文本中用到大量重复的单词,浪费空间
import re
class SearchEngineBase(object):
def __init__(self):
pass
def addCorpus(self,filePath):
with open(filePath,'r') as fin:
text = fin.read()
self.processCorpus(filePath,text)
def processCorpus(self,id,text):
raise Exception('processCorpus not implemented')
def search(self,query):
raise Exception('search not implemented')
# 词袋模型
class BowEngine(SearchEngineBase):
def __init__(self):
super(BowEngine,self).__init__()
self.__idToWords = {}
def processCorpus(self,id,text):
self.__idToWords[id] = self.textToWords(text)
@staticmethod
def textToWords(text):
text = re.sub(r'[^\w]',' ', text)
wordList = text.split(' ')
wordList = filter(None, wordList)
return set(wordList)
def search(self,querys):
queryWords = self.textToWords(querys)
results = []
for id,text in self.__idToWords.items():
foundFlag = True
for query in queryWords:
if query not in text:
foundFlag = False
break
if foundFlag:
results.append(id)
return results
def main(engine):
for filePath in ['1.txt','2.txt','3.txt']:
engine.addCorpus(filePath)
while True:
query = input('searching the words: ')
results = engine.search(query)
print('found {} results:'.format(len(results)))
for item in results:
print(item)
engine = BowEngine()
main(engine)
3. 词袋倒排索引模型
上面是通过对每个文件进行词语检测,当前将词语索引到所有文件,然后对所有词语求交集
import re
class SearchEngineBase(object):
def __init__(self):
pass
def addCorpus(self,filePath):
with open(filePath,'r') as fin:
text = fin.read()
self.processCorpus(filePath,text)
def processCorpus(self,id,text):
raise Exception('processCorpus not implemented')
def search(self,query):
raise Exception('search not implemented')
# 词袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
def __init__(self):
super(BOWInvertedIndexEngine,self).__init__()
self.invertedIndex = {}
def processCorpus(self,id,text):
words = self.textToWords(text)
for word in words:
if word not in self.invertedIndex:
self.invertedIndex[word] = []
self.invertedIndex[word].append(id)#创建反向索引
def search(self,query):#求交集
queryWords = list(self.textToWords(query))
queryList = list()
for query in queryWords:
queryList.append(0)
for query in queryWords:# 有一个词没有,则整个短语一定不会存在
if query not in self.invertedIndex:
return []
result = []
while True:
currentIds = []
for idx, query in enumerate(queryWords):
currentIdx = queryList[idx]
currentList = self.invertedIndex[query]
if currentIdx >= len(currentList):
return result
currentIds.append(currentList[currentIdx])
if all(x == currentIds[0] for x in currentIds):
result.append(currentIds[0])
queryList = [x+1 for x in queryList]
continue
minVal = min(currentIds)
minPos = currentIds.index(minVal)
queryList[minPos] += 1
@staticmethod
def textToWords(text):
text = re.sub(r'[^\w]',' ', text)
wordList = text.split(' ')
wordList = filter(None, wordList)
return set(wordList)
def main(engine):
for filePath in ['1.txt','2.txt','3.txt']:
engine.addCorpus(filePath)
while True:
query = input('searching the words: ')
results = engine.search(query)
print('found {} results:'.format(len(results)))
for item in results:
print(item)
engine = BOWInvertedIndexEngine()
main(engine)
4. 增加缓存
可以考虑使用LRU算法,此处只是示意了一个缓存。
import re
import pylru
class SearchEngineBase(object):
def __init__(self):
pass
def addCorpus(self,filePath):
with open(filePath,'r') as fin:
text = fin.read()
self.processCorpus(filePath,text)
def processCorpus(self,id,text):
raise Exception('processCorpus not implemented')
def search(self,query):
raise Exception('search not implemented')
# 词袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
def __init__(self):
super(BOWInvertedIndexEngine,self).__init__()
self.invertedIndex = {}
def processCorpus(self,id,text):
words = self.textToWords(text)
for word in words:
if word not in self.invertedIndex:
self.invertedIndex[word] = []
self.invertedIndex[word].append(id)#创建反向索引
def search(self,query):#求交集
queryWords = list(self.textToWords(query))
queryList = list()
for query in queryWords:
queryList.append(0)
for query in queryWords:# 有一个词没有,则整个短语一定不会存在
if query not in self.invertedIndex:
return []
result = []
while True:
currentIds = []
for idx, query in enumerate(queryWords):
currentIdx = queryList[idx]
currentList = self.invertedIndex[query]
if currentIdx >= len(currentList):
return result
currentIds.append(currentList[currentIdx])
if all(x == currentIds[0] for x in currentIds):
result.append(currentIds[0])
queryList = [x+1 for x in queryList]
continue
minVal = min(currentIds)
minPos = currentIds.index(minVal)
queryList[minPos] += 1
@staticmethod
def textToWords(text):
text = re.sub(r'[^\w]',' ', text)
wordList = text.split(' ')
wordList = filter(None, wordList)
return set(wordList)
class LRUCache(object):
def __init__(self,size=32):
self.cache = pylru.lrucache(size)
def has(self,key):
return key in self.cache
def get(self,key):
return self.cache[key]
def set(self,key,value):
self.cache[key] = value
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine,LRUCache):
def __init__(self):
super(BOWInvertedIndexEngineWithCache,self).__init__()
LRUCache.__init__(self)
def search(self,query):
if self.has(query):
return self.get(query)
result = super(BOWInvertedIndexEngineWithCache,self).search(query)
self.set(query,result)
return result
def main(engine):
for filePath in ['1.txt','2.txt','3.txt']:
engine.addCorpus(filePath)
while True:
query = input('searching the words: ')
results = engine.search(query)
print('found {} results:'.format(len(results)))
for item in results:
print(item)
engine = BOWInvertedIndexEngineWithCache()
main(engine)
转自:https://www.zhenxiangsimple.com/2020/03/09/tech/python-spider/