目录
布尔查询之BSBI与索引压缩
使用斯坦福大学CS 276 / LING 286: Information Retrieval and Web Search课程的代码框架来实现。
代码描述:
-
对文档块逐个扫描,得到所有的词项-文档ID对;
-
在每个文档块的扫描过程中,构建临时倒排索引文件(.index),可将倒排索引文件压缩,并存储以词项ID为主键的,描述索引列表的元组(开始位置,出现在文档的数目,bytes长);
-
最后进行合并,将排序好的小文件进行归并排序,得到最终的倒排索引,同样记录以词项ID为主键的元组(开始位置,出现在文档的数目,bytes长);
-
布尔查询将用空格分隔的词项查询出的文档id合并。
-
压缩方法:先使用gap-encoding,再使用VB编码(可变长字节编码)。
数据集与代码
介绍几个重要的类。
数据集
数据目录下有10个子目录(命名0-9)每一个子目录下的文件都包含一个独立网页的内容。
IdMap
用到字典来将字符串转换为数字,用列表来将数字转换为字符串。
class IdMap:
def __init__(self):
self.str_to_id = {}
self.id_to_str = []
def __len__(self):
return len(self.id_to_str)
def _get_str(self, i):
### Begin your code
return self.id_to_str[i]
### End your code
def _get_id(self, s):
### Begin your code
if s not in self.str_to_id.keys():
new_id = len(self)
self.id_to_str.append(s)
self.str_to_id[s] = new_id
return new_id
else:
return self.str_to_id[s]
### End your code
def __getitem__(self, key):
if type(key) is int:
return self._get_str(key)
elif type(key) is str:
return self._get_id(key)
else:
raise TypeError
InvertedIndex
之后会在此基础上构建它的子类InvertedIndexWriter, InvertedIndexIterator 和 InvertedIndexMapper
class InvertedIndex:
def __init__(self, index_name, postings_encoding=None, directory=''):
self.index_file_path = os.path.join(directory, index_name+'.index')
self.metadata_file_path = os.path.join(directory, index_name+'.dict')
if postings_encoding is None:
self.postings_encoding = UncompressedPostings
else:
self.postings_encoding = postings_encoding
self.directory = directory
self.postings_dict = {}
self.terms = []
def __enter__(self):
# Open the index file
self.index_file = open(self.index_file_path, 'rb+')
# Load the postings dict and terms from the metadata file
with open(self.metadata_file_path, 'rb') as f:
self.postings_dict, self.terms = pkl.load(f)
self.term_iter = self.terms.__iter__()
return self
def __exit__(self, exception_type, exception_value, traceback):
# Close the index file
self.index_file.close()
# Write the postings dict and terms to the metadata file
with open(self.metadata_file_path, 'wb') as f:
pkl.dump([self.postings_dict, self.terms], f)
InvertedIndexWriter
倒排表不会存储在内存中而是直接写入到磁盘里
class InvertedIndexWriter(InvertedIndex):
""""""
def __enter__(self):
self.index_file = open(self.index_file_path, 'wb+')
return self
def append(self, term, postings_list):
### Begin your code
byte_list = self.postings_encoding.encode(postings_list)
if self.terms != []:
(sp, num, leng) = self.postings_dict[self.terms[-1]]
self.postings_dict[term] = (sp+leng,len(postings_list),len(byte_list))
else:
self.postings_dict[term] = (0,len(postings_list),len(byte_list))
self.terms.append(term)
self.index_file.write(byte_list)
### End your code
InvertedIndexIterator
迭代地从磁盘上每次读取文件的一个倒排列表
class InvertedIndexIterator(InvertedIndex):
def __enter__(self):
"""Adds an initialization_hook to the __enter__ function of super class
"""
super().__enter__()
self._initialization_hook()
return self
def _initialization_hook(self):
"""Use this function to initialize the iterator
"""
### Begin your code
self.start = 0
self.index_file.seek(0)
### End your code
def __iter__(self):
return self
def __next__(self):
### Begin your code
i = self.start
if(i < len(self.terms)):
self.start+=1
start, times, leng = self.postings_dict[self.terms[i]]
pos_list = self.postings_encoding.decode(self.index_file.read(leng))
return (self.terms[i], pos_list)
else:
raise StopIteration
### End your code
def delete_from_disk(self):
self.delete_upon_exit = True
def __exit__(self, exception_type, exception_value, traceback):
self.index_file.close()
if hasattr(self, 'delete_upon_exit') and self.delete_upon_exit:
os.remove(self.index_file_path)
os.remove(self.metadata_file_path)
else:
with open(self.metadata_file_path, 'wb') as f:
pkl.dump([self.postings_dict, self.terms], f)
InvertedIndexMapper
找到对应terms在索引文件中位置并取出它的倒排记录表
class InvertedIndexMapper(InvertedIndex):
def __getitem__(self, key):
return self._get_postings_list(key)
def _get_postings_list(self, term):
### Begin your code
if term >= len(self.postings_dict):
return []
start, times, leng = self.postings_dict[term]
self.index_file.seek(start, 0)
posting_list = self.postings_encoding.decode(self.index_file.read(leng))
return posting_list
### End your code
BSBIIndex
class BSBIIndex:
def __init__(self, data_dir, output_dir, index_name = "BSBI",
postings_encoding = None):
self.term_id_map = IdMap()
self.doc_id_map = IdMap()
self.data_dir = data_dir
self.output_dir = output_dir
self.index_name = index_name
self.postings_encoding = postings_encoding
# Stores names of intermediate indices
self.intermediate_indices = []
def save(self):
with open(os.path.join(self.output_dir, 'terms.dict'), 'wb') as f:
pkl.dump(self.term_id_map, f)
with open(os.path.join(self.output_dir, 'docs.dict'), 'wb') as f:
pkl.dump(self.doc_id_map, f)
def load(self):
with open(os.path.join(self.output_dir, 'terms.dict'), 'rb') as f:
self.term_id_map = pkl.load(f)
with open(os.path.join(self.output_dir, 'docs.dict'), 'rb') as f:
self.doc_id_map = pkl.load(f)
def index(self):
for block_dir_relative in sorted(next(os.walk(self.data_dir))[1]):
td_pairs = self.parse_block(block_dir_relative)
index_id = 'index_'+block_dir_relative
self.intermediate_indices.append(index_id)
with InvertedIndexWriter(index_id, directory=self.output_dir,
postings_encoding=
self.postings_encoding) as index:
self.invert_write(td_pairs, index)
td_pairs = None
self.save()
with InvertedIndexWriter(self.index_name, directory=self.output_dir,
postings_encoding=
self.postings_encoding) as merged_index:
with contextlib.ExitStack() as stack:
indices = [stack.enter_context(
InvertedIndexIterator(index_id,
directory=self.output_dir,
postings_encoding=
self.postings_encoding))
for index_id in self.intermediate_indices]
self.merge(indices, merged_index)
def parse_block(self, block_dir_relative):
### Begin your code
term_doc = set({})
fileList = os.listdir(os.path.join(self.data_dir, block_dir_relative))
for file in fileList:
file_dir = os.path.join(block_dir_relative, file)
text = open(os.path.join(self.data_dir, file_dir)).read()
text = text.lower()
words = text.split()
# print(words)
doc_id = self.doc_id_map.__getitem__(file_dir) #文档id
for word in words:
term_id = self.term_id_map.__getitem__(word)
term_doc.add((term_id, doc_id))
return list(term_doc)
### End your code
def invert_write(self, td_pairs, index):
### Begin your code
td_dict = {}
for pair in td_pairs:
if pair[0] in td_dict.keys():
td_dict[pair[0]].append(pair[1])
else:
td_dict[pair[0]] = [pair[1]]
# print(td_dict)
for term in sorted(td_dict): # 排序
td_dict[term].sort()
index.append(term, td_dict[term])
### End your code
def retrieve(self, query):
if len(self.term_id_map) == 0 or len(self.doc_id_map) == 0:
self.load()
### Begin your code
terms = query.split()
result = []
with InvertedIndexMapper(
directory=self.output_dir,
index_name = self.index_name,
postings_encoding=self.postings_encoding) as invert_map:
result = invert_map._get_postings_list(self.term_id_map[terms[0]])
if len(terms) > 1:
for term in terms[1:]:
next_list = invert_map._get_postings_list(self.term_id_map[term])
result = sorted_intersect(next_list, result)
files = [self.doc_id_map[f] for f in result]
return files
### End your code
# heapq.merge,一个神奇的函数
def merge(self, indices, merged_index):
### Begin your code
index = 0
posting_list = []
for merged_item in heapq.merge(*indices, key=lambda x: x[0]):
if merged_item[0] == index:
posting_list += merged_item[1]
elif merged_item[0] > index:
posting_list.sort()
merged_index.append(index, posting_list)
posting_list = merged_item[1]
index = merged_item[0]
posting_list.sort()
merged_index.append(index, posting_list)
### End your code
sorted_intersect
遍历两个有序列表并在线性时间内合并
def sorted_intersect(list1, list2):
### Begin your code
result = []
i = 0
j = 0
while i < len(list1) and j < len(list2):
if list1[i] == list2[j]:
result.append(list1[i])
i += 1
j += 1
elif list1[i] < list2[j]:
i += 1
elif list1[i] > list2[j]:
j += 1
return result
### End your code
CompressedPostings
对索引进行压缩,对gap进行编码
class CompressedPostings:
#If you need any extra helper methods you can add them here
### Begin your code
@staticmethod
def vbcode(n):
byte = []
while True:
byte.append(n%128)
if n < 128:
break
n = n // 128
byte[0] += 128
byte = list(reversed(byte))
return byte
### End your code
@staticmethod
def encode(postings_list):
### Begin your code
bytestream = []
last = 0
for posting in postings_list:
gap = posting - last
last = posting
byte = CompressedPostings.vbcode(gap)
bytestream.extend(byte)
return array.array('B', bytestream).tobytes()
### End your code
@staticmethod
def decode(encoded_postings_list):
### Begin your code
decoded_postings_list = array.array('B')
decoded_postings_list.frombytes(encoded_postings_list)
numbers = []
n = 0
for i, byte in enumerate(decoded_postings_list):
if byte < 128:
n = 128*n + byte
else:
n = 128*n + byte-128
numbers.append(n)
n = 0
prefix_sum = 0
res = []
for num in numbers:
prefix_sum += num
res.append(prefix_sum)
return res
### End your code
总结
历时两周,我沉浸在这里涂涂改改,敲敲打打,只感觉时光飞逝。从一头雾水到日思夜想,再到初见雏形,日益完善,我感觉到了一点点功利之外的快乐。
写代码的过程中,我深刻体会到,快速并且精准地写出一些测试代码非常重要。确保每一个模块的完美正确,这样可以避免在最终的测试中焦头烂额。
如君也想尝试,请移步官网CS 276 / LING 286: Information Retrieval and Web Search。
如有错漏之处,欢迎留言。