1.Python代码
1.1 构建索引
def build_index(file_path):
start_time = time.clock()
schema = Schema(title=TEXT(stored=True), path=ID(stored=False), content=TEXT(stored=True))
ix = create_in("indexdir", schema)
writer = ix.writer()
with open(file_path, 'r') as filereader:
num = 1
for row in filereader:
title_num = str(num) + " line"
writer.add_document(title=title_num.decode("utf-8"), path=file_path.decode("utf-8"), content=row.decode("utf-8"))
num += 1
writer.commit()
elapsed = (time.clock() - start_time)
return "Step One: 索引已经构建完成--------------------- " + str(elapsed)
1.2 检索
def search_word(search):
start_time = time.clock()
ix = open_dir("indexdir")
with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(search)
results = searcher.search(query, limit = None)
elapsed = (time.clock() - start_time)
print "Step Two: 搜索已经完成--------------------- " + str(elapsed)
pprint.pprint(results[:])
return "Finished."
1.3 测试
if __name__ == '__main__':
file_name = raw_input("Please input the path of file you want to build index: ")
step_result = "Something went wrong......."
try:
step_result = build_index(file_name)
finally:
print step_result
search = raw_input("Please input the word you want to search: ")
step_result = "Something went wrong......"
try:
step_result = search_word(search)
finally:
print step_result
2.参考资料
1.https://whoosh.readthedocs.io/en/latest/releases/index.html