whoose 的使用
官网文档: https://whoosh.readthedocs.io/en/latest/quickstart.html
主要想完成的功能:
(1) 在创建数据模型的时候将其添加到 whoose 中;
(2) 在删除数据模型的时候将其从 whoose 中移除;
(3) 支持中文查询;
(4)支持模糊查询。
建立存储目录
def get_search():
"""建立存储目录"""
index_path = "/Users/furuiyang/codes/microblog/index"
os.makedirs(index_path, exist_ok=True)
return index_path
建立索引模式
class MySchema(SchemaClass):
"""
建立索引模式
"""
path = ID(stored=True)
title = TEXT(stored=True)
content = TEXT
tags = KEYWORD
写入索引内容
def write_index():
"""写入索引内容"""
schema = MySchema()
path = get_search()
index.create_in(path, schema)
ix = index.open_dir(path)
writer = ix.writer()
# 开始写入内容
writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
tags=u"firlst short")
writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
tags=u"second short")
writer.commit()
读出索引内容
def read_index():
"""读出内容"""
ix = index.open_dir(get_search())
with ix.searcher() as searcher:
# (do somthing)
query = QueryParser("content", ix.schema).parse("*do*")
result = searcher.search(query)
print(result)
print(list(result))
删除索引内容
def delete_index():
"""
删除索引内容
:return:
"""
schema = MySchema()
path = get_search()
index.create_in(path, schema)
ix = index.open_dir(path)
writer = ix.writer()
# Delete document by its path -- this field must be indexed
writer.delete_by_term('path', u'/a')
# Save the deletion to disk
writer.commit()
完整代码测试
import os
from whoosh import index
from whoosh.fields import SchemaClass, ID, TEXT, KEYWORD
from whoosh.qparser import QueryParser
def get_search():
"""建立存储目录"""
index_path = "/Users/furuiyang/codes/microblog/index"
os.makedirs(index_path, exist_ok=True)
return index_path
class MySchema(SchemaClass):
"""
建立索引模式
"""
path = ID(stored=True)
title = TEXT(stored=True)
content = TEXT
tags = KEYWORD
def write_index():
"""写入索引内容"""
schema = MySchema()
path = get_search()
index.create_in(path, schema)
ix = index.open_dir(path)
writer = ix.writer()
# 开始写入内容
writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
tags=u"firlst short")
writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
tags=u"second short")
writer.commit()
def delete_index():
"""
删除索引内容
:return:
"""
schema = MySchema()
path = get_search()
index.create_in(path, schema)
ix = index.open_dir(path)
writer = ix.writer()
# Delete document by its path -- this field must be indexed
writer.delete_by_term('path', u'/a')
# Save the deletion to disk
writer.commit()
def read_index():
"""读出内容"""
ix = index.open_dir(get_search())
with ix.searcher() as searcher:
# (do somthing)
# query = QueryParser("content", ix.schema).parse("*")
query = QueryParser("title", ix.schema).parse("*")
result = searcher.search(query)
print(result)
print(list(result))
if __name__ == "__main__":
write_index()
read_index()
delete_index()
whoosh 与 jieba 分词结合
使用结巴生成中文分词器
class ChineseTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
# 使用结巴分析库进行分词
seglist = jieba.cut_for_search(value)
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos = start_pos+value.find(w)
if chars:
t.startchar = start_char+value.find(w)
t.endchar = start_char+value.find(w)+len(w)
# 通过生成器返回每个分词的结果token
yield t
def get_analyzer():
"""
获取分析器
:return:
"""
return ChineseTokenizer()
在对应的字段上指定分析器
class MySchema(SchemaClass):
"""
建立索引模式
"""
path = ID(stored=True)
title = TEXT(stored=True)
content = TEXT(stored=True, analyzer=ChineseAnalyzer())
tags = KEYWORD
在读出时可对结果进行高亮显示
def read_index():
"""读出内容"""
ix = index.open_dir(get_search())
with ix.searcher() as searcher:
# (do somthing)
query = QueryParser("content", ix.schema).parse("我们")
# query = QueryParser("title", ix.schema).parse("*")
result = searcher.search(query)
# 对搜索结果进行高亮显示
if 0 != len(result):
for hit in result:
print(hit.highlights("content"))
完整测试代码
import os
import jieba
from jieba.analyse import ChineseAnalyzer
from whoosh import index
from whoosh.analysis import Tokenizer, Token
from whoosh.compat import text_type
from whoosh.fields import SchemaClass, ID, TEXT, KEYWORD
from whoosh.qparser import QueryParser
class ChineseTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
# 使用结巴分析库进行分词
seglist = jieba.cut_for_search(value)
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos = start_pos+value.find(w)
if chars:
t.startchar = start_char+value.find(w)
t.endchar = start_char+value.find(w)+len(w)
# 通过生成器返回每个分词的结果token
yield t
def get_analyzer():
"""
获取分析器
:return:
"""
return ChineseTokenizer()
class MySchema(SchemaClass):
"""
建立索引模式
"""
path = ID(stored=True)
title = TEXT(stored=True)
content = TEXT(stored=True, analyzer=ChineseAnalyzer())
tags = KEYWORD
def get_search():
"""建立存储目录"""
index_path = "/Users/furuiyang/codes/microblog/index"
os.makedirs(index_path, exist_ok=True)
return index_path
def write_index():
"""写入索引内容"""
schema = MySchema()
path = get_search()
index.create_in(path, schema)
ix = index.open_dir(path)
writer = ix.writer()
# 开始写入内容
writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
tags=u"firlst short")
writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
tags=u"second short")
writer.add_document(title=u"First document", path=u"/c", content=u"先生说我们都是好学生")
writer.add_document(title=u"Second document", path=u"/d", content=u"我们要树立科学发展观")
writer.commit()
def delete_index():
"""
删除索引内容
:return:
"""
schema = MySchema()
path = get_search()
index.create_in(path, schema)
ix = index.open_dir(path)
writer = ix.writer()
# Delete document by its path -- this field must be indexed
writer.delete_by_term('path', u'/a')
# Save the deletion to disk
writer.commit()
def read_index():
"""读出内容"""
ix = index.open_dir(get_search())
with ix.searcher() as searcher:
# (do somthing)
query = QueryParser("content", ix.schema).parse("我们")
# query = QueryParser("title", ix.schema).parse("*")
result = searcher.search(query)
# 对搜索结果进行高亮显示
if 0 != len(result):
for hit in result:
print(hit.highlights("content"))
if __name__ == "__main__":
write_index()
read_index()
# delete_index()
es 的使用
参考: 在 flask 中使用全文搜索