关于 Python 文本检索

whoose 的使用

官网文档: https://whoosh.readthedocs.io/en/latest/quickstart.html
主要想完成的功能:
(1) 在创建数据模型的时候将其添加到 whoose 中;
(2) 在删除数据模型的时候将其从 whoose 中移除;
(3) 支持中文查询;
(4)支持模糊查询。

建立存储目录
def get_search():
    """建立存储目录"""
    index_path = "/Users/furuiyang/codes/microblog/index"
    os.makedirs(index_path, exist_ok=True)
    return index_path
建立索引模式
class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT
    tags = KEYWORD
写入索引内容
def write_index():
    """写入索引内容"""
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)

    ix = index.open_dir(path)
    writer = ix.writer()
    # 开始写入内容
    writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
                        tags=u"firlst short")
    writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                        tags=u"second short")
    writer.commit()
读出索引内容
def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        query = QueryParser("content", ix.schema).parse("*do*")
        result = searcher.search(query)
        print(result)
        print(list(result))
删除索引内容
def delete_index():
    """
    删除索引内容
    :return:
    """
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)
    ix = index.open_dir(path)
    writer = ix.writer()
    # Delete document by its path -- this field must be indexed
    writer.delete_by_term('path', u'/a')
    # Save the deletion to disk
    writer.commit()
完整代码测试
import os

from whoosh import index
from whoosh.fields import SchemaClass, ID, TEXT, KEYWORD
from whoosh.qparser import QueryParser


def get_search():
    """建立存储目录"""
    index_path = "/Users/furuiyang/codes/microblog/index"
    os.makedirs(index_path, exist_ok=True)
    return index_path


class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT
    tags = KEYWORD


def write_index():
    """写入索引内容"""
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)

    ix = index.open_dir(path)
    writer = ix.writer()
    # 开始写入内容
    writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
                        tags=u"firlst short")
    writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                        tags=u"second short")
    writer.commit()


def delete_index():
    """
    删除索引内容
    :return:
    """
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)
    ix = index.open_dir(path)
    writer = ix.writer()
    # Delete document by its path -- this field must be indexed
    writer.delete_by_term('path', u'/a')
    # Save the deletion to disk
    writer.commit()


def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        # query = QueryParser("content", ix.schema).parse("*")
        query = QueryParser("title", ix.schema).parse("*")
        result = searcher.search(query)
        print(result)
        print(list(result))


if __name__ == "__main__":
    write_index()
    read_index()
    delete_index()

whoosh 与 jieba 分词结合

使用结巴生成中文分词器
class ChineseTokenizer(Tokenizer):
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        # 使用结巴分析库进行分词
        seglist = jieba.cut_for_search(value)
        for w in seglist:
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos+value.find(w)
            if chars:
                t.startchar = start_char+value.find(w)
                t.endchar = start_char+value.find(w)+len(w)
            # 通过生成器返回每个分词的结果token
            yield t


def get_analyzer():
    """
    获取分析器
    :return:
    """
    return ChineseTokenizer()
在对应的字段上指定分析器
class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT(stored=True, analyzer=ChineseAnalyzer())
    tags = KEYWORD

在读出时可对结果进行高亮显示

def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        query = QueryParser("content", ix.schema).parse("我们")
        # query = QueryParser("title", ix.schema).parse("*")
        result = searcher.search(query)
        # 对搜索结果进行高亮显示
        if 0 != len(result):
            for hit in result:
                print(hit.highlights("content"))
完整测试代码
import os

import jieba
from jieba.analyse import ChineseAnalyzer
from whoosh import index
from whoosh.analysis import Tokenizer, Token
from whoosh.compat import text_type
from whoosh.fields import SchemaClass, ID, TEXT, KEYWORD
from whoosh.qparser import QueryParser


class ChineseTokenizer(Tokenizer):
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        # 使用结巴分析库进行分词
        seglist = jieba.cut_for_search(value)
        for w in seglist:
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos+value.find(w)
            if chars:
                t.startchar = start_char+value.find(w)
                t.endchar = start_char+value.find(w)+len(w)
            # 通过生成器返回每个分词的结果token
            yield t


def get_analyzer():
    """
    获取分析器
    :return:
    """
    return ChineseTokenizer()


class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT(stored=True, analyzer=ChineseAnalyzer())
    tags = KEYWORD


def get_search():
    """建立存储目录"""
    index_path = "/Users/furuiyang/codes/microblog/index"
    os.makedirs(index_path, exist_ok=True)
    return index_path


def write_index():
    """写入索引内容"""
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)

    ix = index.open_dir(path)
    writer = ix.writer()
    # 开始写入内容
    writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
                        tags=u"firlst short")
    writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                        tags=u"second short")
    writer.add_document(title=u"First document", path=u"/c", content=u"先生说我们都是好学生")
    writer.add_document(title=u"Second document", path=u"/d", content=u"我们要树立科学发展观")
    writer.commit()


def delete_index():
    """
    删除索引内容
    :return:
    """
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)
    ix = index.open_dir(path)
    writer = ix.writer()
    # Delete document by its path -- this field must be indexed
    writer.delete_by_term('path', u'/a')
    # Save the deletion to disk
    writer.commit()


def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        query = QueryParser("content", ix.schema).parse("我们")
        # query = QueryParser("title", ix.schema).parse("*")
        result = searcher.search(query)
        # 对搜索结果进行高亮显示
        if 0 != len(result):
            for hit in result:
                print(hit.highlights("content"))


if __name__ == "__main__":
    write_index()
    read_index()
    # delete_index()

es 的使用

参考: 在 flask 中使用全文搜索

  • 2
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值