关于 Python 文本检索

最新推荐文章于 2022-08-10 16:51:47 发布

feiyy404

最新推荐文章于 2022-08-10 16:51:47 发布

阅读量1.6k

点赞数 2

分类专栏： Python库

本文链接：https://blog.csdn.net/Enjolras_fuu/article/details/98040796

版权

Python库专栏收录该内容

6 篇文章 0 订阅

订阅专栏

whoose 的使用

官网文档： https://whoosh.readthedocs.io/en/latest/quickstart.html
主要想完成的功能：
（1）在创建数据模型的时候将其添加到 whoose 中；
（2）在删除数据模型的时候将其从 whoose 中移除；
（3）支持中文查询；
（4）支持模糊查询。

建立存储目录

def get_search():
    """建立存储目录"""
    index_path = "/Users/furuiyang/codes/microblog/index"
    os.makedirs(index_path, exist_ok=True)
    return index_path

建立索引模式

class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT
    tags = KEYWORD

写入索引内容

def write_index():
    """写入索引内容"""
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)

    ix = index.open_dir(path)
    writer = ix.writer()
    # 开始写入内容
    writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
                        tags=u"firlst short")
    writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                        tags=u"second short")
    writer.commit()

读出索引内容

def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        query = QueryParser("content", ix.schema).parse("*do*")
        result = searcher.search(query)
        print(result)
        print(list(result))

删除索引内容

def delete_index():
    """
    删除索引内容
    :return:
    """
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)
    ix = index.open_dir(path)
    writer = ix.writer()
    # Delete document by its path -- this field must be indexed
    writer.delete_by_term('path', u'/a')
    # Save the deletion to disk
    writer.commit()

完整代码测试

import os

from whoosh import index
from whoosh.fields import SchemaClass, ID, TEXT, KEYWORD
from whoosh.qparser import QueryParser


def get_search():
    """建立存储目录"""
    index_path = "/Users/furuiyang/codes/microblog/index"
    os.makedirs(index_path, exist_ok=True)
    return index_path


class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT
    tags = KEYWORD


def write_index():
    """写入索引内容"""
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)

    ix = index.open_dir(path)
    writer = ix.writer()
    # 开始写入内容
    writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
                        tags=u"firlst short")
    writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                        tags=u"second short")
    writer.commit()


def delete_index():
    """
    删除索引内容
    :return:
    """
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)
    ix = index.open_dir(path)
    writer = ix.writer()
    # Delete document by its path -- this field must be indexed
    writer.delete_by_term('path', u'/a')
    # Save the deletion to disk
    writer.commit()


def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        # query = QueryParser("content", ix.schema).parse("*")
        query = QueryParser("title", ix.schema).parse("*")
        result = searcher.search(query)
        print(result)
        print(list(result))


if __name__ == "__main__":
    write_index()
    read_index()
    delete_index()

whoosh 与 jieba 分词结合

使用结巴生成中文分词器

class ChineseTokenizer(Tokenizer):
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        # 使用结巴分析库进行分词
        seglist = jieba.cut_for_search(value)
        for w in seglist:
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos+value.find(w)
            if chars:
                t.startchar = start_char+value.find(w)
                t.endchar = start_char+value.find(w)+len(w)
            # 通过生成器返回每个分词的结果token
            yield t


def get_analyzer():
    """
    获取分析器
    :return:
    """
    return ChineseTokenizer()

在对应的字段上指定分析器

class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT(stored=True, analyzer=ChineseAnalyzer())
    tags = KEYWORD

在读出时可对结果进行高亮显示

def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        query = QueryParser("content", ix.schema).parse("我们")
        # query = QueryParser("title", ix.schema).parse("*")
        result = searcher.search(query)
        # 对搜索结果进行高亮显示
        if 0 != len(result):
            for hit in result:
                print(hit.highlights("content"))

完整测试代码

import os

import jieba
from jieba.analyse import ChineseAnalyzer
from whoosh import index
from whoosh.analysis import Tokenizer, Token
from whoosh.compat import text_type
from whoosh.fields import SchemaClass, ID, TEXT, KEYWORD
from whoosh.qparser import QueryParser


class ChineseTokenizer(Tokenizer):
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        # 使用结巴分析库进行分词
        seglist = jieba.cut_for_search(value)
        for w in seglist:
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos+value.find(w)
            if chars:
                t.startchar = start_char+value.find(w)
                t.endchar = start_char+value.find(w)+len(w)
            # 通过生成器返回每个分词的结果token
            yield t


def get_analyzer():
    """
    获取分析器
    :return:
    """
    return ChineseTokenizer()


class MySchema(SchemaClass):
    """
    建立索引模式
    """
    path = ID(stored=True)
    title = TEXT(stored=True)
    content = TEXT(stored=True, analyzer=ChineseAnalyzer())
    tags = KEYWORD


def get_search():
    """建立存储目录"""
    index_path = "/Users/furuiyang/codes/microblog/index"
    os.makedirs(index_path, exist_ok=True)
    return index_path


def write_index():
    """写入索引内容"""
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)

    ix = index.open_dir(path)
    writer = ix.writer()
    # 开始写入内容
    writer.add_document(title=u"my document", content=u"this is my document", path=u"/a",
                        tags=u"firlst short")
    writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b",
                        tags=u"second short")
    writer.add_document(title=u"First document", path=u"/c", content=u"先生说我们都是好学生")
    writer.add_document(title=u"Second document", path=u"/d", content=u"我们要树立科学发展观")
    writer.commit()


def delete_index():
    """
    删除索引内容
    :return:
    """
    schema = MySchema()
    path = get_search()
    index.create_in(path, schema)
    ix = index.open_dir(path)
    writer = ix.writer()
    # Delete document by its path -- this field must be indexed
    writer.delete_by_term('path', u'/a')
    # Save the deletion to disk
    writer.commit()


def read_index():
    """读出内容"""
    ix = index.open_dir(get_search())
    with ix.searcher() as searcher:
        # (do somthing)
        query = QueryParser("content", ix.schema).parse("我们")
        # query = QueryParser("title", ix.schema).parse("*")
        result = searcher.search(query)
        # 对搜索结果进行高亮显示
        if 0 != len(result):
            for hit in result:
                print(hit.highlights("content"))


if __name__ == "__main__":
    write_index()
    read_index()
    # delete_index()

es 的使用

参考：在 flask 中使用全文搜索

feiyy404

关注

2
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
关于 Python 文本检索

whoose 的使用官网文档： https://whoosh.readthedocs.io/en/latest/quickstart.html主要想完成的功能：（1）在创建数据模型的时候将其添加到 whoose 中；（2）在删除数据模型的时候将其从 whoose 中移除；（3）支持中文查询；（4）支持模糊查询。建立存储目录def get_search(): """建立...
复制链接

扫一扫