Lucene 简单入门

最新推荐文章于 2022-09-13 09:43:02 发布

ouprince

最新推荐文章于 2022-09-13 09:43:02 发布

阅读量548

点赞数

分类专栏： NLP 文章标签： lucene python

NLP 专栏收录该内容

36 篇文章 21 订阅

订阅专栏

Lucene是apache软件基金会4 jakarta项目组的一个子项目，是一个开放源代码的全文检索引擎工具包，但它不是一个完整的全文检索引擎，而是一个全文检索引擎的架构，提供了完整的查询引擎和索引引擎，部分文本分析引擎（英文与德文两种西方语言）。Lucene的目的是为软件开发人员提供一个简单易用的工具包，以方便的在目标系统中实现全文检索的功能，或者是以此为基础建立起完整的全文检索引擎。

from java.nio.file import Paths
import sys,os,lucene
    
from org.apache.lucene.store import SimpleFSDirectory 
from org.apache.lucene.index import DirectoryReader   
from org.apache.lucene.search import IndexSearcher    
from org.apache.lucene.analysis.core import WhitespaceAnalyzer  #分词器
from org.apache.lucene.queryparser.classic import QueryParser   #查询表达式
    
lucene.initVM()  #初始化
chat_directory_path = Paths.get(os.path.join(curdir,"xxx"))  #返回一个路径，在该路径下建立索引文件
chat_directory = SimpleFSDirectory(chat_directory_path)
chat_search = IndexSearcher(DirectoryReader.open(chat_directory))
analyzer = WhitespaceAnalyzer()

#使用查询
qc = "post:(+我 是 +谁)"  #在索引文件搜索包含 “我” “谁”的句子
query = QueryParser("contents",analyzer).parse(qc)
o = chat_search.search(query,5).scoreDocs
for z in o:
    doc = chat_search.doc(z.doc)
    print "id:%s" %doc.get("id")
    print "score:%s" %doc.get("score")
    print "post:%s" %doc.get("post") #检索匹配的内容

关于 Lucene 内置的分词器介绍：
1.WhitespaceAnalyzer 仅仅去除空格，不支持中文
2.SimpleAnalyzer 首先通过非字母字符分割文本信息，然后单词统一为小写，去除数字字符
3.StopAnalyzer 在SimpleAnalyzer的基础上增加了去除英文常用词，如the a 。。。
4.StandardAnalyzer 支持中文采用的方法为单词切分，去掉停止词和标点符号

关于搜索：

不写的话默认是或关系， +代表必须出现 -代表必须不能出现

关于更完整的建立索引和搜索过程示例代码如下：

import lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer #分词器
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser #查询表达式

lucene.initVM(vmargs=['-Djava.awt.headless=true'])
#分词函数 " ".join（词汇）
def _get_post_terms(sentence):
    words, tags = tokenizer.word_segment(sentence) #这里是结巴分词封装函数
    if len(words) > 0:
        return " ".join(words)

#创建查询表达式
def build_query_condition(words, tags):
    result = ""
    for (x, y) in zip(words, tags):
        if y.startswith("n"):
            result += "+%s " % x
        else:
            result += "%s " % x
    return result

#在 target 目录建立索引，并将 file_path 中的数据写入索引目录
def indexing(file_path, target, remove=True):
    if not os.path.exists(file_path):
        raise BaseException("index error: %s does not exist." % file_path)
    directory = SimpleFSDirectory(Paths.get(target))
    analyzer = LimitTokenCountAnalyzer(WhitespaceAnalyzer(), 10000)
    config = IndexWriterConfig(analyzer)
    writer = IndexWriter(directory, config)
    with common_utils.smart_open(file_path) as fin: #file_path 文件中以 "id 句子" 形式存储
        for x in fin.readlines():
            o = x.split()
            id = o[0].strip()
            post = o[1].strip() # id 和 post 代开的都是以 二进制 （binary）形式打开的格式
            terms = _get_post_terms(post)
            doc = Document()
            #写入索引数据
            if terms:
                doc.add(Field("id", id, StringField.TYPE_STORED))
                doc.add(Field("post", post, TextField.TYPE_STORED))
                doc.add(Field("terms", terms, TextField.TYPE_STORED))
                writer.addDocument(doc)
    writer.commit()
    writer.close()
    return True

#建立对象
class LuceneSearch():
    def __init__(self, index_path, analyzer = None):
        self.index_path = index_path
        if analyzer:
            self.analyzer = analyzer
        else:
            self.analyzer = WhitespaceAnalyzer()

    #在 self.index_path 建立索引目录并将 data 中的数据写入索引目录
    def index(self, data, remove = False):
        if remove:
            common_utils.create_dir(self.index_path, remove = True)
        directory = SimpleFSDirectory(Paths.get(self.index_path))
        config = IndexWriterConfig(self.analyzer)
        writer = IndexWriter(directory, config)
        
        for o in data:
            id = o[0].strip()
            post = o[1].strip()
            terms = _get_post_terms(post)
            doc = Document()
            if terms:
                doc.add(Field("id", id, StringField.TYPE_STORED))
                doc.add(Field("post", post, TextField.TYPE_STORED))
                doc.add(Field("terms", terms, TextField.TYPE_STORED))
                writer.addDocument(doc)

        writer.commit()
        writer.close()
        
    #建立查询函数
    def query(self, q, size = 50):
        query = QueryParser("terms", self.analyzer).parse(q)
        directory = SimpleFSDirectory(Paths.get(self.index_path))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        scoreDocs = searcher.search(query, size).scoreDocs
        results = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            results.append(dict({
                                "post": doc.get("post"),
                                "id": doc.get("id"),
                                "score": scoreDoc.score,
                                "terms": doc.get("terms")
                                }))
        return results

import unittest
class Test(unittest.TestCase):
    def setUp(self):
        pass
    def tearDown(self):
        pass
    def test_index_and_search(self):
        print("test_index_files")
        to_ = os.path.join(rootdir, 'tmp', 'test_index')
        from_ = os.path.join(rootdir, 'corpus', 'gfzq', 'gfzq.2017-08-25.visitor')
        search = LuceneSearch(index_path = to_)
        data = []
        with common_utils.smart_open(from_) as fin: #注意这里将数据以二进制（binary）编码格式打开
            for x in fin.readlines():
                o = x.split()
                id = o[0].strip()
                post = o[1].strip()
                data.append([id, post]) #data 中是[[id,post],[id,post]...] 数据
      
        search.index(data)
        matched = search.query("合规") #在索引加入的 terms 中找到 "合规"
        for x in matched:
            print("id: %s, post: %s, score: %s" % (x['id'], x['post'], x['score']))
        
    def search_index_files(self):
        print("search_index_files")
        from_ = os.path.join(rootdir, 'tmp', 'test_index')
        search = LuceneSearch(from_)
        search.query("合规")

def test():
    unittest.main()

if __name__ == "__main__":
    test()

ouprince

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene 简单入门

Lucene是apache软件基金会4 jakarta项目组的一个子项目，是一个开放源代码的全文检索引擎工具包，但它不是一个完整的全文检索引擎，而是一个全文检索引擎的架构，提供了完整的查询引擎和索引引擎，部分文本分析引擎（英文与德文两种西方语言）。Lucene的目的是为软件开发人员提供一个简单易用的工具包，以方便的在目标系统中实现全文检索的功能，或者是以此为基础建立起完整的全文检索引擎。...
复制链接

扫一扫