PyLucene学习之二

文档和域

文档是Lucene索引和搜索的原子单位,文档为包含一个或多个域的容器,而域则依次包含”真正的“被索引内容。


索引

提取文本->创建对应Document实例->通过分析将域文本处理成大量语汇单元->将语汇单元加入段结构
使用倒排索引的数据结构进行存储,能够有效的利用磁盘空间,把文档中提取出的语汇单元作为查询关键字

索引步骤

1 首先创建Directory对象用于存放索引

store=SimpleFSDirectory(File(storeDir))

2 接下来在Directory对象上创建IndexWriter对象

config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer=IndexWriter(store,config)

3 创建Document对象和Fields对象,并将Document加入索引


域索引选项

Field.Index.* 通过倒排索引来控制域文本是否可被索引。
Index.ANALYZED:被分析器分析,分析器提供的主要功能是将文本处理成大量语汇单元,例如文章正文等就需要被解析。
Index.NOT_ANALYZED:对当前域不进行分析,例如一些不想被改变的内容。

域存储选项

Field.Store.* 用来确定是否需要存储域的真实值, 以便后续搜索时能恢复这个值。

域的项向量选项

项向量是介于索引域和存储域的一个中间结构。

域选项组合

索引选项存储选项项向量使用范例
NOT_ANALYZED_NO_FORMSYESNO标识符,姓名,电话,日期
ANALYZEDYESWITH_POSITIONS_OFFSETS文档标题,摘要
ANALYZEDNOWITH_POSITIONS_OFFSETS文档正文
NOYESNO文档类型,数据库主键
NOT_ANALYZEDNONO隐藏的关键词

例如content的FieldType:

t2=FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

索引代码:

#!/usr/bin/env python
#coding:utf-8

INDEX_DIR = "IndexFiles.index"

import sys, os, lucene, threading, time
from datetime import datetime

from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents.  The
resulting Lucene index will be placed in the current directory and called
'index'.
"""

class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

class IndexFiles(object):
    """Usage: python IndexFiles <doc_dand will index all of the files in that directory and downward recursively.
irectory>"""

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'

    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                #print filename
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                #try:
                path = os.path.join(root, filename)
                file = open(path)
                # contents = unicode(file.read(), 'iso-8859-1')
                contents = unicode(file.read(), 'utf-8')
                file.close()
                if len(contents) > 0:
                    sentences = contents.split('###')
                    i = 0
                    for sentence in sentences:
                        i += 1
                        #print i
                        doc = Document()
                        doc.add(Field("name", filename, t1))
                        doc.add(Field("path", root, t1))
                        doc.add(Field("sentence_id", str(i), Field.Store.YES, Field.Index.NOT_ANALYZED))
                        doc.add(Field("contents", sentence, t2))
                        writer.addDocument(doc)
                else:
                    print "warning: no content in %s" % filename

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    #try:
    base_dir = os.path.dirname(os.path.abspath('.'))
    print base_dir
    IndexFiles(".", os.path.join(base_dir, INDEX_DIR),
               WhitespaceAnalyzer(Version.LUCENE_CURRENT))
    end = datetime.now()
    print end - start

搜索

#!/usr/bin/env python
#coding:utf-8

INDEX_DIR = "IndexFiles.index"

import sys, os, lucene

from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import DirectoryReader, IndexReader, Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher, Explanation
from org.apache.lucene.util import Version

"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles.  It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field.  It will then display the
'path' and 'name' fields for each of the hits it finds in the index.  Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer, reader):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command

        term = Term("contents", command)
        print term.toString()
        term_vector = reader.totalTermFreq(term)
        print "%s total terms" % term_vector

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 10000).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            explanation = searcher.explain(query, scoreDoc.doc)
            #print explanation.toString()
            print 'path:', doc.get("path"), 'name:', doc.get("name"), doc.get("sentence_id")


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath("."))
    print base_dir
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    reader = IndexReader.open(directory)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer, reader)
    del searcher

很好的参考文献:http://www.cppblog.com/baby-fly/archive/2010/03/08/109189.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值