Lucene 在Ubuntu+Python2的环境下进行搜索

本文介绍如何使用Lucene在Ubuntu系统中结合Python2环境建立文本索引并进行搜索。通过两个Python脚本IndexFiles.py和SearchFile.py实现对指定目录下文件的索引创建及搜索功能。
摘要由CSDN通过智能技术生成

Lucene 在Ubuntu+Python2的环境下进行搜索

IndexFiles.py:

import sys, os, lucene, threading, time, traceback
from datetime import datetime
from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import TextField, Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from java.nio.file import Paths
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents.  The
resulting Lucene index will be placed in the current directory and called
'index'.
"""

class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

class IndexFiles(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        path = Paths.get(storeDir)
        store = SimpleFSDirectory(path)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'optimizing index',
        threading.Thread(target=ticker.run).start()
        writer.close()
        ticker.tick = False
        print 'done'

    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            print root
            try:
                sroot = unicode(root, 'GBK')
                print sroot
            except:
                print "*****************************unicode error"
                print root
                continue
            #FieldType:
            t1 = FieldType()
            t1.setStored(True)
            t1.setTokenized(False)

            t2 = FieldType()
            t2.setStored(True)
            t2.setTokenized(True)



            #add dir
            doc = Document()

            #doc.add(TextField("path", sroot,Field.Store.YES))
            #doc.add(TextField("name", sroot,Field.Store.YES))
            #writer.addDocument(doc)

            for filename in filenames:
                try:
                    filename = unicode(filename, 'GBK')
                    print filename
                except:
                    print "*****************************unicode error"
                    print filename
                    continue
                print "adding", filename
                try:

                    path = unicode(root, 'GBK')
                    path =os.path.join(sroot, filename)
                    print "read file: ", path
                    file = open(path)
                    #contents = unicode(file.read(), 'iso-8859-1')
                    #contents = unicode(file.read(), 'GBK')
                    contents = file.read()
                    print "contents is:", contents
                    file.close()
                    doc = Document()

                    doc.add(TextField("path", path,
                                           Field.Store.YES))

                    doc.add(TextField("name", filename,
                                           Field.Store.YES))

                    if len(contents) > 0:
                        doc.add(TextField("contents", contents,
                                               Field.Store.YES))
                    else:
                        print "warning: no content in %s" % filename

                    writer.addDocument(doc)
                    writer.commit()
                    print "[THIS FILE]:",doc
                except Exception, e:
                    print "Failed in indexDocs:", e
__debug = 0
if __name__ == '__main__':
    if __debug != 1:
        if len(sys.argv) < 2:
            print IndexFiles.__doc__
            sys.exit(1)


    print 'Lucene', lucene.VERSION
    start = datetime.now()
    lucene.initVM()
    try:
        if __debug != 1:
            IndexFiles(sys.argv[1], "index", StandardAnalyzer())
        else:
            IndexFiles(r'../corpus/', "index", StandardAnalyzer())
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        print "traceback: ", traceback.print_exc()

SearchFile.py:

import sys, os, lucene
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search import Query, TermQuery
from org.apache.lucene.util import Version
from java.nio.file import Paths


"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles.  It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field.  It will then display the
'path' and 'name' fields for each of the hits it finds in the index.  Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        print
        print "Searching for:", command
        parser = QueryParser("contents",analyzer)
        query = parser.parse(command)
        #query = QueryParser("name", analyzer).parse(command)
        hits = searcher.search(query, 1000)
        print "%s total matching documents." % hits.totalHits

        scoredocs = hits.scoreDocs
        for docs in scoredocs:
            doc = searcher.doc(docs.doc)
            print "path: ",doc.get("path"),"name: ",doc.get("name")


if __name__ == '__main__':
    STORE_DIR = "index"
    print 'Lucene', lucene.VERSION
    lucene.initVM()
    directory = SimpleFSDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()
    run(searcher, analyzer)

运行过程中首先运行IndexFiles.py建立索引:

sudo python IndexFiles.py ../corpus/

corpus下放的是预料文件。

然后进行搜索:

sudo python SearchFiles.py

然后再Query后面写相关搜索内容。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值