Lucene 在Ubuntu+Python2的环境下进行搜索

最新推荐文章于 2024-06-09 16:30:00 发布

金泽尚

最新推荐文章于 2024-06-09 16:30:00 发布

阅读量746

点赞数

本文链接：https://blog.csdn.net/u014566794/article/details/74188531

版权

Lucene 同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

Python

1 篇文章 0 订阅

订阅专栏

本文介绍如何使用Lucene在Ubuntu系统中结合Python2环境建立文本索引并进行搜索。通过两个Python脚本IndexFiles.py和SearchFile.py实现对指定目录下文件的索引创建及搜索功能。

摘要由CSDN通过智能技术生成

Lucene 在Ubuntu+Python2的环境下进行搜索

IndexFiles.py:

import sys, os, lucene, threading, time, traceback
from datetime import datetime
from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import TextField, Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from java.nio.file import Paths
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents.  The
resulting Lucene index will be placed in the current directory and called
'index'.
"""

class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

class IndexFiles(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        path = Paths.get(storeDir)
        store = SimpleFSDirectory(path)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'optimizing index',
        threading.Thread(target=ticker.run).start()
        writer.close()
        ticker.tick = False
        print 'done'

    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            print root
            try:
                sroot = unicode(root, 'GBK')
                print sroot
            except:
                print "*****************************unicode error"
                print root
                continue
            #FieldType:
            t1 = FieldType()
            t1.setStored(True)
            t1.setTokenized(False)

            t2 = FieldType()
            t2.setStored(True)
            t2.setTokenized(True)



            #add dir
            doc = Document()

            #doc.add(TextField("path", sroot,Field.Store.YES))
            #doc.add(TextField("name", sroot,Field.Store.YES))
            #writer.addDocument(doc)

            for filename in filenames:
                try:
                    filename = unicode(filename, 'GBK')
                    print filename
                except:
                    print "*****************************unicode error"
                    print filename
                    continue
                print "adding", filename
                try:

                    path = unicode(root, 'GBK')
                    path =os.path.join(sroot, filename)
                    print "read file: ", path
                    file = open(path)
                    #contents = unicode(file.read(), 'iso-8859-1')
                    #contents = unicode(file.read(), 'GBK')
                    contents = file.read()
                    print "contents is:", contents
                    file.close()
                    doc = Document()

                    doc.add(TextField("path", path,
                                           Field.Store.YES))

                    doc.add(TextField("name", filename,
                                           Field.Store.YES))

                    if len(contents) > 0:
                        doc.add(TextField("contents", contents,
                                               Field.Store.YES))
                    else:
                        print "warning: no content in %s" % filename

                    writer.addDocument(doc)
                    writer.commit()
                    print "[THIS FILE]:",doc
                except Exception, e:
                    print "Failed in indexDocs:", e
__debug = 0
if __name__ == '__main__':
    if __debug != 1:
        if len(sys.argv) < 2:
            print IndexFiles.__doc__
            sys.exit(1)


    print 'Lucene', lucene.VERSION
    start = datetime.now()
    lucene.initVM()
    try:
        if __debug != 1:
            IndexFiles(sys.argv[1], "index", StandardAnalyzer())
        else:
            IndexFiles(r'../corpus/', "index", StandardAnalyzer())
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        print "traceback: ", traceback.print_exc()

SearchFile.py:

import sys, os, lucene
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search import Query, TermQuery
from org.apache.lucene.util import Version
from java.nio.file import Paths


"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles.  It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field.  It will then display the
'path' and 'name' fields for each of the hits it finds in the index.  Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'GBK')
        if command == '':
            return

        print
        print "Searching for:", command
        parser = QueryParser("contents",analyzer)
        query = parser.parse(command)
        #query = QueryParser("name", analyzer).parse(command)
        hits = searcher.search(query, 1000)
        print "%s total matching documents." % hits.totalHits

        scoredocs = hits.scoreDocs
        for docs in scoredocs:
            doc = searcher.doc(docs.doc)
            print "path: ",doc.get("path"),"name: ",doc.get("name")


if __name__ == '__main__':
    STORE_DIR = "index"
    print 'Lucene', lucene.VERSION
    lucene.initVM()
    directory = SimpleFSDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()
    run(searcher, analyzer)