Lucene 在Ubuntu+Python2的环境下进行搜索
IndexFiles.py:
import sys, os, lucene, threading, time, traceback
from datetime import datetime
from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import TextField, Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from java.nio.file import Paths
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__(self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(1.0)
class IndexFiles(object):
"""Usage: python IndexFiles <doc_directory>"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
path = Paths.get(storeDir)
store = SimpleFSDirectory(path)
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexDocs(root, writer)
ticker = Ticker()
print 'optimizing index',
threading.Thread(target=ticker.run).start()
writer.close()
ticker.tick = False
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
print root
try:
sroot = unicode(root, 'GBK')
print sroot
except:
print "*****************************unicode error"
print root
continue
#FieldType:
t1 = FieldType()
t1.setStored(True)
t1.setTokenized(False)
t2 = FieldType()
t2.setStored(True)
t2.setTokenized(True)
#add dir
doc = Document()
#doc.add(TextField("path", sroot,Field.Store.YES))
#doc.add(TextField("name", sroot,Field.Store.YES))
#writer.addDocument(doc)
for filename in filenames:
try:
filename = unicode(filename, 'GBK')
print filename
except:
print "*****************************unicode error"
print filename
continue
print "adding", filename
try:
path = unicode(root, 'GBK')
path =os.path.join(sroot, filename)
print "read file: ", path
file = open(path)
#contents = unicode(file.read(), 'iso-8859-1')
#contents = unicode(file.read(), 'GBK')
contents = file.read()
print "contents is:", contents
file.close()
doc = Document()
doc.add(TextField("path", path,
Field.Store.YES))
doc.add(TextField("name", filename,
Field.Store.YES))
if len(contents) > 0:
doc.add(TextField("contents", contents,
Field.Store.YES))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
writer.commit()
print "[THIS FILE]:",doc
except Exception, e:
print "Failed in indexDocs:", e
__debug = 0
if __name__ == '__main__':
if __debug != 1:
if len(sys.argv) < 2:
print IndexFiles.__doc__
sys.exit(1)
print 'Lucene', lucene.VERSION
start = datetime.now()
lucene.initVM()
try:
if __debug != 1:
IndexFiles(sys.argv[1], "index", StandardAnalyzer())
else:
IndexFiles(r'../corpus/', "index", StandardAnalyzer())
end = datetime.now()
print end - start
except Exception, e:
print "Failed: ", e
print "traceback: ", traceback.print_exc()
SearchFile.py:
import sys, os, lucene
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search import Query, TermQuery
from org.apache.lucene.util import Version
from java.nio.file import Paths
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
while True:
print
print "Hit enter with no input to quit."
command = raw_input("Query:")
command = unicode(command, 'GBK')
if command == '':
return
print
print "Searching for:", command
parser = QueryParser("contents",analyzer)
query = parser.parse(command)
#query = QueryParser("name", analyzer).parse(command)
hits = searcher.search(query, 1000)
print "%s total matching documents." % hits.totalHits
scoredocs = hits.scoreDocs
for docs in scoredocs:
doc = searcher.doc(docs.doc)
print "path: ",doc.get("path"),"name: ",doc.get("name")
if __name__ == '__main__':
STORE_DIR = "index"
print 'Lucene', lucene.VERSION
lucene.initVM()
directory = SimpleFSDirectory(Paths.get(STORE_DIR))
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = StandardAnalyzer()
run(searcher, analyzer)
运行过程中首先运行IndexFiles.py建立索引:
sudo python IndexFiles.py ../corpus/
corpus下放的是预料文件。
然后进行搜索:
sudo python SearchFiles.py
然后再Query后面写相关搜索内容。