用PyLucene实现本地文件名全文索引
功能
如果我们想在硬盘上查找一个文件或目录,可以使用Windows下的“搜索”功能,但每次搜索都要遍历整个硬盘,速度很慢,另外,如果要查找多个关键字也很困难。能否做一个类似于“本地搜索”功能的简单搜索程序,空闲的时候对硬盘的目录名和文件名进行索引,但需要查找文件名时可以“非常”快的超找到文件的位置?这可以通过Lucene实现。
实现
我们用Python来实现。首先安装PyLucene,可以参考《PyLucene安装及使用》。
PyLucene Samples目录下的IndexFiles.py和SearchFiles.py完成了对指定目录下的.txt文件内容进行索引,我们可以修改两个文件实现上面的功能。另外为了能够检索简体中文、繁体中文文件名、目录名对文件名和目录名进行了Unicode编码。源码如下:
IndexFiles.py
#
-*- coding:GB2312 -*-
import sys, os, PyLucene, threading, time
from datetime import datetime
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__ (self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write( ' . ' )
sys.stdout.flush()
time.sleep( 1.0 )
class IndexFiles(object):
""" Usage: python IndexFiles <doc_directory> """
def __init__ (self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = PyLucene.FSDirectory.getDirectory(storeDir, False)
writer = PyLucene.IndexWriter(store, analyzer, False)
writer.setMaxFieldLength( 1048576 )
self.indexDocs(root, writer)
ticker = Ticker()
print ' optimizing index ' ,
threading.Thread(target = ticker.run).start()
writer.optimize()
writer.close()
ticker.tick = False
print ' done '
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
print root
try :
sroot = unicode(root, ' GBK ' )
print sroot
except :
print " *****************************unicode error "
print root
continue
# add dir
doc = PyLucene.Document()
doc.add(PyLucene.Field( " path " , sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field( " name " , sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
for filename in filenames:
try :
filename = unicode(filename, ' GBK ' )
except :
print " *****************************unicode error "
print filename
continue
print " adding " , filename
try :
# path = unicode(root, 'GB2312')#
#
path = os.path.join(sroot, filename)
# file = open(path)
# contents = unicode(file.read(), 'iso-8859-1')
# contents = unicode(file.read(), 'GBK')
# file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field( " path " , path,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field( " name " , filename,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
'''
if len(contents) > 0:
doc.add(PyLucene.Field("contents", contents,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
else:
print "warning: no content in %s" % filename
'''
writer.addDocument(doc)
except Exception, e:
print " Failed in indexDocs: " , e
__debug = 0
if __name__ == ' __main__ ' :
if __debug != 1 :
if len(sys.argv) < 2 :
print IndexFiles. __doc__
sys.exit( 1 )
print ' PyLucene ' , PyLucene.VERSION, ' Lucene ' , PyLucene.LUCENE_VERSION
start = datetime.now()
try :
if __debug != 1 :
IndexFiles(sys.argv[ 1 ], " index " , PyLucene.StandardAnalyzer())
else :
IndexFiles(r ' c:/testccc ' , " index " , PyLucene.StandardAnalyzer())
end = datetime.now()
print end - start
except Exception, e:
print " Failed: " , e
import sys, os, PyLucene, threading, time
from datetime import datetime
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__ (self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write( ' . ' )
sys.stdout.flush()
time.sleep( 1.0 )
class IndexFiles(object):
""" Usage: python IndexFiles <doc_directory> """
def __init__ (self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = PyLucene.FSDirectory.getDirectory(storeDir, False)
writer = PyLucene.IndexWriter(store, analyzer, False)
writer.setMaxFieldLength( 1048576 )
self.indexDocs(root, writer)
ticker = Ticker()
print ' optimizing index ' ,
threading.Thread(target = ticker.run).start()
writer.optimize()
writer.close()
ticker.tick = False
print ' done '
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
print root
try :
sroot = unicode(root, ' GBK ' )
print sroot
except :
print " *****************************unicode error "
print root
continue
# add dir
doc = PyLucene.Document()
doc.add(PyLucene.Field( " path " , sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field( " name " , sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
for filename in filenames:
try :
filename = unicode(filename, ' GBK ' )
except :
print " *****************************unicode error "
print filename
continue
print " adding " , filename
try :
# path = unicode(root, 'GB2312')#
#
path = os.path.join(sroot, filename)
# file = open(path)
# contents = unicode(file.read(), 'iso-8859-1')
# contents = unicode(file.read(), 'GBK')
# file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field( " path " , path,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field( " name " , filename,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
'''
if len(contents) > 0:
doc.add(PyLucene.Field("contents", contents,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
else:
print "warning: no content in %s" % filename
'''
writer.addDocument(doc)
except Exception, e:
print " Failed in indexDocs: " , e
__debug = 0
if __name__ == ' __main__ ' :
if __debug != 1 :
if len(sys.argv) < 2 :
print IndexFiles. __doc__
sys.exit( 1 )
print ' PyLucene ' , PyLucene.VERSION, ' Lucene ' , PyLucene.LUCENE_VERSION
start = datetime.now()
try :
if __debug != 1 :
IndexFiles(sys.argv[ 1 ], " index " , PyLucene.StandardAnalyzer())
else :
IndexFiles(r ' c:/testccc ' , " index " , PyLucene.StandardAnalyzer())
end = datetime.now()
print end - start
except Exception, e:
print " Failed: " , e
SearchFiles.py
from
PyLucene
import
QueryParser, IndexSearcher, StandardAnalyzer, FSDirectory
from PyLucene import VERSION, LUCENE_VERSION
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
while True:
print
print " Hit enter with no input to quit. "
command = raw_input( " Query: " )
command = unicode(command, ' GBK ' )
if command == '' :
return
print
print " Searching for: " , command
# query = QueryParser("contents", analyzer).parse(command)
query = QueryParser( " name " , analyzer).parse(command)
hits = searcher.search(query)
print " %s total matching documents. " % hits.length()
for i, doc in hits:
print ' path: ' , doc.get( " path " ), ' name: ' , doc.get( " name " )
if __name__ == ' __main__ ' :
STORE_DIR = " index "
print ' PyLucene ' , VERSION, ' Lucene ' , LUCENE_VERSION
directory = FSDirectory.getDirectory(STORE_DIR, False)
searcher = IndexSearcher(directory)
analyzer = StandardAnalyzer()
run(searcher, analyzer)
searcher.close()
from PyLucene import VERSION, LUCENE_VERSION
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
while True:
print " Hit enter with no input to quit. "
command = raw_input( " Query: " )
command = unicode(command, ' GBK ' )
if command == '' :
return
print " Searching for: " , command
# query = QueryParser("contents", analyzer).parse(command)
query = QueryParser( " name " , analyzer).parse(command)
hits = searcher.search(query)
print " %s total matching documents. " % hits.length()
for i, doc in hits:
print ' path: ' , doc.get( " path " ), ' name: ' , doc.get( " name " )
if __name__ == ' __main__ ' :
STORE_DIR = " index "
print ' PyLucene ' , VERSION, ' Lucene ' , LUCENE_VERSION
directory = FSDirectory.getDirectory(STORE_DIR, False)
searcher = IndexSearcher(directory)
analyzer = StandardAnalyzer()
run(searcher, analyzer)
searcher.close()
建立索引,运行:
python IndexFiles.py c:/
查找的时候,运行:
python SearchFiles.py
如果只查找一个关键词则直接输入;如果想同时查找两个关键词,如Python 网络,则输入:Python AND 网络;如果想查找Python或网络则:Python 网络,也可以Python OR 网络。
其他的查询请参考API文档。
如果上面代码中用到的函数不太明白可以参考《实战 Lucene》