Lucene是简单而功能强大的基于Java的搜索库。它可以用于任何应用程序的搜索功能。它是可扩展的,高性能的库用于索引和搜索几乎任何类型的文本。
项目中使用Lucene做业务菜单的搜索功能。客户可以通过输入业务菜单的部分文字,通过Lucene检索,查询到相符合的菜单目录进行业务操作。闲话不说,本人根据项目中Lucene的使用情况结合新版的(6.6)Lucene使用情况写了个DEMO用于学习。
首先是DEMO中Lucene使用的公共常量类。
/**
* lucene常量类
* @author zhouyi
*
*/
public class LuceneConstants {
public static final String CONTENTS = "contents" ;
public static final String FILE_NAME = "filename";
public static final String FILE_PATH = "filepath" ;
public static final int MAX_SEARCH = 10 ; //搜索数目为10条
}
然后对需要索引的文件做类别区分,这里暂时只对TXT文件进行索引。
import java.io.File;
import java.io.FileFilter;
public class TextFileFilter implements FileFilter {
@Override
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".txt");
}
}
下面开始对需要检索的文件建立索引。注意:新版的Lucene使用了NIO2中一系列方式,摈弃了File等IO的方式。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import cn.zhouyi.demo.lucene.LuceneConstants;
public class Indexer {
//索引创建类
private IndexWriter writer ;
public Indexer(String indexDirectoryPath) throws IOException{
//读取需要索引的文件到Lucene的目录类中,新版的Lucene只支持IO2中的Path类型的变量了。
Directory indexDirectory = FSDirectory.open(Paths.get(indexDirectoryPath)) ;
//创建分词器,这个分词器必须和IndexSearcher中的一致。
Analyzer analyzer = new StandardAnalyzer() ;
//新版的Lucene中索引创建类只接收IndexWriterConfig配置。
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
writer = new IndexWriter(indexDirectory, iwc);
}
public void close() throws CorruptIndexException, IOException{
writer.close();
}
//给文件创建索引
private void indexFile(Path path) throws IOException{
//因为使用了Java7的try(),所以文件流的操作必须在try()中写完,否则会自动关闭流。
try(InputStream stream = Files.newInputStream(path)){
//建立Lucene文档
Document document = new Document() ;
Field contentField = new TextField(LuceneConstants.CONTENTS, new BufferedReader(new InputStreamReader(stream,StandardCharsets.UTF_8))) ;
Field fileNameField = new StringField(LuceneConstants.FILE_NAME, path.getFileName().toString(), Field.Store.YES);
Field filePathField = new StringField(LuceneConstants.FILE_PATH, path.toString(), Field.Store.YES);
document.add(contentField);
document.add(fileNameField);
document.add(filePathField);
System.out.println("Indexing "+path.toString());
//写入文档到索引创建类中
writer.addDocument(document) ;
}
}
//遍历文件目录下的文件,给这些文件加索引
public int createIndex(String docPath, FileFilter filter) throws IOException{
Path path = Paths.get(docPath) ;
if(!Files.isReadable(path)){
System.out.println("Document Directory '"+path.toAbsolutePath()+ "'is not readable or is not exist");
System.exit(1);
}
if(Files.isDirectory(path)){
//NIO2中优雅地遍历文件
Files.walkFileTree(path, new SimpleFileVisitor<Path>(){
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs){
try{
if(filter.accept(file.toFile())){
indexFile(file) ;
}
}catch(IOException ex){
ex.printStackTrace();
}
return FileVisitResult.CONTINUE;
}
}) ;
}else{
if(filter.accept(path.toFile())){
indexFile(path) ;
}
}
return writer.numDocs() ;
}
}
上面给对应目录的文件创建好了分词索引后,下面开始读取索引进行搜索。
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import cn.zhouyi.demo.lucene.LuceneConstants;
public class Searcher {
//索引搜索类
private IndexSearcher indexSearcher ;
//索引读取类
private IndexReader reader ;
//将用户的搜索条件封装成Lucene的query条件
private QueryParser queryParser ;
private Query query ;
public Searcher(String indexDirectoryPath) throws IOException{
//将索引文件读取到lucene的索引读取类中
Directory directory = FSDirectory.open(Paths.get(indexDirectoryPath));
reader = DirectoryReader.open(directory);
//创建索引搜索类
indexSearcher = new IndexSearcher(reader) ;
//此处分词器需要和索引类中的一致
Analyzer analyzer = new StandardAnalyzer();
queryParser = new QueryParser(LuceneConstants.CONTENTS, analyzer);
}
//根据用户的搜索条件返回lucene搜索的文档
public TopDocs search(String searchQuery) throws ParseException, IOException{
query = queryParser.parse(searchQuery);
return indexSearcher.search(query, LuceneConstants.MAX_SEARCH) ;
}
//根据文档的id获取文档,注scoreDoc=TopDocs.scoreDocs[i]
public Document getDocument(ScoreDoc scoreDoc) throws IOException{
return indexSearcher.doc(scoreDoc.doc);
}
public void close() throws IOException{
reader.close();
}
}
以上简单的索引类和搜索类已经写好了,下面写一个测试类来测试一下lucene的功能。
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import cn.zhouyi.demo.lucene.LuceneConstants;
public class LuceneTester {
String indexDir = "E:\\code\\lucence\\index" ;
String dataDir = "E:\\code\\lucence\\docs" ;
Indexer indexer ;
Searcher searcher ;
public static void main(String args[]){
LuceneTester tester ;
try{
tester = new LuceneTester() ;
tester.createIndex();
tester.search("you");
}catch(Exception ex){
ex.printStackTrace();
}
}
private void createIndex() throws IOException{
indexer = new Indexer(indexDir) ;
int numIndexed ;
long startTime = System.currentTimeMillis() ;
//numIndexed = indexer.createIndex(dataDir, new TextFileFilter()) ;
//使用一下Java8的新特性来实现一下文件的筛选。
numIndexed = indexer.createIndex(dataDir, (pathname)->{
return pathname.getName().toLowerCase().endsWith(".txt");
});
long endTime = System.currentTimeMillis() ;
indexer.close();
System.out.println(numIndexed+" File indexed, time taken: "+(endTime-startTime)+" ms");
}
private void search(String searchQuery) throws IOException, ParseException{
searcher = new Searcher(indexDir);
long startTime = System.currentTimeMillis();
TopDocs hits = searcher.search(searchQuery);
long endTime = System.currentTimeMillis();
System.out.println(hits.totalHits+" documents found. Time :" + (endTime - startTime));
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = searcher.getDocument(scoreDoc);
System.out.println("File: " + doc.get(LuceneConstants.FILE_PATH));
}
searcher.close();
}
}
执行上面测试类得到的结果:
Indexing E:\code\lucence\docs\doc1.txt
1 File indexed, time taken: 105 ms
1 documents found. Time :24
File: E:\code\lucence\docs\doc1.txt
收工完毕。