1、所用的Jar包
lucene-core-4.7.2.jar
lucene-analyzers-common-4.7.2.jar
lucene-queryparser-4.7.2.jar
lucene-highlighter-4.7.2.jar //高亮
IKAnalyzer2012FF_u1.jar //中文分析器
JDK1.6
2、测试Txt
在D:\lucene\test目录创建4个txt
3、代码示例
package com.search.lucene;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* txt文件索引
*/
public class IndexFile {
private Directory directory;
private String indexPath = "D://lucene/index"; // 建立索引文件的目录
private String dirPath = "D://lucene/test"; // txt资源目录
private Analyzer analyzer = new IKAnalyzer();
private IndexWriter indexWriter;
@Before
public void init() {
try {
directory=FSDirectory.open(new File(indexPath));
indexWriter=getIndexWriter(directory);
} catch(Exception e) {
System.out.println("索引打开异常!");
}
}
/**
* 获得所有txt文件
* @param dirPath
* @return
*/
public List<File> getFileList(String dirPath) {
File[] files=new File(dirPath).listFiles();
List<File> fileList=new ArrayList<File>();
for(File file: files) {
if(isTxtFile(file.getName())) {
fileList.add(file);
}
}
return fileList;
}
/**
* 创建索引
* @throws Exception
*/
@Test
public void createIndex() throws Exception{
List<File> fileList = getFileList(dirPath);
Document document = null;
for(File file:fileList){
document = fileToDocument(file);
indexWriter.addDocument(document);
System.out.println("filename=="+document.get("filename"));
indexWriter.commit();
}
closeWriter();
}
/**
* 判断是否是txt文件
* @param fileName
* @return
*/
public boolean isTxtFile(String fileName) {
if(fileName.lastIndexOf(".txt") > 0) {
return true;
}
return false;
}
/**
* 将文件转换成Document对象
* @param file
* @return
* @throws Exception
*/
public Document fileToDocument(File file) throws Exception {
Document document=new Document();
document.add(new TextField("filename", file.getName(), Store.YES));
document.add(new TextField("content", getFileContent(file), Store.YES));
document.add(new LongField("size", file.getTotalSpace(), Store.YES));
return document;
}
/**
* 获得indexwriter对象
* @param dir
* @return
* @throws Exception
*/
public IndexWriter getIndexWriter(Directory dir) throws Exception {
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_47, analyzer);
return new IndexWriter(dir, iwc);
}
/**
* 关闭indexwriter对象
* @throws Exception
*/
public void closeWriter() throws Exception {
if(indexWriter != null) {
indexWriter.close();
}
}
/**
* 读取文件内容
* @param file
* @return
* @throws Exception
*/
public String getFileContent(File file) throws Exception{
Reader reader = new InputStreamReader(new FileInputStream(file),"GBK");
BufferedReader br = new BufferedReader(reader);
String result ="";
while(br.readLine() != null){
result = result+"\n"+br.readLine();
}
br.close();
reader.close();
return result;
}
@Test
public void search() throws Exception {
String filePath=indexPath;
Directory dir=FSDirectory.open(new File(filePath));
IndexReader reader=DirectoryReader.open(dir);
IndexSearcher searcher=new IndexSearcher(reader);
//方法1:TermQuery
Term term=new Term("content", "中国");
TermQuery query=new TermQuery(term);
//方法2:TermQuery
// QueryParser parser = new QueryParser(Version.LUCENE_47, "content", analyzer);
// parser.setDefaultOperator(Operator.AND);
// Query query = parser.parse("中国");
TopDocs topdocs=searcher.search(query, 10);
ScoreDoc[] scoreDocs=topdocs.scoreDocs;
System.out.println("命中:" + topdocs.totalHits);
for(int i=0; i < scoreDocs.length; i++) {
Document targetDoc = searcher.doc(scoreDocs[i].doc);
System.out.println("内容=:" + targetDoc.toString());
}
// 分页,高亮显示
higherIndex(analyzer, searcher, query, topdocs);
reader.close();
}
/**
* 分页,高亮显示
*
* @param analyzer
* @param isearcher
* @param query
* @param topDocs
* @throws IOException
* @throws Exception
*/
public void higherIndex(Analyzer analyzer, IndexSearcher isearcher, Query query, TopDocs topDocs)
throws IOException, Exception {
TopScoreDocCollector results = TopScoreDocCollector.create(topDocs.totalHits, false);
isearcher.search(query, results);
// 分页取出指定的doc(开始条数, 取几条)
ScoreDoc[] docs = results.topDocs(1, 2).scoreDocs;
for (int i = 0; i < docs.length; i++) {
Document targetDoc = isearcher.doc(docs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
// 关键字高亮显示的html标签,需要导入lucene-highlighter-x.jar
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
for (int i = 0; i < docs.length; i++) {
Document doc = isearcher.doc(docs[i].doc);
// 标题增加高亮显示
TokenStream tokenStream1 = analyzer.tokenStream("filename", new StringReader(doc.get("filename")));
String title = highlighter.getBestFragment(tokenStream1, doc.get("filename"));
// 内容增加高亮显示
TokenStream tokenStream2 = analyzer.tokenStream("content", new StringReader(doc.get("content")));
String content = highlighter.getBestFragment(tokenStream2, doc.get("content"));
System.out.println(doc.get("filename") + " : " + title + " : " + content);
}
}
}
4、运行结果