package net.lucene.edu;
/**
* 根据不同的文件类型,使用相对应的工具进行解析,创建索引文件
* @author lvx
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
public class TxtFileIndexer {
public static void main(String[] args) throws Exception{
File indexDir = new File("D://tmp//luceneIndex"); //索引文件存放路径
File dataDir = new File("D://tmp//luceneData"); //文件所存路径
Analyzer luceneAnalyzer = new CJKAnalyzer();
File[] dataFiles = dataDir.listFiles();
IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,false); //创建索引文件的时候创建indexWriter时候指定true,往指定索引文件中加入新的索引指定false
Document document = null;
for(int i = 0; i < dataFiles.length; i++){
if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
document = new Document();
FileReader reader = new FileReader(dataFiles[i]);
document.add(new Field("path",dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
document.add(new Field("contents",reader));
indexWriter.addDocument(document);
}else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".pdf")){
document = new Document();
document = LucenePDFDocument.getDocument(dataFiles[i]);
indexWriter.addDocument(document);
}else if(dataFiles[i].getName().endsWith(".htm") || dataFiles[i].getName().endsWith(".html")){
document = new Document();
NodeFilter filter = new NodeClassFilter(TextNode.class);
Parser parser = new Parser();
parser.setURL(dataFiles[i].getPath());
parser.setEncoding("GBK");
NodeList list = parser.extractAllNodesThatMatch(filter);
document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
document.add(new Field("contents", list.asString(),Field.Store.YES,Field.Index.TOKENIZED));
indexWriter.addDocument(document);
}else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".doc")){
document = new Document();
FileInputStream stream = new FileInputStream (dataFiles[i]);
WordExtractor extractor = new WordExtractor(stream);
String str = extractor.getText();
document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
document.add(new Field("contents", str,Field.Store.YES,Field.Index.TOKENIZED));
indexWriter.addDocument(document);
}else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".xls")){
document = new Document();
StringBuffer str = new StringBuffer();
InputStream fileInputStream = new FileInputStream(dataFiles[i]);
Workbook rwb = Workbook.getWorkbook(fileInputStream); //获得总 Sheets
Sheet[] sheets = rwb.getSheets();
int sheetLen = sheets.length;
for(int j=0;j<sheetLen;j++){ //获得单个Sheets 含有的行数
Sheet rs = rwb.getSheet(j);
Cell[] cell_domain = null;
for(int ii=0;ii<rs.getRows();ii++){
cell_domain = rs.getRow(ii);
for(int jj=0;jj<cell_domain.length;jj++){
str.append(cell_domain[jj].getContents());
}
}
}
document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
document.add(new Field("contents", str.toString(),Field.Store.YES,Field.Index.TOKENIZED));
indexWriter.addDocument(document);
}else if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".ppt")){
document = new Document();
FileInputStream fileInputStream = new FileInputStream (dataFiles[i]);
PowerPointExtractor extractor = new PowerPointExtractor(fileInputStream);
String str = extractor.getText();
document.add(new Field("path", dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.NO));
document.add(new Field("contents", str,Field.Store.YES,Field.Index.TOKENIZED));
indexWriter.addDocument(document);
}
}
indexWriter.optimize();
indexWriter.close();
System.out.println("索引建立完成!");
}
}
package net.lucene.edu;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
public class TxtFileSearcher {
public static void main(String[] args) throws Exception{
System.out.println("开始检索关键字:双鸭山");
String queryStr = "吕兴"; //检索关键字
String indexDirect = "D://tmp//luceneIndex"; //索引文件存放路径
Hits hits = null;
Query query = null;
Analyzer analyzer = new CJKAnalyzer();
IndexSearcher searcher = null;
searcher = new IndexSearcher(indexDirect);
QueryParser parser = new QueryParser("contents", analyzer);
query = parser.parse(queryStr);
hits = searcher.search(query);
Highlighter highlighter = new Highlighter(new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(50));
for(int i = 0; i < hits.length(); i++){
Document document = hits.doc(i);
System.out.println("检索文件: " + document.get("path"));
String content = document.get("contents");
if (content!= null){
TokenStream tokenStream =analyzer.tokenStream("content", new StringReader(content));
System.out.println(highlighter.getBestFragment(tokenStream,hits.doc(i).get("contents")));
}
}
}
}
完成,测试时需要导入相应的包(PDFBox、poi、lucene、jxl、htmlparser)