lucene实践小例

最近要用到lucene,看了一下文档,动手写个小例子。
电脑中有N多pdf文档,不方便查找,索性写个搜索这个的小工具。
建索引的类:

import java.io.File;
import java.io.FileInputStream;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class LuceneMain {

public static void main(String[] args) throws Exception {
File fileDir = new File("C:\\Documents and Settings\\zhanglu\\桌面\\pdf文档");

File indexDir = new File("C:\\index");
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(indexDir),luceneAnalyzer,true,IndexWriter.MaxFieldLength.UNLIMITED);

File[] textFiles = fileDir.listFiles();
long start = new Date().getTime();

for(File f:textFiles){
if(f.isFile()&&f.getName().endsWith(".pdf")){
String temp = FileReaderAll(f.getCanonicalPath(),"GBK");
Document document = new Document();
Field FieldPath = new Field("path",f.getPath(),Field.Store.YES,Field.Index.NO);
Field FieldBody = new Field("body",temp,Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
}
}
indexWriter.optimize();
indexWriter.close();
long end = new Date().getTime();
System.out.println("time waste "+(end-start));
}

public static String FileReaderAll(String path,String charset) throws Exception{
// BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
// String s = "";
// StringBuffer sb = new StringBuffer(s);
// while((s=br.readLine())!=null){
// System.out.println(s);
// sb.append(s);
// }
// br.close();
// return sb.toString();
String docText = "";
PDFParser parser = new PDFParser(new FileInputStream(path));
parser.parse();
COSDocument cosDoc = parser.getDocument();
if(!cosDoc.isEncrypted()){
PDFTextStripper stripper = new PDFTextStripper();
PDDocument pdd = new PDDocument(cosDoc);
docText = stripper.getText(pdd);
// System.out.println(docText);
pdd.close();
}
cosDoc.close();
return docText;

}

}

查找类:

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class TestQuery {
public static void main(String[] args) throws CorruptIndexException, IOException, ParseException {
TopScoreDocCollector collector = TopScoreDocCollector.create(100,true);
String queryString = "socket";
Query query = null;
IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("C:\\index")));

Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT,"body",analyzer);
query = qp.parse(queryString);
if(searcher!=null){
searcher.search(query,collector);
}
ScoreDoc[] hits= collector.topDocs().scoreDocs;
if(hits.length>0){
System.out.println("共找到"+hits.length+"个结果");
for(int i = 0;i<hits.length;i++){
System.out.println(searcher.doc(hits[i].doc).getField("path").stringValue());
}
}
}


}

运行结果:
共找到3个结果
C:\Documents and Settings\zhanglu\桌面\pdf文档\java socket(IBM).pdf
C:\Documents and Settings\zhanglu\桌面\pdf文档\Apache_Mina_Server_2.0中文参考手册V1.0.pdf
C:\Documents and Settings\zhanglu\桌面\pdf文档\JavaEye论坛热点推荐_-_2009年10月_-_总第17期.pdf
用到Lucene3.0核心包,pdfbox(操纵pdf文档的开源工具)。
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值