废话不说了,直接上实例代码,如果你看过前面几篇文章,这些代码对你来说都是小case了,理解最重要
下面两个代码是一个工程:
IndexDocument.java
package baseSample;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
public class IndexDocument {
public static Directory getIndexDirectory(Directory directory,
Analyzer analyzer) throws CorruptIndexException,
LockObtainFailedException, IOException {
IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
new IndexWriter.MaxFieldLength(25000));
// 索引过程的调优
// iwriter.setMergeFactor(10); // 激励因子
// iwriter.setMaxMergeDocs(2000); // segment最大文档数量
// iwriter.setMaxBufferedDocs(1); // 内存文档数量
// news Fields
Field newsId = null;
Field newsName = null;
Field publishDate = null;
Field newsSource = null;
Field newssummay = null;
// 第1篇新闻
Document doc1 = new Document();
newsId = new Field("newsId", "aaaa", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsName = new Field("newsName", "江苏常州曝疫苗造假大案7人被捕超百万人受害",
Field.Store.YES, Field.Index.ANALYZED);
publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES,
Field.Index.ANALYZED);
newssummay = new Field(
"newssummay",
"据香港明报报道,江苏常州爆出疫苗造假大案。当地著名疫苗生产商江苏延申生物科技股份有限公司(简称“江苏延申”)被国家药监局查实在疫苗生产过程中长期故意造假,导致大量问题疫苗流向市场,受害者最少超过100万人。",
Field.Store.YES, Field.Index.ANALYZED);
doc1.add(newsId);
doc1.add(newsName);
doc1.add(publishDate);
doc1.add(newsSource);
doc1.add(newssummay);
iwriter.addDocument(doc1);
// 第2篇新闻
Document doc2 = new Document();
newsId = new Field("newsId", "bbbb", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsName = new Field("newsName", "富士康一月内发生三起坠楼案", Field.Store.YES,
Field.Index.ANALYZED);
publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsSource = new Field("newsSource", "广州日报", Field.Store.YES,
Field.Index.ANALYZED);
newssummay = new Field("newssummay",
"昨日凌晨3时左右,富士康科技集团龙华厂区的一名23岁湖南籍男性员工从宿舍楼上坠下,当场死亡",
Field.Store.YES, Field.Index.ANALYZED);
doc2.add(newsId);
doc2.add(newsName);
doc2.add(publishDate);
doc2.add(newsSource);
doc2.add(newssummay);
iwriter.addDocument(doc2);
// 第3篇新闻
Document doc3 = new Document();
newsId = new Field("newsId", "cccc", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsName = new Field("newsName", "普京称要消灭掉制造地铁爆炸案恐怖分子", Field.Store.YES,
Field.Index.ANALYZED);
publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES,
Field.Index.ANALYZED);
newssummay = new Field("newssummay",
"据外电报道,俄罗斯总理普京29日表示,当天制造莫斯科地铁连环爆炸案的恐怖分子一定会被抓到,并被消灭掉。",
Field.Store.YES, Field.Index.ANALYZED);
doc3.add(newsId);
doc3.add(newsName);
doc3.add(publishDate);
doc3.add(newsSource);
doc3.add(newssummay);
// doc3.setBoost(2);
iwriter.addDocument(doc3);
// 第4篇新闻
Document doc4 = new Document();
newsId = new Field("newsId", "cccc", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsName = new Field("newsName", "最天使", Field.Store.YES,
Field.Index.ANALYZED);
publishDate = new Field("publishDate", "2009/3/30", Field.Store.YES,
Field.Index.NOT_ANALYZED);
newsSource = new Field("newsSource", "易", Field.Store.YES,
Field.Index.ANALYZED);
newssummay = new Field("newssummay", "长肥了", Field.Store.YES,
Field.Index.ANALYZED);
doc4.add(newsId);
doc4.add(newsName);
doc4.add(publishDate);
doc4.add(newsSource);
doc4.add(newssummay);
iwriter.addDocument(doc4);
iwriter.close();
return directory;
}
}
SampleSearch.java
package baseSample;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class SampleSearch{
public static void main(String arg[]) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException{
//Store the index in memory:
// Directory directory = new RAMDirectory();
//To store an index on disk, use this instead:
File file = new File("D:/mapreduce-out/lucenetmp/cache.txt") ;
if(file.exists()) {
System.out.println("文件已存在,删除掉");
file.delete() ;
}
Directory directory = FSDirectory.open(file);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
analyzer = new CJKAnalyzer(Version.LUCENE_30);
//Now search the index 这一步同时也写入了lucene的cache文件
IndexSearcher isearcher = new IndexSearcher(IndexDocument.getIndexDirectory(directory, analyzer), true);
/**
* IndexSearcher 的主要检索方法
* isearcher.search(Query query, Collector results);
* isearcher.search(Query query,int n);
* isearcher.search(Query query, Filter filter, Collector results);
*/
//Term 是查询的基本单位
//1.termQuery
Query termQuery = new TermQuery(new Term("newsSource","网易"));
System.out.println("--- termQuery : "+termQuery.toString());
//2.BooleanQuery ,类似还提供RangeQuery范围搜索; PrefixQuery 前缀搜索 ;FuzzyQuery 模糊搜索 ..etc
Query a = new TermQuery(new Term("newsSource", "网"));
Query b = new TermQuery(new Term("newsSource", "易"));
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(a, BooleanClause.Occur.MUST);
booleanQuery.add(b, BooleanClause.Occur.MUST);
System.out.println("--- booleanQuery :"+ booleanQuery.toString());
//3.用QueryParser 切词出 query
System.out.println("lucene的当前版本 : " + Version.LUCENE_CURRENT);
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "newsSource", analyzer);
parser.setDefaultOperator(QueryParser.AND_OPERATOR);//默认term之间是or关系
Query parserQuery = parser.parse("java lucene");
System.out.println("--- parserQuery : "+parserQuery.toString());
//4.利用MultiFieldQueryParser实现对多Field查询
String[] fields = {"newsName","newsSource"};
MultiFieldQueryParser mparser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer);
Query mQuery = mparser.parse("江苏");
System.out.println("---- mQuery :"+mQuery);
ScoreDoc[] docs = isearcher.search(termQuery, 10).scoreDocs;
for (int i = 0; i < docs.length; i++){
System.out.println(docs[i].doc);
System.out.println("searcher score :" + docs[i].score);
Document hitDoc = isearcher.doc(docs[i].doc);
System.out.println("--- explain : "+isearcher.explain(termQuery, docs[i].doc));
System.out.println("boost:" + hitDoc.getBoost());
System.out.println("newsId:" + hitDoc.get("newsId"));
System.out.println("newsName:" + hitDoc.get("newsName"));
System.out.println("publishDate:" + hitDoc.get("publishDate"));
System.out.println("newsSource:" + hitDoc.get("newsSource"));
System.out.println("newssummay:" + hitDoc.get("newssummay"));
System.out.println("------------------------------------------");
}
}
}
下面两个代码,是一起的
TextFileIndexer.java
package lighter.javaeye.com;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class TextFileIndexer {
public static void main(String[] args) throws IOException {
//致命要索引文件夹的位置
File fileDir = new File("D:/mapreduce-out/lucenetmp/demo1") ;
//这里放索引文件的位置
File indexDir = new File("D:/mapreduce-out/lucenetmp/demo2") ;
//此处的indexDir应该是放置生成缓存的文件夹
Directory docx = FSDirectory.open(indexDir);
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT) ;
IndexWriter.MaxFieldLength mf = new MaxFieldLength(100);
IndexWriter indexWriter = new IndexWriter(docx, luceneAnalyzer, mf) ;
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
for(int i=0;i<textFiles.length;i++) {
if(textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
System.out.println("文件 " + textFiles[i].getCanonicalPath() + "正在呗索引") ;
String temp = fileReaderAll(textFiles[i].getCanonicalPath(), "GBK") ;
System.out.println("temp = " + temp);
Document document = new Document();
Field fieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.NO) ;
Field fieldBody = new Field("body", temp, Field.Store.YES, Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS) ;
document.add(fieldPath);
document.add(fieldBody);
indexWriter.addDocument(document);
}
}
//optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();
long endTime = new Date().getTime();
System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + fileDir.getPath());
}
public static String fileReaderAll(String fileName, String charset) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),charset));
String line = new String() ;
String temp = new String() ;
while((line = reader.readLine()) != null) {
temp += line ;
}
reader.close();
return temp ;
}
}
TestQuery.java
package lighter.javaeye.com;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class TestQuery {
public static void main(String[] args) throws IOException {
TopDocs topDoc = null ;
String queryString = "中华" ;
Query query = null ;
Directory directory = FSDirectory.open(new File("D:/mapreduce-out/lucenetmp/demo2"));
IndexSearcher search = new IndexSearcher(directory) ;
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
try {
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "body", analyzer) ;
query = qp.parse(queryString);
} catch (ParseException e) {
e.printStackTrace() ;
}
if(search != null) {
topDoc = search.search(query, 100);
if (topDoc.getMaxScore() > 0) {
System.out.println("topDoc.totalHits" + topDoc.totalHits);
System.out.println("topDoc.getMaxScore()" + topDoc.getMaxScore());
System.out.println("topDoc.toString()" + topDoc.toString());
} else {
System.out.println("没有查询到结果");
}
}
}
}