Lucene索引实战
——comaple.zhang
声明:本文为原创,转载请注明出处。
1.概述
Lucene 是用来建立索引和查询索引的利器,其查询速度很快。我本人测试的数据位125万条数据每条200字节,返回结果要1.7s内完成。而从结果集中取出120万数据并转换为pojo对象则需要13.8s。网络i/o达到16MB/s。而建立120万条这样的索引数据仅需要1.26m。那么现在就让我们一起来看一下具体的用法吧。
2.概念解释
2.1倒排索引
Lucene是高性能的基于java实现的全文检索工具,可以把他理解为全文检索的api集合。它使用的是倒排文件索引结构。下面举一个例子来说明倒排索引的概念。假如有两篇文章,
Doc1:tom lives inGuangzhou,I live in Guangzhou too。
Doc2:he oncelived in shanghai。
a) 由于lucene是基于关键词索引和查询的,所以首先我们要对文章进行分词处理。为了简便我们这里只讨论英文分词,即以空格分隔的单词系列,再去掉小品词即可。
b) 单词分词以后统一大小写,还原英文单词原型。如lives lived还原为live
c) 去掉标点符号,此时文章变为:
Doc1:[tom] [live] [guangzhou][live] [guangzhou]
Doc2: [he] [live] [shanghai]
d) 接下来我们来进行倒排索引,一般索引以这样的格式存储:文章id :关键字;而倒排索引反过来了如:关键字:包还该关键字的所有文档id。如下所示:
关键字 文章id
Guangzhou doc1
He doc2
I doc1
Live doc1、doc2
Shanghai doc2
Tom doc1
2.2 Document 和 Field
对于要index的文档, 首先要生成抽象的document对象, 然后把需要index的内容加到各个fields中去.Document就是fields的集合, 而fields用来放文档本身和文档的元数据.
field由名字和值组成.如代码:
Document doc = new Document();
String pin = item.getPin();
doc.add(new Field("pin", pin,Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("sku",item.getSku(), Store.YES,Index.ANALYZED));
2.3写入文档
indexWriter.addDocument(doc);
2.4 创建indexwriter
IndexWriter writer = new IndexWriter(dir,getAnalyzer(), true);
3下面一个完整的demo
直接上代码了,这是一个可以实现创建,更新和查询的工具类,其中查询实现了分页读取的操作。
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import jd.data.migration.dao.BaseDao;
import jd.data.migration.model.UserOrderModel;
/**
*
* @author qt-zhangshengtao
*
*/
public class LuceneDaoImpl implements BaseDao<UserOrderModel>{
//索引文件存放的路径
private final String DBPATH = "e:\\newindex";
//indexwriter
private static IndexWriter indexWriter =null;
//indexwriter用到的存储对象
private static Directory indexDir =null;
//配置对象
private static IndexWriterConfig config =null;
public static Analyzer luceneAnalyzer =null;
private IndexReader reader = null;
private Query query = null;
private IndexSearcher searcher = null;
// 默认每页返回一万
private final int pageCount = 10000;
public static LuceneDaoImpl instance;
private static Lock lock = new ReentrantLock();
private LuceneDaoImpl() {
try {
luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_35);
config = new IndexWriterConfig(Version.LUCENE_35,luceneAnalyzer);
indexDir = new NIOFSDirectory(new File(DBPATH));
indexWriter = new IndexWriter(indexDir,config);
indexWriter.setRAMBufferSizeMB(512);
//indexWriter.setUseCompoundFile(false);
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public static LuceneDaoImpl getInstance() {
lock.lock();
if (instance == null)
instance = new LuceneDaoImpl();
lock.unlock();
return instance;
}
/**
* 根据单个pin获取userOrder列表
*
* @param pin
* @return
*/
public List<UserOrderModel> getListByPin(String sku) {
try {
List<UserOrderModel> list = newArrayList<UserOrderModel>();
query = new TermQuery(new Term("sku", sku));
reader = IndexReader.open(indexDir);
searcher = new IndexSearcher(reader);
//TopScoreDocCollector result = TopScoreDocCollector.cr
TopDocs docs = searcher.search(query, pageCount);
int totalCount = docs.totalHits;
if (totalCount > pageCount) {
for (int i = 0; i < totalCount / pageCount; i++) {
ScoreDoc[] scoreDocs = docs.scoreDocs;
ScoreDoc lastdoc = null;
for (ScoreDoc scoreDoc : scoreDocs) {
Document document = searcher.doc(scoreDoc.doc);
list.add(new UserOrderModel(document.get("pin"),
document.get("sku")));
lastdoc = scoreDoc;
}
docs = searcher.searchAfter(lastdoc, query, pageCount);
if (docs.scoreDocs.length == 0)
break;
}
}
return list;
} catch (Exception e) {
System.out.println(e.getMessage());
return null;
}
}
/**
* 在lucene中根据传入的list创建索引
*
* @param list
*/
public void createIndex(List<UserOrderModel> list) {
try {
for (Iterator iterator = list.iterator(); iterator.hasNext();) {
UserOrderModel item = (UserOrderModel) iterator.next();
Document doc = new Document();
String pin = item.getPin();
doc.add(new Field("pin", pin, Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("sku", item.getSku(), Store.YES,
Index.ANALYZED));
indexWriter.addDocument(doc);
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
/**
* 更新索引
*/
public void updateIndex(List<UserOrderModel> list) {
try {
for (Iterator iterator = list.iterator();iterator.hasNext();) {
UserOrderModel item = (UserOrderModel) iterator.next();
Document doc = new Document();
doc.add(new Field("pin", item.getPin(), Store.YES,
Index.NOT_ANALYZED));
doc.add(new Field("sku", item.getSku().toString(), Store.YES,
Index.ANALYZED_NO_NORMS));
indexWriter.updateDocument(new Term("pin", item.getPin()), doc);
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void close() {
try {
// TODO Auto-generatedmethod stub
indexWriter.commit();
indexWriter.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public List<UserOrderModel> getListByPin(String[] pin) {
// TODO Auto-generatedmethod stub
return null;
}
}