Lucene的Maven配置
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>4.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>3.6.2</version>
</dependency>
<dependency>
<groupId>org.wltea.analyzer</groupId>
<artifactId>IKAnalyzer</artifactId>
<version>2012FF_hf1</version>
</dependency>
<!-- maven关键字高亮 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.10.1</version>
</dependency>
利用IKAnalyzer作为分析器。
package com.xuzengqiang.design.common.utils.lucene;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* IKAnalyzer分词器:采用特有的的"正向迭代最细粒度切分算法",支持细粒度和智能分词两种切分方式。
*
* @author xuzengqiang
* @since 2014-11-05
*/
@SuppressWarnings("all")
public class IKAnalyzerDemo {
// text:需要分词的文本
public static void example(String text) {
// 构建IKAnalyzer分词器,使用smart分词模式
Analyzer analyer = new IKAnalyzer(true);
StringReader reader = new StringReader(text);
// 分词
try {
// 获取Lucene的TokenStream对象
TokenStream token = analyer.tokenStream("", reader);
/*
* 必须在incrementToken前调用一次reset,重置TokenStream,否则会抛出以下异常: TokenStream
* contract violation: reset()/close() call missing, reset() called
* multiple times, or subclass does not call super.reset(). Please
* see Javadocs of TokenStream class for more information about the
* correct consuming workflow.
*/
token.reset();
CharTermAttribute term = token.getAttribute(CharTermAttribute.class);
// 遍历分词
while (token.incrementToken()) {
System.out.print(term.toString() + " | ");
}
System.out.println();
token.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
String text = "小学教育";
example(text);
}
}
Lucene例子:
package com.xuzengqiang.design.common.utils.lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.vnetoo.redu.resource.bo.RepositoryData;
/**
* Lucene Demo Lucene:可以看成一个支持全文索引的数据库系统
*
* @author xuzengqiang
* @since 2014-11-05
*/
@SuppressWarnings("all")
public class Lucene {
/**
*
* @param fieldName
* :Field名称,类似于根据数据库哪个字段查询
* @param keyword
* :关键词,类似于该字段的值
*/
public static void example(String fieldName, String keyword) {
String filePath = "D:/test/lucene";
// 实例化IkAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
// 索引存放的位置:磁盘和内存
Directory directory = null;
IndexWriter writer = null;
IndexReader reader = null;
IndexSearcher searcher = null;
try {
// 建立内存索引对象
directory = new RAMDirectory();
// 创建IndexWriter对象时,需通过IndexWriterConfig类设置相关配置,matchVersion:所用Lucene版本,analyzer:解析器
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
// 如果索引存在,则扩充,否则添加
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
writer = new IndexWriter(directory, config);
// 写入索引,相当于数据库中的insert语句,可以从数据库中取出相关字段
Document doc = new Document();
doc.add(new StringField("id", "10000", Field.Store.YES));
doc.add(new TextField("title", "小学教育", Field.Store.YES));
writer.addDocument(doc);
Document doc2 = new Document();
doc2.add(new StringField("id", "10001", Field.Store.YES));
doc2.add(new TextField("title", "中学教育", Field.Store.YES));
writer.addDocument(doc2);
/**
* 删除索引操作,调用IndexWriter中的deleteDocuments(args):参数args可以为一个Query对象,
* 也可以为一个Term对象,Term是一个精确查找的值
* 此时删除的文档并不是完全删除,而是被存储到一个回收站中,可以恢复,可以使用Reader可以有效的恢复取到的文档数
*/
// 删除第一条数据,这个时候只能查出一条数据:10001-中学教育
writer.deleteDocuments(new Term("id", "10000"));
// writer.rollback();数据回滚,可以恢复已经删除的数据,提交writer.commit();
// 更新文档,Lucene并没有提供更新操作,实际上是先删除后添加操作的合集
Document doc3 = new Document();
doc3.add(new StringField("id", "10001", Field.Store.YES));
doc3.add(new TextField("title", "大学教育", Field.Store.YES));
// 更新id=10001的数据
writer.updateDocument(new Term("id", "10001"), doc3);
// 清空回收站
writer.forceMergeDeletes();
writer.close();
// 搜索
reader = DirectoryReader.open(directory);
// 初始化IndexSearcher,类似于数据库中的select语句
searcher = new IndexSearcher(reader);
// 使用QueryParser查询分析器构造Query对象
QueryParser parser = new QueryParser(fieldName, analyzer);
// DefaultOperator是设置关键词中空格之间的关系,即存在空格的时候,空格之间的词是或还是且的关系
// parser.setDefaultOperator(QueryParser.OR_OPERATOR);
// 设置检索条件为OR_OPERATOR或
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = parser.parse(keyword);
System.out.println("Query = " + query);
// 搜索相似度最高的2条数据
TopDocs top = searcher.search(query, 2);
System.out.println("实际查询出:" + top.totalHits + "条数据");
// 将查询结果输出,将查询出的TopDocs数据分析出来
ScoreDoc[] score = top.scoreDocs;
// 高亮显示文本:导入lucene-highlighter-3.6.2.jar
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
// 指定关键词
QueryScorer scorer = new QueryScorer(query);
Highlighter lighter = new Highlighter(formatter, scorer);
for (int i = 0; i < top.totalHits; i++) {
Document target = searcher.doc(score[i].doc);
String title = target.get(fieldName);
// 查询出来显示内容的长度
Fragmenter frag = new SimpleFragmenter(title.length());
lighter.setTextFragmenter(frag);
if (title != null && "".equals(title) == false) {
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(title));
title = lighter.getBestFragment(tokenStream, title);
}
System.out.println(target.get("id") + "-" + title);
}
System.out.println("存储的文档数:" + reader.numDocs());
System.out.println("总存储量:" + reader.maxDoc());
System.out.println("被删除文档数:" + reader.numDeletedDocs());
} catch (Exception e) {
e.printStackTrace();
} finally {
destoryResource(reader, directory);
}
}
// 关闭资源
public static void destoryResource(IndexReader reader, Directory directory) {
try {
if (reader != null) {
reader.close();
}
if (directory != null) {
directory.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
// 根据title查询,且关键词为"学"
example("title", "教育");
}
}
Lucene与数据库的交互,可以优化,只是作为一个例子,返回的是一个查询的结果集
/* Lucene操作 */
@Override
public List<RepositoryData> search(String title) {
String fileName = "title"; // 需要查询的字段
List<RepositoryData> result = new ArrayList<RepositoryData>();
// 实例化IkAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null;
IndexWriter writer = null;
IndexReader reader = null;
IndexSearcher searcher = null;
try {
directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
writer = new IndexWriter(directory, config);
// 建立索引:查询所有的Repository
RepositoryData queryInfo = new RepositoryData();
queryInfo.setEnableFlag(1);
List<RepositoryData> repositoryList = getDao().query(queryInfo);
for (RepositoryData repository : repositoryList) {
Document doc = new Document();
doc.add(new StringField("id", repository.getId().toString(), Field.Store.YES));
doc.add(new TextField("title", repository.getTitle(), Field.Store.YES));
doc.add(new TextField("content", repository.getContent(), Field.Store.YES));
writer.addDocument(doc);
}
writer.close();
// 搜索
reader = DirectoryReader.open(directory);
searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser("title", analyzer);
parser.setDefaultOperator(QueryParser.OR_OPERATOR);
Query query = parser.parse(title);
TopDocs top = searcher.search(query, 1000);
System.out.println("实际查询出:" + top.totalHits + "条数据");
// 高亮显示文本:导入lucene-highlighter-3.6.2.jar
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
// 指定关键词
QueryScorer scorer = new QueryScorer(query);
Highlighter lighter = new Highlighter(formatter, scorer);
// 将查询结果输出,将查询出的TopDocs数据分析出来
ScoreDoc[] score = top.scoreDocs;
for (int i = 0; i < top.totalHits; i++) {
Document target = searcher.doc(score[i].doc);
RepositoryData data = new RepositoryData();
data.setId(Integer.valueOf(target.get("id")));
title = target.get(fileName);
// 查询出来显示内容的长度
Fragmenter frag = new SimpleFragmenter(title.length());
lighter.setTextFragmenter(frag);
if (title != null && "".equals(title) == false) {
TokenStream tokenStream = analyzer.tokenStream(fileName, new StringReader(title));
title = lighter.getBestFragment(tokenStream, title);
}
data.setTitle(title);
data.setContent(target.get("content"));
result.add(data);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
destoryResource(reader, directory);
}
return result;
}
public void destoryResource(IndexReader reader, Directory directory) {
try {
if (reader != null) {
reader.close();
}
if (directory != null) {
directory.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}