1.什么是lucene
Lucene是一个开放源代码的全文检索引擎工具包,提供了一个简单却强大的应用程式接口,能够做全文索引和搜寻。
2.依赖
<lucene-analyzers-common.version>4.7.2</lucene-analyzers-common.version>
<lucene-core.version>4.7.2</lucene-core.version>
<lucene-facet.version>4.7.2</lucene-facet.version>
<lucene-highlighter.version>4.7.2</lucene-highlighter.version>
<lucene-queries.version>4.7.2</lucene-queries.version>
<lucene-queryparser.version>4.7.2</lucene-queryparser.version>
<!-- lucene begin -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene-analyzers-common.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene-core.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-facet</artifactId>
<version>${lucene-facet.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>${lucene-highlighter.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>${lucene-queries.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene-queryparser.version}</version>
</dependency>
<!-- lucene end -->
<!-- ikanalyzer 中文分词器 -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
<exclusions>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
</exclusion>
</exclusions>
</dependency>
3.使用demo
package com.tbtx.imaijia.controller;
import com.tbtx.imaijia.biz.ArticleBiz;
import com.tbtx.imaijia.domain.bo.ArticleBO;
import com.tbtx.imaijia.domain.query.ContentArticleQuery;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author chengxn
* @date 2019/10/29
*/
public class LuceneTest {
/**
* 添加索引库
* @throws Exception
*/
public static void index() throws Exception {
List<ArticleBO> list = new ArrayList<>();
for(int i = 1; i<1000; i++){
ArticleBO bo = new ArticleBO();
bo.setId(Long.valueOf(i));
bo.setTitle("标题");
bo.setSummary("摘要");
bo.setContent("内容是我呀互联网技术更新速度高级管理员产品技术");
list.add(bo);
}
// 将采集到的数据封装到Document对象中
List<Document> docList = new ArrayList<>();
Document document;
for (ArticleBO ArticleBO : list) {
document = new Document();
// store:如果是yes,则说明存储到文档域中
// 不分词, 索引,存储
// 文章id
LongField id = new LongField("id", ArticleBO.getId(), Field.Store.YES);
// 标题
TextField title = new TextField("title", ArticleBO.getTitle()== null ? "" :ArticleBO.getTitle(), Field.Store.YES);
// 摘要
TextField summary = new TextField("summary", ArticleBO.getSummary() == null?"":ArticleBO.getSummary(), Field.Store.YES);
// 正文内容
TextField content = new TextField("content", ArticleBO.getContent() == null?"":ArticleBO.getContent(), Field.Store.YES);
// 作者
StringField author = new StringField("author", ArticleBO.getAuthor() == null?"":ArticleBO.getAuthor() , Field.Store.YES);
// 标签
StringField tag = new StringField("tag", ArticleBO.getTag() == null? "": ArticleBO.getTag(), Field.Store.YES);
// 图片
StringField image = new StringField("image", ArticleBO.getImage() == null ?"":ArticleBO.getImage(), Field.Store.YES);
// 将field域设置到Document对象中
document.add(id);
document.add(title);
document.add(summary);
document.add(content);
document.add(author);
document.add(tag);
document.add(image);
docList.add(document);
}
// 创建IndexWriter
IndexWriter writer = getIndexWriter();
// 通过IndexWriter对象将Document写入到索引库中
for (Document doc : docList) {
writer.addDocument(doc);
}
// 关闭writer
writer.close();
}
public static IndexWriter getIndexWriter() throws Exception {
// 创建分词器,标准分词器
Analyzer analyzer = new IKAnalyzer();
// 创建IndexWriter
IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_47, analyzer);
// 指定索引库的地址
File indexFile = new File(getIndexPath());
Directory directory = FSDirectory.open(indexFile);
IndexWriter writer = new IndexWriter(directory, cfg);
return writer;
}
private static String getIndexPath() {
// 获取索引的目录
String path = "/Users/tbtx/Desktop/Lucene索引库/cxn";
// 不存在就创建目录
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
return path;
}
/**
* 清空索引库
*/
public static void deleteAll() {
IndexWriter writer = null;
try {
// 获取IndexWriter
writer = getIndexWriter();
// 删除所有的数据
writer.deleteAll();
int cnt = writer.numDocs();
System.out.println("索引条数\t" + cnt);
// 提交事物
writer.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
closeWriter(writer);
}
}
private static void closeWriter(IndexWriter writer) {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 查询
* @param query
*/
public static void excQuery(Query query){
//查询
IndexReader reader = null;
try {
reader = getIndexReader();
//获取查询数据
IndexSearcher searcher = new IndexSearcher(reader);
//检索数据
TopDocs topDocs = searcher.search(query, 100);
int totalSize = topDocs.totalHits;
System.out.println("总数"+totalSize+"条");
for(ScoreDoc scoreDoc : topDocs.scoreDocs){
Document doc = reader.document(scoreDoc.doc);
System.out.println(doc.get("id")+":"+doc.get("title")+":"+doc.get("summary")+":"+doc.get("content"));
}
} catch (Exception e) {
e.printStackTrace();
}finally{
closeReader(reader);
}
}
public static IndexReader getIndexReader() throws Exception {
// 创建IndexWriter
String path = getIndexPath();
FSDirectory fs = FSDirectory.open(new File(path));
// 获取到读
return DirectoryReader.open(fs);
}
public static void closeReader(IndexReader reader) {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
//清空索引库
deleteAll();
//添加索引库
index();
// 创建query对象
// 使用QueryParser搜索时,需要指定分词器,搜索时的分词器要和索引时的分词器一致
// 第一个参数:默认搜索的域的名称
QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_47,new String[]{"title","content"}, new IKAnalyzer());
// 通过queryparser来创建query对象
// 参数:输入的lucene的查询语句(关键字一定要大写)
Query query = parser.parse("互联网技术");
excQuery(query);
}
}
4.项目中使用会有分页及高亮需求
package com.tbtx.imaijia.biz.impl;
import com.tbtx.imaijia.biz.LuceneBiz;
import com.tbtx.imaijia.domain.bo.ArticleBO;
import com.tbtx.imaijia.domain.query.SearchQuery;
import com.tbtx.utils.model.PageResult;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
* @author chengxn
* @date 2019/10/30
*/
@Service
public class LuceneBizImpl implements LuceneBiz {
@Value("${lucene.search.index.path}")
private String indexPath;
@Override
public void addIndex(List<ArticleBO> articleBOList) throws Exception {
// 将采集到的数据封装到Document对象中
List<Document> docList = new ArrayList<>();
Document document;
for (ArticleBO ArticleBO : articleBOList) {
document = new Document();
// 文章id(分词,索引,存储)
LongField id = new LongField("id", ArticleBO.getId(), Field.Store.YES);
// 标题(分词,索引,存储)
TextField title = new TextField("title", ArticleBO.getTitle()== null ? "" :ArticleBO.getTitle(), Field.Store.YES);
// 摘要(分词,索引,存储)
TextField summary = new TextField("summary", ArticleBO.getSummary() == null?"":ArticleBO.getSummary(), Field.Store.YES);
// 正文内(分词,索引,不存储)
TextField content = new TextField("content", ArticleBO.getContent() == null?"":ArticleBO.getContent(), Field.Store.NO);
// 作者(分词,索引,存储)
TextField author = new TextField("author", ArticleBO.getAuthor() == null?"":ArticleBO.getAuthor() , Field.Store.YES);
// 标签(分词,索引,存储)
TextField tag = new TextField("tag", ArticleBO.getTag() == null? "": ArticleBO.getTag(), Field.Store.YES);
// 图片(不分词,索引,存储)
StringField image = new StringField("image", ArticleBO.getImage() == null ?"":ArticleBO.getImage(), Field.Store.YES);
// 栏目名称(不分词,索引,存储)
StringField columnName = new StringField("columnName", ArticleBO.getColumnName() == null ?"":ArticleBO.getColumnName(), Field.Store.YES);
//发布时间
Field publishTime = new Field("publishTime", DateTools.dateToString(ArticleBO.getPublishTime(), DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.NOT_ANALYZED);
// 将field域设置到Document对象中
document.add(id);
document.add(title);
document.add(summary);
document.add(content);
document.add(author);
document.add(tag);
document.add(image);
document.add(columnName);
document.add(publishTime);
docList.add(document);
}
// 创建IndexWriter
IndexWriter writer = getIndexWriter();
// 通过IndexWriter对象将Document写入到索引库中
for (Document doc : docList) {
writer.addDocument(doc);
}
// 关闭writer
writer.close();
}
@Override
public void updateIndex(List<ArticleBO> articleBOList) throws Exception {
deleteAll();
addIndex(articleBOList);
}
@Override
public PageResult<ArticleBO> excQuery(SearchQuery searchQuery) throws ParseException {
PageResult<ArticleBO> resultPage = new PageResult<>(new ArrayList<ArticleBO>());
List<ArticleBO> articleBOList = new ArrayList<>();
// 使用QueryParser搜索时,需要指定分词器,搜索时的分词器要和索引时的分词器一致
Analyzer analyzer = new IKAnalyzer();
QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_47,new String[]{"title","summary","content","tag"}, analyzer);
Query query = parser.parse(searchQuery.getKeyWord());
//查询
IndexReader reader = null;
try {
reader = getIndexReader();
//获取查询数据
IndexSearcher indexSearcher = new IndexSearcher(reader);
TopDocs topDocs = indexSearcher.search(query, searchQuery.getPageNum()*searchQuery.getPageSize());
//数据分页
// System.out.println("查询到的条数"+topDocs.totalHits);
ScoreDoc [] scores = topDocs.scoreDocs;
int start = (searchQuery.getPageNum() -1)*searchQuery.getPageSize();
int end = topDocs.totalHits;
//高亮处理
//如果不指定参数的话,默认是加粗,即<b><b/>
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color=red>","</font></b>");
//计算得分,会初始化一个查询结果最高的得分
QueryScorer scorer = new QueryScorer(query);
//根据这个得分计算出一个片段
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
//设置一下要显示的片段
highlighter.setTextFragmenter(fragmenter);
// 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容
// highlighter.setTextFragmenter(new SimpleFragmenter(100));
for(int i=start;i<end || i<searchQuery.getPageNum()*searchQuery.getPageSize();i++){
Document doc = reader.document(scores[i].doc);
TokenStream tokenStream1 = analyzer.tokenStream("summary", new StringReader(doc.get("summary")));
String summary = highlighter.getBestFragment(tokenStream1, doc.get("summary"));
TokenStream tokenStream2 = analyzer.tokenStream("title", new StringReader(doc.get("title")));
String title = highlighter.getBestFragment(tokenStream2, doc.get("title"));
ArticleBO articleBO = new ArticleBO();
articleBO.setId(Long.parseLong(doc.get("id")));
articleBO.setAuthor(doc.get("author"));
articleBO.setTag(doc.get("tag"));
articleBO.setImage(doc.get("image"));
articleBO.setColumnName(doc.get("columnName"));
articleBO.setPublishTime(DateTools.stringToDate(doc.get("publishTime")));
articleBO.setTitle(title==null?doc.get("title"):title);
articleBO.setSummary(summary==null?doc.get("summary"):summary);
// System.out.println(doc.get("publishTime"));
// System.out.println(doc.get("id")+":"+doc.get("author")+":"+doc.get("tag")+":"+doc.get("title")+":"+doc.get("summary"));
articleBOList.add(articleBO);
}
resultPage.setTotal(Long.valueOf(topDocs.totalHits));
resultPage.setDatas(articleBOList);
} catch (Exception e) {
e.printStackTrace();
}finally{
closeReader(reader);
}
return resultPage;
}
private IndexWriter getIndexWriter() throws Exception {
// 创建分词器,标准分词器
Analyzer analyzer = new IKAnalyzer();
// 创建IndexWriter
IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_47, analyzer);
// 指定索引库的地址
File indexFile = new File(getIndexPath());
Directory directory = FSDirectory.open(indexFile);
IndexWriter writer = new IndexWriter(directory, cfg);
return writer;
}
private String getIndexPath() {
// 获取索引的目录,不存在就创建目录
String path = indexPath;
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
return path;
}
private void closeWriter(IndexWriter writer) {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
private IndexReader getIndexReader() throws Exception {
// 创建IndexWriter
String path = getIndexPath();
FSDirectory fs = FSDirectory.open(new File(path));
// 获取到读
return DirectoryReader.open(fs);
}
private void closeReader(IndexReader reader) {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
private void deleteAll() {
IndexWriter writer = null;
try {
// 获取IndexWriter
writer = getIndexWriter();
// 删除所有的数据
writer.deleteAll();
int cnt = writer.numDocs();
//System.out.println("索引条数" + cnt);
// 提交事物
writer.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
closeWriter(writer);
}
}
}