Lucene初步应用

最新推荐文章于 2024-10-17 17:50:29 发布

eininotop

最新推荐文章于 2024-10-17 17:50:29 发布

阅读量478

点赞数

分类专栏： lucene 文章标签： eclipse lucene 全文检索搜索引擎索引

本文链接：https://blog.csdn.net/eininotop/article/details/70193799

版权

lucene 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

先来一段百度百科解答：

Lucene是apache软件基金会4 jakarta项目组的一个子项目，是一个开放源代码的全文检索引擎工具包，但它不是一个完整的全文检索引擎，而是一个全文检索引擎的架构，提供了完整的查询引擎和索引引擎，部分文本分析引擎（英文与德文两种西方语言）。Lucene的目的是为软件开发人员提供一个简单易用的工具包，以方便的在目标系统中实现全文检索的功能，或者是以此为基础建立起完整的全文检索引擎。Lucene是一套用于全文检索和搜寻的开源程式库，由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程式接口，能够做全文索引和搜寻。在Java开发环境里Lucene是一个成熟的免费开源工具。就其本身而言，Lucene是当前以及最近几年最受欢迎的免费Java信息检索程序库。人们经常提到信息检索程序库，虽然与搜索引擎有关，但不应该将信息检索程序库与搜索引擎相混淆

再来开始该博文的主要内容咯—如何使用Lucene咯

1、所需资源：

a、lucene-4.4.0.zip下的三个jar包：lucene-core-4.4.0.jar、lucene-queryparser-4.4.0.jar、lucene-analyzers-common-4.4.0.jar

b、IK Analyzer 2012FF_hf1.zip（庖丁分词器）下的一个包：IKAnalyzer2012FF_u1.jar

c、jdk运行环境

d、Eclipse运行环境

2、建立Java Project,并将所需的4个jar包导入

3、庖丁分词器需要配置的三个文件：

a、IKAnalyzer.cfg.xml

文本内容：

<?xml version=”1.0″ encoding=”UTF-8″?>
<!DOCTYPE properties SYSTEM “http://java.sun.com/dtd/properties.dtd”>
<properties>

   <comment>IK Analyzer 扩展配置</comment>
   <!–在这里配置自己的扩展字典–>
   <entry key=”ext_dict”>mydict.dic;</entry>

   <!–在这里配置自己的扩展停止词字典–>
   <entry key=”ext_stopwords”>ext_stopword.dic</entry>

</properties>

b、mydict.dic;

文本内容：

且行且珍惜

c、ext_stopword.dic（可自行填加）

文本内容：

也
了
仍
从
以
使
则
却
又
及

4、创建工具类（1）LuceneUtils，负责获取IndexWriter和IndexSearcher

首先需要创建接口类，存放常量

package top.einino.utils;

public interface Constant {

//在这目录下存放创建的索引
public static final String INDEX_DIR = “G:\\java\\index”;

}

package top.einino.utils;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class LuceneUtils {

   public static Directory directory = null;
   public static IndexWriterConfig indexWriterConfig = null;
   public static Version matchVersion = null;
   public static Analyzer analyzer = null;

   static {

try {

           //索引写入的目录
           directory = FSDirectory.open(new File(Constant.INDEX_DIR));
           //Lucene的版本号
           matchVersion = Version.LUCENE_44;
           //使用庖丁分词器
           analyzer = new IKAnalyzer();
           //索引写入的配置
           indexWriterConfig = new org.apache.lucene.index.IndexWriterConfig(matchVersion, analyzer);

       } catch (IOException e) {
           e.printStackTrace();
       }

   }

   public static IndexWriter getIndexWriter() throws IOException{

       //创建索引类
       IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
       return indexWriter;

   }


   public static IndexSearcher getIndexSearcher() throws IOException{

       //读取索引目录类
       IndexReader indexReader = DirectoryReader.open(directory);
       //创建索引类
       IndexSearcher indexSearcher = new IndexSearcher(indexReader);
       return indexSearcher;

   }

   //方便引用
   public static Version getMatchVersion() {

return matchVersion;

   }
   //方便引用
   public static Analyzer getAnalyzer() {

return analyzer;

}

5、创建工具类（2）BlogDocumentUtils，负责pojo对象与document对象的转换

首先需要创建一个pojo对象

package top.einino.bean;

import java.util.Date;

public class Blog {

   private Integer id;
   private String author;
   private String title;
   private String content;
   private Date date;

   public Integer getId() {
       return id;
   }
   public void setId(Integer id) {
       this.id = id;
   }
   public String getAuthor() {
       return author;
   }
   public void setAuthor(String author) {
       this.author = author;
   }
   public String getTitle() {
       return title;
   }
   public void setTitle(String title) {
       this.title = title;
   }
   public String getContent() {
       return content;
   }
   public void setContent(String content) {
       this.content = content;
   }
   public Date getDate() {
       return date;
   }
   public void setDate(Date date) {
       this.date = date;
   }

}

package top.einino.utils;

import java.text.ParseException;
import java.text.SimpleDateFormat;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.queryparser.flexible.core.util.StringUtils;

import top.einino.bean.Blog;

public class BlogDocumentUtils {

/**
* 将对象转换成document
* @param blog
* @return
*/
public static Document blogToDocument(Blog blog){

Document document = new Document();
//第一个参数是Blog属性，第二个参数是pojo属性值，第三个参数是是否作为索引存储
if(blog.getId() != null){
document.add(new IntField(“id”, blog.getId(), Store.YES));
}
//StringField不支持分词，表示不再进行分词
if(blog.getAuthor() != null){
document.add(new StringField(“author”, blog.getAuthor(), Store.YES));
}
//StringField支持分词
if(blog.getTitle() != null){
document.add(new TextField(“title”, blog.getTitle(), Store.YES));
}
if(blog.getContent() != null){
document.add(new TextField(“content”, blog.getContent(), Store.YES));
}
if(blog.getDate() != null){
document.add(new StringField(“date”, new SimpleDateFormat(“yyyy-MM-dd HH:mm:ss”).format(blog.getDate()), Store.YES));
}
return document;

}
/**
* 将document转换成pojo对象
* @param document
* @return
* @throws ParseException
*/
public static Blog documentToBlog(Document document) throws ParseException{

Blog blog = new Blog();
blog.setId(Integer.parseInt(document.get(“id”)));
blog.setAuthor(document.get(“author”));
blog.setTitle(document.get(“title”));
blog.setContent(document.get(“content”));
blog.setDate(new SimpleDateFormat(“yyyy-MM-dd HH:mm:ss”).parse(document.get(“date”)));
return blog;

}

6、创建LuceneDao,实现增删改查即分页操作

package top.einino.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

import top.einino.bean.Blog;
import top.einino.utils.BlogDocumentUtils;
import top.einino.utils.LuceneUtils;

public class LuceneDao {

//添加索引
public void addIndex(Blog blog) throws IOException{

//使用工具类获得索引创建类
IndexWriter indexWriter = LuceneUtils.getIndexWriter();
//使用工具类将blog对象转换成document对象
Document document = BlogDocumentUtils.blogToDocument(blog);
//创建索引
indexWriter.addDocument(document);
//关闭
indexWriter.close();

}

//根据关键词多字段检索索引
public List<Blog> findBlogList(String keyword) throws Exception{

List<Blog> blogList = new ArrayList<Blog>();
//使用工具类创建索引
IndexSearcher indexSearcher = LuceneUtils.getIndexSearcher();
//需要检索字段
String[] fields = {“title”, “content”};
//String[] fields = {“id”};
//创建多字段检索规则，并配置Lucene版本和分词器使用规则、需要跟创建索引时对应上！
QueryParser queryParser = new MultiFieldQueryParser(LuceneUtils.getMatchVersion(), fields, LuceneUtils.getAnalyzer());
//传入检索关键词
Query query = queryParser.parse(keyword);
//第一次检索：主要是检索出索引、获得检索结果，第一个参数是检索规则，第二个参数是获得前100条符合的结果
TopDocs topDocs = indexSearcher.search(query, 100);
//输出总记录数，主要是为了测试
System.out.println(“总记录数：”+topDocs.totalHits);
//获得检索命中结果集
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if(scoreDocs != null && scoreDocs.length > 0){

for(ScoreDoc scoreDoc : scoreDocs){

//获得存储id
int docId = scoreDoc.doc;
//第二次检索,通过存储id获得document对象
Document document = indexSearcher.doc(docId);
//使用工具类将document对象转换成blog对象
Blog blog = BlogDocumentUtils.documentToBlog(document);
//将blog对象放入集合中
blogList.add(blog);

}

}
return blogList;

}

//根据关键词多字段检索索引并使用分页，startRow表示起始行，rows表示每页行数
public List<Blog> findBlogListWithPage(String keyword, int startRow, int rows) throws Exception{

List<Blog> blogList = new ArrayList<Blog>();
//使用工具类创建索引
IndexSearcher indexSearcher = LuceneUtils.getIndexSearcher();
//需要检索字段
String[] fields = {“title”, “content”};
//创建多字段检索规则，并配置Lucene版本和分词器使用规则、需要跟创建索引时对应上！
QueryParser queryParser = new MultiFieldQueryParser(LuceneUtils.getMatchVersion(), fields, LuceneUtils.getAnalyzer());
//传入检索关键词
Query query = queryParser.parse(keyword);
//第一次检索：主要是检索出索引、获得检索结果，第一个参数是检索规则，第二个参数是获得前100条符合的结果
TopDocs topDocs = indexSearcher.search(query, 100);
//输出总记录数，主要是为了测试
System.out.println(“总记录数：”+topDocs.totalHits);
//获得检索命中结果集
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
//分页,能遍历到的最终行，但结果集可能没有这么多行
int finalRow = startRow + rows;
//取两者最小值，如果最终行大于结果集数，则只需要遍历到结果集，如果最终行小于结果集，则遍历到最终行
int min = Math.min(finalRow, scoreDocs.length);
if(scoreDocs != null && scoreDocs.length > 0){

for(int i=startRow; i<min; i++){

//获得存储id
int docId = scoreDocs[i].doc;
//第二次检索,通过存储id获得document对象
Document document = indexSearcher.doc(docId);
//使用工具类将document对象转换成blog对象
Blog blog = BlogDocumentUtils.documentToBlog(document);
//将blog对象放入集合中
blogList.add(blog);

}

}
return blogList;

}
//删除文档，第一个参数为删除的字段名，第二个参数为删除的符合的字段值
public void deleteIndex(String fileName, String fileValue) throws IOException{

IndexWriter indexWriter = LuceneUtils.getIndexWriter();
indexWriter.deleteDocuments(new Term(fileName, fileValue));
indexWriter.commit();
indexWriter.close();

}

//更新文档
public void updateIndex(String fileName, String fileValue, Blog blog) throws IOException {

IndexWriter indexWriter = LuceneUtils.getIndexWriter();
//执行过程：是先删除字段名为fileName,符合fileValue的文档，然后再创建索引文档
indexWriter.updateDocument(new Term(fileName, fileValue), BlogDocumentUtils.blogToDocument(blog));
indexWriter.commit();
indexWriter.close();

}

7、创建测试用例TestLuceneDao,可以一边实现方法，一边调测试用例

package top.einino.junit;

import java.io.IOException;
import java.util.Date;
import java.util.List;

import org.junit.Test;

import top.einino.bean.Blog;
import top.einino.lucene.LuceneDao;

public class TestLuceneDao {

private LuceneDao luceneDao = new LuceneDao();
//测试添加索引方法、调用该方法即在G:\\java\\index目录下会生成索引文件
@Test
public void testAddIndex() throws IOException{

//为测试分页
for(int i=1; i<35; i++){

Blog blog = new Blog();
blog.setId(i);
blog.setAuthor(“郑先生”);
blog.setTitle(“Lucene应用”);
blog.setContent(“该博文内容主要是讲如何使用Lucene创建索引，和检索索引”);
blog.setDate(new Date());
luceneDao.addIndex(blog);

}

//测试查询检索方法
@Test
public void testFindBlogIndex() throws Exception{

String keyword = “医药应用”;
List<Blog> blogList = luceneDao.findBlogList(keyword);
if(blogList != null && blogList.size() > 0){

for(Blog blog : blogList){

System.out.println(“id:”+blog.getId()
+” author:”+blog.getAuthor()
+” title:”+blog.getTitle()
+” content:”+blog.getContent()
+” date:”+blog.getDate());

}

//测试分页查询检索方法
@Test
public void testFindBlogIndexWithPage() throws Exception{

String keyword = “Lucene”;
List<Blog> blogList = luceneDao.findBlogListWithPage(keyword, 30, 10);
if(blogList != null && blogList.size() > 0){

for(Blog blog : blogList){

System.out.println(“id:”+blog.getId()
+” author:”+blog.getAuthor()
+” title:”+blog.getTitle()
+” content:”+blog.getContent()
+” date:”+blog.getDate());

}

//测试删除索引方法
@Test
public void testDeleteIndex() throws IOException{

luceneDao.deleteIndex(“author”, “郑先生”);

}

//测试更新方法
@Test
public void testUpdateIndex() throws IOException{

Blog blog = new Blog();
blog.setId(35);
blog.setAuthor(“张女士”);
blog.setTitle(“医药应用”);
blog.setContent(“医药应用”);
blog.setDate(new Date());
luceneDao.updateIndex(“author”, “郑先生”, blog);

}