lucene搜索

最新推荐文章于 2020-06-26 00:12:07 发布

ChengNengNeng

最新推荐文章于 2020-06-26 00:12:07 发布

阅读量123

点赞数

分类专栏： java基础

本文链接：https://blog.csdn.net/ChengChengxn/article/details/103113150

版权

java基础专栏收录该内容

12 篇文章 0 订阅

订阅专栏

1.什么是lucene
Lucene是一个开放源代码的全文检索引擎工具包，提供了一个简单却强大的应用程式接口，能够做全文索引和搜寻。
2.依赖

<lucene-analyzers-common.version>4.7.2</lucene-analyzers-common.version>
<lucene-core.version>4.7.2</lucene-core.version>
<lucene-facet.version>4.7.2</lucene-facet.version>
<lucene-highlighter.version>4.7.2</lucene-highlighter.version>
<lucene-queries.version>4.7.2</lucene-queries.version>
<lucene-queryparser.version>4.7.2</lucene-queryparser.version>

<!-- lucene begin -->
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-analyzers-common</artifactId>
	<version>${lucene-analyzers-common.version}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-core</artifactId>
	<version>${lucene-core.version}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-facet</artifactId>
	<version>${lucene-facet.version}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-highlighter</artifactId>
	<version>${lucene-highlighter.version}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-queries</artifactId>
	<version>${lucene-queries.version}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-queryparser</artifactId>
	<version>${lucene-queryparser.version}</version>
</dependency>
<!-- lucene end -->

<!-- ikanalyzer 中文分词器  -->
<dependency>
	<groupId>com.janeluo</groupId>
	<artifactId>ikanalyzer</artifactId>
	<version>2012_u6</version>
	<exclusions>
		<exclusion>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
		</exclusion>
		<exclusion>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
		</exclusion>
		<exclusion>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
		</exclusion>
	</exclusions>
</dependency>

3.使用demo

package com.tbtx.imaijia.controller;

import com.tbtx.imaijia.biz.ArticleBiz;
import com.tbtx.imaijia.domain.bo.ArticleBO;
import com.tbtx.imaijia.domain.query.ContentArticleQuery;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author chengxn
 * @date 2019/10/29
 */
public class LuceneTest {
   /**
    * 添加索引库
    * @throws Exception
    */
   public static void index() throws Exception {

       List<ArticleBO> list = new ArrayList<>();
       for(int i = 1; i<1000; i++){
           ArticleBO bo = new ArticleBO();
           bo.setId(Long.valueOf(i));
           bo.setTitle("标题");
           bo.setSummary("摘要");
           bo.setContent("内容是我呀互联网技术更新速度高级管理员产品技术");
           list.add(bo);
       }
       // 将采集到的数据封装到Document对象中
       List<Document> docList = new ArrayList<>();
       Document document;
       for (ArticleBO ArticleBO : list) {
           document = new Document();
           // store:如果是yes，则说明存储到文档域中
           // 不分词， 索引，存储
           // 文章id
           LongField id = new LongField("id", ArticleBO.getId(), Field.Store.YES);
           // 标题
           TextField title = new TextField("title", ArticleBO.getTitle()== null ? "" :ArticleBO.getTitle(), Field.Store.YES);
           // 摘要
           TextField summary = new TextField("summary", ArticleBO.getSummary() == null?"":ArticleBO.getSummary(), Field.Store.YES);
           // 正文内容
           TextField content = new TextField("content", ArticleBO.getContent() == null?"":ArticleBO.getContent(), Field.Store.YES);
           // 作者
           StringField author = new StringField("author", ArticleBO.getAuthor() == null?"":ArticleBO.getAuthor() , Field.Store.YES);
           // 标签
           StringField tag = new StringField("tag", ArticleBO.getTag() == null? "": ArticleBO.getTag(), Field.Store.YES);
           // 图片
           StringField image = new StringField("image", ArticleBO.getImage() == null ?"":ArticleBO.getImage(), Field.Store.YES);
           // 将field域设置到Document对象中
           document.add(id);
           document.add(title);
           document.add(summary);
           document.add(content);
           document.add(author);
           document.add(tag);
           document.add(image);
           docList.add(document);
       }
       // 创建IndexWriter
       IndexWriter writer = getIndexWriter();
       // 通过IndexWriter对象将Document写入到索引库中
       for (Document doc : docList) {
           writer.addDocument(doc);
       }
       // 关闭writer
       writer.close();
   }

   public static IndexWriter getIndexWriter() throws Exception {
       // 创建分词器，标准分词器
       Analyzer analyzer = new IKAnalyzer();

       // 创建IndexWriter
       IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_47, analyzer);
       // 指定索引库的地址
       File indexFile = new File(getIndexPath());
       Directory directory = FSDirectory.open(indexFile);
       IndexWriter writer = new IndexWriter(directory, cfg);

       return writer;
   }

   private static String getIndexPath() {
       // 获取索引的目录
       String path = "/Users/tbtx/Desktop/Lucene索引库/cxn";

       // 不存在就创建目录
       File file = new File(path);
       if (!file.exists()) {
           file.mkdirs();
       }
       return path;
   }

   /**
    * 清空索引库
    */
   public static void deleteAll() {
       IndexWriter writer = null;
       try {
           // 获取IndexWriter
           writer = getIndexWriter();

           // 删除所有的数据
           writer.deleteAll();

           int cnt = writer.numDocs();
           System.out.println("索引条数\t" + cnt);

           // 提交事物
           writer.commit();
       } catch (Exception e) {
           e.printStackTrace();
       } finally {
           closeWriter(writer);
       }
   }

   private static void closeWriter(IndexWriter writer) {
       try {
           if (writer != null) {
               writer.close();
           }
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   /**
    * 查询
    * @param query
    */
   public static void excQuery(Query query){
       //查询
       IndexReader reader  = null;
       try {
           reader = getIndexReader();

           //获取查询数据
           IndexSearcher searcher = new IndexSearcher(reader);

           //检索数据
           TopDocs topDocs = searcher.search(query, 100);
           int totalSize = topDocs.totalHits;
           System.out.println("总数"+totalSize+"条");
           for(ScoreDoc scoreDoc : topDocs.scoreDocs){
               Document doc = reader.document(scoreDoc.doc);
               System.out.println(doc.get("id")+":"+doc.get("title")+":"+doc.get("summary")+":"+doc.get("content"));
           }
       } catch (Exception e) {
           e.printStackTrace();
       }finally{
           closeReader(reader);
       }
   }

   public static IndexReader getIndexReader() throws Exception {
       // 创建IndexWriter
       String path = getIndexPath();
       FSDirectory fs = FSDirectory.open(new File(path));
       // 获取到读
       return DirectoryReader.open(fs);
   }

   public static void closeReader(IndexReader reader) {
       try {
           if (reader != null) {
               reader.close();
           }
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   public static void main(String[] args) throws Exception {
       //清空索引库
        deleteAll();
        //添加索引库
       index();


           // 创建query对象
           // 使用QueryParser搜索时，需要指定分词器，搜索时的分词器要和索引时的分词器一致
           // 第一个参数：默认搜索的域的名称
           QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_47,new String[]{"title","content"}, new IKAnalyzer());
           // 通过queryparser来创建query对象
           // 参数：输入的lucene的查询语句(关键字一定要大写)
           Query query = parser.parse("互联网技术");
           excQuery(query);
   }

}

4.项目中使用会有分页及高亮需求

package com.tbtx.imaijia.biz.impl;

import com.tbtx.imaijia.biz.LuceneBiz;
import com.tbtx.imaijia.domain.bo.ArticleBO;
import com.tbtx.imaijia.domain.query.SearchQuery;
import com.tbtx.utils.model.PageResult;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;


/**
* @author chengxn
* @date 2019/10/30
*/
@Service
public class LuceneBizImpl implements LuceneBiz {

   @Value("${lucene.search.index.path}")
   private String indexPath;

   @Override
   public void addIndex(List<ArticleBO> articleBOList) throws Exception {
       // 将采集到的数据封装到Document对象中
       List<Document> docList = new ArrayList<>();
       Document document;
       for (ArticleBO ArticleBO : articleBOList) {
           document = new Document();
           // 文章id（分词，索引，存储）
           LongField id = new LongField("id", ArticleBO.getId(), Field.Store.YES);
           // 标题（分词，索引，存储）
           TextField title = new TextField("title", ArticleBO.getTitle()== null ? "" :ArticleBO.getTitle(), Field.Store.YES);
           // 摘要（分词，索引，存储）
           TextField summary = new TextField("summary", ArticleBO.getSummary() == null?"":ArticleBO.getSummary(), Field.Store.YES);
           // 正文内（分词，索引，不存储）
           TextField content = new TextField("content", ArticleBO.getContent() == null?"":ArticleBO.getContent(), Field.Store.NO);
           // 作者（分词，索引，存储）
           TextField author = new TextField("author", ArticleBO.getAuthor() == null?"":ArticleBO.getAuthor() , Field.Store.YES);
           // 标签（分词，索引，存储）
           TextField tag = new TextField("tag", ArticleBO.getTag() == null? "": ArticleBO.getTag(), Field.Store.YES);
           // 图片（不分词，索引，存储）
           StringField image = new StringField("image", ArticleBO.getImage() == null ?"":ArticleBO.getImage(), Field.Store.YES);
           // 栏目名称（不分词，索引，存储）
           StringField columnName = new StringField("columnName", ArticleBO.getColumnName() == null ?"":ArticleBO.getColumnName(), Field.Store.YES);
           //发布时间
           Field publishTime = new Field("publishTime", DateTools.dateToString(ArticleBO.getPublishTime(),  DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.NOT_ANALYZED);
           // 将field域设置到Document对象中
           document.add(id);
           document.add(title);
           document.add(summary);
           document.add(content);
           document.add(author);
           document.add(tag);
           document.add(image);
           document.add(columnName);
           document.add(publishTime);
           docList.add(document);
       }
       // 创建IndexWriter
       IndexWriter writer = getIndexWriter();
       // 通过IndexWriter对象将Document写入到索引库中
       for (Document doc : docList) {
           writer.addDocument(doc);
       }
       // 关闭writer
       writer.close();
   }

   @Override
   public void updateIndex(List<ArticleBO> articleBOList) throws Exception {
       deleteAll();
       addIndex(articleBOList);
   }

   @Override
   public PageResult<ArticleBO> excQuery(SearchQuery searchQuery) throws ParseException {
       PageResult<ArticleBO> resultPage = new PageResult<>(new ArrayList<ArticleBO>());
       List<ArticleBO> articleBOList = new ArrayList<>();

       // 使用QueryParser搜索时，需要指定分词器，搜索时的分词器要和索引时的分词器一致
       Analyzer analyzer =  new IKAnalyzer();
       QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_47,new String[]{"title","summary","content","tag"}, analyzer);
       Query query = parser.parse(searchQuery.getKeyWord());
       //查询
       IndexReader reader  = null;
       try {
           reader = getIndexReader();
           //获取查询数据
           IndexSearcher indexSearcher = new IndexSearcher(reader);
           TopDocs topDocs = indexSearcher.search(query, searchQuery.getPageNum()*searchQuery.getPageSize());
           //数据分页
          // System.out.println("查询到的条数"+topDocs.totalHits);
           ScoreDoc [] scores = topDocs.scoreDocs;
           int start = (searchQuery.getPageNum() -1)*searchQuery.getPageSize();
           int end = topDocs.totalHits;
           //高亮处理
           //如果不指定参数的话，默认是加粗，即<b><b/>
           SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color=red>","</font></b>");
           //计算得分，会初始化一个查询结果最高的得分
           QueryScorer scorer = new QueryScorer(query);
           //根据这个得分计算出一个片段
           Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
           Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
           //设置一下要显示的片段
           highlighter.setTextFragmenter(fragmenter);
           // 这个100是指定关键字字符串的context的长度，你可以自己设定，因为不可能返回整篇正文内容
          // highlighter.setTextFragmenter(new SimpleFragmenter(100));

           for(int i=start;i<end || i<searchQuery.getPageNum()*searchQuery.getPageSize();i++){
               Document doc = reader.document(scores[i].doc);
               TokenStream tokenStream1 = analyzer.tokenStream("summary", new StringReader(doc.get("summary")));
               String summary = highlighter.getBestFragment(tokenStream1, doc.get("summary"));
               TokenStream tokenStream2 = analyzer.tokenStream("title", new StringReader(doc.get("title")));
               String title = highlighter.getBestFragment(tokenStream2, doc.get("title"));

               ArticleBO articleBO = new ArticleBO();
               articleBO.setId(Long.parseLong(doc.get("id")));
               articleBO.setAuthor(doc.get("author"));
               articleBO.setTag(doc.get("tag"));
               articleBO.setImage(doc.get("image"));
               articleBO.setColumnName(doc.get("columnName"));
               articleBO.setPublishTime(DateTools.stringToDate(doc.get("publishTime")));
               articleBO.setTitle(title==null?doc.get("title"):title);
               articleBO.setSummary(summary==null?doc.get("summary"):summary);
               // System.out.println(doc.get("publishTime"));
              // System.out.println(doc.get("id")+":"+doc.get("author")+":"+doc.get("tag")+":"+doc.get("title")+":"+doc.get("summary"));
               articleBOList.add(articleBO);
           }
           resultPage.setTotal(Long.valueOf(topDocs.totalHits));
           resultPage.setDatas(articleBOList);
       } catch (Exception e) {
           e.printStackTrace();
       }finally{
           closeReader(reader);
       }
       return resultPage;
   }

   private IndexWriter getIndexWriter() throws Exception {
       // 创建分词器，标准分词器
       Analyzer analyzer = new IKAnalyzer();
       // 创建IndexWriter
       IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_47, analyzer);
       // 指定索引库的地址
       File indexFile = new File(getIndexPath());
       Directory directory = FSDirectory.open(indexFile);
       IndexWriter writer = new IndexWriter(directory, cfg);

       return writer;
   }

   private String getIndexPath() {
       // 获取索引的目录,不存在就创建目录
       String path = indexPath;
       File file = new File(path);
       if (!file.exists()) {
           file.mkdirs();
       }
       return path;
   }

   private void closeWriter(IndexWriter writer) {
       try {
           if (writer != null) {
               writer.close();
           }
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   private IndexReader getIndexReader() throws Exception {
       // 创建IndexWriter
       String path = getIndexPath();
       FSDirectory fs = FSDirectory.open(new File(path));
       // 获取到读
       return DirectoryReader.open(fs);
   }

   private void closeReader(IndexReader reader) {
       try {
           if (reader != null) {
               reader.close();
           }
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   private void deleteAll() {
       IndexWriter writer = null;
       try {
           // 获取IndexWriter
           writer = getIndexWriter();
           // 删除所有的数据
           writer.deleteAll();
           int cnt = writer.numDocs();
           //System.out.println("索引条数" + cnt);
           // 提交事物
           writer.commit();
       } catch (Exception e) {
           e.printStackTrace();
       } finally {
           closeWriter(writer);
       }
   }
}

ChengNengNeng

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
lucene搜索

1.什么是luceneLucene是一个开放源代码的全文检索引擎工具包，提供了一个简单却强大的应用程式接口，能够做全文索引和搜寻。2.依赖<lucene-analyzers-common.version>4.7.2</lucene-analyzers-common.version><lucene-core.version>4.7.2</lucene...
复制链接

扫一扫

专栏目录