Lucene学习二：封装操作接口，实现删除、更新索引

最新推荐文章于 2024-03-11 21:30:51 发布

z781582206

最新推荐文章于 2024-03-11 21:30:51 发布

阅读量423

点赞数

分类专栏： lucene 文章标签： lucene 索引

本文链接：https://blog.csdn.net/z781582206/article/details/77865295

版权

lucene 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

索引文件可以理解为一个数据库，既然是数据库那么就因该包含增删改查，在此我们先定义两个接口IndexInterfac和QueryInterface分别表示索引相关的操作和查询相关的操作

IndexInterface代码如下：

import java.io.IOException;

import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;

public interface IndexInterface {
  void create(Long id, String title) throws IOException;
  
  void updateOne(Long id, String title) throws IOException;
  
  /**
   * delete all index from lucene
   * @throws IOException
   */
  void delete() throws IOException;
  
  /**
   * This way will delete index which hit key in field
   * @param field
   * @param key
   * @throws IOException
   * @throws ParseException 
   */
  void delete(String field, String key) throws IOException, ParseException;
  
  void delete(Query[] querys) throws IOException;
  
  void delete(Term term) throws IOException;
  
  void delete(Term[] terms) throws IOException;
}

创建索引的时候可以分两种情况：丢弃现有索引重新创建索引、在现有索引上增加新索引，增加会引起重复，之后我会介绍如何防止重复添加

QueryInterface代码：

import java.io.IOException;

import org.apache.lucene.queryparser.classic.ParseException;

public interface QueryInterface {
  Object search(String key, String field) throws IOException, ParseException; // 返回值需要自己封装
}






索引实现类：

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.hao.search.service.IndexInterface;

public class IndexService implements IndexInterface {

  private Analyzer analyzer = new IKAnalyzer();
  
  public IndexService(String path) {
    this.path = path;
  }
  
  private String path; // 索引文件目录
  
  public void create(Long id, String title) throws IOException {
    Directory dir = FSDirectory.open(new File(path));
    IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    IndexWriter indexWriter = new IndexWriter(dir, writerConfig);
    
    Document document = new Document();
    document.add(new StringField("id", id.toString(), Field.Store.YES)); // 业务记录id，唯一，例如商品编号
    document.add(new TextField("title", title, Field.Store.YES)); // 待分词搜索字段
    
    indexWriter.addDocument(document); 
    indexWriter.close();

  }
  // 
  public void append(Long id, String title) throws IOException {
    Directory dir = FSDirectory.open(new File(path));
    IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    IndexWriter indexWriter = new IndexWriter(dir, writerConfig);
    
    Document document = new Document();
    document.add(new StringField("id", id.toString(), Field.Store.YES));
    document.add(new TextField("title", title, Field.Store.YES));
    
    indexWriter.addDocument(document);
    indexWriter.close();

  }
  
  public void delete() throws IOException {
    Directory dir = FSDirectory.open(new File(path)); 
    IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    IndexWriter indexWriter = new IndexWriter(dir, writerConfig);
    indexWriter.deleteAll(); // 删除全部索引
    indexWriter.close();
  }

  public void delete(Long id) throws IOException {
    delete(new Term("id", id.toString())); // 这里是按照id进行删除
  }
  
  public void delete(String field, String key) throws IOException, ParseException {
    Directory dir = FSDirectory.open(new File(path)); 
    IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    IndexWriter indexWriter = new IndexWriter(dir, writerConfig);
    
    QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer);  
    Query query = parser.parse(key);
    
    indexWriter.deleteDocuments(query);
    indexWriter.close();
    
  }

  public void delete(Query[] querys) throws IOException {
    // TODO Auto-generated method stub
    
  }

  public void delete(Term term) throws IOException {
    Directory dir = FSDirectory.open(new File(path)); 
    IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    IndexWriter indexWriter = new IndexWriter(dir, writerConfig);
    indexWriter.deleteDocuments(term);
    indexWriter.close();
    
  }

  public void delete(Term[] terms) throws IOException {
    // TODO Auto-generated method stub
    
  }

  public void updateOne(Long id, String title) throws IOException {
    Directory dir = FSDirectory.open(new File(path));
    IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    IndexWriter indexWriter = new IndexWriter(dir, writerConfig);
    
    Document document = new Document();
    document.add(new StringField("id", id.toString(), Field.Store.YES));
    document.add(new TextField("title", title, Field.Store.YES));
    
    indexWriter.updateDocument(new Term("id", id.toString()), document);// 按照id进行更新，若不存在就会新创建，建议使用此方法追加索引
    indexWriter.close();

    
  }

}


StringField 与 TextField都是Field的子类，两者都会被建立索引，但StringField不会被分词，例如商品系统中图片地址，商品编号等




查询实现类：
import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.hao.search.service.QueryInterface;

public class QueryService implements QueryInterface{

  Analyzer analyzer = new IKAnalyzer();
  
  public QueryService(String path) {
    this.path = path;
  }
  
  private String path = "d:/test/lucene";
  public Object search(String key, String field) throws IOException, ParseException {
    
    IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("d:/test/lucene"))));  
    QueryParser queryParser = new QueryParser(Version.LUCENE_47, field, analyzer);
    queryParser.setDefaultOperator(QueryParser.Operator.AND); // 设置分词结果集之间的关系，这里设置为全包含
    Query query = queryParser.parse(key);
    QueryScorer scorer = new QueryScorer(query);  
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);// 得到得分的片段，就是得到一段包含所查询的关键字的摘要
    SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(
    "<b><font color='red'>", "</font></b>");// 对查询的数据格式化；无参构造器的默认是将关键字加粗
    Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);// 根据得分和格式化
    highlighter.setTextFragmenter(fragmenter);// 设置成高亮
    
    ScoreDoc after = null;
    TopDocs topDocs = searcher.searchAfter(after, query, 10);
    ScoreDoc[] pageDocs = topDocs.scoreDocs;
    for (ScoreDoc doc : pageDocs) {
      Document d = searcher.doc(doc.doc); 
      try {
        String str = highlighter.getBestFragment(analyzer, field, d.get(field)) ; // 对命中关键词加高亮，此处可以封装成对象列表返回
        System.out.println(str);
      } catch (InvalidTokenOffsetsException e) {
        e.printStackTrace();
      }
      after = pageDocs[topDocs.scoreDocs.length - 1]; 
      System.out.println("id: "+ d.get(field) +"内容:"+d.get("title"));
    }
    
    return null;
  }

}
测试类
import java.io.IOException;

import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;

import com.hao.search.service.IndexInterface;
import com.hao.search.service.QueryInterface;
import com.hao.search.serviceImpl.IndexService;
import com.hao.search.serviceImpl.QueryService;

/**
 * Hello world!
 *
 */
public class App {
    static String path = "d:/test/lucene";
    public static void main( String[] args ) {
//      delete();
//      create(); // 先创建索引，在执行查询
      query("入门", "title");
//      delete(3l);
    }
    
    public static void delete(Long id) {
      IndexInterface index = new IndexService(path);
      try {
        index.delete(new Term("id", id.toString()));
        System.out.println("delete success");
      } catch (IOException e1) {
        e1.printStackTrace();
        System.out.println("delete errro");
      }
      
    }
    
    public static void delete() {
      IndexInterface index = new IndexService(path);
      try {
        index.delete();
        System.out.println("delete success");
      } catch (IOException e1) {
        e1.printStackTrace();
        System.out.println("delete errro");
      }
      
    }
    
    public static Object query(String id, String field) {
      QueryInterface query = new QueryService(path);
      try {
        query.search(id, field);
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (ParseException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      return null;
    }
    
    public static void create() {
      IndexInterface index = new IndexService(path);
      try {
        index.updateOne(1l, "字符串和三元运算符字符串相加优先级的问题");
        index.updateOne(2l, "eclipse下maven更新的意外bug总结");
        index.updateOne(3l, "如何使用搜索技巧来成为一名高效的程序员");
        index.updateOne(4l, "入门级Demo，创建索引和查询高亮显示");
        index.updateOne(5l, "批量爬QQ用户信息，利用QQ查询功能进行获取QQ用户信息");
        index.updateOne(6l, "Python爬虫开发（三）：数据存储以及多线程");
        index.updateOne(7l, "Python爬虫开发（一）：零基础入门");
        index.updateOne(8l, "Python爬虫开发（二）：整站爬虫与Web挖掘");
        index.updateOne(9l, "Python爬虫开发（三-续）：快速线程池爬虫");
        index.updateOne(10l, "Python爬虫开发（五）：反爬虫措施以及爬虫编写注意事项");
        index.updateOne(11l, "Python爬虫开发（四）：动态加载页面的解决方案与爬虫代理");
        index.updateOne(12l, "挖洞经验 | 价值1万美金的谷歌内部主机信息泄露漏洞");
        index.updateOne(13l, "如何确认Google用户的具体电子邮件地址（已提交Google漏洞奖励计划）");
        index.updateOne(14l, "欧洲国家电网的噩梦：攻击太阳能板就能导致大规模停电？");
        index.updateOne(15l, "Java反序列化危机已过，这次来的是.Net反序列化漏洞");
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
}
输出结果：
<b><font color='red'>入门</font></b>级Demo，创建索引和查询高亮显示
id: 入门级Demo，创建索引和查询高亮显示内容:入门级Demo，创建索引和查询高亮显示
Python爬虫开发（一）：零基础<b><font color='red'>入门</font></b>
id: Python爬虫开发（一）：零基础入门内容:Python爬虫开发（一）：零基础入门