lucene4.3全文搜索引擎—索引的增删查改

最新推荐文章于 2022-06-26 12:08:10 发布

ws_zhh

最新推荐文章于 2022-06-26 12:08:10 发布

阅读量266

点赞数

分类专栏： lucene4.3学习笔记文章标签： lucene4.3 lucene全文搜索引擎 lucene4.3索引的增删查改

lucene4.3学习笔记专栏收录该内容

4 篇文章 0 订阅

订阅专栏

之前说到，lucene简单的建立索引，并根据索引进行简单的搜索功能。下面来探讨一下对lucene索引的管理，也就是对索引的增删查改，其它不多说，直接贴代码：

（ps：因为在学习lucene的时候，全部笔记都作为注释写到代码了，所以贴出代码后也不多说了，望谅解）

package lucene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IndexUtil {

	private String[] ids={"1","2","3","4","5","6"};
	
	private String[] emails={"sam@163.com","holiday@163.com","issac@163.com","summer@163.com","coco@163.com","roy@163.com"};
	
	private String[] contents={"hello,how are you","hi,I am fine!","what is your name","my name is summer","what is your number","I will tell you,just wait a minute"};
	
	private String [] names={"sam","holiday","issac","summer","coco","roy"};
	
	private int[] attachs={2,3,2,4,5,7};
	
	private Directory directory=null;
	
	public Directory getDirectory() {
		return directory;
	}

	public void setDirectory(Directory directory) {
		this.directory = directory;
	}

	public IndexUtil() throws IOException
	{
		directory=FSDirectory.open(new File("E:/lucene/index02"));
		
	}
	
	/**
	 * 建立索引
	 * @throws IOException
	 */
	public void index() throws IOException 
	{
		IndexWriter indexWriter=new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
		
		for(int i=0;i<ids.length;i++)
		{
			Document document=new Document();
			
			document.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
			document.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
			document.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
			document.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
			
			indexWriter.addDocument(document);
		}
		
		if(indexWriter!=null) indexWriter.close();
	}
	
	/**
	 * 查询索引的信息
	 * @throws IOException
	 */
	public void query() throws IOException
	{
		IndexReader indexReader=IndexReader.open(directory);
		
		//通过IndexReader，可以获取有效文档的个数，还有一些其它的操作
		System.out.println("实际文档数numdocs："+indexReader.numDocs());
		System.out.println("全部文档数maxdoc（包括回收站的文档）："+indexReader.maxDoc());
		System.out.println("被删除的文档（被放到回收站里）："+indexReader.numDeletedDocs());
		
		indexReader.close();
	}
	
	
	public void delete() throws IOException
	{
		IndexWriter indexWriter=new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
		
		//这里的参数可以是一个Query，也可以是一个term
		//1、当参数为Query时，可以传入一个Query对象，它会根据你传入的query对象，把符合这个查找条件的文档都删掉，例如：如果你传入的Query是content域中，包含字母‘a’的，那么所有包含字符‘a’的文档都会被删掉
		//2、当传入的是一个term时，这个算是一个精确查找，它是一个‘像’，怎么理解呢，比如，下面的Term("id","1")它就是id为1的像，也就是，它会找到id为1的索引文档
		//需要注意的一点是，这个删除并不是真正的删除，它会把这个文档放到回收站里 
		indexWriter.deleteDocuments(new Term("id","1"));
		
		//rollback()函数，当indexWriter没有commit之前，它是可以回滚回之前没有修改的状态，
		//indexWriter.rollback();
		
		//indexWriter不提交或者不关闭，它所进行的修改时不会起效的
		indexWriter.close();
		
	}
	
	
	/*在版本3.6之后，已经没有了unDeleteAll()方法了
	 * public void recoverDelete() throws IOException
	{
		IndexWriter indexWriter =new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));

	}*/
	
	/**
	 * 强制删除，意思就是清空回收站
	 * @throws IOException 
	 */
	public void forceDelete() throws IOException
	{
		IndexWriter indexWriter=new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
		
		indexWriter.forceMergeDeletes();
		
		indexWriter.close();
	}
	
	/**
	 * 强制合并，把几段合并成x段，x为设置的参数
	 * @throws IOException 
	 */
	public void forceMerge() throws IOException
	{
		IndexWriter indexWriter=new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
		
		//强制合并，把几段合并成x段，x为设置的参数，这里是强制合并为两段
		//在合并的过程中，例如合并成两段，通常都是将第一段不作任何改变的作为第一段，后面的全部合并成一段，在这合并的过程中，它会把回收站也清空掉（这里不影响第一段的结果），如果合并成三段，那么就将第一、二段不作任何改变作为第一、二段，其余的合并成一段，以此类推
		//合并之后，实际的文档数是不变的，变的只是总的文档数，也就是回收站的数量变而已
		//注意：这里在lucene3.5之后不建议使用，因为会消耗大量的开销，因为lucene在段数达到一定的情况下，会自动处理的
		indexWriter.forceMerge(3);
		
		indexWriter.close();
	}
	
	/**
	 * 更新索引
	 * @throws IOException 
	 */
	public void update() throws IOException
	{
		IndexWriter indexWriter=new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
		
		Document document=new Document();
		
		document.add(new Field("id","10",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
		document.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
		document.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
		document.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
		
		//这里的更新，从方法上可以看出，它实际上时将旧的删除，然后添加一个新文档的进去，将匹配到term的文档删除，然后就新的document添加进去
		indexWriter.updateDocument(new Term("id","1"), document);
		
		indexWriter.close();
	}
	
	
	
}

下面写个测试类：

package test;

import java.io.IOException;

import lucene.IndexUtil;


import org.junit.Test;

public class TestIndex {

	@Test
	public void testIndex() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.index();
	}
	
	
	@Test
	public void testQuery() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.query();
		
	}
	
	@Test
	public void testDelete() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.delete();
		
	}
	
	/*此方法已经不能用了
	 * @Test
	public void testRecoverDelete() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.recoverDelete();
	}*/
	
	
	@Test
	public void testForceDelete() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.forceDelete();
	}
	
	
	
	@Test
	public void testForceMerge() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.forceMerge();
		
	}
	
	
	@Test
	public void testUpdate() throws IOException
	{
		IndexUtil indexUtil=new IndexUtil();
		
		indexUtil.update();
	}
	
	
}

因为方法有点多，所以测试结果就不贴出来了。OK，到这里，索引的增删查改也完了，整体来说还是比较简单容易理解的。

ps：下面增加一个小知识，希望对大家有用

Field.Store.YES或者NO(存储域选项)
设置为YES表示或把这个域中的内容完全存储到文件中，方便进行文本的还原
设置为NO表示把这个域的内容不存储到文件中，但是可以被索引，此时内容无法完全还原(doc.get)

Field.Index(索引选项)
Index.ANALYZED:进行分词和索引，适用于标题、内容等
Index.NOT_ANALYZED:进行索引，但是不进行分词，如果身份证号，姓名，ID等，适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息，这个norms中包括了创建索引的时间和权值等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引 (一个域除了能够被索引，还能够被存储，仅仅被存储的域是搜索不到的，但是能通过文档号查到，多用于不想被搜索到，但是在通过其它域能够搜索到的情况下，能够随着文档号返回给用户的域)

ws_zhh

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
lucene4.3全文搜索引擎—索引的增删查改

之前说到，lucene简单的建立索引，并根据索引进行简单的搜索功能。下面来探讨一下对lucene索引的管理，也就是对索引的增删查改，其它不多说，直接贴代码：（ps：因为在学习lucene的时候，全部笔记都作为注释写到代码了，所以贴出代码后也不多说了，望谅解）package lucene;import java.io.File;import java.io.IOExcepti...
复制链接

扫一扫

专栏目录