Lucene3.5 之索引删除和更新

最新推荐文章于 2023-05-22 21:35:28 发布

doymm2008

最新推荐文章于 2023-05-22 21:35:28 发布

阅读量3.4k

点赞数 1

分类专栏： Java技术文章标签： lucene null 文档 merge 存储 email

Java技术专栏收录该内容

68 篇文章 5 订阅

订阅专栏

package com.ethan.index;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class IndexUtil {
private String[] ids = {"1","2","3","4","5","6"};
private String[] emails = {"11@qq.com","22@qq.com","33@126.com","43@yahoo.cn","54@gmail.com","65@qq.com"};
private String[] contents = {
"welcome to nba hot",
"my name is ethan",
"someone like you ",
"rolling in the deep, you like",
"i like fast........",
"l like sports"
};
private int[] attachs = {2,3,1,5,4,6};
private String[] names = {"ethan","sara","michael","wade","lin","paul"};
private Directory directory = null;
public IndexUtil() {
try {
directory = FSDirectory.open(new File("C:\\Users\\ETHAN\\workspace\\hellolucene\\index02"));
} catch (IOException e) {
e.printStackTrace();
}
}
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)) );
Document doc = null;
for(int i=0;i<ids.length;i++) {
doc = new Document();
doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void query() {
try {
IndexReader reader = IndexReader.open(directory);
//被存储的
System.out.println("numDocs: "+reader.numDocs());
//文档总量
System.out.println("maxDocs: "+reader.maxDoc());
//删除的文档
System.out.println("deleteDocs: "+reader.numDeletedDocs());;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void delete() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
//参数是一个选项，可以是一个Query,也可以是一个term,term是一个精确查找的值
//这里删除id=1的文档，还会留在”回收站“。xxx.del
writer.deleteDocuments(new Term("id","1"));
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void undelete() {
//使用IndexReader进行恢复
IndexReader reader = null;
try {
//set readOnly=false
reader = IndexReader.open(directory,false);
reader.undeleteAll();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(reader!=null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void forceDelete() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
//强制优化，del文件就没了，回收站清空
writer.forceMergeDeletes();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 自己手动merge
* 多次创建索引，文件会增多，
* 比如 5次的话，5个id=1的
*
* merge后合并为n段
*/
public void merge() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
//将索引合并为2段，这两段中的del文件会被清空
//3.5后不建议使用，开销大，lucene会根据情况自动处理
writer.forceMerge(2);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 更新操作
*/
public void update() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
//lucene没有提供更新方法，这里操作分为两步
//匹配后删除和添加新的
Document doc = new Document();
doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
writer.updateDocument(new Term("id","1"),doc);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer!=null) {
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

[java] view plain copy

package com.ethan.test;
import org.junit.Test;
import com.ethan.index.IndexUtil;
public class IndexTest {
@Test
public void testIndex() {
IndexUtil iu = new IndexUtil();
iu.index();
}
/*
* numDocs: 24
maxDocs: 24
deleteDocs: 0
*/
@Test
public void testQuery() {
IndexUtil iu = new IndexUtil();
iu.query();
}
/*
* numDocs: 20
maxDocs: 24
deleteDocs: 4 (id=1 4条)
*/
@Test
public void testDelete() {
IndexUtil iu = new IndexUtil();
iu.delete();
}
/*
* numDocs: 7
maxDocs: 7
deleteDocs: 0
*/
@Test
public void testUnDelete() {
IndexUtil iu = new IndexUtil();
iu.undelete();
}
/*
* numDocs: 6
maxDocs: 6(7)
deleteDocs: 0(1)
*/
@Test
public void testForceDelete() {
IndexUtil iu = new IndexUtil();
iu.forceDelete();
}
/*
* merge后：
* numDocs: 20
maxDocs: 21
deleteDocs: 1（因为强制合并为2段，所以_0_1.del没删）
_0为第一段，不动，把后边的合并为一段
*/
@Test
public void testMerge() {
IndexUtil iu = new IndexUtil();
iu.merge();
}
/*
* numDocs: 6
maxDocs: 7
deleteDocs: 1
删除后 add
*/
@Test
public void testUpdate() {
IndexUtil iu = new IndexUtil();
iu.update();
}
}

索引文件中文件表示含义：

0.fnm: 保存的field的信息，有哪几个字段

0.fdt,0.fdx: Store.YES的对应字段的值

0.frq:单词出现的频率

0.nrm: 存储评分信息，权重

0.prx: 偏移量

0.tii,0.tis: 存储索引信息

文档和域的概念：

文档相当于表中的一条记录，域相当于表中每一个字段

optimize() 已被启用，开销比较大
forceMergeDeletes() 强制把回收站的内容给删掉

当segment比较多时，lucene会自动优化处理

doymm2008

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Lucene3.5 之索引删除和更新

package com.ethan.index; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apach
复制链接

扫一扫