- package com.ethan.index;
- import java.io.File;
- import java.io.IOException;
- import org.apache.commons.io.FileUtils;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexReader.FieldOption;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.util.Version;
- public class IndexUtil {
- private String[] ids = {"1","2","3","4","5","6"};
- private String[] emails = {"11@qq.com","22@qq.com","33@126.com","43@yahoo.cn","54@gmail.com","65@qq.com"};
- private String[] contents = {
- "welcome to nba hot",
- "my name is ethan",
- "someone like you ",
- "rolling in the deep, you like",
- "i like fast........",
- "l like sports"
- };
- private int[] attachs = {2,3,1,5,4,6};
- private String[] names = {"ethan","sara","michael","wade","lin","paul"};
- private Directory directory = null;
- public IndexUtil() {
- try {
- directory = FSDirectory.open(new File("C:\\Users\\ETHAN\\workspace\\hellolucene\\index02"));
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void index() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)) );
- Document doc = null;
- for(int i=0;i<ids.length;i++) {
- doc = new Document();
- doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
- writer.addDocument(doc);
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(writer!=null) {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- public void query() {
- try {
- IndexReader reader = IndexReader.open(directory);
- //被存储的
- System.out.println("numDocs: "+reader.numDocs());
- //文档总量
- System.out.println("maxDocs: "+reader.maxDoc());
- //删除的文档
- System.out.println("deleteDocs: "+reader.numDeletedDocs());;
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void delete() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
- //这里删除id=1的文档,还会留在”回收站“。xxx.del
- writer.deleteDocuments(new Term("id","1"));
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(writer!=null) {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- public void undelete() {
- //使用IndexReader进行恢复
- IndexReader reader = null;
- try {
- //set readOnly=false
- reader = IndexReader.open(directory,false);
- reader.undeleteAll();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(reader!=null) {
- try {
- reader.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- public void forceDelete() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //强制优化,del文件就没了,回收站清空
- writer.forceMergeDeletes();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(writer!=null) {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- /*
- * 自己手动merge
- * 多次创建索引,文件会增多,
- * 比如 5次的话,5个id=1的
- *
- * merge后合并为n段
- */
- public void merge() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //将索引合并为2段,这两段中的del文件会被清空
- //3.5后不建议使用,开销大,lucene会根据情况自动处理
- writer.forceMerge(2);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(writer!=null) {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- /*
- * 更新操作
- */
- public void update() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //lucene没有提供更新方法,这里操作分为两步
- //匹配后删除 和 添加新的
- Document doc = new Document();
- doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
- writer.updateDocument(new Term("id","1"),doc);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if(writer!=null) {
- try {
- writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- }
- package com.ethan.test;
- import org.junit.Test;
- import com.ethan.index.IndexUtil;
- public class IndexTest {
- @Test
- public void testIndex() {
- IndexUtil iu = new IndexUtil();
- iu.index();
- }
- /*
- * numDocs: 24
- maxDocs: 24
- deleteDocs: 0
- */
- @Test
- public void testQuery() {
- IndexUtil iu = new IndexUtil();
- iu.query();
- }
- /*
- * numDocs: 20
- maxDocs: 24
- deleteDocs: 4 (id=1 4条)
- */
- @Test
- public void testDelete() {
- IndexUtil iu = new IndexUtil();
- iu.delete();
- }
- /*
- * numDocs: 7
- maxDocs: 7
- deleteDocs: 0
- */
- @Test
- public void testUnDelete() {
- IndexUtil iu = new IndexUtil();
- iu.undelete();
- }
- /*
- * numDocs: 6
- maxDocs: 6(7)
- deleteDocs: 0(1)
- */
- @Test
- public void testForceDelete() {
- IndexUtil iu = new IndexUtil();
- iu.forceDelete();
- }
- /*
- * merge后:
- * numDocs: 20
- maxDocs: 21
- deleteDocs: 1(因为强制合并为2段,所以_0_1.del没删)
- _0为第一段,不动,把后边的合并为一段
- */
- @Test
- public void testMerge() {
- IndexUtil iu = new IndexUtil();
- iu.merge();
- }
- /*
- * numDocs: 6
- maxDocs: 7
- deleteDocs: 1
- 删除后 add
- */
- @Test
- public void testUpdate() {
- IndexUtil iu = new IndexUtil();
- iu.update();
- }
- }
索引文件中文件 表示含义:
0.fnm: 保存的field的信息,有哪几个字段
0.fdt,0.fdx: Store.YES的对应字段的值
0.frq:单词出现的频率
0.nrm: 存储评分信息,权重
0.tii,0.tis: 存储索引信息
文档和域的概念:
optimize() 已被启用,开销比较大
forceMergeDeletes() 强制把回收站的内容给删掉
当segment比较多时,lucene会自动优化处理