边学边记(七) lucene索引结构详解四(_N.fdx,_N.fdt)

(_N.fdx,_N.fdt)

FieldData信息存储在_N.fdt文件中

Field的index信息存储在_N.fdx文件中

根据fdx文件中的存储的field 数据的指针在fdt文件中查找对应的field的数据

FieldIndex (.fdx) --> <FieldValuesPosition> SegSize

FieldValuesPosition --> Uint64

fdx文件存储了N个指针信息 N代表此段中所存储的document的数量,指针的值是一个long类型的数据

根据测试数据中创建的两个doc 此文件应该保存了 两个long值

 

在lucene2.9以后 fdx文件保存了format version信息,lucene写入fdx fdt文件的逻辑可以参看

org.apache.lucene.index.FieldsWriter 类和org.apache.lucene.index.FieldInfos类

fdt和fdx文件的header部分都写入了一个int类型的version信息

 

FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
        fieldInfos = fn;
        boolean success = false;
        final String fieldsName = segment + "." + IndexFileNames.FIELDS_EXTENSION;
        try {
          fieldsStream = d.createOutput(fieldsName);
          fieldsStream.writeInt(FORMAT_CURRENT);
          success = true;
        } finally {
          if (!success) {
            try {
              close();
            } catch (Throwable t) {
              // Suppress so we keep throwing the original exception
            }
            try {
              d.deleteFile(fieldsName);
            } catch (Throwable t) {
              // Suppress so we keep throwing the original exception
            }
          }
        }
        success = false;
        final String indexName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
        try {
          indexStream = d.createOutput(indexName);
          indexStream.writeInt(FORMAT_CURRENT);
          success = true;
        } finally {
          if (!success) {
            try {
              close();
            } catch (IOException ioe) {
            }
            try {
              d.deleteFile(fieldsName);
            } catch (Throwable t) {
              // Suppress so we keep throwing the original exception
            }
            try {
              d.deleteFile(indexName);
            } catch (Throwable t) {
              // Suppress so we keep throwing the original exception
            }
          }
        }
        doClose = true;
    }

那么fdx文件的大小应该是4+N*8 N代表此段中保存的document的个数

 

读取fdx文件的内容如下:

 

/****************
 *
 *Create Class:ReadFieldIndex.java
 *Author:a276202460
 *Create at:2010-6-6
 */
package com.rich.lucene.io;
public class ReadFieldIndex {
	/**
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		 String indexfile = "D:/lucenetest/indexs/txtindex/index4/_0.fdx";
		 IndexFileInput input = null;
		 try{
			 input = new IndexFileInput(indexfile);
			 long length = input.length();
			 System.out.println("Index File Format Version:"+input.readInt());
			 long docnum = (length - 4) / 8;
			 for(int i = 0 ;i < docnum;i++){
				 System.out.println("The Document["+i+"]'s position in fdt file is:"+input.readLong());
			 }
 
		 }finally{
			 input.close();
		 }
	}
}

读取结果:

Index File Format Version:2
The Document[0]'s position in fdt file is:4
The Document[1]'s position in fdt file is:43

查看fdt文件内容前 读下lucene的写入逻辑 比看文档解释应该清楚很多

 

final void addDocument(Document doc) throws IOException {
        indexStream.writeLong(fieldsStream.getFilePointer());
        int storedCount = 0;
        List<Fieldable> fields = doc.getFields();
        for (Fieldable field : fields) {
            if (field.isStored())
                storedCount++;
        }
        fieldsStream.writeVInt(storedCount);
        
        for (Fieldable field : fields) {
            if (field.isStored())
              writeField(fieldInfos.fieldInfo(field.name()), field);
        }
    }

final void writeField(FieldInfo fi, Fieldable field) throws IOException {
      fieldsStream.writeVInt(fi.number);
      byte bits = 0;
      if (field.isTokenized())
        bits |= FieldsWriter.FIELD_IS_TOKENIZED;
      if (field.isBinary())
        bits |= FieldsWriter.FIELD_IS_BINARY;
                
      fieldsStream.writeByte(bits);
                
      if (field.isBinary()) {
        final byte[] data;
        final int len;
        final int offset;
        data = field.getBinaryValue();
        len = field.getBinaryLength();
        offset =  field.getBinaryOffset();
        fieldsStream.writeVInt(len);
        fieldsStream.writeBytes(data, offset, len);
      }
      else {
        fieldsStream.writeString(field.stringValue());
      }
    }

 

读取的代码:

 

final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
    seekIndex(n);
    long position = indexStream.readLong();
    fieldsStream.seek(position);
    Document doc = new Document();
    int numFields = fieldsStream.readVInt();
    for (int i = 0; i < numFields; i++) {
      int fieldNumber = fieldsStream.readVInt();
      FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
      FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
      
      byte bits = fieldsStream.readByte();
      assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
      boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
      assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
        : "compressed fields are only allowed in indexes of version <= 2.9";
      boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
      boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
      //TODO: Find an alternative approach here if this list continues to grow beyond the
      //list of 5 or 6 currently here.  See Lucene 762 for discussion
      if (acceptField.equals(FieldSelectorResult.LOAD)) {
        addField(doc, fi, binary, compressed, tokenize);
      }
      else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
        addField(doc, fi, binary, compressed, tokenize);
        break;//Get out of this loop
      }
      else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
        addFieldLazy(doc, fi, binary, compressed, tokenize);
      }
      else if (acceptField.equals(FieldSelectorResult.SIZE)){
        skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));
      }
      else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
        addFieldSize(doc, fi, binary, compressed);
        break;
      }
      else {
        skipField(binary, compressed);
      }
    }
    return doc;
  }

ftd文件的格式:

FieldData (.fdt) --> <DocFieldData> SegSize

DocFieldData --> FieldCount, <FieldNum, Bits, Value> FieldCount

FieldCount --> VInt

FieldNum --> VInt

Bits --> Byte

 

 

 

读取第一个doc的信息:

 

/****************
 *
 *Create Class:ReadDocument.java
 *Author:a276202460
 *Create at:2010-6-6
 */
package com.rich.lucene.io;
import org.apache.lucene.document.Field;
public class ReadDocument {
	/**
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		 long position = 4;
		 String datafile = "D:/lucenetest/indexs/txtindex/index4/_0.fdt";
		 IndexFileInput input = null;
		 try{
			 input = new IndexFileInput(datafile);
			 input.seek(position);
			 int fieldcount = input.readVInt();
			 System.out.println("doc's stored field count is :"+fieldcount);
			 for(int i = 0 ;i < fieldcount;i++){
				 /*
				  * fieldnum 对应的是fnm中保存的field的位置序号
				  * doc.add(new Field("title", title, Field.Store.YES,
						 Field.Index.ANALYZED));
		            doc.add(new Field("url", url, Field.Store.YES,
						 Field.Index.NOT_ANALYZED));
		            doc.add(new Field("content", content, Field.Store.NO,
				         Field.Index.ANALYZED));
				  */
				 System.out.println("fieldnumber:"+input.readVInt());
				 System.out.println("field's policy:"+Integer.toBinaryString(input.readByte()));
				 System.out.println("field value:"+input.readString());
			 }
 
		 }finally{
			 input.close();
		 }
	}
} 

 

运行结果:

doc's stored field count is :2
fieldnumber:0  fnm文件中保存的field信息中的第0个field
field's policy:1 分词存储的title
field value:百度搜索
fieldnumber:1 fnm文件中保存的field信息中的第1个field
field's policy:0 只是存储了url信息并未进行分词
field value:http://www.baidu.com

 

lucene在制定了doc的索引以后是很容易取到整个doc的信息的 所以在查询的时候lucene的result集合返回的都是结果集中的doc的编号

通过reader读取制定编号的文档

 

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值