(_N.fdx,_N.fdt)
FieldData信息存储在_N.fdt文件中
Field的index信息存储在_N.fdx文件中
根据fdx文件中的存储的field 数据的指针在fdt文件中查找对应的field的数据
FieldIndex (.fdx) --> <FieldValuesPosition> SegSize
FieldValuesPosition --> Uint64
fdx文件存储了N个指针信息 N代表此段中所存储的document的数量,指针的值是一个long类型的数据
根据测试数据中创建的两个doc 此文件应该保存了 两个long值
在lucene2.9以后 fdx文件保存了format version信息,lucene写入fdx fdt文件的逻辑可以参看
org.apache.lucene.index.FieldsWriter 类和org.apache.lucene.index.FieldInfos类
fdt和fdx文件的header部分都写入了一个int类型的version信息
FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
fieldInfos = fn;
boolean success = false;
final String fieldsName = segment + "." + IndexFileNames.FIELDS_EXTENSION;
try {
fieldsStream = d.createOutput(fieldsName);
fieldsStream.writeInt(FORMAT_CURRENT);
success = true;
} finally {
if (!success) {
try {
close();
} catch (Throwable t) {
// Suppress so we keep throwing the original exception
}
try {
d.deleteFile(fieldsName);
} catch (Throwable t) {
// Suppress so we keep throwing the original exception
}
}
}
success = false;
final String indexName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
try {
indexStream = d.createOutput(indexName);
indexStream.writeInt(FORMAT_CURRENT);
success = true;
} finally {
if (!success) {
try {
close();
} catch (IOException ioe) {
}
try {
d.deleteFile(fieldsName);
} catch (Throwable t) {
// Suppress so we keep throwing the original exception
}
try {
d.deleteFile(indexName);
} catch (Throwable t) {
// Suppress so we keep throwing the original exception
}
}
}
doClose = true;
}
那么fdx文件的大小应该是4+N*8 N代表此段中保存的document的个数
读取fdx文件的内容如下:
/****************
*
*Create Class:ReadFieldIndex.java
*Author:a276202460
*Create at:2010-6-6
*/
package com.rich.lucene.io;
public class ReadFieldIndex {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String indexfile = "D:/lucenetest/indexs/txtindex/index4/_0.fdx";
IndexFileInput input = null;
try{
input = new IndexFileInput(indexfile);
long length = input.length();
System.out.println("Index File Format Version:"+input.readInt());
long docnum = (length - 4) / 8;
for(int i = 0 ;i < docnum;i++){
System.out.println("The Document["+i+"]'s position in fdt file is:"+input.readLong());
}
}finally{
input.close();
}
}
}
读取结果:
Index File Format Version:2
The Document[0]'s position in fdt file is:4
The Document[1]'s position in fdt file is:43
查看fdt文件内容前 读下lucene的写入逻辑 比看文档解释应该清楚很多
final void addDocument(Document doc) throws IOException {
indexStream.writeLong(fieldsStream.getFilePointer());
int storedCount = 0;
List<Fieldable> fields = doc.getFields();
for (Fieldable field : fields) {
if (field.isStored())
storedCount++;
}
fieldsStream.writeVInt(storedCount);
for (Fieldable field : fields) {
if (field.isStored())
writeField(fieldInfos.fieldInfo(field.name()), field);
}
}
final void writeField(FieldInfo fi, Fieldable field) throws IOException {
fieldsStream.writeVInt(fi.number);
byte bits = 0;
if (field.isTokenized())
bits |= FieldsWriter.FIELD_IS_TOKENIZED;
if (field.isBinary())
bits |= FieldsWriter.FIELD_IS_BINARY;
fieldsStream.writeByte(bits);
if (field.isBinary()) {
final byte[] data;
final int len;
final int offset;
data = field.getBinaryValue();
len = field.getBinaryLength();
offset = field.getBinaryOffset();
fieldsStream.writeVInt(len);
fieldsStream.writeBytes(data, offset, len);
}
else {
fieldsStream.writeString(field.stringValue());
}
}
读取的代码:
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
seekIndex(n);
long position = indexStream.readLong();
fieldsStream.seek(position);
Document doc = new Document();
int numFields = fieldsStream.readVInt();
for (int i = 0; i < numFields; i++) {
int fieldNumber = fieldsStream.readVInt();
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
byte bits = fieldsStream.readByte();
assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
: "compressed fields are only allowed in indexes of version <= 2.9";
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
//TODO: Find an alternative approach here if this list continues to grow beyond the
//list of 5 or 6 currently here. See Lucene 762 for discussion
if (acceptField.equals(FieldSelectorResult.LOAD)) {
addField(doc, fi, binary, compressed, tokenize);
}
else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
addField(doc, fi, binary, compressed, tokenize);
break;//Get out of this loop
}
else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
addFieldLazy(doc, fi, binary, compressed, tokenize);
}
else if (acceptField.equals(FieldSelectorResult.SIZE)){
skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));
}
else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
addFieldSize(doc, fi, binary, compressed);
break;
}
else {
skipField(binary, compressed);
}
}
return doc;
}
ftd文件的格式:
FieldData (.fdt) --> <DocFieldData> SegSize
DocFieldData --> FieldCount, <FieldNum, Bits, Value> FieldCount
FieldCount --> VInt
FieldNum --> VInt
Bits --> Byte
读取第一个doc的信息:
/****************
*
*Create Class:ReadDocument.java
*Author:a276202460
*Create at:2010-6-6
*/
package com.rich.lucene.io;
import org.apache.lucene.document.Field;
public class ReadDocument {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
long position = 4;
String datafile = "D:/lucenetest/indexs/txtindex/index4/_0.fdt";
IndexFileInput input = null;
try{
input = new IndexFileInput(datafile);
input.seek(position);
int fieldcount = input.readVInt();
System.out.println("doc's stored field count is :"+fieldcount);
for(int i = 0 ;i < fieldcount;i++){
/*
* fieldnum 对应的是fnm中保存的field的位置序号
* doc.add(new Field("title", title, Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("url", url, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("content", content, Field.Store.NO,
Field.Index.ANALYZED));
*/
System.out.println("fieldnumber:"+input.readVInt());
System.out.println("field's policy:"+Integer.toBinaryString(input.readByte()));
System.out.println("field value:"+input.readString());
}
}finally{
input.close();
}
}
}
运行结果:
doc's stored field count is :2
fieldnumber:0 fnm文件中保存的field信息中的第0个field
field's policy:1 分词存储的title
field value:百度搜索
fieldnumber:1 fnm文件中保存的field信息中的第1个field
field's policy:0 只是存储了url信息并未进行分词
field value:http://www.baidu.com
lucene在制定了doc的索引以后是很容易取到整个doc的信息的 所以在查询的时候lucene的result集合返回的都是结果集中的doc的编号
通过reader读取制定编号的文档