CLucene中文件.fdt文件和.fdx文件格式

最新推荐文章于 2021-05-18 10:48:46 发布

wenxinfly

最新推荐文章于 2021-05-18 10:48:46 发布

阅读量699

点赞数

本文链接：https://blog.csdn.net/wenxinfly/article/details/102663079

版权

xxx.fdt主要存储了每个Field相应的内容, 格式如下：

void DocumentsWriter::ThreadState::writeDocument() {

// If we hit an exception while appending to the
// stored fields or term vectors files, we have to
// abort all documents since we last flushed because
// it means those files are possibly inconsistent.
try {
_parent->numDocsInStore++;

// Append stored fields to the real FieldsWriter:
_parent->fieldsWriter->flushDocument(numStoredFields, fdtLocal); //文档个数
fdtLocal->reset();

}

每一个字段的内容:

if (field->isStored()) {
threadState->numStoredFields++;
bool success = false;
try {
threadState->localFieldsWriter->writeField(fieldInfo, field);
success = true;
} _CLFINALLY(

字段函数:

void FieldsWriter::writeField(FieldInfo* fi, CL_NS(document)::Field* field)
{
   // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
   // and field.binaryValue() already returns the compressed value for a field
   // with isCompressed()==true, so we disable compression in that case
   bool disableCompression = (field->instanceOf(FieldsReader::FieldForMerge::getClassName()));

   fieldsStream->writeVInt(fi->number);
   uint8_t bits = 0;
   if (field->isTokenized())
       bits |= FieldsWriter::FIELD_IS_TOKENIZED;
   if (field->isBinary())
       bits |= FieldsWriter::FIELD_IS_BINARY;
   if (field->isCompressed())
       bits |= FieldsWriter::FIELD_IS_COMPRESSED;

fieldsStream->writeByte(bits);

if ( field->isCompressed() ){
// compression is enabled for the current field
CL_NS(util)::ValueArray<uint8_t> dataB;
const CL_NS(util)::ValueArray<uint8_t>* data = &dataB;

if (disableCompression) {
// optimized case for merging, the data
// is already compressed
data = field->binaryValue();
} else {
// check if it is a binary field
if (field->isBinary()) {
compress(*field->binaryValue(), dataB);
}else if ( field->stringValue() == NULL ){ //we must be using readerValue
CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
Reader* r = field->readerValue();

int32_t sz = r->size();
if ( sz < 0 )
sz = 10000000; //todo: we should warn the developer here....

//read the entire string
const TCHAR* rv = NULL;
int64_t rl = r->read(rv, sz, 1);
if ( rl > LUCENE_INT32_MAX_SHOULDBE )
_CLTHROWA(CL_ERR_Runtime,"Field length too long");
else if ( rl < 0 )
rl = 0;

string str = lucene_wcstoutf8string(rv, rl);
CL_NS(util)::ValueArray<uint8_t> utfstr;
utfstr.length = str.length();
utfstr.values = (uint8_t*)str.c_str();
compress(utfstr, dataB);
utfstr.values = NULL;
}else if ( field->stringValue() != NULL ){
string str = lucene_wcstoutf8string(field->stringValue(), LUCENE_INT32_MAX_SHOULDBE);
CL_NS(util)::ValueArray<uint8_t> utfstr;
utfstr.length = str.length();
utfstr.values = (uint8_t*)str.c_str();
compress(utfstr, dataB);
utfstr.values = NULL;
}
}
fieldsStream->writeVInt(data->length);
fieldsStream->writeBytes(data->values, data->length);

}else{

       //FEATURE: this problem in Java Lucene too, if using Reader, data is not stored.
       //todo: this is a logic bug...
       //if the field is stored, and indexed, and is using a reader the field wont get indexed
       //
       //if we could write zero prefixed vints (therefore static length), then we could
       //write a reader directly to the field indexoutput and then go back and write the data
       //length. however this is not supported in lucene yet...
       //if this is ever implemented, then it would make sense to also be able to combine the
       //FieldsWriter and DocumentWriter::invertDocument process, and use a streamfilter to
       //write the field data while the documentwrite analyses the document! how cool would
       //that be! it would cut out all these buffers!!!

       // compression is disabled for the current field
       if (field->isBinary()) {
           const CL_NS(util)::ValueArray<uint8_t>* data = field->binaryValue();
fieldsStream->writeVInt(data->length);
fieldsStream->writeBytes(data->values, data->length);

       }else if ( field->stringValue() == NULL ){ //we must be using readerValue
           CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
           Reader* r = field->readerValue();

           int32_t sz = r->size();
           if ( sz < 0 )
               sz = 10000000; //todo: we should warn the developer here....

           //read the entire string
           const TCHAR* rv;
           int64_t rl = r->read(rv, sz, 1);
           if ( rl > LUCENE_INT32_MAX_SHOULDBE )
               _CLTHROWA(CL_ERR_Runtime,"Field length too long");
           else if ( rl < 0 )
               rl = 0;

           fieldsStream->writeString( rv, (int32_t)rl);
       }else if ( field->stringValue() != NULL ){
           fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));
       }else
           _CLTHROWA(CL_ERR_Runtime, "No values are set for the field");
   }
}

number数文档中字段的索引值，第一个字段的值为0，第二个字段的值为1（1-2个字节)

属性(1个字节)

内容

xxx.fdx是对xxx.fdt的索引。以每个doc作为界限。存储的是每个doc在xxx.fdt的pos.

主要代码为：

void FieldsWriter::flushDocument(int32_t numStoredFields, CL_NS(store)::RAMOutputStream* buffer) {
   indexStream->writeLong(fieldsStream->getFilePointer());
   fieldsStream->writeVInt(numStoredFields);
   buffer->writeTo(fieldsStream);
}