CLucene中文件.fdt文件和.fdx文件格式

xxx.fdt主要存储了每个Field相应的内容, 格式如下:

void DocumentsWriter::ThreadState::writeDocument() {

  // If we hit an exception while appending to the
  // stored fields or term vectors files, we have to
  // abort all documents since we last flushed because
  // it means those files are possibly inconsistent.
  try {
    _parent->numDocsInStore++;

    // Append stored fields to the real FieldsWriter:
    _parent->fieldsWriter->flushDocument(numStoredFields, fdtLocal);     //文档个数
    fdtLocal->reset();

}

每一个字段的内容:

 if (field->isStored()) {
        threadState->numStoredFields++;
        bool success = false;
        try {
          threadState->localFieldsWriter->writeField(fieldInfo, field);
          success = true;
        } _CLFINALLY(

字段函数:

void FieldsWriter::writeField(FieldInfo* fi, CL_NS(document)::Field* field)
{
    // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
    // and field.binaryValue() already returns the compressed value for a field
    // with isCompressed()==true, so we disable compression in that case
    bool disableCompression = (field->instanceOf(FieldsReader::FieldForMerge::getClassName()));

    fieldsStream->writeVInt(fi->number);
    uint8_t bits = 0;
    if (field->isTokenized())
        bits |= FieldsWriter::FIELD_IS_TOKENIZED;
    if (field->isBinary())
        bits |= FieldsWriter::FIELD_IS_BINARY;
    if (field->isCompressed())
        bits |= FieldsWriter::FIELD_IS_COMPRESSED;

    fieldsStream->writeByte(bits);

    if ( field->isCompressed() ){
    // compression is enabled for the current field
    CL_NS(util)::ValueArray<uint8_t> dataB;
    const CL_NS(util)::ValueArray<uint8_t>* data = &dataB;

    if (disableCompression) {
      // optimized case for merging, the data
      // is already compressed
      data = field->binaryValue();
    } else {
      // check if it is a binary field
      if (field->isBinary()) {
        compress(*field->binaryValue(), dataB);
      }else if ( field->stringValue() == NULL ){ //we must be using readerValue
        CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
        Reader* r = field->readerValue();

        int32_t sz = r->size();
        if ( sz < 0 )
          sz = 10000000; //todo: we should warn the developer here....

        //read the entire string
        const TCHAR* rv = NULL;
        int64_t rl = r->read(rv, sz, 1);
        if ( rl > LUCENE_INT32_MAX_SHOULDBE )
          _CLTHROWA(CL_ERR_Runtime,"Field length too long");
        else if ( rl < 0 )
          rl = 0;

        string str = lucene_wcstoutf8string(rv, rl);
        CL_NS(util)::ValueArray<uint8_t> utfstr;
        utfstr.length = str.length();
        utfstr.values = (uint8_t*)str.c_str();
        compress(utfstr, dataB);
        utfstr.values = NULL;
      }else if ( field->stringValue() != NULL ){
        string str = lucene_wcstoutf8string(field->stringValue(), LUCENE_INT32_MAX_SHOULDBE);
        CL_NS(util)::ValueArray<uint8_t> utfstr;
        utfstr.length = str.length();
        utfstr.values = (uint8_t*)str.c_str();
        compress(utfstr, dataB);
        utfstr.values = NULL;
      }
    }
    fieldsStream->writeVInt(data->length);
    fieldsStream->writeBytes(data->values, data->length);

    }else{

        //FEATURE: this problem in Java Lucene too, if using Reader, data is not stored.
        //todo: this is a logic bug...
        //if the field is stored, and indexed, and is using a reader the field wont get indexed
        //
        //if we could write zero prefixed vints (therefore static length), then we could
        //write a reader directly to the field indexoutput and then go back and write the data
        //length. however this is not supported in lucene yet...
        //if this is ever implemented, then it would make sense to also be able to combine the
        //FieldsWriter and DocumentWriter::invertDocument process, and use a streamfilter to
        //write the field data while the documentwrite analyses the document! how cool would
        //that be! it would cut out all these buffers!!!

        // compression is disabled for the current field
        if (field->isBinary()) {
            const CL_NS(util)::ValueArray<uint8_t>* data = field->binaryValue();
      fieldsStream->writeVInt(data->length);
      fieldsStream->writeBytes(data->values, data->length);

        }else if ( field->stringValue() == NULL ){ //we must be using readerValue
            CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
            Reader* r = field->readerValue();

            int32_t sz = r->size();
            if ( sz < 0 )
                sz = 10000000; //todo: we should warn the developer here....

            //read the entire string
            const TCHAR* rv;
            int64_t rl = r->read(rv, sz, 1);
            if ( rl > LUCENE_INT32_MAX_SHOULDBE )
                _CLTHROWA(CL_ERR_Runtime,"Field length too long");
            else if ( rl < 0 )
                rl = 0;

            fieldsStream->writeString( rv, (int32_t)rl);
        }else if ( field->stringValue() != NULL ){
            fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));
        }else
            _CLTHROWA(CL_ERR_Runtime, "No values are set for the field");
    }
}

number数 文档中字段的索引值,第一个字段的值为0, 第二个字段的值为1(1-2个字节)

属性(1个字节)

内容

xxx.fdx是对xxx.fdt的索引。以每个doc作为界限。存储的是 每个doc在xxx.fdt的pos.

主要代码为:

void FieldsWriter::flushDocument(int32_t numStoredFields, CL_NS(store)::RAMOutputStream* buffer) {
    indexStream->writeLong(fieldsStream->getFilePointer());
    fieldsStream->writeVInt(numStoredFields);
    buffer->writeTo(fieldsStream);
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值