Clucene索引建立剖析

  下面,我们将结合代码,Clucene建立索引的过程进行剖析.

  (). main函数中调用建立索引的过程

(1).void IndexFiles()方法:

//参数:索引文件路径,索引后的目标路径

void IndexFiles(char* path, char* target, const bool clearIndex)

{

    IndexWriter* writer = NULL;

    lucene::analysis::standard::StandardAnalyzer an;

    if (!clearIndex && IndexReader::indexExists(target)){

        if (IndexReader::isLocked(target) ){  //在函数调用里面执行了创建了索引的目录

            printf("Index was locked... unlocking it./n");

            IndexReader::unlock(target);

        }

        writer = _CLNEW IndexWriter( target, &an, false);

    }else{

        writer = _CLNEW IndexWriter( target ,&an, true);

    }

    writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);

    writer->setUseCompoundFile(false);  //设置不使用复合索引

    uint64_t str = lucene::util::Misc::currentTimeMillis();

    indexDocs(writer, path);

    writer->optimize();

    writer->close();

    _CLDELETE(writer);

    printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str);

}

   (2).void IndexFiles()方法中调用indexDocs(writer, path)方法:

void indexDocs(IndexWriter* writer, char* directory)

{

    DIR* dir = opendir(directory);

    if ( dir != NULL ){

        struct dirent* fl;

        struct fileStat buf;

        char path[CL_MAX_DIR];

        strcpy(path,directory);

        strcat(path,PATH_DELIMITERA);

        char* pathP = path + strlen(path);

        fl = readdir(dir);

        while ( fl != NULL ){

            if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {

                pathP[0]=0;

                strcat(pathP,fl->d_name);

                int32_t ret = fileStat(path,&buf);

                if ( buf.st_mode & S_IFDIR ) {

                    indexDocs(writer, path );

                }else{

                    //处理目录下面的每个文档

                    Document* doc = FileDocument( path );

                    writer->addDocument(doc);

                    _CLDELETE(doc);

                }

            }

            fl = readdir(dir);

        }

        closedir(dir);

    }else{

        printf( "adding: %s/n", directory);

        Document* doc = FileDocument( directory );

        writer->addDocument( doc );

        _CLDELETE(doc);

    }

   (3). Document* FileDocument(const char* f)方法:

//先将字段加入到文档,在将文档加入到IndexWriter

Document* FileDocument(const char* f)

{

    Document* doc = _CLNEW Document();

    TCHAR tf[CL_MAX_DIR];

    STRCPY_AtoT(tf,f,CL_MAX_DIR);

    doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );

    FILE* fh = fopen(f,"r");

    if ( fh != NULL ){

        StringBuffer str;

        int fn = fileno(fh);

        struct stat filestat;

        fstat(fn, &filestat);

        str.reserve(filestat.st_size);

        char abuf[1024];

        TCHAR tbuf[1024];

        size_t r;

        //每次读取1023字节

        do{

            r = fread(abuf,1,1023,fh);

            abuf[r]=0;

            STRCPY_AtoT(tbuf,abuf,r);

            tbuf[r]=0;

            str.append(tbuf);

        }while(r>0);

        fclose(fh);

        doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) );

    }

    return doc;

}

 ().进入建立索引的细节

   (1). Document类与Field

void Document::add(Field& field)

{

    //刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系

    //新生成的fieldList都是在链表头部

    fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList);

}

      文档字段迭代器DocumentFieldEnumeration

//文档字段迭代器

class DocumentFieldEnumeration :LUCENE_BASE{  

    class DocumentFieldList :LUCENE_BASE{

    public:

        DocumentFieldList(Field* f, DocumentFieldList* n);  //构造函数

        ~DocumentFieldList();

        Field* field;

        DocumentFieldList* next;    //应该叫做之前的pre指针

    };

    friend class Document;

private:

    const DocumentFieldList* fields;

public:

    DocumentFieldEnumeration(const DocumentFieldList* fl);

    ~DocumentFieldEnumeration();

    bool hasMoreElements() const;

    Field* nextElement();

};

     void Document::removeFields()方法:

//从链表中删除多个重名的字段

void Document::removeFields(const TCHAR* name)

{

    CND_PRECONDITION(name != NULL, "name is NULL");

    DocumentFieldEnumeration::DocumentFieldList* previous = NULL;

    DocumentFieldEnumeration::DocumentFieldList* current = fieldList;

    while (current != NULL) {

        if ( _tcscmp(current->field->name(),name) == 0 ){

            if (previous){

                previous->next = current->next;  //删除当前指针,修改指针指向

            }else

                fieldList = current->next;

            current->next=NULL;

            _CLDELETE(current);

            if ( previous )

                current = previous->next;  //重新设置当前指针

            else

                current = fieldList;

        }else{

            previous = current;

            current = current->next;

        }

    }

}

(2). void IndexWriter::addDocument()方法

void IndexWriter::addDocument(Document* doc, Analyzer* analyzer)

{

    CND_PRECONDITION(ramDirectory != NULL,"ramDirectory is NULL");

    if ( analyzer == NULL )

    {

        analyzer = this->analyzer;

    }

    ramDirectory->transStart();

    try {

        //每加入一个文档,就得到新的段名

        char* segmentName = newSegmentName();

        CND_CONDITION(segmentName != NULL, "segmentName is NULL");

        try {

            // ramDirectory:带事务的内存文件目录

            DocumentWriter* dw = _CLNEW DocumentWriter(ramDirectory, analyzer, this );

            CND_CONDITION(dw != NULL, "dw is NULL");

            try {

                dw->addDocument(segmentName, doc);

            } _CLFINALLY(

                _CLDELETE(dw);

            );

            //建立索引时加入一个文档,就生成一个新的段信息

            SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory);

            CND_CONDITION(si != NULL, "Si is NULL");

            {

                SCOPED_LOCK_MUTEX(THIS_LOCK)

                segmentInfos->add(si);

                //合并段

                maybeMergeSegments();

            }

        } _CLFINALLY(

            _CLDELETE_CaARRAY(segmentName);

        );

    } catch (...) {

        ramDirectory->transAbort();

        throw;

    }

    ramDirectory->transCommit();

}

(3).调用到的void DocumentWriter::addDocument()方法

//将文档加入到新段里面

void DocumentWriter::addDocument(const char* segment, Document* doc)

{

    CND_PRECONDITION(fieldInfos==NULL, "fieldInfos!=NULL")

        // write field names

        fieldInfos = _CLNEW FieldInfos();

    fieldInfos->add(doc);

    //.fnm 写入字段名称的文件

    const char* buf = Misc::segmentname(segment, ".fnm");

    fieldInfos->write(directory, buf);

    _CLDELETE_CaARRAY(buf);

    // write field values

    FieldsWriter fieldsWriter(directory, segment, fieldInfos);

    try {

        fieldsWriter.addDocument(doc);

    } _CLFINALLY(fieldsWriter.close());

    //invert doc into postingTable

    clearPostingTable();              // clear postingTable

    //文档中字段的个数

    size_t size = fieldInfos->size();

    fieldLengths = _CL_NEWARRAY(int32_t,size);    // init fieldLengths

    fieldPositions = _CL_NEWARRAY(int32_t,size);  // init fieldPositions

    fieldOffsets = _CL_NEWARRAY(int32_t,size);    // init fieldOffsets

    memset(fieldPositions, 0, sizeof(int32_t) * size);

    //initialise fieldBoost array with default boost

    int32_t fbl = fieldInfos->size();

    float_t fbd = doc->getBoost();  //初始是.0f;

    fieldBoosts = _CL_NEWARRAY(float_t,fbl);      // init fieldBoosts

    {

        for ( int32_t i=0;i<fbl;i++ )

            fieldBoosts[i] = fbd;

    }

    {

        for ( int32_t i=0;i<fieldInfos->size();i++ )

            fieldLengths[i] = 0;

    }

    //进行倒排处理

    invertDocument(doc);

    // sort postingTable into an array

    Posting** postings = NULL;

    int32_t postingsLength = 0;

    //postingTable中的词条进行排序,返回一个排序的Posting[]数组

    sortPostingTable(postings,postingsLength);

    //write postings

    //将经过排序的Posting[]数组写入到索引段文件中(segmentsv.frq文件和segments.prx文件)

    writePostings(postings,postingsLength, segment);

    //write norms of indexed fields

    //写入被索引的Fieldnorm信息

    writeNorms(segment);

    _CLDELETE_ARRAY( postings );

}

(4). void FieldInfos::write()方法

//写入字段信息

void FieldInfos::write(IndexOutput* output) const

{

    //首先写入字段个数

    output->writeVInt(size());

    FieldInfo* fi;

    uint8_t bits;

    for (int32_t i = 0; i < size(); ++i) {

        fi = fieldInfo(i);

        bits = 0x0;

        if (fi->isIndexed) bits |= IS_INDEXED;  //每个位的位置,如果两个操作数对应的位有一个或者两个都为,则该位为,否则为

        if (fi->storeTermVector) bits |= STORE_TERMVECTOR;

        if (fi->storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;

        if (fi->storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;

        if (fi->omitNorms) bits |= OMIT_NORMS;

        output->writeString(fi->name,_tcslen(fi->name));  //写入字段名称以及长度

        output->writeByte(bits); //写入一个字节

    }

}

 

(5). void FieldsWriter::addDocument()方法

//写入字段值

void FieldsWriter::addDocument(Document* doc)

{

    CND_PRECONDITION(indexStream != NULL,"indexStream is NULL");

    CND_PRECONDITION(fieldsStream != NULL,"fieldsStream is NULL");

    printf("%s=%d","fieldsStream->getFilePointer()",fieldsStream->getFilePointer());

    //索引流写入字段流的位置指针

    indexStream->writeLong(fieldsStream->getFilePointer());

    int32_t storedCount = 0;

    DocumentFieldEnumeration* fields = doc->fields();

    while (fields->hasMoreElements()) {

        Field* field = fields->nextElement();

        if (field->isStored())

        {

            storedCount++;

        }

    }

    _CLDELETE(fields);

    //字段流写入存储索引的字段个数

    fieldsStream->writeVInt(storedCount);

    fields = doc->fields();

    while (fields->hasMoreElements())

    {

        Field* field = fields->nextElement();

        if (field->isStored())

        {

            //写入字段序号

            fieldsStream->writeVInt(fieldInfos->fieldNumber(field->name())); 

            uint8_t bits = 0;

            if (field->isTokenized())

                bits |= FieldsWriter::FIELD_IS_TOKENIZED;

            if (field->isBinary())

                bits |= FieldsWriter::FIELD_IS_BINARY;

            if (field->isCompressed())

                bits |= FieldsWriter::FIELD_IS_COMPRESSED;

            //写入一个字节:是否分词,是否是字节,是否压缩

            fieldsStream->writeByte(bits);

            if ( field->isCompressed() ){

                _CLTHROWA(CL_ERR_Runtime, "CLucene does not directly support compressed fields. Write a compressed byte array instead");

            }else{

                if (field->isBinary()) {

                    jstreams::StreamBase<char>* stream = field->streamValue();

                    const char* sd;

                    //去读取

                    int32_t rl = stream->read(sd,10000000,0);

                    if ( rl < 0 ){

                        fieldsStream->writeVInt(0);                 }else{

                        fieldsStream->writeVInt(rl);

                        fieldsStream->writeBytes((uint8_t*)sd, rl);

                    }

                }else if ( field->stringValue() == NULL ){                  CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")

                        Reader* r = field->readerValue();

                    const TCHAR* rv;

                    int64_t rl = r->read(rv, LUCENE_INT32_MAX_SHOULDBE);

                    if ( rl > LUCENE_INT32_MAX_SHOULDBE )

                        _CLTHROWA(CL_ERR_Runtime,"Field length too long");

                    else if ( rl < 0 )

                        rl = 0;

                    fieldsStream->writeString( rv, (int32_t)rl);

                }else if ( field->stringValue() != NULL ){

                    //写入读取的字符串

            fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));

                }else

                    _CLTHROWA(CL_ERR_Runtime, "No values are set for the field");

            }

        }

    }

    _CLDELETE(fields);

}

(6). void DocumentWriter::invertDocument方法

//进行倒排处理

void DocumentWriter::invertDocument(const Document* doc)

{

    DocumentFieldEnumeration* fields = doc->fields();

    try {

        while (fields->hasMoreElements())

        {

            Field* field = (Field*)fields->nextElement();

            const TCHAR*  fieldName = field->name();

            const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);

            //初始时都是

            int32_t length = fieldLengths[fieldNumber];     // length of field  // 根据每个Field的编号,设置每个Field的长度

            int32_t position = fieldPositions[fieldNumber]; // position in field // 根据每个Field的编号,设置每个Field的位置

            if (length>0)

            {

                position+=analyzer->getPositionIncrementGap(fieldName);

            }

            int32_t offset = fieldOffsets[fieldNumber];       // offset field  // 根据每个Field的编号,设置每个Fieldoffset

            if (field->isIndexed())

            {   // 如果Field被索引

                if (!field->isTokenized())

                {   // 如果Field没有进行分词

                    const TCHAR* charBuf = NULL;

                    int64_t dataLen = 0;

                    if (field->stringValue() == NULL && !field->isStored() )

                    {

                        CL_NS(util)::Reader* r = field->readerValue();

                        dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);

                        if (dataLen == -1)

                            dataLen = 0;

                    } else {

                        charBuf = field->stringValue();

                        dataLen = _tcslen(charBuf);

                    }

                    // 是否把整个Field的数据作为一个词条存储到postingTable

                    if(field->isStoreOffsetWithTermVector()){

                        TermVectorOffsetInfo tio;

                        tio.setStartOffset(offset);

                        tio.setEndOffset(offset + dataLen);

                        addPosition(fieldName, charBuf, position++, &tio );

                    }

                    else

                    {

                        addPosition(fieldName, charBuf, position++, NULL);

                    }

                    offset += dataLen;  //偏移量在加上数据长度

                    length++;

                } else { // field must be tokenized  // 需要对Field进行分词

                    CL_NS(util)::Reader* reader; // find or make Reader

                    bool delReader = false;

                    if (field->readerValue() != NULL) {  // 如果从Field获取的Reader数据不为null

                        reader = field->readerValue();

                    } else if (field->stringValue() != NULL) {  //   根据从Field获取的字符串数据构造一个Reader输入流

                        reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);

                        delReader = true;

                    } else {

                        _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");

                    }

 

                    try {

                        // Tokenize field and add to postingTable.

                        // 把经过分词处理的Field加入到postingTable

                        CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);

                        try

                        {

                            CL_NS(analysis)::Token t;

                            int32_t lastTokenEndOffset = -1;  //上一个分词的终止位置

                            while (stream->next(&t))

                            {

                                position += (t.getPositionIncrement() - 1);  //每次切出一个词,就将position加上这个词的长度

                                // 如果指定了Field的词条向量的偏移量,则存储该此条向量

                                if(field->isStoreOffsetWithTermVector()){

                                    TermVectorOffsetInfo tio;

                                    tio.setStartOffset(offset + t.startOffset());

                                    tio.setEndOffset(offset + t.endOffset());

                                    addPosition(fieldName, t.termText(), position++, &tio);

                                }

                                else

                                {

                                    addPosition(fieldName, t.termText(), position++, NULL);

                                }

                                lastTokenEndOffset = t.endOffset();

                                length++;

                                // Apply field truncation policy.

                                // length:切出的字段的长度

                                if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {

                                    if ( length > maxFieldLength) {  // 如果当前切出的词条数已经达到了该Field的最大长度

                                        break;

                                    }

                                } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {

                                    const TCHAR* errMsgBase =

                                        _T("Indexing a huge number of tokens from a single")

                                        _T(" field (/"%s/", in this case) can cause CLucene")

                                        _T(" to use memory excessively.")

                                        _T("  By default, CLucene will accept only %s tokens")

                                        _T(" tokens from a single field before forcing the")

                                        _T(" client programmer to specify a threshold at")

                                        _T(" which to truncate the token stream.")

                                        _T("  You should set this threshold via")

                                        _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")

                                        _T(" to disable truncation, or a value to specify maximum number of fields).");

                                    TCHAR defaultMaxAsChar[34];

                                    _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,defaultMaxAsChar, 10);

                                    int32_t errMsgLen = _tcslen(errMsgBase)+ _tcslen(fieldName)+ _tcslen(defaultMaxAsChar);

                                    TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);

                                    _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);

                                    _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);

                                }

                            } // while token->next

                            if(lastTokenEndOffset != -1 )

                            {

                                offset += lastTokenEndOffset + 1;

                            }

                        } _CLFINALLY (

                            stream->close();

                        _CLDELETE(stream);

                        );

                    } _CLFINALLY (

                        if (delReader) {

                            _CLDELETE(reader);

                        }

                        );

                } // if/else field is to be tokenized

                // 位置信息,偏移量信息,长度信息

                fieldLengths[fieldNumber] = length;       // save field length

                fieldPositions[fieldNumber] = position;   // save field position

                fieldBoosts[fieldNumber] *= field->getBoost();

                fieldOffsets[fieldNumber] = offset;  //实际上是这个字段的终止偏移位置

            } // if field is to beindexed

        } // while more fields available

    } _CLFINALLY (

        _CLDELETE(fields);

    );

}

(7). void DocumentWriter::addPosition()方法

void DocumentWriter::addPosition(const TCHAR* field,const TCHAR* text,const int32_t position,TermVectorOffsetInfo* offset)

{

    //设置词条

    //typedef CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare, Term::Equals> PostingTableType;

    termBuffer->set(field,text,false);

    Posting* ti = postingTable.get(termBuffer);

    if (ti != NULL)

    {          

        int32_t freq = ti->freq;

        if (ti->positions.length == freq) {

            // positions array is full, realloc its size

            // 扩充数组:初始添加时频率为positions.values[0] = position; positions.length = 1;

            ti->positions.length = freq*2;

            ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));

        }

        ti->positions.values[freq] = position;        // add new position

        if (offset != NULL)

        {

            if (ti->offsets.length == freq)

            {

                //存储偏移量信息时跟存储位置采用相同的方法

                ti->offsets.length = freq*2;

                ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));

            }

            ti->offsets[freq] = *offset;

        }

        ti->freq = freq + 1;      // 更新词条频率

    } else {                      // word not seen before

        Term* term = _CLNEW Term( field, text, false);

        postingTable.put(term, _CLNEW Posting(term, position, offset));

    }

}

     (8). DocumentWriter::Posting::Posting()构造函数

/*Posting构造函数*/

DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset)

{

    //对新生成的词条的处理

    freq = 1;  //频率设置为

    term = _CL_POINTER(t);

    positions.values = (int32_t*)malloc(sizeof(int32_t));  //存储的位置数组

    positions.values[0] = position;

    positions.length = 1;    //设置数组容量也是

    if ( offset != NULL )

    {

        this->offsets.values = (TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));

        this->offsets.values[0] = *offset; //设置其中一个偏移量信息

        this->offsets.length = 1;  //设置数组容量也是

    }

}

   (9). DocumentWriter::writePostings()方法

void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment)

{

#define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} }

    IndexOutput* freq = NULL;

    IndexOutput* prox = NULL;

    TermInfosWriter* tis = NULL;

    TermVectorsWriter* termVectorWriter = NULL;

    try {

        //open files for inverse index storage

        //.frq: 频率信息文件

        const char* buf = Misc::segmentname( segment, ".frq");

        freq = directory->createOutput( buf );

        _CLDELETE_CaARRAY( buf );

        //.prx: 位置信息文件

        buf = Misc::segmentname( segment, ".prx");

        prox = directory->createOutput( buf );

        _CLDELETE_CaARRAY( buf );

        //TermInfosWriter类的构造函数,termIndexInterval:词条分组间隔

        tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,termIndexInterval);

        TermInfo* ti = _CLNEW TermInfo();

        const TCHAR* currentField = NULL;

        for (int32_t i = 0; i < postingsLength; i++) {

            Posting* posting = postings[i];

            // 写入字典文件以及快表文件

            ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);

            tis->add(posting->term, ti);

            int32_t postingFreq = posting->freq;

            if (postingFreq == 1)                 // optimize freq=1

                freq->writeVInt(1);           // set low bit of doc num.

            else {

                freq->writeVInt(0);           // the document number

                freq->writeVInt(postingFreq); // frequency in doc

            }

            int32_t lastPosition = 0;             // write positions

            //使用差别法写入位置信息

            for (int32_t j = 0; j < postingFreq; ++j) {       // use delta-encoding

                prox->writeVInt(posting->positions.values[j] - lastPosition);

                lastPosition = posting->positions.values[j];

            }

            // check to see if we switched to a new field

            const TCHAR* termField = posting->term->field();

            //对字段包含的词条的处理

            //对不同字段的处理

            if (currentField==NULL||_tcscmp(currentField,termField)!= 0) { //todo, can we do an intern'd check?

                // changing field - see if there is something to save

                currentField = termField;

                FieldInfo* fi = fieldInfos->fieldInfo(currentField);

                //field中以StoreTermVector方式保存的posting信息需要TermVectorsWriter类来写入

                if (fi->storeTermVector)

                {

                    if (termVectorWriter == NULL) {

                        //TermVectorsWriter类的构造函数

                        termVectorWriter =_CLNEW TermVectorsWriter(directory, segment, fieldInfos);

                        termVectorWriter->openDocument();

                    }

                    termVectorWriter->openField(currentField);

                } else if (termVectorWriter != NULL) {

                    termVectorWriter->closeField();

                }

            }

            if (termVectorWriter != NULL && termVectorWriter->isFieldOpen())

            {

                termVectorWriter->addTerm(posting->term->text(), postingFreq, &posting->positions, &posting->offsets);

            }

        }

        if (termVectorWriter != NULL)

        {

            termVectorWriter->closeDocument();

        }

        _CLDELETE(ti);

    }_CLFINALLY (

        const char* err=NULL;

    int32_t ierr=0;

    __DOCLOSE(freq);

    __DOCLOSE(prox);

    __DOCLOSE(tis);

    __DOCLOSE(termVectorWriter);

    if ( err != NULL )

        _CLTHROWA(ierr,err);

    );

}

   (10). TermInfosWriter::TermInfosWriter()构造函数

TermInfosWriter::TermInfosWriter(Directory* directory, const char* segment, FieldInfos* fis, int32_t interval):

fieldInfos(fis)

{

    CND_PRECONDITION(segment != NULL, "segment is NULL");

    initialise(directory,segment,interval, false);

    //这个other会填写.tii文件信息

    other = _CLNEW TermInfosWriter(directory, segment,fieldInfos, interval, true);

    CND_CONDITION(other != NULL, "other is NULL");

    other->other = this;

}

   (11). void TermInfosWriter::initialise()方法

void TermInfosWriter::initialise(Directory* directory, const char* segment, int32_t interval, bool IsIndex)

{

//字典文件由term信息组成,.tis 文件表示term信息文件.tii文件代表快表文件

   //.tis文件中term个数计数,每到一个分组跨度(比如计数到,256),便把分组信息点term信息保存到.tii文件中

    lastTerm = _CLNEW Term;

    CND_CONDITION(lastTerm != NULL, "Could not allocate memory for lastTerm");

    lastTi  = _CLNEW TermInfo();

    CND_CONDITION(lastTi != NULL, "Could not allocate memory for lastTi");

    lastIndexPointer = 0;

    size             = 0;

    isIndex          = IsIndex;

    indexInterval = interval;

    skipInterval = LUCENE_DEFAULT_TERMDOCS_SKIP_INTERVAL;

    //other: isIndex=true 本身自己是false

    const char* buf = Misc::segmentname(segment, (isIndex ? ".tii" : ".tis"));

    output = directory->createOutput(buf);

    _CLDELETE_CaARRAY(buf);

    output->writeInt(FORMAT);                      // write format

    output->writeLong(0);                          // leave space for size

    output->writeInt(indexInterval);// write indexInterval

    output->writeInt(skipInterval); // write skipInterval

    //Set other to NULL by Default

    other = NULL;

}

(12). void TermInfosWriter::add()方法

void TermInfosWriter::add(Term* term, const TermInfo* ti)

{

    CND_PRECONDITION(isIndex || (!isIndex  && term->compareTo(lastTerm) > 0),"term out of order");

    CND_PRECONDITION(ti->freqPointer >= lastTi->freqPointer,"freqPointer out of order");

    CND_PRECONDITION(ti->proxPointer >= lastTi->proxPointer,"proxPointer out of order");

    if (!isIndex && size % indexInterval == 0){

        //本身是isIndex=false 然后达到了词条分组间隔

        other->add(lastTerm, lastTi);

    }

    //写入词条

    writeTerm(term);                     

    // write doc freq 写入文档频率

    output->writeVInt(ti->docFreq);      

    //write pointers  写入词条频率差值,位置信息差值,初始时都是

    output->writeVLong(ti->freqPointer - lastTi->freqPointer);

    output->writeVLong(ti->proxPointer - lastTi->proxPointer);

    if (ti->docFreq >= skipInterval)

    {

        output->writeVInt(ti->skipOffset);

    }

    //other 快表的写入处理

    if (isIndex)

    {

        output->writeVLong(other->output->getFilePointer() - lastIndexPointer);

        lastIndexPointer = other->output->getFilePointer(); // write pointer

    }

    lastTi->set(ti); //设置上一次TermInfo* ti信息

    size++;

}

(13). void TermInfosWriter::writeTerm()方法

void TermInfosWriter::writeTerm(Term* term)

{

    //因为词条信息已经是排序好了的,这里使用差别法写入词条信息

    int32_t start = Misc::stringDifference(lastTerm->text(),lastTerm->textLength(), term->text(),term->textLength());

    int32_t length = term->textLength() - start;

    output->writeVInt(start);             // 写入共有前缀字符长度

    output->writeVInt(length);            // 写入不同的字符长度

    output->writeChars(term->text(), start, length);  //写入不同的字符值

    int32_t fieldnum = fieldInfos->fieldNumber(term->field()); //写入词条所在字段的编号

    CND_PRECONDITION(fieldnum>=-1&&fieldnum<fieldInfos->size(),"Fieldnum is out of range");

    output->writeVInt(fieldnum); // write field num

    if ( lastTerm->__cl_refcount == 1 ){

        lastTerm->set(term,term->text());

    }else{

        _CLDECDELETE(lastTerm);

        lastTerm = _CL_POINTER(term);

    }

}

(14). TermVectorsWriter::TermVectorsWriter()构造函数

TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory,

                                     const char* segment,FieldInfos* fieldInfos)

{

    //.tvx: 保存了指针信息,指针指向.tvddocument数据位置

    //.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息

    //.tvf: 保存fieldterm,频率,位置与偏移信息

    char fbuf[CL_MAX_NAME];

    strcpy(fbuf,segment);

    char* fpbuf=fbuf+strlen(fbuf);

    strcpy(fpbuf,LUCENE_TVX_EXTENSION);

    tvx = directory->createOutput(fbuf);

    tvx->writeInt(FORMAT_VERSION);

    strcpy(fpbuf,LUCENE_TVD_EXTENSION);

    tvd = directory->createOutput(fbuf);

    tvd->writeInt(FORMAT_VERSION);

    strcpy(fpbuf,LUCENE_TVF_EXTENSION);

    tvf = directory->createOutput(fbuf);

    tvf->writeInt(FORMAT_VERSION);

    this->fieldInfos = fieldInfos;

    currentField = NULL;      //字段是否打开的判断

    currentDocPointer = -1;   //文档是否打开的判断

}

(15). void TermVectorsWriter::writeField()方法

void TermVectorsWriter::writeField() 

{

    //.tvx: 保存了指针信息,指针指向.tvddocument数据位置

    //.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息

    //.tvf: 保存fieldterm,频率,位置与偏移信息

    currentField->tvfPointer = tvf->getFilePointer();

    //System.out.println("Field Pointer: " + currentField.tvfPointer);

    //写入词条个数

    int32_t size = terms.size();

    tvf->writeVInt(size);

    //是否以TermVector方式保存位置信息,是否以TermVector方式保存偏移量信息

    bool storePositions = currentField->storePositions;

    bool storeOffsets = currentField->storeOffsets;

    uint8_t bits = 0x0;

    if (storePositions)

        bits |= STORE_POSITIONS_WITH_TERMVECTOR;

    if (storeOffsets)

        bits |= STORE_OFFSET_WITH_TERMVECTOR;

    tvf->writeByte(bits);

    const TCHAR* lastTermText = LUCENE_BLANK_STRING;  //一个空串""

    int32_t lastTermTextLen = 0;

    for (int32_t i = 0; i < size; ++i)

    {

        TVTerm* term = terms[i];

        int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen, term->getTermText(),term->getTermTextLen());

        int32_t length = term->getTermTextLen() - start;

        tvf->writeVInt(start);            // 写入共有前缀字符长度

        tvf->writeVInt(length);           // 写入不同的字符长度

        tvf->writeChars(term->getTermText(), start, length);  //写入不同的字符值

        tvf->writeVInt(term->freq);  //写入词条的频率

        lastTermText = term->getTermText();

        lastTermTextLen = term->getTermTextLen();

        //位置信息与偏移量的差别在于:位置信息保存的是term之间相隔term的个数,偏移量保存

        //term之间相隔的字符数

        if(storePositions){

            //TermVector方式保存位置信息

            if(term->positions == NULL)

            {

                _CLTHROWA(CL_ERR_IllegalState, "Trying to write positions that are NULL!");

            }

            // use delta encoding for positions

            int32_t position = 0;

            for (int32_t j = 0; j < term->freq; ++j){

                tvf->writeVInt((*term->positions)[j] - position); //只保存位置差值

                position = (*term->positions)[j];

            }

        }

        if(storeOffsets){

            //TermVector方式保存偏移量信息

            if(term->offsets == NULL)

            {

                _CLTHROWA(CL_ERR_IllegalState, "Trying to write offsets that are NULL!");

            }

            int32_t position = 0;

            for (int32_t j = 0; j < term->freq; ++j) {

                tvf->writeVInt((*term->offsets)[j].getStartOffset() - position);

                tvf->writeVInt((*term->offsets)[j].getEndOffset() - (*term->offsets)[j].getStartOffset()); //Save the diff between the two.

                position = (*term->offsets)[j].getEndOffset();

            }

        }

    }

}

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值