下面,我们将结合代码,对Clucene建立索引的过程进行剖析.
(一). main函数中调用建立索引的过程
(1).void IndexFiles()方法:
//参数:索引文件路径,索引后的目标路径 void IndexFiles(char* path, char* target, const bool clearIndex) { IndexWriter* writer = NULL; lucene::analysis::standard::StandardAnalyzer an; if (!clearIndex && IndexReader::indexExists(target)){ if (IndexReader::isLocked(target) ){ //在函数调用里面执行了创建了索引的目录 printf("Index was locked... unlocking it./n"); IndexReader::unlock(target); } writer = _CLNEW IndexWriter( target, &an, false); }else{ writer = _CLNEW IndexWriter( target ,&an, true); } writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH); writer->setUseCompoundFile(false); //设置不使用复合索引 uint64_t str = lucene::util::Misc::currentTimeMillis(); indexDocs(writer, path); writer->optimize(); writer->close(); _CLDELETE(writer); printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str); } |
(2).void IndexFiles()方法中调用indexDocs(writer, path)方法:
void indexDocs(IndexWriter* writer, char* directory) { DIR* dir = opendir(directory); if ( dir != NULL ){ struct dirent* fl; struct fileStat buf; char path[CL_MAX_DIR]; strcpy(path,directory); strcat(path,PATH_DELIMITERA); char* pathP = path + strlen(path); fl = readdir(dir); while ( fl != NULL ){ if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) { pathP[0]=0; strcat(pathP,fl->d_name); int32_t ret = fileStat(path,&buf); if ( buf.st_mode & S_IFDIR ) { indexDocs(writer, path ); }else{ //处理目录下面的每个文档 Document* doc = FileDocument( path ); writer->addDocument(doc); _CLDELETE(doc); } } fl = readdir(dir); } closedir(dir); }else{ printf( "adding: %s/n", directory); Document* doc = FileDocument( directory ); writer->addDocument( doc ); _CLDELETE(doc); } |
(3). Document* FileDocument(const char* f)方法:
//先将字段加入到文档,在将文档加入到IndexWriter中 Document* FileDocument(const char* f) { Document* doc = _CLNEW Document(); TCHAR tf[CL_MAX_DIR]; STRCPY_AtoT(tf,f,CL_MAX_DIR); doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) ); FILE* fh = fopen(f,"r"); if ( fh != NULL ){ StringBuffer str; int fn = fileno(fh); struct stat filestat; fstat(fn, &filestat); str.reserve(filestat.st_size); char abuf[1024]; TCHAR tbuf[1024]; size_t r; //每次读取1023字节 do{ r = fread(abuf,1,1023,fh); abuf[r]=0; STRCPY_AtoT(tbuf,abuf,r); tbuf[r]=0; str.append(tbuf); }while(r>0); fclose(fh); doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) ); } return doc; } |
(二).进入建立索引的细节
(1). Document类与Field类
void Document::add(Field& field) { //刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系 //新生成的fieldList都是在链表头部 fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList); } |
文档字段迭代器DocumentFieldEnumeration类
//文档字段迭代器 class DocumentFieldEnumeration :LUCENE_BASE{ class DocumentFieldList :LUCENE_BASE{ public: DocumentFieldList(Field* f, DocumentFieldList* n); //构造函数 ~DocumentFieldList(); Field* field; DocumentFieldList* next; //应该叫做之前的pre指针 }; friend class Document; private: const DocumentFieldList* fields; public: DocumentFieldEnumeration(const DocumentFieldList* fl); ~DocumentFieldEnumeration(); bool hasMoreElements() const; Field* nextElement(); }; |
void Document::removeFields()方法:
//从链表中删除多个重名的字段 void Document::removeFields(const TCHAR* name) { CND_PRECONDITION(name != NULL, "name is NULL"); DocumentFieldEnumeration::DocumentFieldList* previous = NULL; DocumentFieldEnumeration::DocumentFieldList* current = fieldList; while (current != NULL) { if ( _tcscmp(current->field->name(),name) == 0 ){ if (previous){ previous->next = current->next; //删除当前指针,修改指针指向 }else fieldList = current->next; current->next=NULL; _CLDELETE(current); if ( previous ) current = previous->next; //重新设置当前指针 else current = fieldList; }else{ previous = current; current = current->next; } } } |
(2). void IndexWriter::addDocument()方法
void IndexWriter::addDocument(Document* doc, Analyzer* analyzer) { CND_PRECONDITION(ramDirectory != NULL,"ramDirectory is NULL"); if ( analyzer == NULL ) { analyzer = this->analyzer; } ramDirectory->transStart(); try { //每加入一个文档,就得到新的段名 char* segmentName = newSegmentName(); CND_CONDITION(segmentName != NULL, "segmentName is NULL"); try { // ramDirectory:带事务的内存文件目录 DocumentWriter* dw = _CLNEW DocumentWriter(ramDirectory, analyzer, this ); CND_CONDITION(dw != NULL, "dw is NULL"); try { dw->addDocument(segmentName, doc); } _CLFINALLY( _CLDELETE(dw); ); //建立索引时加入一个文档,就生成一个新的段信息 SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory); CND_CONDITION(si != NULL, "Si is NULL"); { SCOPED_LOCK_MUTEX(THIS_LOCK) segmentInfos->add(si); //合并段 maybeMergeSegments(); } } _CLFINALLY( _CLDELETE_CaARRAY(segmentName); ); } catch (...) { ramDirectory->transAbort(); throw; } ramDirectory->transCommit(); } |
(3).调用到的void DocumentWriter::addDocument()方法
//将文档加入到新段里面 void DocumentWriter::addDocument(const char* segment, Document* doc) { CND_PRECONDITION(fieldInfos==NULL, "fieldInfos!=NULL") // write field names fieldInfos = _CLNEW FieldInfos(); fieldInfos->add(doc); //.fnm 写入字段名称的文件 const char* buf = Misc::segmentname(segment, ".fnm"); fieldInfos->write(directory, buf); _CLDELETE_CaARRAY(buf); // write field values FieldsWriter fieldsWriter(directory, segment, fieldInfos); try { fieldsWriter.addDocument(doc); } _CLFINALLY(fieldsWriter.close()); //invert doc into postingTable clearPostingTable(); // clear postingTable //文档中字段的个数 size_t size = fieldInfos->size(); fieldLengths = _CL_NEWARRAY(int32_t,size); // init fieldLengths fieldPositions = _CL_NEWARRAY(int32_t,size); // init fieldPositions fieldOffsets = _CL_NEWARRAY(int32_t,size); // init fieldOffsets memset(fieldPositions, 0, sizeof(int32_t) * size); //initialise fieldBoost array with default boost int32_t fbl = fieldInfos->size(); float_t fbd = doc->getBoost(); //初始是.0f; fieldBoosts = _CL_NEWARRAY(float_t,fbl); // init fieldBoosts { for ( int32_t i=0;i<fbl;i++ ) fieldBoosts[i] = fbd; } { for ( int32_t i=0;i<fieldInfos->size();i++ ) fieldLengths[i] = 0; } //进行倒排处理 invertDocument(doc); // sort postingTable into an array Posting** postings = NULL; int32_t postingsLength = 0; //对postingTable中的词条进行排序,返回一个排序的Posting[]数组 sortPostingTable(postings,postingsLength); //write postings //将经过排序的Posting[]数组写入到索引段文件中(segmentsv.frq文件和segments.prx文件) writePostings(postings,postingsLength, segment); //write norms of indexed fields //写入被索引的Field的norm信息 writeNorms(segment); _CLDELETE_ARRAY( postings ); } |
(4). void FieldInfos::write()方法
//写入字段信息 void FieldInfos::write(IndexOutput* output) const { //首先写入字段个数 output->writeVInt(size()); FieldInfo* fi; uint8_t bits; for (int32_t i = 0; i < size(); ++i) { fi = fieldInfo(i); bits = 0x0; if (fi->isIndexed) bits |= IS_INDEXED; //每个位的位置,如果两个操作数对应的位有一个或者两个都为,则该位为,否则为 if (fi->storeTermVector) bits |= STORE_TERMVECTOR; if (fi->storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (fi->storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; if (fi->omitNorms) bits |= OMIT_NORMS; output->writeString(fi->name,_tcslen(fi->name)); //写入字段名称以及长度 output->writeByte(bits); //写入一个字节 } }
|
(5). void FieldsWriter::addDocument()方法
//写入字段值 void FieldsWriter::addDocument(Document* doc) { CND_PRECONDITION(indexStream != NULL,"indexStream is NULL"); CND_PRECONDITION(fieldsStream != NULL,"fieldsStream is NULL"); printf("%s=%d","fieldsStream->getFilePointer()",fieldsStream->getFilePointer()); //索引流写入字段流的位置指针 indexStream->writeLong(fieldsStream->getFilePointer()); int32_t storedCount = 0; DocumentFieldEnumeration* fields = doc->fields(); while (fields->hasMoreElements()) { Field* field = fields->nextElement(); if (field->isStored()) { storedCount++; } } _CLDELETE(fields); //字段流写入存储索引的字段个数 fieldsStream->writeVInt(storedCount); fields = doc->fields(); while (fields->hasMoreElements()) { Field* field = fields->nextElement(); if (field->isStored()) { //写入字段序号 fieldsStream->writeVInt(fieldInfos->fieldNumber(field->name())); uint8_t bits = 0; if (field->isTokenized()) bits |= FieldsWriter::FIELD_IS_TOKENIZED; if (field->isBinary()) bits |= FieldsWriter::FIELD_IS_BINARY; if (field->isCompressed()) bits |= FieldsWriter::FIELD_IS_COMPRESSED; //写入一个字节:是否分词,是否是字节,是否压缩 fieldsStream->writeByte(bits); if ( field->isCompressed() ){ _CLTHROWA(CL_ERR_Runtime, "CLucene does not directly support compressed fields. Write a compressed byte array instead"); }else{ if (field->isBinary()) { jstreams::StreamBase<char>* stream = field->streamValue(); const char* sd; //去读取 int32_t rl = stream->read(sd,10000000,0); if ( rl < 0 ){ fieldsStream->writeVInt(0); }else{ fieldsStream->writeVInt(rl); fieldsStream->writeBytes((uint8_t*)sd, rl); } }else if ( field->stringValue() == NULL ){ CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too") Reader* r = field->readerValue(); const TCHAR* rv; int64_t rl = r->read(rv, LUCENE_INT32_MAX_SHOULDBE); if ( rl > LUCENE_INT32_MAX_SHOULDBE ) _CLTHROWA(CL_ERR_Runtime,"Field length too long"); else if ( rl < 0 ) rl = 0; fieldsStream->writeString( rv, (int32_t)rl); }else if ( field->stringValue() != NULL ){ //写入读取的字符串 fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue())); }else _CLTHROWA(CL_ERR_Runtime, "No values are set for the field"); } } } _CLDELETE(fields); } |
(6). void DocumentWriter::invertDocument方法
//进行倒排处理 void DocumentWriter::invertDocument(const Document* doc) { DocumentFieldEnumeration* fields = doc->fields(); try { while (fields->hasMoreElements()) { Field* field = (Field*)fields->nextElement(); const TCHAR* fieldName = field->name(); const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName); //初始时都是 int32_t length = fieldLengths[fieldNumber]; // length of field // 根据每个Field的编号,设置每个Field的长度 int32_t position = fieldPositions[fieldNumber]; // position in field // 根据每个Field的编号,设置每个Field的位置 if (length>0) { position+=analyzer->getPositionIncrementGap(fieldName); } int32_t offset = fieldOffsets[fieldNumber]; // offset field // 根据每个Field的编号,设置每个Field的offset if (field->isIndexed()) { // 如果Field被索引 if (!field->isTokenized()) { // 如果Field没有进行分词 const TCHAR* charBuf = NULL; int64_t dataLen = 0; if (field->stringValue() == NULL && !field->isStored() ) { CL_NS(util)::Reader* r = field->readerValue(); dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE); if (dataLen == -1) dataLen = 0; } else { charBuf = field->stringValue(); dataLen = _tcslen(charBuf); } // 是否把整个Field的数据作为一个词条存储到postingTable中 if(field->isStoreOffsetWithTermVector()){ TermVectorOffsetInfo tio; tio.setStartOffset(offset); tio.setEndOffset(offset + dataLen); addPosition(fieldName, charBuf, position++, &tio ); } else { addPosition(fieldName, charBuf, position++, NULL); } offset += dataLen; //偏移量在加上数据长度 length++; } else { // field must be tokenized // 需要对Field进行分词 CL_NS(util)::Reader* reader; // find or make Reader bool delReader = false; if (field->readerValue() != NULL) { // 如果从Field获取的Reader数据不为null reader = field->readerValue(); } else if (field->stringValue() != NULL) { // 根据从Field获取的字符串数据构造一个Reader输入流 reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false); delReader = true; } else { _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value"); }
try { // Tokenize field and add to postingTable. // 把经过分词处理的Field加入到postingTable中 CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader); try { CL_NS(analysis)::Token t; int32_t lastTokenEndOffset = -1; //上一个分词的终止位置 while (stream->next(&t)) { position += (t.getPositionIncrement() - 1); //每次切出一个词,就将position加上这个词的长度 // 如果指定了Field的词条向量的偏移量,则存储该此条向量 if(field->isStoreOffsetWithTermVector()){ TermVectorOffsetInfo tio; tio.setStartOffset(offset + t.startOffset()); tio.setEndOffset(offset + t.endOffset()); addPosition(fieldName, t.termText(), position++, &tio); } else { addPosition(fieldName, t.termText(), position++, NULL); } lastTokenEndOffset = t.endOffset(); length++; // Apply field truncation policy. // length:切出的字段的长度 if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) { if ( length > maxFieldLength) { // 如果当前切出的词条数已经达到了该Field的最大长度 break; } } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) { const TCHAR* errMsgBase = _T("Indexing a huge number of tokens from a single") _T(" field (/"%s/", in this case) can cause CLucene") _T(" to use memory excessively.") _T(" By default, CLucene will accept only %s tokens") _T(" tokens from a single field before forcing the") _T(" client programmer to specify a threshold at") _T(" which to truncate the token stream.") _T(" You should set this threshold via") _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX") _T(" to disable truncation, or a value to specify maximum number of fields)."); TCHAR defaultMaxAsChar[34]; _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,defaultMaxAsChar, 10); int32_t errMsgLen = _tcslen(errMsgBase)+ _tcslen(fieldName)+ _tcslen(defaultMaxAsChar); TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1); _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar); _CLTHROWT_DEL(CL_ERR_Runtime,errMsg); } } // while token->next if(lastTokenEndOffset != -1 ) { offset += lastTokenEndOffset + 1; } } _CLFINALLY ( stream->close(); _CLDELETE(stream); ); } _CLFINALLY ( if (delReader) { _CLDELETE(reader); } ); } // if/else field is to be tokenized // 位置信息,偏移量信息,长度信息 fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field->getBoost(); fieldOffsets[fieldNumber] = offset; //实际上是这个字段的终止偏移位置 } // if field is to beindexed } // while more fields available } _CLFINALLY ( _CLDELETE(fields); ); } |
(7). void DocumentWriter::addPosition()方法
void DocumentWriter::addPosition(const TCHAR* field,const TCHAR* text,const int32_t position,TermVectorOffsetInfo* offset) { //设置词条 //typedef CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare, Term::Equals> PostingTableType; termBuffer->set(field,text,false); Posting* ti = postingTable.get(termBuffer); if (ti != NULL) { int32_t freq = ti->freq; if (ti->positions.length == freq) { // positions array is full, realloc its size // 扩充数组:初始添加时频率为positions.values[0] = position; positions.length = 1; ti->positions.length = freq*2; ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t)); } ti->positions.values[freq] = position; // add new position if (offset != NULL) { if (ti->offsets.length == freq) { //存储偏移量信息时跟存储位置采用相同的方法 ti->offsets.length = freq*2; ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo)); } ti->offsets[freq] = *offset; } ti->freq = freq + 1; // 更新词条频率 } else { // word not seen before Term* term = _CLNEW Term( field, text, false); postingTable.put(term, _CLNEW Posting(term, position, offset)); } } |
(8). DocumentWriter::Posting::Posting()构造函数
/*Posting构造函数*/ DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset) { //对新生成的词条的处理 freq = 1; //频率设置为 term = _CL_POINTER(t); positions.values = (int32_t*)malloc(sizeof(int32_t)); //存储的位置数组 positions.values[0] = position; positions.length = 1; //设置数组容量也是 if ( offset != NULL ) { this->offsets.values = (TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo)); this->offsets.values[0] = *offset; //设置其中一个偏移量信息 this->offsets.length = 1; //设置数组容量也是 } } |
(9). DocumentWriter::writePostings()方法
void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment) { #define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} } IndexOutput* freq = NULL; IndexOutput* prox = NULL; TermInfosWriter* tis = NULL; TermVectorsWriter* termVectorWriter = NULL; try { //open files for inverse index storage //.frq: 频率信息文件 const char* buf = Misc::segmentname( segment, ".frq"); freq = directory->createOutput( buf ); _CLDELETE_CaARRAY( buf ); //.prx: 位置信息文件 buf = Misc::segmentname( segment, ".prx"); prox = directory->createOutput( buf ); _CLDELETE_CaARRAY( buf ); //TermInfosWriter类的构造函数,termIndexInterval:词条分组间隔 tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,termIndexInterval); TermInfo* ti = _CLNEW TermInfo(); const TCHAR* currentField = NULL; for (int32_t i = 0; i < postingsLength; i++) { Posting* posting = postings[i]; // 写入字典文件以及快表文件 ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1); tis->add(posting->term, ti); int32_t postingFreq = posting->freq; if (postingFreq == 1) // optimize freq=1 freq->writeVInt(1); // set low bit of doc num. else { freq->writeVInt(0); // the document number freq->writeVInt(postingFreq); // frequency in doc } int32_t lastPosition = 0; // write positions //使用差别法写入位置信息 for (int32_t j = 0; j < postingFreq; ++j) { // use delta-encoding prox->writeVInt(posting->positions.values[j] - lastPosition); lastPosition = posting->positions.values[j]; } // check to see if we switched to a new field const TCHAR* termField = posting->term->field(); //对字段包含的词条的处理 //对不同字段的处理 if (currentField==NULL||_tcscmp(currentField,termField)!= 0) { //todo, can we do an intern'd check? // changing field - see if there is something to save currentField = termField; FieldInfo* fi = fieldInfos->fieldInfo(currentField); //在field中以StoreTermVector方式保存的posting信息需要TermVectorsWriter类来写入 if (fi->storeTermVector) { if (termVectorWriter == NULL) { //TermVectorsWriter类的构造函数 termVectorWriter =_CLNEW TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter->openDocument(); } termVectorWriter->openField(currentField); } else if (termVectorWriter != NULL) { termVectorWriter->closeField(); } } if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) { termVectorWriter->addTerm(posting->term->text(), postingFreq, &posting->positions, &posting->offsets); } } if (termVectorWriter != NULL) { termVectorWriter->closeDocument(); } _CLDELETE(ti); }_CLFINALLY ( const char* err=NULL; int32_t ierr=0; __DOCLOSE(freq); __DOCLOSE(prox); __DOCLOSE(tis); __DOCLOSE(termVectorWriter); if ( err != NULL ) _CLTHROWA(ierr,err); ); } |
(10). TermInfosWriter::TermInfosWriter()构造函数
TermInfosWriter::TermInfosWriter(Directory* directory, const char* segment, FieldInfos* fis, int32_t interval): fieldInfos(fis) { CND_PRECONDITION(segment != NULL, "segment is NULL"); initialise(directory,segment,interval, false); //这个other会填写.tii文件信息 other = _CLNEW TermInfosWriter(directory, segment,fieldInfos, interval, true); CND_CONDITION(other != NULL, "other is NULL"); other->other = this; } |
(11). void TermInfosWriter::initialise()方法
void TermInfosWriter::initialise(Directory* directory, const char* segment, int32_t interval, bool IsIndex) { //字典文件由term信息组成,.tis 文件表示term信息文件.tii文件代表快表文件 //对.tis文件中term个数计数,每到一个分组跨度(比如计数到,256),便把分组信息点term信息保存到.tii文件中 lastTerm = _CLNEW Term; CND_CONDITION(lastTerm != NULL, "Could not allocate memory for lastTerm"); lastTi = _CLNEW TermInfo(); CND_CONDITION(lastTi != NULL, "Could not allocate memory for lastTi"); lastIndexPointer = 0; size = 0; isIndex = IsIndex; indexInterval = interval; skipInterval = LUCENE_DEFAULT_TERMDOCS_SKIP_INTERVAL; //other: isIndex=true 本身自己是false const char* buf = Misc::segmentname(segment, (isIndex ? ".tii" : ".tis")); output = directory->createOutput(buf); _CLDELETE_CaARRAY(buf); output->writeInt(FORMAT); // write format output->writeLong(0); // leave space for size output->writeInt(indexInterval);// write indexInterval output->writeInt(skipInterval); // write skipInterval //Set other to NULL by Default other = NULL; } |
(12). void TermInfosWriter::add()方法
void TermInfosWriter::add(Term* term, const TermInfo* ti) { CND_PRECONDITION(isIndex || (!isIndex && term->compareTo(lastTerm) > 0),"term out of order"); CND_PRECONDITION(ti->freqPointer >= lastTi->freqPointer,"freqPointer out of order"); CND_PRECONDITION(ti->proxPointer >= lastTi->proxPointer,"proxPointer out of order"); if (!isIndex && size % indexInterval == 0){ //本身是isIndex=false 然后达到了词条分组间隔 other->add(lastTerm, lastTi); } //写入词条 writeTerm(term); // write doc freq 写入文档频率 output->writeVInt(ti->docFreq); //write pointers 写入词条频率差值,位置信息差值,初始时都是 output->writeVLong(ti->freqPointer - lastTi->freqPointer); output->writeVLong(ti->proxPointer - lastTi->proxPointer); if (ti->docFreq >= skipInterval) { output->writeVInt(ti->skipOffset); } //对other 快表的写入处理 if (isIndex) { output->writeVLong(other->output->getFilePointer() - lastIndexPointer); lastIndexPointer = other->output->getFilePointer(); // write pointer } lastTi->set(ti); //设置上一次TermInfo* ti信息 size++; } |
(13). void TermInfosWriter::writeTerm()方法
void TermInfosWriter::writeTerm(Term* term) { //因为词条信息已经是排序好了的,这里使用差别法写入词条信息 int32_t start = Misc::stringDifference(lastTerm->text(),lastTerm->textLength(), term->text(),term->textLength()); int32_t length = term->textLength() - start; output->writeVInt(start); // 写入共有前缀字符长度 output->writeVInt(length); // 写入不同的字符长度 output->writeChars(term->text(), start, length); //写入不同的字符值 int32_t fieldnum = fieldInfos->fieldNumber(term->field()); //写入词条所在字段的编号 CND_PRECONDITION(fieldnum>=-1&&fieldnum<fieldInfos->size(),"Fieldnum is out of range"); output->writeVInt(fieldnum); // write field num if ( lastTerm->__cl_refcount == 1 ){ lastTerm->set(term,term->text()); }else{ _CLDECDELETE(lastTerm); lastTerm = _CL_POINTER(term); } } |
(14). TermVectorsWriter::TermVectorsWriter()构造函数
TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory, const char* segment,FieldInfos* fieldInfos) { //.tvx: 保存了指针信息,指针指向.tvd的document数据位置 //.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息 //.tvf: 保存field中term,频率,位置与偏移信息 char fbuf[CL_MAX_NAME]; strcpy(fbuf,segment); char* fpbuf=fbuf+strlen(fbuf); strcpy(fpbuf,LUCENE_TVX_EXTENSION); tvx = directory->createOutput(fbuf); tvx->writeInt(FORMAT_VERSION); strcpy(fpbuf,LUCENE_TVD_EXTENSION); tvd = directory->createOutput(fbuf); tvd->writeInt(FORMAT_VERSION); strcpy(fpbuf,LUCENE_TVF_EXTENSION); tvf = directory->createOutput(fbuf); tvf->writeInt(FORMAT_VERSION); this->fieldInfos = fieldInfos; currentField = NULL; //字段是否打开的判断 currentDocPointer = -1; //文档是否打开的判断 } |
(15). void TermVectorsWriter::writeField()方法
void TermVectorsWriter::writeField() { //.tvx: 保存了指针信息,指针指向.tvd的document数据位置 //.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息 //.tvf: 保存field中term,频率,位置与偏移信息 currentField->tvfPointer = tvf->getFilePointer(); //System.out.println("Field Pointer: " + currentField.tvfPointer); //写入词条个数 int32_t size = terms.size(); tvf->writeVInt(size); //是否以TermVector方式保存位置信息,是否以TermVector方式保存偏移量信息 bool storePositions = currentField->storePositions; bool storeOffsets = currentField->storeOffsets; uint8_t bits = 0x0; if (storePositions) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (storeOffsets) bits |= STORE_OFFSET_WITH_TERMVECTOR; tvf->writeByte(bits); const TCHAR* lastTermText = LUCENE_BLANK_STRING; //一个空串"" int32_t lastTermTextLen = 0; for (int32_t i = 0; i < size; ++i) { TVTerm* term = terms[i]; int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen, term->getTermText(),term->getTermTextLen()); int32_t length = term->getTermTextLen() - start; tvf->writeVInt(start); // 写入共有前缀字符长度 tvf->writeVInt(length); // 写入不同的字符长度 tvf->writeChars(term->getTermText(), start, length); //写入不同的字符值 tvf->writeVInt(term->freq); //写入词条的频率 lastTermText = term->getTermText(); lastTermTextLen = term->getTermTextLen(); //位置信息与偏移量的差别在于:位置信息保存的是term之间相隔term的个数,偏移量保存 //term之间相隔的字符数 if(storePositions){ //以TermVector方式保存位置信息 if(term->positions == NULL) { _CLTHROWA(CL_ERR_IllegalState, "Trying to write positions that are NULL!"); } // use delta encoding for positions int32_t position = 0; for (int32_t j = 0; j < term->freq; ++j){ tvf->writeVInt((*term->positions)[j] - position); //只保存位置差值 position = (*term->positions)[j]; } } if(storeOffsets){ //以TermVector方式保存偏移量信息 if(term->offsets == NULL) { _CLTHROWA(CL_ERR_IllegalState, "Trying to write offsets that are NULL!"); } int32_t position = 0; for (int32_t j = 0; j < term->freq; ++j) { tvf->writeVInt((*term->offsets)[j].getStartOffset() - position); tvf->writeVInt((*term->offsets)[j].getEndOffset() - (*term->offsets)[j].getStartOffset()); //Save the diff between the two. position = (*term->offsets)[j].getEndOffset(); } } } } |