lucene-2.9.0 索引过程(二) FreqProxTermsWriter

FreqProxTermsWriter

类功能

将内存中的索引写入相应的索引文件(tis/tii/frq/prx)

二、成员函数

2.1 createPostings函数

TermsHashPerThread::morePostings()会使用

FreqProxTermsWriter::createPostings申请更多的Postings内存,不过只是调用此函数,并非有缓冲的意思

 

// 简单代码

void createPostings(RawPostingList[] postings, int start, int count)

TermsHashPerField::add()

{

         // Refill?

      if (0 == perThread.freePostingsCount)

        perThread.morePostings(); // TermsHashPerThread类型

}

 

2.2 flush函数

  public void flush(Map threadsAndFields, final SegmentWriteState state)

{

    final int numAllFields = allFields.size(); // 所有字段

 

    // TODO: allow Lucene user to customize this consumer:

final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);

 

    int start = 0;

    while(start < numAllFields) {

      final FieldInfo fieldInfo = ((FreqProxTermsWriterPerField) allFields.get(start)).fieldInfo;

 

      final String fieldName = fieldInfo.name;

 

      int end = start+1;

      while(end < numAllFields && ((FreqProxTermsWriterPerField) allFields.get(end)).fieldInfo.name.equals(fieldName))// 相同字段

        end++;

     

      FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start];

      for(int i=start;i<end;i++) {

        fields[i-start] = (FreqProxTermsWriterPerField) allFields.get(i);

 

        // Aggregate the storePayload as seen by the same

        // field across multiple threads

        fieldInfo.storePayloads |= fields[i-start].hasPayloads;

      }

 

      // If this field has postings then add them to the

      // segment

      // 合并索引并写入磁盘 frq/prx/tis

      appendPostings(fields, consumer); //

 

 

      start = end;

}

 

// FormatPostingsFieldsWriter::finish()写入tii文件

    consumer.finish();

  }

 

// 索引合并且写入磁盘

 

void appendPostings(FreqProxTermsWriterPerField[] fields,

                      FormatPostingsFieldsConsumer consumer)

    throws CorruptIndexException, IOException {

 

    int numFields = fields.length;

 

    final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

 

    for(int i=0;i<numFields;i++) {

      FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

 

      assert fms.field.fieldInfo == fields[0].fieldInfo;

 

    

// 将该词项索引从TermsHashPerField读取到FreqProxFieldMergeState

      // final ByteSliceReader freq= new ByteSliceReader(); 词频

  // final ByteSliceReader prox = new ByteSliceReader(); 位置

  // 中调用TermsHashPerField::initReader(ByteSliceReader reader,

// RawPostingList p, int stream)

      boolean result = fms.nextTerm();

      assert result;

    }

 

    final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);

 

    FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

 

    final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;

 

    while(numFields > 0) { // 遍历所有字段

 

      // Get the next term to merge

      termStates[0] = mergeStates[0]; // 按词项字符串已序的字段

      int numToMerge = 1;

 

     // 取词项(字符串)最小值,按字符串大小合并postings

      for(int i=1;i<numFields;i++) {

        final char[] text = mergeStates[i].text;

        final int textOffset = mergeStates[i].textOffset;

        final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

 

        if (cmp < 0) {

          termStates[0] = mergeStates[i];

          numToMerge = 1;

        } else if (cmp == 0)

          termStates[numToMerge++] = mergeStates[i];

      }

 

      final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset);

 

      // Now termStates has numToMerge FieldMergeStates

      // which all share the same term.  Now we must

      // interleave the docID streams.

      while(numToMerge > 0) { // 同字段同名不同segemnts?

       

        FreqProxFieldMergeState minState = termStates[0];

                 

       // 如果有多个segment则选文档号最小-按文档号排序

        for(int i=1;i<numToMerge;i++)

          if (termStates[i].docID < minState.docID)

            minState = termStates[i];

 

        final int termDocFreq = minState.termFreq; // tf

 

        final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);

 

        // 已经从TermHashPerField中取得

        final ByteSliceReader prox = minState.prox; 

 

        // 如果需要存储词频和位置

        if (!currentFieldOmitTermFreqAndPositions) {

          // omitTermFreqAndPositions == false so we do write positions &

          // payload         

          int position = 0;

          for(int j=0;j<termDocFreq;j++) { // 遍历所有的位置

            final int code = prox.readVInt();

            position += code >> 1;

 

            final int payloadLength;

            if ((code & 1) != 0) {

              // 如果有payload数据

              payloadLength = prox.readVInt();

 

if (payloadBuffer == null || payloadBuffer.length < payloadLength)

                payloadBuffer = new byte[payloadLength];

 

              prox.readBytes(payloadBuffer, 0, payloadLength);

 

            } else

              payloadLength = 0;

 

            posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);

          } //End for

 

          posConsumer.finish();

        }

 

        if (!minState.nextDoc()) {//该词项的postings未完

 

          // Remove from termStates

          int upto = 0;

          for(int i=0;i<numToMerge;i++)

            if (termStates[i] != minState)

              termStates[upto++] = termStates[i];

          numToMerge--;

         

 

          // Advance this state to the next term

 

          if (!minState.nextTerm()) { //如果词项表未扫描完

            // OK, no more terms, so remove from mergeStates

            // as well

            upto = 0;

            for(int i=0;i<numFields;i++)

              if (mergeStates[i] != minState)

                mergeStates[upto++] = mergeStates[i];

            numFields--;

           

          }

        }

      }

 

// 写入IndexOutput缓存中,如果缓冲满则写入磁盘

// BufferedIndexOutput::BUFFER_SIZE = 16384

      docConsumer.finish(); // FormatPostingsDocsWriter:: finish()

    }

 

    termsConsumer.finish();

  }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值