lucene几个类的含义2

最新推荐文章于 2024-07-25 17:09:29 发布

iteye_6075

最新推荐文章于 2024-07-25 17:09:29 发布

阅读量95

点赞数

分类专栏： search 文章标签： lucene Apache

本文链接：https://blog.csdn.net/iteye_6075/article/details/81718577

版权

search 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

索引文件里的文件命名有什么规律

[quote]
_9.cfs
_9.cfx
segments_k
segments.gen
[/quote]

private final synchronized String newSegmentName(){
  return "_"+Integer.toString(segmentInfos.counter++,Character.MAX_RADIX);
}

将segmentInfos.counter加1后转为36进制。前面加下划线。所以segmentInfos.counter的值表示了segment中总共文档的数量。

文档倒排。这是建立索引内存消耗最大的时刻。除了词条，还需要存储词条位置，频率等信息


package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
 * Holds state for inverting all occurrences of a single
 * field in the document.  This class doesn't do anything
 * itself; instead, it forwards the tokens produced by
 * analysis to its own consumer
 * (InvertedDocConsumerPerField).  It also interacts with an
 * endConsumer (InvertedDocEndConsumerPerField).
 */

final class DocInverterPerField extends DocFieldConsumerPerField {

  final private DocInverterPerThread perThread;
  final private FieldInfo fieldInfo;
  final InvertedDocConsumerPerField consumer;
  final InvertedDocEndConsumerPerField endConsumer;
  final DocumentsWriter.DocState docState;
  final FieldInvertState fieldState;

  public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
    this.perThread = perThread;
    this.fieldInfo = fieldInfo;
    docState = perThread.docState;
    fieldState = perThread.fieldState;
    this.consumer = perThread.consumer.addField(this, fieldInfo);
    this.endConsumer = perThread.endConsumer.addField(this, fieldInfo);
  }

  void abort() {
    consumer.abort();
    endConsumer.abort();
  }

  public void processFields(final Fieldable[] fields,
                            final int count) throws IOException {

    fieldState.reset(docState.doc.getBoost());

    final int maxFieldLength = docState.maxFieldLength;

    final boolean doInvert = consumer.start(fields, count);

    for(int i=0;i<count;i++) {

      final Fieldable field = fields[i];

      // TODO FI: this should be "genericized" to querying
      // consumer if it wants to see this particular field
      // tokenized.
      if (field.isIndexed() && doInvert) {

        if (fieldState.length > 0)
          fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);

        if (!field.isTokenized()) {		  // un-tokenized field
          String stringValue = field.stringValue();
          final int valueLength = stringValue.length();
          perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
          fieldState.attributeSource = perThread.singleTokenTokenStream;
          perThread.localTokenStream.reset();
          consumer.start(field);

          boolean success = false;
          try {
            consumer.add();
            success = true;
          } finally {
            if (!success)
              docState.docWriter.setAborting();
          }
          fieldState.offset += valueLength;
          fieldState.length++;
          fieldState.position++;
        } else {                                  // tokenized field
          final TokenStream stream;
          final TokenStream streamValue = field.tokenStreamValue();

          if (streamValue != null) 
            stream = streamValue;
          else {
            // the field does not have a TokenStream,
            // so we have to obtain one from the analyzer
            final Reader reader;			  // find or make Reader
            final Reader readerValue = field.readerValue();

            if (readerValue != null)
              reader = readerValue;
            else {
              String stringValue = field.stringValue();
              if (stringValue == null)
                throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
              perThread.stringReader.init(stringValue);
              reader = perThread.stringReader;
            }

            // Tokenize field and add to postingTable
            stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
          }

          // reset the TokenStream to the first token
          stream.reset();

          try {
            int offsetEnd = fieldState.offset-1;

            boolean useNewTokenStreamAPI = stream.useNewAPI();
            Token localToken = null;

            if (useNewTokenStreamAPI) {
              fieldState.attributeSource = stream;
            } else {              
              fieldState.attributeSource = perThread.localTokenStream;
              localToken = perThread.localToken;
            }         

            consumer.start(field);

            OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);

            for(;;) {

              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID
              Token token = null;


/** 
token.termText 切出的词
token.startOffset 词的起始位置
token.endOffset 词的结束位置
*/

              if (useNewTokenStreamAPI) {
                if (!stream.incrementToken()) break;
              } else {
                token = stream.next(localToken);
                if (token == null) break;
                perThread.localTokenStream.set(token);
              }

              final int posIncr = posIncrAttribute.getPositionIncrement();
              fieldState.position += posIncr - 1;
              if (posIncr == 0)
                fieldState.numOverlap++;

              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;
              } finally {
                if (!success)
                  docState.docWriter.setAborting();
              }
              fieldState.position++;
              offsetEnd = fieldState.offset + offsetAttribute.endOffset();
              if (++fieldState.length >= maxFieldLength) {
                if (docState.infoStream != null)
                  docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
                break;
              }
            }
            fieldState.offset = offsetEnd+1;
          } finally {
            stream.close();
          }
        }

        fieldState.boost *= field.getBoost();
      }
    }

    consumer.finish();
    endConsumer.finish();
  }
}

iteye_6075

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
lucene几个类的含义2

索引文件里的文件命名有什么规律[quote]_9.cfs_9.cfxsegments_ksegments.gen[/quote][code="java"]private final synchronized String newSegmentName(){ return "_"+Integer.toString(segmentInfos.counter++...
复制链接

扫一扫

专栏目录