Lucene中的内存结构读写

最新推荐文章于 2024-02-07 11:25:55 发布

chuanyangwang

最新推荐文章于 2024-02-07 11:25:55 发布

阅读量355

点赞数

分类专栏： ES 文章标签： lucene java apache

本文链接：https://blog.csdn.net/chuanyangwang/article/details/120992005

版权

ES 专栏收录该内容

50 篇文章 0 订阅

订阅专栏

lucene构建索引的缓存结构，当内存占用达到一定程度之后，或者用户执行indexWrite.flush时会将内存中的索引结构刷到磁盘中去。

DefalutIndexChain.flush

    Map<String,TermsHashPerField> fieldsToFlush = new HashMap<>();
    for (int i=0;i<fieldHash.length;i++) {
      PerField perField = fieldHash[i];
      while (perField != null) {
        if (perField.invertState != null) {
          fieldsToFlush.put(perField.fieldInfo.name, perField.termsHashPerField);
        }
        perField = perField.next;
      }
    }

    try (NormsProducer norms = readState.fieldInfos.hasNorms()
        ? state.segmentInfo.getCodec().normsFormat().normsProducer(readState)
        : null) {
      NormsProducer normsMergeInstance = null;
      if (norms != null) {
        // Use the merge instance in order to reuse the same IndexInput for all terms
        normsMergeInstance = norms.getMergeInstance();
      }
      termsHash.flush(fieldsToFlush, state, sortMap, normsMergeInstance);
    }

termsHash负责将fields中的内容刷到磁盘中去，以org.apache.lucene.index.FreqProxTermsWriter为例

  public void flush(Map<String,TermsHashPerField> fieldsToFlush, final SegmentWriteState state,
      Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
    super.flush(fieldsToFlush, state, sortMap, norms);

    // Gather all fields that saw any postings:
    List<FreqProxTermsWriterPerField> allFields = new ArrayList<>();

    for (TermsHashPerField f : fieldsToFlush.values()) {
      final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
      if (perField.bytesHash.size() > 0) {
        perField.sortPostings();
        assert perField.fieldInfo.getIndexOptions() != IndexOptions.NONE;
        allFields.add(perField);
      }
    }

    // Sort by field name
    CollectionUtil.introSort(allFields);

    Fields fields = new FreqProxFields(allFields);
    applyDeletes(state, fields);
    if (sortMap != null) {
      fields = new SortingLeafReader.SortingFields(fields, state.fieldInfos, sortMap);
    }

    FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
    boolean success = false;
    try {
      consumer.write(fields, norms);
      success = true;
    } finally {
      if (success) {
        IOUtils.close(consumer);
      } else {
        IOUtils.closeWhileHandlingException(consumer);
      }
    }

  }

最终会从Fields fields = new FreqProxFields(allFields); FreqProxFields中去查找索引信息。

获取的方式如下：

  private static class FreqProxPostingsEnum extends PostingsEnum {

    final FreqProxTermsWriterPerField terms;
    final FreqProxPostingsArray postingsArray;
    // docid freq的reader
    final ByteSliceReader reader = new ByteSliceReader();
    // postion payload的reader
    final ByteSliceReader posReader = new ByteSliceReader();
    final boolean readOffsets;
    int docID = -1;
    int freq;
    int pos;
    int startOffset;
    int endOffset;
    int posLeft;
    int termID;
    boolean ended;
    boolean hasPayload;
    BytesRefBuilder payload = new BytesRefBuilder();

    public FreqProxPostingsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) {
      this.terms = terms;
      this.postingsArray = postingsArray;
      this.readOffsets = terms.hasOffsets;
      assert terms.hasProx;
      assert terms.hasFreq;
    }

    public void reset(int termID) {
      this.termID = termID;
        // 初始化docid freq reader  0代表第0个读取流
      terms.initReader(reader, termID, 0);
        // 初始化postion payload reader 1代表第一个读取流
      terms.initReader(posReader, termID, 1);
      ended = false;
      docID = -1;
      posLeft = 0;
    }

    @Override
    public int docID() {
      return docID;
    }

    @Override
    public int freq() {
      return freq;
    }

    @Override
    public int nextDoc() throws IOException {
      if (docID == -1) {
        docID = 0;
      }
       //跳过所有的pos
      while (posLeft != 0) {
        nextPosition();
      }

      if (reader.eof()) {
        if (ended) {
          return NO_MORE_DOCS;
        } else {
          ended = true;
          docID = postingsArray.lastDocIDs[termID];
          freq = postingsArray.termFreqs[termID];
        }
      } else {
        int code = reader.readVInt();
        docID += code >>> 1;
        if ((code & 1) != 0) {
          freq = 1;
        } else {
          freq = reader.readVInt();
        }

        assert docID != postingsArray.lastDocIDs[termID];
      }

      posLeft = freq;
      pos = 0;
      startOffset = 0;
      return docID;
    }

    @Override
    public int advance(int target) {
      throw new UnsupportedOperationException();
    }

    @Override
    public long cost() {
      throw new UnsupportedOperationException();
    }

    @Override
    public int nextPosition() throws IOException {
      assert posLeft > 0;
      // 把posLeft减小一个
      posLeft--;
      int code = posReader.readVInt();
      pos += code >>> 1;
      if ((code & 1) != 0) {
        hasPayload = true;
        // has a payload
        payload.setLength(posReader.readVInt());
        payload.grow(payload.length());
        posReader.readBytes(payload.bytes(), 0, payload.length());
      } else {
        hasPayload = false;
      }

      if (readOffsets) {
        startOffset += posReader.readVInt();
        endOffset = startOffset + posReader.readVInt();
      }

      return pos;
    }

    @Override
    public int startOffset() {
      if (!readOffsets) {
        throw new IllegalStateException("offsets were not indexed");
      }
      return startOffset;
    }

    @Override
    public int endOffset() {
      if (!readOffsets) {
        throw new IllegalStateException("offsets were not indexed");
      }
      return endOffset;
    }

    @Override
    public BytesRef getPayload() {
      if (hasPayload) {
        return payload.get();
      } else {
        return null;
      }
    }
  }

ByteSliceReader屏蔽了slice读取的细节

final class ByteSliceReader extends DataInput {
  ByteBlockPool pool;
  int bufferUpto;
  byte[] buffer;
  public int upto;
  int limit;
  int level;
  public int bufferOffset;

  public int endIndex;

  public void init(ByteBlockPool pool, int startIndex, int endIndex) {

    assert endIndex-startIndex >= 0;
    assert startIndex >= 0;
    assert endIndex >= 0;

    this.pool = pool;
    this.endIndex = endIndex;

    level = 0;
    bufferUpto = startIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
    bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
    buffer = pool.buffers[bufferUpto];
    upto = startIndex & ByteBlockPool.BYTE_BLOCK_MASK;

    final int firstSize = ByteBlockPool.LEVEL_SIZE_ARRAY[0];

    // 这里是计算limit的方法
    //1. 当startIndex+firstSize 小于 endIndex时代表还有其他的slice
    if (startIndex+firstSize >= endIndex) {
      // There is only this one slice to read
      limit = endIndex & ByteBlockPool.BYTE_BLOCK_MASK;
    } else
      limit = upto+firstSize-4;
  }

  public boolean eof() {
    assert upto + bufferOffset <= endIndex;
    return upto + bufferOffset == endIndex;
  }

  @Override
  public byte readByte() {
    assert !eof();
    assert upto <= limit;
    if (upto == limit)
        // limit = upto+firstSize-4;
        // 当达到本slice最后一个字节时跳到下一个slice

      nextSlice();
    return buffer[upto++];
  }

  public long writeTo(DataOutput out) throws IOException {
    long size = 0;
    while(true) {
      if (limit + bufferOffset == endIndex) {
        assert endIndex - bufferOffset >= upto;
        out.writeBytes(buffer, upto, limit-upto);
        size += limit-upto;
        break;
      } else {
        out.writeBytes(buffer, upto, limit-upto);
        size += limit-upto;
        nextSlice();
      }
    }

    return size;
  }

  public void nextSlice() {

    // Skip to our next slice
    final int nextIndex = ((buffer[limit]&0xff)<<24) + ((buffer[1+limit]&0xff)<<16) + ((buffer[2+limit]&0xff)<<8) + (buffer[3+limit]&0xff);

    level = ByteBlockPool.NEXT_LEVEL_ARRAY[level];
    final int newSize = ByteBlockPool.LEVEL_SIZE_ARRAY[level];

    bufferUpto = nextIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
    bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;

    buffer = pool.buffers[bufferUpto];
    upto = nextIndex & ByteBlockPool.BYTE_BLOCK_MASK;

    if (nextIndex + newSize >= endIndex) {
      // We are advancing to the final slice
      assert endIndex - nextIndex > 0;
      limit = endIndex - bufferOffset;
    } else {
      // This is not the final slice (subtract 4 for the
      // forwarding address at the end of this new slice)
      limit = upto+newSize-4;
    }
  }

  @Override
  public void readBytes(byte[] b, int offset, int len) {
    while(len > 0) {
      final int numLeft = limit-upto;
      if (numLeft < len) {
        // Read entire slice
        System.arraycopy(buffer, upto, b, offset, numLeft);
        offset += numLeft;
        len -= numLeft;
        nextSlice();
      } else {
        // This slice is the last one
        System.arraycopy(buffer, upto, b, offset, len);
        upto += len;
        break;
      }
    }
  }
}

  
      termID = (-termID)-1;
        // 从postingsArray中获取intStart
      int intStart = postingsArray.intStarts[termID];
        // 从intPool 中找到相应的“数组”
      intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
        // 获取现在的偏移量
      intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
      addTerm(termID);




    if (termID >= 0) {// New posting 新posting
      bytesHash.byteStart(termID);
      // Init stream slices
      if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
        intPool.nextBuffer();
      }

      if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
        bytePool.nextBuffer();
      }

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];

      newTerm(termID);

    } else {
        // 老的posting
      termID = (-termID)-1;
      int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
      intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
      addTerm(termID);
    }

















   void writeByte(int stream, byte b) {
    // 从“数组”中获取该stream对应的地址， upto对应的是bytePool的地址
    int upto = intUptos[intUptoStart+stream];
    // 拿upto 找到相应的数组
    byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
    assert bytes != null;
    // 拿upto找到相应的偏移量
    int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
    // 当前分片已经使用完毕
    if (bytes[offset] != 0) {
      // End of slice; allocate a new one
      // 新分配一个slice
      offset = bytePool.allocSlice(bytes, offset);
      bytes = bytePool.buffer;
      intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
    }
    // 写入
    bytes[offset] = b;
    // intPool中指向 offset/pos  bytePool中的地址+1
    (intUptos[intUptoStart+stream])++;
  }

下面的图不是很全，其中docid,freq 黄色部分也有可能有多个slice

TermVectorsConsumerPerField 和 FreqProxTermsWriterPerField 是不一样的

参考文档：

1. lucene4.5源码分析系列：索引的创建过程_笨狐狸-CSDN博客

chuanyangwang

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene中的内存结构读写

private static class FreqProxPostingsEnum extends PostingsEnum { final FreqProxTermsWriterPerField terms; final FreqProxPostingsArray postingsArray; // docid freq的reader final ByteSliceReader reader = new ByteSliceReader(); // posti...
复制链接

扫一扫