Solr.IndexWriter源码分析.1

最新推荐文章于 2024-10-10 17:25:53 发布

Wild__Child

最新推荐文章于 2024-10-10 17:25:53 发布

阅读量116

点赞数

文章标签： solr

本文链接：https://blog.csdn.net/Wild__Child/article/details/121843601

版权

2021SC@SDUSC

概述
代码分析

概述

(来自官方文档)
IndexWriter 创建和维护索引。

构造函数的 create 参数确定是创建新索引还是打开现有索引。请注意，即使读者正在使用索引，您也可以使用 create=true 打开索引。老读者会继续搜索他们已经打开的“时间点”快照，直到他们重新打开才会看到新创建的索引。还有一些没有 create 参数的构造函数，如果在提供的路径上还没有索引，它将创建一个新索引，否则打开现有索引。

在任何一种情况下，文档都使用 addDocument 添加并使用 deleteDocuments(Term) 或 deleteDocuments(Query) 删除。可以使用 updateDocument 更新文档（它只是删除然后添加整个文档）。当完成添加、删除和更新文档时，应调用 close。

这些更改缓存在内存中并定期刷新到目录中（在上述方法调用期间）。当有足够的缓冲删除（请参阅 setMaxBufferedDeleteTerms(int)）或自上次刷新以来添加的文档足够多时，会触发刷新，以较早者为准。对于添加的文档，刷新由文档的 RAM 使用率（请参阅 setRAMBufferSizeMB(double)）或添加文档的数量触发。默认是在 RAM 使用量达到 16 MB 时刷新。为了获得最佳索引速度，您应该使用大 RAM 缓冲区根据 RAM 使用情况进行刷新。请注意，刷新只是将 IndexWriter 中的内部缓冲状态移动到索引中，但在调用 commit() 或 close() 之前，这些更改对 IndexReader 不可见。刷新还可能触发一个或多个段合并，默认情况下，这些合并与后台线程一起运行，以免阻塞 addDocument 调用（请参阅下文更改 MergeScheduler）。

打开 IndexWriter 会为正在使用的目录创建一个锁定文件。尝试在同一目录上打开另一个 IndexWriter 将导致 LockObtainFailedException。如果使用同一目录上的 IndexReader 从索引中删除文档，也会抛出 LockObtainFailedException。

代码分析

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.BooleanSupplier;
import java.util.function.IntPredicate;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
import org.apache.lucene.index.FieldInfos.FieldNumbers;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.LockValidatingDirectoryWrapper;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.MergeInfo;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;

/**
 *说明：检查点（和提交）
 * IndexWriter 将新的索引文件写入目录而不写入新的segments_N
 * 引用这些新文件的文件。 这也意味着状态
 * 内存中的 SegmentInfos 对象与最近的不同
 *segments_N 文件写入目录。
 *
 * 每次更改 SegmentInfos 时，并匹配（可能
 * 修改）目录文件，我们有一个新的“检查点”。
 * 如果修改的/新的 SegmentInfos 写入磁盘 - 作为新的
 *（生成）segments_N 文件——这个检查点也是一个
 * 索引提交。
 *
 * 一个新的检查点总是会替换之前的检查点并且
 * 成为索引的新“前端”。 这允许 IndexFileDeleter
 * 删除仅由陈旧检查点引用的文件。
 *（自上次提交以来创建的文件，但不再是
 * 由索引的“前面”引用）。 为此，IndexFileDeleter
 * 跟踪最后一个非提交检查点。
 */
public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
    MergePolicy.MergeContext {

  /**
    *可以添加到文档中的最大文档数的硬限制
    *  指数。 如果您尝试添加更多内容，则会遇到 {@code IllegalArgumentException}。 */
   // 我们防御性地减去 128 以远低于最低值
   // “典型”JVM 上的 ArrayUtil.MAX_ARRAY_LENGTH。 我们不只是使用
   // ArrayUtil.MAX_ARRAY_LENGTH 在这里，因为这可能因JVM而异：
  */
  public static final int MAX_DOCS = Integer.MAX_VALUE - 128;

  /** Maximum value of the token position in an indexed field. */
  public static final int MAX_POSITION = Integer.MAX_VALUE - 128;

  // Use package-private instance var to enforce the limit so testing
  // can use less electricity:
  private static int actualMaxDocs = MAX_DOCS;

  /** Used only for testing. */
  static void setMaxDocs(int maxDocs) {
    if (maxDocs > MAX_DOCS) {
      // Cannot go higher than the hard max:
      throw new IllegalArgumentException("maxDocs must be <= IndexWriter.MAX_DOCS=" + MAX_DOCS + "; got: " + maxDocs);
    }
    IndexWriter.actualMaxDocs = maxDocs;
  }

  static int getActualMaxDocs() {
    return IndexWriter.actualMaxDocs;
  }
  
  /** Used only for testing. */
  private final boolean enableTestPoints;

  private static final int UNBOUNDED_MAX_MERGE_SEGMENTS = -1;
  
  /**
   * Name of the write lock in the index.
   */
  public static final String WRITE_LOCK_NAME = "write.lock";

  /** Key for the source of a segment in the {@link SegmentInfo#getDiagnostics() diagnostics}. */
  public static final String SOURCE = "source";
  /** Source of a segment which results from a merge of other segments. */
  public static final String SOURCE_MERGE = "merge";
  /** Source of a segment which results from a flush. */
  public static final String SOURCE_FLUSH = "flush";
  /** Source of a segment which results from a call to {@link #addIndexes(CodecReader...)}. */
  public static final String SOURCE_ADDINDEXES_READERS = "addIndexes(CodecReader...)";

  /**
   * 一个术语的绝对硬最大长度，一次以字节为单位
   * 编码为 UTF8。 如果一个术语从分析器到达
   * 比这个长度长，一个
   * <code>IllegalArgumentException</code> 被抛出
   * 并且一条消息被打印到 infoStream，如果设置了（见 {@link
   * IndexWriterConfig#setInfoStream(InfoStream)})。
   */
  public final static int MAX_TERM_LENGTH =  BYTE_BLOCK_SIZE-2;

  /**
   * Maximum length string for a stored field.
   */
  public final static int MAX_STORED_STRING_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH / UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
    
  // when unrecoverable disaster strikes, we populate this with the reason that we had to close IndexWriter
  private final AtomicReference<Throwable> tragedy = new AtomicReference<>(null);

  private final Directory directoryOrig;       // original user directory
  private final Directory directory;           // wrapped with additional checks

  private final AtomicLong changeCount = new AtomicLong(); // increments every time a change is completed
  private volatile long lastCommitChangeCount; // last changeCount that was committed

  private List<SegmentCommitInfo> rollbackSegments;      // list of segmentInfo we will fallback to if the commit fails

  private volatile SegmentInfos pendingCommit;            // set when a commit is pending (after prepareCommit() & before commit())
  private volatile long pendingSeqNo;
  private volatile long pendingCommitChangeCount;

  private Collection<String> filesToCommit;

  private final SegmentInfos segmentInfos;
  final FieldNumbers globalFieldNumberMap;

  final DocumentsWriter docWriter;
  private final EventQueue eventQueue = new EventQueue(this);
  private final MergeScheduler.MergeSource mergeSource = new IndexWriterMergeSource(this);

  private final ReentrantLock writeDocValuesLock = new ReentrantLock();

  static final class EventQueue implements Closeable {
    private volatile boolean closed;
   
    // 我们在这里使用信号量而不是简单的同步方法来允许
    // 事件由多个线程并发处理，使得所有事件
    // 对于某个线程，一旦该线程从 IW 返回，就会对其进行处理
    private final Semaphore permits = new Semaphore(Integer.MAX_VALUE);
    private final Queue<Event> queue = new ConcurrentLinkedQueue<>();
    private final IndexWriter writer;

    EventQueue(IndexWriter writer) {
      this.writer = writer;
    }

    private void acquire() {
      if (permits.tryAcquire() == false) {
        throw new AlreadyClosedException("queue is closed");
      }
      if (closed) {
        permits.release();
        throw new AlreadyClosedException("queue is closed");
      }
    }

    boolean add(Event event) {
      acquire();
      try {
        return queue.add(event);
      } finally {
        permits.release();
      }
    }

    void processEvents() throws IOException {
      acquire();
      try {
        processEventsInternal();
      } finally {
        permits.release();
      }
    }

    private void processEventsInternal() throws IOException {
      assert Integer.MAX_VALUE - permits.availablePermits() > 0 : "must acquire a permit before processing events";
      Event event;
      while ((event = queue.poll()) != null) {
        event.process(writer);
      }
    }

    @Override
    public synchronized void close() throws IOException { // synced to prevent double closing
      assert closed == false : "we should never close this twice";
      closed = true;
     
      // 有可能我们在 processEvents 调用中关闭了这个队列
      if (writer.getTragicException() != null) {
        // we are already handling a tragic exception let's drop it all on the floor and return
        queue.clear();
      } else {
        
        // 现在我们获得了所有的许可，以确保我们是唯一一个处理队列的人
        try {
          permits.acquire(Integer.MAX_VALUE);
        } catch (InterruptedException e) {
          throw new ThreadInterruptedException(e);
        }
        try {
          processEventsInternal();
        } finally {
          permits.release(Integer.MAX_VALUE);
        }
      }
    }
  }

  private final IndexFileDeleter deleter;

  // forceMerge 使用它来记录那些需要合并的
  private final Map<SegmentCommitInfo,Boolean> segmentsToMerge = new HashMap<>();
  private int mergeMaxNumSegments;

  private Lock writeLock;

  private volatile boolean closed;
  private volatile boolean closing;

  private final AtomicBoolean maybeMerge = new AtomicBoolean();

  private Iterable<Map.Entry<String,String>> commitUserData;

  // Holds all SegmentInfo instances currently involved in
  // merges
  private final HashSet<SegmentCommitInfo> mergingSegments = new HashSet<>();
  private final MergeScheduler mergeScheduler;
  private final Set<SegmentMerger> runningAddIndexesMerges = new HashSet<>();
  private final LinkedList<MergePolicy.OneMerge> pendingMerges = new LinkedList<>();
  private final Set<MergePolicy.OneMerge> runningMerges = new HashSet<>();
  private final List<MergePolicy.OneMerge> mergeExceptions = new ArrayList<>();
  private long mergeGen;
  private Merges merges = new Merges();
  private boolean didMessageState;
  private final AtomicInteger flushCount = new AtomicInteger();
  private final AtomicInteger flushDeletesCount = new AtomicInteger();
  private final ReaderPool readerPool;
  private final BufferedUpdatesStream bufferedUpdatesStream;

  /** Counts how many merges have completed; this is used by {@link #forceApply(FrozenBufferedUpdates)}
   *  to handle concurrently apply deletes/updates with merges completing. */
  private final AtomicLong mergeFinishedGen = new AtomicLong();

  // 传递给构造函数的实例。 仅按顺序保存
  // 允许用户查询 IndexWriter 设置。
  private final LiveIndexWriterConfig config;

  /** System.nanoTime() when commit started; used to write
   *  an infoStream message about how long commit took. */
  private long startCommitTime;

  /** 
   * 有多少文档在索引中，或者正在被索引
   * 添加（保留）。 例如，像 addIndexes 这样的操作将首先保留
   * 在他们实际更改索引之前添加 N 个文档的权利，
   * 很像酒店如何对您的信用进行“授权保留”
   * 卡，以确保他们稍后可以在您退房时向您收费。*/
  private final AtomicLong pendingNumDocs = new AtomicLong();
  private final boolean softDeletesEnabled;

  private final DocumentsWriter.FlushNotifications flushNotifications = new DocumentsWriter.FlushNotifications() {
    @Override
    public void deleteUnusedFiles(Collection<String> files) {
      eventQueue.add(w -> w.deleteNewFiles(files));
    }

    @Override
    public void flushFailed(SegmentInfo info) {
      eventQueue.add(w -> w.flushFailed(info));
    }

    @Override
    public void afterSegmentsFlushed() throws IOException {
      publishFlushedSegments(false);
    }

    @Override
    public void onTragicEvent(Throwable event, String message) {
      IndexWriter.this.onTragicEvent(event, message);
    }

    @Override
    public void onDeletesApplied() {
      eventQueue.add(w -> {
          try {
            w.publishFlushedSegments(true);
          } finally {
            flushCount.incrementAndGet();
          }
        }
      );
    }

    @Override
    public void onTicketBacklog() {
      eventQueue.add(w -> w.publishFlushedSegments(true));
    }
  };

  DirectoryReader getReader() throws IOException {
    return getReader(true, false);
  }