2021SC@SDUSC
概述
(来自官方文档)
IndexWriter 创建和维护索引。
构造函数的 create 参数确定是创建新索引还是打开现有索引。请注意,即使读者正在使用索引,您也可以使用 create=true 打开索引。老读者会继续搜索他们已经打开的“时间点”快照,直到他们重新打开才会看到新创建的索引。还有一些没有 create 参数的构造函数,如果在提供的路径上还没有索引,它将创建一个新索引,否则打开现有索引。
在任何一种情况下,文档都使用 addDocument 添加并使用 deleteDocuments(Term) 或 deleteDocuments(Query) 删除。可以使用 updateDocument 更新文档(它只是删除然后添加整个文档)。当完成添加、删除和更新文档时,应调用 close。
这些更改缓存在内存中并定期刷新到目录中(在上述方法调用期间)。当有足够的缓冲删除(请参阅 setMaxBufferedDeleteTerms(int))或自上次刷新以来添加的文档足够多时,会触发刷新,以较早者为准。对于添加的文档,刷新由文档的 RAM 使用率(请参阅 setRAMBufferSizeMB(double))或添加文档的数量触发。默认是在 RAM 使用量达到 16 MB 时刷新。为了获得最佳索引速度,您应该使用大 RAM 缓冲区根据 RAM 使用情况进行刷新。请注意,刷新只是将 IndexWriter 中的内部缓冲状态移动到索引中,但在调用 commit() 或 close() 之前,这些更改对 IndexReader 不可见。刷新还可能触发一个或多个段合并,默认情况下,这些合并与后台线程一起运行,以免阻塞 addDocument 调用(请参阅下文更改 MergeScheduler)。
打开 IndexWriter 会为正在使用的目录创建一个锁定文件。尝试在同一目录上打开另一个 IndexWriter 将导致 LockObtainFailedException。如果使用同一目录上的 IndexReader 从索引中删除文档,也会抛出 LockObtainFailedException。
代码分析
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.BooleanSupplier;
import java.util.function.IntPredicate;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
import org.apache.lucene.index.FieldInfos.FieldNumbers;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.LockValidatingDirectoryWrapper;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.MergeInfo;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
/**
*说明:检查点(和提交)
* IndexWriter 将新的索引文件写入目录而不写入新的segments_N
* 引用这些新文件的文件。 这也意味着状态
* 内存中的 SegmentInfos 对象与最近的不同
*segments_N 文件写入目录。
*
* 每次更改 SegmentInfos 时,并匹配(可能
* 修改)目录文件,我们有一个新的“检查点”。
* 如果修改的/新的 SegmentInfos 写入磁盘 - 作为新的
*(生成)segments_N 文件——这个检查点也是一个
* 索引提交。
*
* 一个新的检查点总是会替换之前的检查点并且
* 成为索引的新“前端”。 这允许 IndexFileDeleter
* 删除仅由陈旧检查点引用的文件。
*(自上次提交以来创建的文件,但不再是
* 由索引的“前面”引用)。 为此,IndexFileDeleter
* 跟踪最后一个非提交检查点。
*/
public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable,
MergePolicy.MergeContext {
/**
*可以添加到文档中的最大文档数的硬限制
* 指数。 如果您尝试添加更多内容,则会遇到 {@code IllegalArgumentException}。 */
// 我们防御性地减去 128 以远低于最低值
// “典型”JVM 上的 ArrayUtil.MAX_ARRAY_LENGTH。 我们不只是使用
// ArrayUtil.MAX_ARRAY_LENGTH 在这里,因为这可能因JVM而异:
*/
public static final int MAX_DOCS = Integer.MAX_VALUE - 128;
/** Maximum value of the token position in an indexed field. */
public static final int MAX_POSITION = Integer.MAX_VALUE - 128;
// Use package-private instance var to enforce the limit so testing
// can use less electricity:
private static int actualMaxDocs = MAX_DOCS;
/** Used only for testing. */
static void setMaxDocs(int maxDocs) {
if (maxDocs > MAX_DOCS) {
// Cannot go higher than the hard max:
throw new IllegalArgumentException("maxDocs must be <= IndexWriter.MAX_DOCS=" + MAX_DOCS + "; got: " + maxDocs);
}
IndexWriter.actualMaxDocs = maxDocs;
}
static int getActualMaxDocs() {
return IndexWriter.actualMaxDocs;
}
/** Used only for testing. */
private final boolean enableTestPoints;
private static final int UNBOUNDED_MAX_MERGE_SEGMENTS = -1;
/**
* Name of the write lock in the index.
*/
public static final String WRITE_LOCK_NAME = "write.lock";
/** Key for the source of a segment in the {@link SegmentInfo#getDiagnostics() diagnostics}. */
public static final String SOURCE = "source";
/** Source of a segment which results from a merge of other segments. */
public static final String SOURCE_MERGE = "merge";
/** Source of a segment which results from a flush. */
public static final String SOURCE_FLUSH = "flush";
/** Source of a segment which results from a call to {@link #addIndexes(CodecReader...)}. */
public static final String SOURCE_ADDINDEXES_READERS = "addIndexes(CodecReader...)";
/**
* 一个术语的绝对硬最大长度,一次以字节为单位
* 编码为 UTF8。 如果一个术语从分析器到达
* 比这个长度长,一个
* <code>IllegalArgumentException</code> 被抛出
* 并且一条消息被打印到 infoStream,如果设置了(见 {@link
* IndexWriterConfig#setInfoStream(InfoStream)})。
*/
public final static int MAX_TERM_LENGTH = BYTE_BLOCK_SIZE-2;
/**
* Maximum length string for a stored field.
*/
public final static int MAX_STORED_STRING_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH / UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
// when unrecoverable disaster strikes, we populate this with the reason that we had to close IndexWriter
private final AtomicReference<Throwable> tragedy = new AtomicReference<>(null);
private final Directory directoryOrig; // original user directory
private final Directory directory; // wrapped with additional checks
private final AtomicLong changeCount = new AtomicLong(); // increments every time a change is completed
private volatile long lastCommitChangeCount; // last changeCount that was committed
private List<SegmentCommitInfo> rollbackSegments; // list of segmentInfo we will fallback to if the commit fails
private volatile SegmentInfos pendingCommit; // set when a commit is pending (after prepareCommit() & before commit())
private volatile long pendingSeqNo;
private volatile long pendingCommitChangeCount;
private Collection<String> filesToCommit;
private final SegmentInfos segmentInfos;
final FieldNumbers globalFieldNumberMap;
final DocumentsWriter docWriter;
private final EventQueue eventQueue = new EventQueue(this);
private final MergeScheduler.MergeSource mergeSource = new IndexWriterMergeSource(this);
private final ReentrantLock writeDocValuesLock = new ReentrantLock();
static final class EventQueue implements Closeable {
private volatile boolean closed;
// 我们在这里使用信号量而不是简单的同步方法来允许
// 事件由多个线程并发处理,使得所有事件
// 对于某个线程,一旦该线程从 IW 返回,就会对其进行处理
private final Semaphore permits = new Semaphore(Integer.MAX_VALUE);
private final Queue<Event> queue = new ConcurrentLinkedQueue<>();
private final IndexWriter writer;
EventQueue(IndexWriter writer) {
this.writer = writer;
}
private void acquire() {
if (permits.tryAcquire() == false) {
throw new AlreadyClosedException("queue is closed");
}
if (closed) {
permits.release();
throw new AlreadyClosedException("queue is closed");
}
}
boolean add(Event event) {
acquire();
try {
return queue.add(event);
} finally {
permits.release();
}
}
void processEvents() throws IOException {
acquire();
try {
processEventsInternal();
} finally {
permits.release();
}
}
private void processEventsInternal() throws IOException {
assert Integer.MAX_VALUE - permits.availablePermits() > 0 : "must acquire a permit before processing events";
Event event;
while ((event = queue.poll()) != null) {
event.process(writer);
}
}
@Override
public synchronized void close() throws IOException { // synced to prevent double closing
assert closed == false : "we should never close this twice";
closed = true;
// 有可能我们在 processEvents 调用中关闭了这个队列
if (writer.getTragicException() != null) {
// we are already handling a tragic exception let's drop it all on the floor and return
queue.clear();
} else {
// 现在我们获得了所有的许可,以确保我们是唯一一个处理队列的人
try {
permits.acquire(Integer.MAX_VALUE);
} catch (InterruptedException e) {
throw new ThreadInterruptedException(e);
}
try {
processEventsInternal();
} finally {
permits.release(Integer.MAX_VALUE);
}
}
}
}
private final IndexFileDeleter deleter;
// forceMerge 使用它来记录那些需要合并的
private final Map<SegmentCommitInfo,Boolean> segmentsToMerge = new HashMap<>();
private int mergeMaxNumSegments;
private Lock writeLock;
private volatile boolean closed;
private volatile boolean closing;
private final AtomicBoolean maybeMerge = new AtomicBoolean();
private Iterable<Map.Entry<String,String>> commitUserData;
// Holds all SegmentInfo instances currently involved in
// merges
private final HashSet<SegmentCommitInfo> mergingSegments = new HashSet<>();
private final MergeScheduler mergeScheduler;
private final Set<SegmentMerger> runningAddIndexesMerges = new HashSet<>();
private final LinkedList<MergePolicy.OneMerge> pendingMerges = new LinkedList<>();
private final Set<MergePolicy.OneMerge> runningMerges = new HashSet<>();
private final List<MergePolicy.OneMerge> mergeExceptions = new ArrayList<>();
private long mergeGen;
private Merges merges = new Merges();
private boolean didMessageState;
private final AtomicInteger flushCount = new AtomicInteger();
private final AtomicInteger flushDeletesCount = new AtomicInteger();
private final ReaderPool readerPool;
private final BufferedUpdatesStream bufferedUpdatesStream;
/** Counts how many merges have completed; this is used by {@link #forceApply(FrozenBufferedUpdates)}
* to handle concurrently apply deletes/updates with merges completing. */
private final AtomicLong mergeFinishedGen = new AtomicLong();
// 传递给构造函数的实例。 仅按顺序保存
// 允许用户查询 IndexWriter 设置。
private final LiveIndexWriterConfig config;
/** System.nanoTime() when commit started; used to write
* an infoStream message about how long commit took. */
private long startCommitTime;
/**
* 有多少文档在索引中,或者正在被索引
* 添加(保留)。 例如,像 addIndexes 这样的操作将首先保留
* 在他们实际更改索引之前添加 N 个文档的权利,
* 很像酒店如何对您的信用进行“授权保留”
* 卡,以确保他们稍后可以在您退房时向您收费。*/
private final AtomicLong pendingNumDocs = new AtomicLong();
private final boolean softDeletesEnabled;
private final DocumentsWriter.FlushNotifications flushNotifications = new DocumentsWriter.FlushNotifications() {
@Override
public void deleteUnusedFiles(Collection<String> files) {
eventQueue.add(w -> w.deleteNewFiles(files));
}
@Override
public void flushFailed(SegmentInfo info) {
eventQueue.add(w -> w.flushFailed(info));
}
@Override
public void afterSegmentsFlushed() throws IOException {
publishFlushedSegments(false);
}
@Override
public void onTragicEvent(Throwable event, String message) {
IndexWriter.this.onTragicEvent(event, message);
}
@Override
public void onDeletesApplied() {
eventQueue.add(w -> {
try {
w.publishFlushedSegments(true);
} finally {
flushCount.incrementAndGet();
}
}
);
}
@Override
public void onTicketBacklog() {
eventQueue.add(w -> w.publishFlushedSegments(true));
}
};
DirectoryReader getReader() throws IOException {
return getReader(true, false);
}