lucene构建索引的缓存结构, 当内存占用达到一定程度之后,或者用户执行indexWrite.flush时会将内存中的索引结构刷到磁盘中去。
DefalutIndexChain.flush
Map<String,TermsHashPerField> fieldsToFlush = new HashMap<>();
for (int i=0;i<fieldHash.length;i++) {
PerField perField = fieldHash[i];
while (perField != null) {
if (perField.invertState != null) {
fieldsToFlush.put(perField.fieldInfo.name, perField.termsHashPerField);
}
perField = perField.next;
}
}
try (NormsProducer norms = readState.fieldInfos.hasNorms()
? state.segmentInfo.getCodec().normsFormat().normsProducer(readState)
: null) {
NormsProducer normsMergeInstance = null;
if (norms != null) {
// Use the merge instance in order to reuse the same IndexInput for all terms
normsMergeInstance = norms.getMergeInstance();
}
termsHash.flush(fieldsToFlush, state, sortMap, normsMergeInstance);
}
termsHash负责将fields中的内容刷到磁盘中去, 以org.apache.lucene.index.FreqProxTermsWriter为例
public void flush(Map<String,TermsHashPerField> fieldsToFlush, final SegmentWriteState state,
Sorter.DocMap sortMap, NormsProducer norms) throws IOException {
super.flush(fieldsToFlush, state, sortMap, norms);
// Gather all fields that saw any postings:
List<FreqProxTermsWriterPerField> allFields = new ArrayList<>();
for (TermsHashPerField f : fieldsToFlush.values()) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
if (perField.bytesHash.size() > 0) {
perField.sortPostings();
assert perField.fieldInfo.getIndexOptions() != IndexOptions.NONE;
allFields.add(perField);
}
}
// Sort by field name
CollectionUtil.introSort(allFields);
Fields fields = new FreqProxFields(allFields);
applyDeletes(state, fields);
if (sortMap != null) {
fields = new SortingLeafReader.SortingFields(fields, state.fieldInfos, sortMap);
}
FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
boolean success = false;
try {
consumer.write(fields, norms);
success = true;
} finally {
if (success) {
IOUtils.close(consumer);
} else {
IOUtils.closeWhileHandlingException(consumer);
}
}
}
最终会从Fields fields = new FreqProxFields(allFields); FreqProxFields中去查找索引信息。
获取的方式如下:
private static class FreqProxPostingsEnum extends PostingsEnum {
final FreqProxTermsWriterPerField terms;
final FreqProxPostingsArray postingsArray;
// docid freq的reader
final ByteSliceReader reader = new ByteSliceReader();
// postion payload的reader
final ByteSliceReader posReader = new ByteSliceReader();
final boolean readOffsets;
int docID = -1;
int freq;
int pos;
int startOffset;
int endOffset;
int posLeft;
int termID;
boolean ended;
boolean hasPayload;
BytesRefBuilder payload = new BytesRefBuilder();
public FreqProxPostingsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) {
this.terms = terms;
this.postingsArray = postingsArray;
this.readOffsets = terms.hasOffsets;
assert terms.hasProx;
assert terms.hasFreq;
}
public void reset(int termID) {
this.termID = termID;
// 初始化docid freq reader 0代表第0个读取流
terms.initReader(reader, termID, 0);
// 初始化postion payload reader 1代表第一个读取流
terms.initReader(posReader, termID, 1);
ended = false;
docID = -1;
posLeft = 0;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
return freq;
}
@Override
public int nextDoc() throws IOException {
if (docID == -1) {
docID = 0;
}
//跳过所有的pos
while (posLeft != 0) {
nextPosition();
}
if (reader.eof()) {
if (ended) {
return NO_MORE_DOCS;
} else {
ended = true;
docID = postingsArray.lastDocIDs[termID];
freq = postingsArray.termFreqs[termID];
}
} else {
int code = reader.readVInt();
docID += code >>> 1;
if ((code & 1) != 0) {
freq = 1;
} else {
freq = reader.readVInt();
}
assert docID != postingsArray.lastDocIDs[termID];
}
posLeft = freq;
pos = 0;
startOffset = 0;
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
throw new UnsupportedOperationException();
}
@Override
public int nextPosition() throws IOException {
assert posLeft > 0;
// 把posLeft减小一个
posLeft--;
int code = posReader.readVInt();
pos += code >>> 1;
if ((code & 1) != 0) {
hasPayload = true;
// has a payload
payload.setLength(posReader.readVInt());
payload.grow(payload.length());
posReader.readBytes(payload.bytes(), 0, payload.length());
} else {
hasPayload = false;
}
if (readOffsets) {
startOffset += posReader.readVInt();
endOffset = startOffset + posReader.readVInt();
}
return pos;
}
@Override
public int startOffset() {
if (!readOffsets) {
throw new IllegalStateException("offsets were not indexed");
}
return startOffset;
}
@Override
public int endOffset() {
if (!readOffsets) {
throw new IllegalStateException("offsets were not indexed");
}
return endOffset;
}
@Override
public BytesRef getPayload() {
if (hasPayload) {
return payload.get();
} else {
return null;
}
}
}
ByteSliceReader屏蔽了slice读取的细节
final class ByteSliceReader extends DataInput {
ByteBlockPool pool;
int bufferUpto;
byte[] buffer;
public int upto;
int limit;
int level;
public int bufferOffset;
public int endIndex;
public void init(ByteBlockPool pool, int startIndex, int endIndex) {
assert endIndex-startIndex >= 0;
assert startIndex >= 0;
assert endIndex >= 0;
this.pool = pool;
this.endIndex = endIndex;
level = 0;
bufferUpto = startIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
upto = startIndex & ByteBlockPool.BYTE_BLOCK_MASK;
final int firstSize = ByteBlockPool.LEVEL_SIZE_ARRAY[0];
// 这里是计算limit的方法
//1. 当startIndex+firstSize 小于 endIndex时代表还有其他的slice
if (startIndex+firstSize >= endIndex) {
// There is only this one slice to read
limit = endIndex & ByteBlockPool.BYTE_BLOCK_MASK;
} else
limit = upto+firstSize-4;
}
public boolean eof() {
assert upto + bufferOffset <= endIndex;
return upto + bufferOffset == endIndex;
}
@Override
public byte readByte() {
assert !eof();
assert upto <= limit;
if (upto == limit)
// limit = upto+firstSize-4;
// 当达到本slice最后一个字节时跳到下一个slice
nextSlice();
return buffer[upto++];
}
public long writeTo(DataOutput out) throws IOException {
long size = 0;
while(true) {
if (limit + bufferOffset == endIndex) {
assert endIndex - bufferOffset >= upto;
out.writeBytes(buffer, upto, limit-upto);
size += limit-upto;
break;
} else {
out.writeBytes(buffer, upto, limit-upto);
size += limit-upto;
nextSlice();
}
}
return size;
}
public void nextSlice() {
// Skip to our next slice
final int nextIndex = ((buffer[limit]&0xff)<<24) + ((buffer[1+limit]&0xff)<<16) + ((buffer[2+limit]&0xff)<<8) + (buffer[3+limit]&0xff);
level = ByteBlockPool.NEXT_LEVEL_ARRAY[level];
final int newSize = ByteBlockPool.LEVEL_SIZE_ARRAY[level];
bufferUpto = nextIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
upto = nextIndex & ByteBlockPool.BYTE_BLOCK_MASK;
if (nextIndex + newSize >= endIndex) {
// We are advancing to the final slice
assert endIndex - nextIndex > 0;
limit = endIndex - bufferOffset;
} else {
// This is not the final slice (subtract 4 for the
// forwarding address at the end of this new slice)
limit = upto+newSize-4;
}
}
@Override
public void readBytes(byte[] b, int offset, int len) {
while(len > 0) {
final int numLeft = limit-upto;
if (numLeft < len) {
// Read entire slice
System.arraycopy(buffer, upto, b, offset, numLeft);
offset += numLeft;
len -= numLeft;
nextSlice();
} else {
// This slice is the last one
System.arraycopy(buffer, upto, b, offset, len);
upto += len;
break;
}
}
}
}
termID = (-termID)-1;
// 从postingsArray中获取intStart
int intStart = postingsArray.intStarts[termID];
// 从intPool 中找到相应的“数组”
intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
// 获取现在的偏移量
intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
addTerm(termID);
if (termID >= 0) {// New posting 新posting
bytesHash.byteStart(termID);
// Init stream slices
if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
intPool.nextBuffer();
}
if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
bytePool.nextBuffer();
}
intUptos = intPool.buffer;
intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount;
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
}
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
newTerm(termID);
} else {
// 老的posting
termID = (-termID)-1;
int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
addTerm(termID);
}
void writeByte(int stream, byte b) {
// 从“数组”中获取该stream对应的地址, upto对应的是bytePool的地址
int upto = intUptos[intUptoStart+stream];
// 拿upto 找到相应的数组
byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
assert bytes != null;
// 拿upto找到相应的偏移量
int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
// 当前分片已经使用完毕
if (bytes[offset] != 0) {
// End of slice; allocate a new one
// 新分配一个slice
offset = bytePool.allocSlice(bytes, offset);
bytes = bytePool.buffer;
intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
}
// 写入
bytes[offset] = b;
// intPool中指向 offset/pos bytePool中的地址+1
(intUptos[intUptoStart+stream])++;
}
下面的图不是很全, 其中docid,freq 黄色部分也有可能有多个slice
TermVectorsConsumerPerField 和 FreqProxTermsWriterPerField 是不一样的
参考文档: