本文的代码以lucene-core 6.3.0为准,包含Lucene50PostingsWriter存储倒排索引的方式等整个类所有代码的解析。转载请注明出处。
Lucene50PostingsWriter类将倒排索引存到磁盘,先了解下这个类的基本信息:
-
倒排表是以Field为单位构建,也就是文档集中,每个Field如果设置了IndexOptions都会写倒排表。
-
了解倒排索引的基本格式可以看下这一篇Lucene源码分析 - Lucene50PostingsFormat 倒排索引简介。
-
docDeltaBuffer
,freqBuffer
,分别是docId的delta,词频;posDeltaBuffer
,offsetStartDeltaBuffer
,offsetLengthBuffer
,分别是term在某个doc中的position的delta,startOffset的delta,endOffset - startOffset。 -
这个类中的方法的调用顺序是:
init
->setField
-> 父类的writeTerm
->startTerm
->startDoc
->addPosition
->finishDoc
->finishTerm
。
setField
函数主要是设置Field的一些属性,writeTerm
函数将同一个Field中的term的倒排信息写入磁盘,代码如下:
public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
startTerm(); // 写入term之前的初始化
postingsEnum = termsEnum.postings(postingsEnum, enumFlags); //获取term的倒排信息的迭代器
int docFreq = 0;
long totalTermFreq = 0;
while (true) { //遍历term下的docId列表
int docID = postingsEnum.nextDoc();
if (docID == PostingsEnum.NO_MORE_DOCS) {
break;
}
docFreq++; //term的文档频率 +1
docsSeen.set(docID);
int freq;
if (writeFreqs) {
freq = postingsEnum.freq(); //term在文档docId中的词频
totalTermFreq += freq; //所有term的数量增加
} else {
freq = -1;
}
startDoc(docID, freq); //写入term的docId和freq
if (writePositions) {
for(int i=0;i<freq;i++) { //遍历term在文档docId中的位置
int pos = postingsEnum.nextPosition(); // 获取position
BytesRef payload = writePayloads ? postingsEnum.getPayload() : null;
int startOffset;
int endOffset;
if (writeOffsets) {
// 获取起始和终止的偏移量
startOffset = postingsEnum.startOffset();
endOffset = postingsEnum.endOffset();
} else {
startOffset = -1;
endOffset = -1;
}
addPosition(pos, payload, startOffset, endOffset); //term 的信息写入磁盘
}
}
finishDoc(); // 完成存储 term在文档docId中的所有内容
}
if (docFreq == 0) {
return null;
} else {
BlockTermState state = newTermState(); //记录term的统计数据,文档频率和出现的总次数
state.docFreq = docFreq;
state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
finishTerm(state); //完成存储term的所有内容
return state;
}
}
startTerm
函数记录三个文件的起始位置,重置跳表。
public void startTerm() {
docStartFP = docOut.getFilePointer();
if (writePositions) {
posStartFP = posOut.getFilePointer();
if (writePayloads || writeOffsets) {
payStartFP = payOut.getFilePointer();
}
}
lastDocID = 0;
lastBlockDocID = -1;
skipWriter.resetSkip();
}
startDoc
函数将docId的delta和Freq存到docDeltaBuffer
和freqBuffer
中,在docId的数量达到BLOCK_SIZE的时候,用当前block构建跳跃点(skip point)。finishDoc
函数是当docId的数量达到BLOCK_SIZE时更新用于构建下一个skip point需要的变量。
这里可以看到构建skip point需要存lastBlockDocID
,lastBlockPosFP
,lastBlockPayFP
,lastBlockPosBufferUpto
和lastBlockPayloadByteUpto
。doc文件只需要存偏移量和lastBlockDocID
,但是pos文件和pay文件不能只存偏移量,因为term在文档中可能会有多个position,docId数量达到BLOCK_SIZE的时候,position的数量是超过BLOCK_SIZE的,超过BLOCK_SIZE整数倍的部分还在内存中,所以pos和payload还需要存lastBlockPosBufferUpto
和lastBlockPayloadByteUpto
。
public void startDoc(int docID, int termDocFreq) throws IOException {
if (lastBlockDocID != -1 && docBufferUpto == 0) {
skipWriter.bufferSkip(lastBlockDocID, docCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockPayloadByteUpto); // 如果docDeltaBuffer 写入了一次 Packed Blocks,那么就要对这个block的最后一个docId 构建跳表
}
final int docDelta = docID - lastDocID; // 计算docId的delta
if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )", docOut);
}
docDeltaBuffer[docBufferUpto] = docDelta; //存delta
if (writeFreqs) {
freqBuffer[docBufferUpto] = termDocFreq; //存freq
}
docBufferUpto++;
docCount++;
if (docBufferUpto == BLOCK_SIZE) {
forUtil.writeBlock(docDeltaBuffer, encoded, docOut); //将docDeltaBuffer以Packed Blocks的形式存到磁盘
if (writeFreqs) {
forUtil.writeBlock(freqBuffer, encoded, docOut); //将freqBuffer以Packed Blocks的形式存到磁盘
}
}
lastDocID = docID; //设置上一个docId的值
lastPosition = 0; //新的文档中,初始化term在文档中position是0
lastStartOffset = 0; //新的文档中,上一个position是0
}
public void finishDoc() throws IOException {
if (docBufferUpto == BLOCK_SIZE) {
lastBlockDocID = lastDocID; //
if (posOut != null) {
if (payOut != null) {
lastBlockPayFP = payOut.getFilePointer();
}
lastBlockPosFP = posOut.getFilePointer();
lastBlockPosBufferUpto = posBufferUpto;
lastBlockPayloadByteUpto = payloadByteUpto;
}
docBufferUpto = 0;
}
}
addPosition
函数将position的delta,payload,startOffset的delta,endOffset-startOffset写入对应的文件。
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (position > IndexWriter.MAX_POSITION) {
throw new CorruptIndexException("position=" + position + " is too large (> IndexWriter.MAX_POSITION=" + IndexWriter.MAX_POSITION + ")", docOut);
}
if (position < 0) {
throw new CorruptIndexException("position=" + position + " is < 0", docOut);
}
posDeltaBuffer[posBufferUpto] = position - lastPosition; //存position的delta
if (writePayloads) {
if (payload == null || payload.length == 0) {
payloadLengthBuffer[posBufferUpto] = 0; //payload为空,长度存0
} else {
payloadLengthBuffer[posBufferUpto] = payload.length; //存payload的长度
if (payloadByteUpto + payload.length > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
}
System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); //存payload的内容
payloadByteUpto += payload.length;
}
}
if (writeOffsets) {
offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; // 存startOffset的delta
offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; // 这里相当于存了term的长度
lastStartOffset = startOffset; // 更新上一个startOffset
}
posBufferUpto++;
lastPosition = position;
if (posBufferUpto == BLOCK_SIZE) {
forUtil.writeBlock(posDeltaBuffer, encoded, posOut); // 存Packed Blocks
if (writePayloads) {
forUtil.writeBlock(payloadLengthBuffer, encoded, payOut); // 存Packed Blocks
payOut.writeVInt(payloadByteUpto);
payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
payloadByteUpto = 0;
}
if (writeOffsets) {
forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut); // 存Packed Blocks
forUtil.writeBlock(offsetLengthBuffer, encoded, payOut); // 存Packed Blocks
}
posBufferUpto = 0;
}
}
finishTerm
函数将position的delta,payload,startOffset的delta,endOffset-startOffset等内容写入对应的文件,这部分内容的大小都小于BLOCK_SIZE的,所以都会以VInt Blocks的方式存储。
public void finishTerm(BlockTermState _state) throws IOException {
IntBlockTermState state = (IntBlockTermState) _state;
assert state.docFreq > 0;
assert state.docFreq == docCount: state.docFreq + " vs " + docCount;
final int singletonDocID;
if (state.docFreq == 1) {
// 如果term的文档频率为1,设置 singletonDocID
singletonDocID = docDeltaBuffer[0];
} else {
singletonDocID = -1;
// 将未写入磁盘的delta数组以VInt Blocks的方式写入磁盘
for(int i=0;i<docBufferUpto;i++) {
final int docDelta = docDeltaBuffer[i];
final int freq = freqBuffer[i];
if (!writeFreqs) {
docOut.writeVInt(docDelta); // 如果不需要存Freq,则只存docId的delta
} else if (freqBuffer[i] == 1) {
docOut.writeVInt((docDelta<<1)|1); // 如果 Freq=1,则存(delta * 2 +1),+1表示不用存Freq
} else {
docOut.writeVInt(docDelta<<1); // 如果 Freq > 1, 则存 (delta * 2),这是个偶数,表示后面存了 Freq
docOut.writeVInt(freq);
}
}
}
final long lastPosBlockOffset;
if (writePositions) {
assert state.totalTermFreq != -1;
if (state.totalTermFreq > BLOCK_SIZE) {
lastPosBlockOffset = posOut.getFilePointer() - posStartFP;
} else {
lastPosBlockOffset = -1;
}
if (posBufferUpto > 0) {
int lastPayloadLength = -1;
int lastOffsetLength = -1;
int payloadBytesReadUpto = 0;
// 将未写入磁盘的缓存数组以VInt Blocks的方式写入磁盘
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = posDeltaBuffer[i];
if (writePayloads) {
final int payloadLength = payloadLengthBuffer[i];
if (payloadLength != lastPayloadLength) { // 如果有连续的payloadLength相等,只有第一次会写入payloadLength
lastPayloadLength = payloadLength;
posOut.writeVInt((posDelta<<1)|1);
posOut.writeVInt(payloadLength); // payloadLength的值第一次出现,将payloadLength写入磁盘
} else {
posOut.writeVInt(posDelta<<1); // payloadLength跟前一个已经写入的值相等
}
if (payloadLength != 0) {
posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength); //写入payload
payloadBytesReadUpto += payloadLength;
}
} else {
posOut.writeVInt(posDelta); // 这里是不需要写入payload的情况,所以posDelta不需要 *2
}
if (writeOffsets) {
int delta = offsetStartDeltaBuffer[i];
int length = offsetLengthBuffer[i];
if (length == lastOffsetLength) {
posOut.writeVInt(delta << 1); // (endOffset-startOffset) 的值跟以前一个已经写入磁盘的值相等
} else {
posOut.writeVInt(delta << 1 | 1);
posOut.writeVInt(length); // 如果 (endOffset-startOffset) 的值第一次出现,就写入磁盘
lastOffsetLength = length;
}
}
}
if (writePayloads) {
assert payloadBytesReadUpto == payloadByteUpto;
payloadByteUpto = 0;
}
}
} else {
lastPosBlockOffset = -1;
}
long skipOffset;
if (docCount > BLOCK_SIZE) {
skipOffset = skipWriter.writeSkip(docOut) - docStartFP;
} else {
skipOffset = -1;
}
// 这里是构建BlockTermState
state.docStartFP = docStartFP;
state.posStartFP = posStartFP;
state.payStartFP = payStartFP;
state.singletonDocID = singletonDocID;
state.skipOffset = skipOffset;
state.lastPosBlockOffset = lastPosBlockOffset;
//
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
docCount = 0;
}
以上就是Lucene50PostingsWriter的所有源码及我的解读,如果有错误的地方,请联系我修改,谢谢!(企鹅号308384127)