FreqProxTermsWriter类
一 类功能
将内存中的索引写入相应的索引文件(tis/tii/frq/prx)
二、成员函数
2.1 createPostings函数
TermsHashPerThread::morePostings()会使用
FreqProxTermsWriter::createPostings申请更多的Postings内存,不过只是调用此函数,并非有缓冲的意思
// 简单代码
void createPostings(RawPostingList[] postings, int start, int count)
当 TermsHashPerField::add()
{
// Refill?
if (0 == perThread.freePostingsCount)
perThread.morePostings(); // TermsHashPerThread类型
}
2.2 flush函数
public void flush(Map threadsAndFields, final SegmentWriteState state)
{
final int numAllFields = allFields.size(); // 所有字段
// TODO: allow Lucene user to customize this consumer:
final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
int start = 0;
while(start < numAllFields) {
final FieldInfo fieldInfo = ((FreqProxTermsWriterPerField) allFields.get(start)).fieldInfo;
final String fieldName = fieldInfo.name;
int end = start+1;
while(end < numAllFields && ((FreqProxTermsWriterPerField) allFields.get(end)).fieldInfo.name.equals(fieldName))// 相同字段
end++;
FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start];
for(int i=start;i<end;i++) {
fields[i-start] = (FreqProxTermsWriterPerField) allFields.get(i);
// Aggregate the storePayload as seen by the same
// field across multiple threads
fieldInfo.storePayloads |= fields[i-start].hasPayloads;
}
// If this field has postings then add them to the
// segment
// 合并索引并写入磁盘 frq/prx/tis
appendPostings(fields, consumer); //
start = end;
}
// FormatPostingsFieldsWriter::finish()写入tii文件
consumer.finish();
}
// 索引合并且写入磁盘
void appendPostings(FreqProxTermsWriterPerField[] fields,
FormatPostingsFieldsConsumer consumer)
throws CorruptIndexException, IOException {
int numFields = fields.length;
final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
for(int i=0;i<numFields;i++) {
FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);
assert fms.field.fieldInfo == fields[0].fieldInfo;
// 将该词项索引从TermsHashPerField读取到FreqProxFieldMergeState的
// final ByteSliceReader freq= new ByteSliceReader(); 词频
// final ByteSliceReader prox = new ByteSliceReader(); 位置
// 中调用TermsHashPerField::initReader(ByteSliceReader reader,
// RawPostingList p, int stream)
boolean result = fms.nextTerm();
assert result;
}
final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
while(numFields > 0) { // 遍历所有字段
// Get the next term to merge
termStates[0] = mergeStates[0]; // 按词项字符串已序的字段
int numToMerge = 1;
// 取词项(字符串)最小值,按字符串大小合并postings
for(int i=1;i<numFields;i++) {
final char[] text = mergeStates[i].text;
final int textOffset = mergeStates[i].textOffset;
final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
if (cmp < 0) {
termStates[0] = mergeStates[i];
numToMerge = 1;
} else if (cmp == 0)
termStates[numToMerge++] = mergeStates[i];
}
final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset);
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
while(numToMerge > 0) { // 同字段同名不同segemnts?
FreqProxFieldMergeState minState = termStates[0];
// 如果有多个segment则选文档号最小-按文档号排序
for(int i=1;i<numToMerge;i++)
if (termStates[i].docID < minState.docID)
minState = termStates[i];
final int termDocFreq = minState.termFreq; // tf值
final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);
// 已经从TermHashPerField中取得
final ByteSliceReader prox = minState.prox;
// 如果需要存储词频和位置
if (!currentFieldOmitTermFreqAndPositions) {
// omitTermFreqAndPositions == false so we do write positions &
// payload
int position = 0;
for(int j=0;j<termDocFreq;j++) { // 遍历所有的位置
final int code = prox.readVInt();
position += code >> 1;
final int payloadLength;
if ((code & 1) != 0) {
// 如果有payload数据
payloadLength = prox.readVInt();
if (payloadBuffer == null || payloadBuffer.length < payloadLength)
payloadBuffer = new byte[payloadLength];
prox.readBytes(payloadBuffer, 0, payloadLength);
} else
payloadLength = 0;
posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
} //End for
posConsumer.finish();
}
if (!minState.nextDoc()) {//该词项的postings未完
// Remove from termStates
int upto = 0;
for(int i=0;i<numToMerge;i++)
if (termStates[i] != minState)
termStates[upto++] = termStates[i];
numToMerge--;
// Advance this state to the next term
if (!minState.nextTerm()) { //如果词项表未扫描完
// OK, no more terms, so remove from mergeStates
// as well
upto = 0;
for(int i=0;i<numFields;i++)
if (mergeStates[i] != minState)
mergeStates[upto++] = mergeStates[i];
numFields--;
}
}
}
// 写入IndexOutput缓存中,如果缓冲满则写入磁盘
// BufferedIndexOutput::BUFFER_SIZE = 16384
docConsumer.finish(); // FormatPostingsDocsWriter:: finish()
}
termsConsumer.finish();
}