lucene几个类的含义2

索引文件里的文件命名有什么规律

[quote]
_9.cfs
_9.cfx
segments_k
segments.gen
[/quote]

private final synchronized String newSegmentName(){
return "_"+Integer.toString(segmentInfos.counter++,Character.MAX_RADIX);
}


将segmentInfos.counter加1后转为36进制。前面加下划线。所以segmentInfos.counter的值表示了segment中总共文档的数量。

文档倒排。这是建立索引内存消耗最大的时刻。除了词条,还需要存储词条位置,频率等信息


package org.apache.lucene.index;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
* Holds state for inverting all occurrences of a single
* field in the document. This class doesn't do anything
* itself; instead, it forwards the tokens produced by
* analysis to its own consumer
* (InvertedDocConsumerPerField). It also interacts with an
* endConsumer (InvertedDocEndConsumerPerField).
*/

final class DocInverterPerField extends DocFieldConsumerPerField {

final private DocInverterPerThread perThread;
final private FieldInfo fieldInfo;
final InvertedDocConsumerPerField consumer;
final InvertedDocEndConsumerPerField endConsumer;
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;

public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
this.perThread = perThread;
this.fieldInfo = fieldInfo;
docState = perThread.docState;
fieldState = perThread.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo);
this.endConsumer = perThread.endConsumer.addField(this, fieldInfo);
}

void abort() {
consumer.abort();
endConsumer.abort();
}

public void processFields(final Fieldable[] fields,
final int count) throws IOException {

fieldState.reset(docState.doc.getBoost());

final int maxFieldLength = docState.maxFieldLength;

final boolean doInvert = consumer.start(fields, count);

for(int i=0;i<count;i++) {

final Fieldable field = fields[i];

// TODO FI: this should be "genericized" to querying
// consumer if it wants to see this particular field
// tokenized.
if (field.isIndexed() && doInvert) {

if (fieldState.length > 0)
fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);

if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
final int valueLength = stringValue.length();
perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
fieldState.attributeSource = perThread.singleTokenTokenStream;
perThread.localTokenStream.reset();
consumer.start(field);

boolean success = false;
try {
consumer.add();
success = true;
} finally {
if (!success)
docState.docWriter.setAborting();
}
fieldState.offset += valueLength;
fieldState.length++;
fieldState.position++;
} else { // tokenized field
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();

if (streamValue != null)
stream = streamValue;
else {
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
final Reader reader; // find or make Reader
final Reader readerValue = field.readerValue();

if (readerValue != null)
reader = readerValue;
else {
String stringValue = field.stringValue();
if (stringValue == null)
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
perThread.stringReader.init(stringValue);
reader = perThread.stringReader;
}

// Tokenize field and add to postingTable
stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
}

// reset the TokenStream to the first token
stream.reset();

try {
int offsetEnd = fieldState.offset-1;

boolean useNewTokenStreamAPI = stream.useNewAPI();
Token localToken = null;

if (useNewTokenStreamAPI) {
fieldState.attributeSource = stream;
} else {
fieldState.attributeSource = perThread.localTokenStream;
localToken = perThread.localToken;
}

consumer.start(field);

OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);

for(;;) {

// If we hit an exception in stream.next below
// (which is fairly common, eg if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
Token token = null;


/**
token.termText 切出的词
token.startOffset 词的起始位置
token.endOffset 词的结束位置
*/

if (useNewTokenStreamAPI) {
if (!stream.incrementToken()) break;
} else {
token = stream.next(localToken);
if (token == null) break;
perThread.localTokenStream.set(token);
}

final int posIncr = posIncrAttribute.getPositionIncrement();
fieldState.position += posIncr - 1;
if (posIncr == 0)
fieldState.numOverlap++;

boolean success = false;
try {
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
consumer.add();
success = true;
} finally {
if (!success)
docState.docWriter.setAborting();
}
fieldState.position++;
offsetEnd = fieldState.offset + offsetAttribute.endOffset();
if (++fieldState.length >= maxFieldLength) {
if (docState.infoStream != null)
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
break;
}
}
fieldState.offset = offsetEnd+1;
} finally {
stream.close();
}
}

fieldState.boost *= field.getBoost();
}
}

consumer.finish();
endConsumer.finish();
}
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值