public void finish() throws IOException {
if (fieldInfo.omitsNorms() == false) {
long normValue;
if (invertState.length == 0) {
// the field exists in this document, but it did not have
// any indexed tokens, so we assign a default value of zero
// to the norm
normValue = 0;
} else {
// 用相似度方法计算 norm
normValue = similarity.computeNorm(invertState);
if (normValue == 0) {
throw new IllegalStateException("Similarity " + similarity + " return 0 for non-empty field");
}
}
norms.addValue(docState.docID, normValue);
}
termsHashPerField.finish();
}
BM25计算的方式
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
float[] cache = new float[256];
// 计算出所有的情况,并缓存起来
for (int i = 0; i < cache.length; i++) {
cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
}
return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
计算分数:
BM25Scorer(float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) {
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
this.k1 = k1;
this.b = b;
this.cache = cache;
this.weight = boost * idf.getValue().floatValue();
}
@Override
public float score(float freq, long encodedNorm) {
// In order to guarantee monotonicity with both freq and norm without
// promoting to doubles, we rewrite freq / (freq + norm) to
// 1 - 1 / (1 + freq * 1/norm).
// freq * 1/norm is guaranteed to be monotonic for both freq and norm due
// to the fact that multiplication and division round to the nearest
// float. And then monotonicity is preserved through composition via
// x -> 1 + x and x -> 1 - 1/x.
// Finally we expand weight * (1 - 1 / (1 + freq * 1/norm)) to
// weight - weight / (1 + freq * 1/norm), which runs slightly faster.
float normInverse = cache[((byte) encodedNorm) & 0xFF];
return weight - weight / (1f + freq * normInverse);
}
/** Score the provided document assuming the given term document frequency.
* This method must be called on non-decreasing sequences of doc ids.
* @see SimScorer#score(float, long) */
public float score(int doc, float freq) throws IOException {
return scorer.score(freq, getNormValue(doc));
}
private long getNormValue(int doc) throws IOException {
if (norms != null) {
boolean found = norms.advanceExact(doc);
assert found;
return norms.longValue();
} else {
return 1L; // default norm
}
}
---------------
private final NumericDocValues norms;
/**
* Sole constructor: Score documents of {@code reader} with {@code scorer}.
*/
public LeafSimScorer(SimScorer scorer, LeafReader reader, String field, boolean needsScores) throws IOException {
this.scorer = Objects.requireNonNull(scorer);
norms = needsScores ? reader.getNormValues(field) : null;
}
@Override
public final NumericDocValues getNormValues(String field) throws IOException {
ensureOpen();
FieldInfo fi = getFieldInfos().fieldInfo(field);
if (fi == null || fi.hasNorms() == false) {
// Field does not exist or does not index norms
return null;
}
return getNormsReader().getNorms(fi);
}
if (coreFieldInfos.hasNorms()) {
normsProducer = codec.normsFormat().normsProducer(segmentReadState);
assert normsProducer != null;
} else {
normsProducer = null;
}
private final NormsFormat normsFormat = new Lucene80NormsFormat();
@Override
public final NormsFormat normsFormat() {
return normsFormat;
}
public class Lucene80NormsFormat extends NormsFormat {
/** Sole Constructor */
public Lucene80NormsFormat() {}
@Override
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
return new Lucene80NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
}
@Override
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
return new Lucene80NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
}
private static final String DATA_CODEC = "Lucene80NormsData";
private static final String DATA_EXTENSION = "nvd";
private static final String METADATA_CODEC = "Lucene80NormsMetadata";
private static final String METADATA_EXTENSION = "nvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
}