计算单个doc的相似性评分
private class BM25DocScorer extends SimScorer {
private final BM25Stats stats;
private final float weightValue; // boost * idf * (k1 + 1)
private final NumericDocValues norms;
/**
* precomputed cache for all length values
*/
private final float[] lengthCache;
/**
* precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
*/
private final float[] cache;
BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
// 权重 = 原始权重 * (k1 +1), k1:控制非线性的词频的标准化因子,就是减少非常高的词频的影响程度,默认值1.2
this.weightValue = stats.weight * (k1 + 1);
this.norms = norms;
// 当前Lucene版本一般都 > 7 , 所以能使用LENGTH_TABLE
if (indexCreatedVersionMajor >= 7) {
lengthCache = LENGTH_TABLE;
cache = stats.cache;
} else {
lengthCache = OLD_LENGTH_TABLE;
cache = stats.oldCache;
}
}
/**
* 给单个document算分
*
* @param doc document id within the inverted index segment
* @param freq sloppy term frequency
* @return
* @throws IOException
*/
@Override
public float score(int doc, float freq) throws IOException {
// if there are no norms, we act as if b=0
float norm;
if (norms == null) {
norm = k1;
} else {
if (norms.advanceExact(doc)) {
// cache里每个元素的值得生成方式: k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
// 根据当前field的docValues来从cache里获取长度因子元素
norm = cache[((byte)norms.longValue()) & 0xFF];
} else {
norm = cache[0];
}
}
// weightValue = stats.weight * (k1 + 1)
// weightValue = idf.getValue() * boost * (k1 + 1)
// weightValue = Math.log(1 + (docCount - docFreq + 0.5D) / (docFreq + 0.5D)) * boost * (k1 + 1)
return weightValue * freq / (freq + norm);
}
}
static {
for (int i = 1; i < 256; i++) {
float f = SmallFloat.byte315ToFloat((byte)i);
OLD_LENGTH_TABLE[i] = 1.0f / (f * f);
}
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
for (int i = 0; i < 256; i++) {
// LENGTH_TABLE 存储 的值, 在前40个数字是递增的,之后就逐渐增加,且幅度越来越大
// 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 19.0
// ......
//35.0 36.0 37.0 38.0 39.0 40.0 42.0 44.0 46.0 48.0 50.0 52.0 54.0 56.0 60.0
//64.0 68.0 72.0 76.0 80.0 84.0 88.0 96.0 104.0 112.0 120.0 128.0 136.0 144.0 152.0
//168.0 184.0 200.0 216.0 232.0 248.0 264.0 280.0 312.0 344.0 376.0 408.0 440.0 472.0 504.0
//......
//1.61061274E9 1.74483046E9 1.87904819E9 2.01326592E9
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte)i);
}
}
计算原始相似性权重
/**
* 计算相似性权重
*/
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats,TermStatistics... termStats) {
// 计算IDF, 如果是短语搜索,也就是有多个term,则idf算法有区别
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
// 此Field的平均term个数
float avgdl = avgFieldLength(collectionStats);
float[] oldCache = new float[256];
float[] cache = new float[256];
// 缓存长度因子
// k1:控制非线性的词频的标准化因子,就是减少非常高的词频的影响程度,默认值1.2, 控制tf 在 0~k+1 之间
// b:控制doc长度因素对评分的影响程度, 越大则长度越重要,越小则约不重要,默认0.75
for (int i = 0; i < cache.length; i++) {
// 老版本使用此缓存
oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
// 目前大部分使用的是这个逻辑
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
}
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
this.field = field;
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
// 原始权重 = idf * boost
this.weight = idf.getValue() * boost;
this.oldCache = oldCache;
this.cache = cache;
}
计算单个term的idf
/**
* 计算单个term的idf
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
Explanation.match(df, "docFreq"),
Explanation.match(docCount, "docCount"));
}
/**
* Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>.
*/
protected float idf(long docFreq, long docCount) {
return (float)Math.log(1 + (docCount - docFreq + 0.5D) / (docFreq + 0.5D));
}
计算多个term的idf之和
/**
* 计算多个term的idf,当一个doc中匹配多个term时,每个term的idf相加得到总的idf
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
double idf = 0d; // sum into a double before casting into a float
List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats) {
Explanation idfExplain = idfExplain(collectionStats, stat);
details.add(idfExplain);
idf += idfExplain.getValue();
}
return Explanation.match((float)idf, "idf(), sum of:", details);
}
计算符合条件的所有doc的平均term长度
/**
* 计算满足条件的doc的平均term个数,默认是所有的doc的有效term freq之和 / 有效doc个数
* The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>
*/
protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq;
// 如果此filed为被store或者index
if (collectionStats.sumTotalTermFreq() == -1) {
// frequencies are omitted (tf=1), its # of postings
if (collectionStats.sumDocFreq() == -1) {
// theoretical case only: remove!
return 1f;
}
// field 每个doc里的有效term的总数,一个term出现多少算一个
sumTotalTermFreq = collectionStats.sumDocFreq();
} else {
// field里所有term出现的次数的总数,一个term可能出现多次
sumTotalTermFreq = collectionStats.sumTotalTermFreq();
}
// 如果field未被store或者index,则用计算所有的doc
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float)(sumTotalTermFreq / (double)docCount);
}
代码比较复杂,靠寥寥几段代码和注释是说不清楚的,不过看看代码还是能够对评分计算有一些了解,希望能对熟悉ES的打分机制能够有所帮助,文笔不行见谅
可以看看大神写的总结博客:
elasticSearch(5.3.0)的评分机制的研究