Elasticsearch(Lucene)检索关联性匹配算法(BM25)的源码解析

最新推荐文章于 2024-06-27 08:28:44 发布

低调的JVM

最新推荐文章于 2024-06-27 08:28:44 发布

阅读量1.1k

点赞数

分类专栏： lucene elasticsearch

本文链接：https://blog.csdn.net/qq_27529917/article/details/103658051

版权

elasticsearch 同时被 2 个专栏收录

9 篇文章 2 订阅

订阅专栏

lucene

8 篇文章 2 订阅

订阅专栏

计算单个doc的相似性评分


private class BM25DocScorer extends SimScorer {

	private final BM25Stats stats;
    private final float weightValue; // boost * idf * (k1 + 1)
    private final NumericDocValues norms;
    /**
     * precomputed cache for all length values
     */
    private final float[] lengthCache;
    /**
     * precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
     */
    private final float[] cache;


	BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
	    this.stats = stats;
	    // 权重 = 原始权重 * (k1 +1), k1：控制非线性的词频的标准化因子，就是减少非常高的词频的影响程度，默认值1.2
	    this.weightValue = stats.weight * (k1 + 1);
	    this.norms = norms;
	    // 当前Lucene版本一般都 > 7 ， 所以能使用LENGTH_TABLE
	    if (indexCreatedVersionMajor >= 7) {
	        lengthCache = LENGTH_TABLE;
	        cache = stats.cache;
	    } else {
	        lengthCache = OLD_LENGTH_TABLE;
	        cache = stats.oldCache;
	    }
	}

	/**
	  * 给单个document算分
	   *
	   * @param doc  document id within the inverted index segment
	   * @param freq sloppy term frequency
	   * @return
	   * @throws IOException
	   */
	  @Override
	  public float score(int doc, float freq) throws IOException {
	      // if there are no norms, we act as if b=0
	      float norm;
	      if (norms == null) {
	          norm = k1;
	      } else {
	          if (norms.advanceExact(doc)) {
	          	  // cache里每个元素的值得生成方式：  k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
	          	  // 根据当前field的docValues来从cache里获取长度因子元素
	              norm = cache[((byte)norms.longValue()) & 0xFF];
	          } else {
	              norm = cache[0];
	          }
	      }
	      // weightValue  = stats.weight * (k1 + 1)
	      // weightValue  = idf.getValue() * boost * (k1 + 1)
	      // weightValue  = Math.log(1 + (docCount - docFreq + 0.5D) / (docFreq + 0.5D)) * boost * (k1 + 1)
	      return weightValue * freq / (freq + norm);
	  }
}


static {
    for (int i = 1; i < 256; i++) {
        float f = SmallFloat.byte315ToFloat((byte)i);
        OLD_LENGTH_TABLE[i] = 1.0f / (f * f);
    }
    OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf

    for (int i = 0; i < 256; i++) {
    	// LENGTH_TABLE 存储 的值, 在前40个数字是递增的，之后就逐渐增加，且幅度越来越大
    	// 0.0	1.0	2.0	3.0	4.0	5.0	6.0	7.0	8.0	9.0	10.0	11.0	12.0	13.0	14.0	15.0	16.0	17.0	18.0	19.0	
    	// ......	
    	//35.0	36.0	37.0	38.0	39.0	40.0	42.0	44.0	46.0	48.0	50.0	52.0	54.0	56.0	60.0	
    	//64.0	68.0	72.0	76.0	80.0	84.0	88.0	96.0	104.0	112.0	120.0	128.0	136.0	144.0	152.0	
    	//168.0	184.0	200.0	216.0	232.0	248.0	264.0	280.0	312.0	344.0	376.0	408.0	440.0	472.0	504.0	
    	//......
    	//1.61061274E9	1.74483046E9	1.87904819E9	2.01326592E9
        LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte)i);
    }
}

计算原始相似性权重

/**
  * 计算相似性权重
  */
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats,TermStatistics... termStats) {
    // 计算IDF， 如果是短语搜索，也就是有多个term，则idf算法有区别
    Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
    // 此Field的平均term个数
    float avgdl = avgFieldLength(collectionStats);

    float[] oldCache = new float[256];
    float[] cache = new float[256];
    // 缓存长度因子
    // k1：控制非线性的词频的标准化因子，就是减少非常高的词频的影响程度，默认值1.2, 控制tf 在 0~k+1 之间
    // b：控制doc长度因素对评分的影响程度, 越大则长度越重要，越小则约不重要，默认0.75
    for (int i = 0; i < cache.length; i++) {
    	// 老版本使用此缓存
        oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
        // 目前大部分使用的是这个逻辑
        cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
    }
    return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
}

BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
    this.field = field;
    this.boost = boost;
    this.idf = idf;
    this.avgdl = avgdl;
    // 原始权重 = idf * boost
    this.weight = idf.getValue() * boost;
    this.oldCache = oldCache;
    this.cache = cache;
}

计算单个term的idf

/**
  * 计算单个term的idf
  */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    final float idf = idf(df, docCount);
    return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
        Explanation.match(df, "docFreq"),
        Explanation.match(docCount, "docCount"));
}

/**
  * Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>.
  */
 protected float idf(long docFreq, long docCount) {
     return (float)Math.log(1 + (docCount - docFreq + 0.5D) / (docFreq + 0.5D));
 }

计算多个term的idf之和

/**
  * 计算多个term的idf，当一个doc中匹配多个term时，每个term的idf相加得到总的idf
  */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
    double idf = 0d; // sum into a double before casting into a float
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats) {
        Explanation idfExplain = idfExplain(collectionStats, stat);
        details.add(idfExplain);
        idf += idfExplain.getValue();
    }
    return Explanation.match((float)idf, "idf(), sum of:", details);
}

计算符合条件的所有doc的平均term长度

/**
 * 计算满足条件的doc的平均term个数，默认是所有的doc的有效term freq之和 / 有效doc个数
 * The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>
 */
protected float avgFieldLength(CollectionStatistics collectionStats) {
    final long sumTotalTermFreq;
    // 如果此filed为被store或者index
    if (collectionStats.sumTotalTermFreq() == -1) {
        // frequencies are omitted (tf=1), its # of postings
        if (collectionStats.sumDocFreq() == -1) {
            // theoretical case only: remove!
            return 1f;
        }
        // field 每个doc里的有效term的总数,一个term出现多少算一个
        sumTotalTermFreq = collectionStats.sumDocFreq();
    } else {
    	// field里所有term出现的次数的总数，一个term可能出现多次
        sumTotalTermFreq = collectionStats.sumTotalTermFreq();
    }
    // 如果field未被store或者index，则用计算所有的doc
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float)(sumTotalTermFreq / (double)docCount);
}