lucene 源码中tfidf打分模块解析随笔

最新推荐文章于 2020-08-11 23:12:43 发布

a925907195

最新推荐文章于 2020-08-11 23:12:43 发布

阅读量1.4k

点赞数

分类专栏： java solr+lucene 搜索引擎资料

本文链接：https://blog.csdn.net/a925907195/article/details/50390221

版权

java 同时被 3 个专栏收录

144 篇文章 0 订阅

订阅专栏

solr+lucene

22 篇文章 0 订阅

订阅专栏

搜索引擎资料

5 篇文章 0 订阅

订阅专栏

Lucene的文档打分的公式：

score(q,d) = coord(q,d) · queryNorm(q) · ∑( tf(t in d) · idf(t)² · t.getBoost() · norm(t,d) )

t in q

Document Boost和Field Boost影响的是norm(t, d)，其公式如下：

norm(t,d) = doc.getBoost() · lengthNorm(field) · ∏f.getBoost()

field f in d named as t

它包括三个参数：

Document boost：此值越大，说明此文档越重要。
Field boost：此域越大，说明此域越重要。
lengthNorm(field) = (1.0 / Math.sqrt(numTerms))：一个域中包含的Term总数越多，也即文档越长，此值越小，文档越短，此值越大。

打分部分测试代码

package com;

import java.io.File;

import java.util.ArrayList;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.Term;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

* Licensed to the Apache Software Foundation (ASF) under one or more

* contributor license agreements. See the NOTICE file distributed with

* this work for additional information regarding copyright ownership.

* The ASF licenses this file to You under the Apache License, Version 2.0

* (the "License"); you may not use this file except in compliance with

* the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

public class NormsDocBoostTest {

private IndexWriter writer;

private Analyzer analyzer;

List<Document> documents = new ArrayList<Document>();

private static final Version VERSION = Version.LUCENE_47;// lucene版本

public static void main(String[] args) {

NormsDocBoostTest normsDocBoostTest=new NormsDocBoostTest();

try {

normsDocBoostTest.testNormsDocBoost();

} catch (Exception e) {

throw new RuntimeException();

}

public void testNormsDocBoost() throws Exception {

System.out.println(Math.log(0.75)+1);

// String indexDir = "D:/luceneTest/NormsDocBoost";

File indexDir = new File("D:/luceneTest/NormsDocBoost");

// analyzer = new IKAnalyzer();

analyzer = new StandardAnalyzer(VERSION.LUCENE_47);

IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);

indexWriterConfig.setMaxBufferedDocs(10000);

indexWriterConfig.setRAMBufferSizeMB(64);

// /设置索引的打开模式创建或者添加索引

indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);

IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),indexWriterConfig);

// writer.setUseCompoundFile(false);

Document doc1 = new Document();

Field f1 = new Field("contents", "common hello hello", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);

// f1.setBoost(100);

doc1.add(f1);

//lucene4.x 以后不能给文档加分值，如果要提高需要给每个文档加权重

// doc1.setBoost(100);

writer.addDocument(doc1);

Document doc2 = new Document();

Field f2 = new Field("contents", "common common hello", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);

doc2.add(f2);

writer.addDocument(doc2);

Document doc3 = new Document();

Field f3 = new Field("contents", "common common common", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);

doc3.add(f3);

writer.addDocument(doc3);

writer.close();

IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));

IndexSearcher searcher = new IndexSearcher(reader);

TopDocs docs = searcher.search(new TermQuery(new Term("contents", "common")), 10);

for (ScoreDoc doc : docs.scoreDocs) {

System.out.println("docid : " + doc.doc + " score : " + doc.score);

}

/* docid : 2 score : 1.2337708

docid : 1 score : 1.0073696

docid : 0 score : 0.71231794*/

}

主要的打分部分代码（其中将引用的部分函数以及计算中的分值做记录）

public Weight createNormalizedWeight(Query query) throws IOException {
query = rewrite(query);
/* 此处doc=3 maxdoc=3
* public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
return new IDFStats(collectionStats.field(), idf, queryBoost);
}
idf= return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); //此处numdocs=3 docFreq=3

这里计算最后的weight 其中idf=0.71231794 = idf(docFreq=3, maxDocs=3) queryboost=1.0f
IDFStats(collectionStats.field(), idf, queryBoost);

public IDFStats(String field, Explanation idf, float queryBoost) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.queryBoost = queryBoost;
this.queryWeight = idf.getValue() * queryBoost; // compute query weight
queryweight=0.71231794*1.0f
}
通过以上部分的处理得到原始的分值weight
*/
Weight weight = query.createWeight(this);


//此处的值为weight*weight idf*idf
float v = weight.getValueForNormalization(); //weight*weight(0.71231794)=0.5073969

// (float)(1.0 / Math.sqrt(sumOfSquaredWeights));=1.4038675
//v=sumOfSquaredWeights=0.5073969
float norm = getSimilarity().queryNorm(v);//v=0.5073969 此处=1.4038675
if (Float.isInfinite(norm) || Float.isNaN(norm)) {
norm = 1.0f;
}


/* @Override
* 此处querynorm=1.4038675 toplevelBoost=1.0 queryweight= 0.71231794(idf)
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; // normalize query weight
value = queryWeight * idf.getValue(); // idf for document
最后的weight分值为0.71231794
}*/
weight.normalize(norm, 1.0f);
return weight;
}

a925907195

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
lucene 源码中tfidf打分模块解析随笔

Lucene的文档打分的公式：score(q,d) = coord(q,d) · queryNorm(q) · ∑( tf(t in d) · idf(t)2 · t.getBoost() · norm(t,d) )
复制链接

扫一扫

专栏目录