Lucene的文档打分的公式:
score(q,d) = coord(q,d) · queryNorm(q) · ∑( tf(t in d) · idf(t)2 · t.getBoost() · norm(t,d) ) t in q |
Document Boost和Field Boost影响的是norm(t, d),其公式如下:
norm(t,d) = field f in d named as t |
它包括三个参数:
- Document boost:此值越大,说明此文档越重要。
- Field boost:此域越大,说明此域越重要。
- lengthNorm(field) = (1.0 / Math.sqrt(numTerms)):一个域中包含的Term总数越多,也即文档越长,此值越小,文档越短,此值越大。
打分部分测试代码
package com;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class NormsDocBoostTest {
private IndexWriter writer;
private Analyzer analyzer;
List<Document> documents = new ArrayList<Document>();
private static final Version VERSION = Version.LUCENE_47;// lucene版本
public static void main(String[] args) {
NormsDocBoostTest normsDocBoostTest=new NormsDocBoostTest();
try {
normsDocBoostTest.testNormsDocBoost();
} catch (Exception e) {
throw new RuntimeException();
}
}
public void testNormsDocBoost() throws Exception {
System.out.println(Math.log(0.75)+1);
// String indexDir = "D:/luceneTest/NormsDocBoost";
File indexDir = new File("D:/luceneTest/NormsDocBoost");
// analyzer = new IKAnalyzer();
analyzer = new StandardAnalyzer(VERSION.LUCENE_47);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
indexWriterConfig.setMaxBufferedDocs(10000);
indexWriterConfig.setRAMBufferSizeMB(64);
// /设置索引的打开模式 创建或者添加索引
indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),indexWriterConfig);
// writer.setUseCompoundFile(false);
Document doc1 = new Document();
Field f1 = new Field("contents", "common hello hello", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);
// f1.setBoost(100);
doc1.add(f1);
//lucene4.x 以后不能给文档加分值,如果要提高需要给每个文档加权重
// doc1.setBoost(100);
writer.addDocument(doc1);
Document doc2 = new Document();
Field f2 = new Field("contents", "common common hello", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);
doc2.add(f2);
writer.addDocument(doc2);
Document doc3 = new Document();
Field f3 = new Field("contents", "common common common", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);
doc3.add(f3);
writer.addDocument(doc3);
writer.close();
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(new TermQuery(new Term("contents", "common")), 10);
for (ScoreDoc doc : docs.scoreDocs) {
System.out.println("docid : " + doc.doc + " score : " + doc.score);
}
/* docid : 2 score : 1.2337708
docid : 1 score : 1.0073696
docid : 0 score : 0.71231794*/
}
}
主要的打分部分代码(其中将引用的部分函数以及计算中的分值做记录)
public Weight createNormalizedWeight(Query query) throws IOException {
query = rewrite(query);
/* 此处doc=3 maxdoc=3
* public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
return new IDFStats(collectionStats.field(), idf, queryBoost);
}
idf= return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); //此处numdocs=3 docFreq=3
这里计算最后的weight 其中idf=0.71231794 = idf(docFreq=3, maxDocs=3) queryboost=1.0f
IDFStats(collectionStats.field(), idf, queryBoost);
public IDFStats(String field, Explanation idf, float queryBoost) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.queryBoost = queryBoost;
this.queryWeight = idf.getValue() * queryBoost; // compute query weight
queryweight=0.71231794*1.0f
}
通过以上部分的处理得到原始的分值weight
*/
Weight weight = query.createWeight(this);
//此处的值为weight*weight idf*idf
float v = weight.getValueForNormalization(); //weight*weight(0.71231794)=0.5073969
// (float)(1.0 / Math.sqrt(sumOfSquaredWeights));=1.4038675
//v=sumOfSquaredWeights=0.5073969
float norm = getSimilarity().queryNorm(v);//v=0.5073969 此处=1.4038675
if (Float.isInfinite(norm) || Float.isNaN(norm)) {
norm = 1.0f;
}
/* @Override
* 此处querynorm=1.4038675 toplevelBoost=1.0 queryweight= 0.71231794(idf)
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; // normalize query weight
value = queryWeight * idf.getValue(); // idf for document
最后的weight分值为0.71231794
}*/
weight.normalize(norm, 1.0f);
return weight;
}
query = rewrite(query);
/* 此处doc=3 maxdoc=3
* public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
return new IDFStats(collectionStats.field(), idf, queryBoost);
}
idf= return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); //此处numdocs=3 docFreq=3
这里计算最后的weight 其中idf=0.71231794 = idf(docFreq=3, maxDocs=3) queryboost=1.0f
IDFStats(collectionStats.field(), idf, queryBoost);
public IDFStats(String field, Explanation idf, float queryBoost) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.queryBoost = queryBoost;
this.queryWeight = idf.getValue() * queryBoost; // compute query weight
queryweight=0.71231794*1.0f
}
通过以上部分的处理得到原始的分值weight
*/
Weight weight = query.createWeight(this);
//此处的值为weight*weight idf*idf
float v = weight.getValueForNormalization(); //weight*weight(0.71231794)=0.5073969
// (float)(1.0 / Math.sqrt(sumOfSquaredWeights));=1.4038675
//v=sumOfSquaredWeights=0.5073969
float norm = getSimilarity().queryNorm(v);//v=0.5073969 此处=1.4038675
if (Float.isInfinite(norm) || Float.isNaN(norm)) {
norm = 1.0f;
}
/* @Override
* 此处querynorm=1.4038675 toplevelBoost=1.0 queryweight= 0.71231794(idf)
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; // normalize query weight
value = queryWeight * idf.getValue(); // idf for document
最后的weight分值为0.71231794
}*/
weight.normalize(norm, 1.0f);
return weight;
}