相似度 (Similarity Score) 是经常用在信息检索中. 相似度在信息检索中的作用是对文档中的字和词有的共性大小在匹配时来定义相似度T为被识字符块与标准字模板某类点的共有数分别占两图块该类点总数的比率,其取值范围为0~1. 以上我用JAVA代码介绍了TF-IDF的计算. 这里我将会用余玄相似度公式来介绍在信息检索中如何计算文档中的词与查询相似度.
公式:
在计算信息检索的相似度中, 这里需要先计算文档中的词的TF-IDF (TF-IDF的计算中的JAVA代码) 和所需要查询的词的TF-IDF的计算结果.
代码:
class SimilarityScore
{
HashMap<Integer,Double> similarityscore = new HashMap<Integer,Double>();
ArrayList<HashMap<String,Double>> TFIDFListSS = new ArrayList<HashMap<String,Double>>();
ArrayList<ArrayList<String>> MainFileListSS = new ArrayList<ArrayList<String>>();
HashMap<String,Double> IDFListSS = new HashMap<String,Double>();
String[] InputWords;
public SimilarityScore(ArrayList<ArrayList<String>> MFL, ArrayList<HashMap<String,Double>> TFIDFl, HashMap<String,Double> IDFL, String[] iw)
{
MainFileListSS = MFL;
TFIDFListSS = TFIDFl;
IDFListSS = IDFL;
InputWords = iw;
}
public HashMap<Integer,Double> PrintSimilarityScore()
{
//Calculate query terms of tf
HashMap<String,Double> SaveQueryTerms = new HashMap<String,Double>();
HashMap<String,Double> StoreQueryTFIDF = new HashMap<String,Double>();
double QueryTermsMaxFreq = 0.0;
//Calculate query terms max freq
for(int t=0; t<InputWords.length; t++)
{
if(!SaveQueryTerms.containsKey(InputWords[t]))
{
SaveQueryTerms.put(InputWords[t], 1.0);
if(QueryTermsMaxFreq < 1.0)
{
QueryTermsMaxFreq = 1.0;
}
}
else
{
double queryValue = SaveQueryTerms.get(InputWords[t]);
queryValue++;
SaveQueryTerms.put(InputWords[t],queryValue);
if(queryValue > QueryTermsMaxFreq)
{
queryValue = QueryTermsMaxFreq;
}
}
}
//Calculate query terms tf
for(int k=0; k<InputWords.length; k++)
{
if(!StoreQueryTFIDF.containsKey(InputWords[k]))
{
//Calculate query terms TF
double QueryTermsFreq = SaveQueryTerms.get(InputWords[k]);
double QueryTermsTF = QueryTermsFreq / QueryTermsMaxFreq;
//Calculate query terms TF_IDF
double QueryIDF = IDFListSS.get(InputWords[k]);
double QueryTF = SaveQueryTerms.get(InputWords[k]);
double QueryTFIDF = QueryTF * QueryIDF;
//Store query terms TF_IDF
//query terms TF_IDF list
StoreQueryTFIDF.put(InputWords[k],QueryTFIDF);
}
}
//=======================================================
for(int i=0; i<MainFileListSS.size(); i++)
{
ArrayList<String> SubFileListSS = MainFileListSS.get(i);
HashMap<String,Double> TFIDFList = TFIDFListSS.get(i);//source file of TFIDF
double Updq = 0.0;
double Down_Dsquare = 0.0;
double Down_Qsquare = 0.0;
for(int j=0; j<SubFileListSS.size(); j++)
{
//Calculate similary score up d*q
if(StoreQueryTFIDF.containsKey(SubFileListSS.get(j)))
{
Updq = (StoreQueryTFIDF.get(SubFileListSS.get(j)) * TFIDFList.get(SubFileListSS.get(j))) + Updq;
}
}
for(int d=0; d<SubFileListSS.size(); d++)
{
Down_Dsquare = (TFIDFList.get(SubFileListSS.get(d)) * TFIDFList.get(SubFileListSS.get(d))) + Down_Dsquare;
}
double Down_Dsquare1 = (double)Math.sqrt(Down_Dsquare);
//============================================
for(int q=0; q<InputWords.length; q++)
{
Down_Qsquare = (StoreQueryTFIDF.get(InputWords[q]) * StoreQueryTFIDF.get(InputWords[q])) + Down_Qsquare;
}
//============================================
double Down_Qsquare1 = (double)Math.sqrt(Down_Qsquare);
double DQmultiply = Down_Dsquare1 * Down_Qsquare1;
double sim = Updq / DQmultiply;
similarityscore.put(i,sim);
}
return similarityscore;
}
}
最后需要在主函数中执行程序.
代码:
import java.util.*;
import java.io.*;
class VectorSpaceModel
{
public static void main(String args[])
{
//Create scanner to obtain input from comman window
Scanner input = new Scanner(System.in);
System.out.print("Enter Query words: "); //prompt
String Terms;//terms to add string
Terms = input.nextLine();//read first number from user
String[] InputTerm = Terms.split(" ");
//===============================================
//print main file
ArrayList<ArrayList<String>> sourcelist = new ArrayList<ArrayList<String>>();
ReadSource source = new ReadSource();
sourcelist = source.readsource();
//check TF
ArrayList<HashMap<String,Double>> tfList = new ArrayList<HashMap<String, Double>>();
TFs tf = new TFs(sourcelist);
tfList = tf.PrintTFs();
//check IDF
HashMap<String,Double> idfList = new HashMap<String,Double>();
IDFs idf = new IDFs(sourcelist);
idfList = idf.PrintIDFs();
//check TF_IDF
ArrayList<HashMap<String,Double>> tfidfList = new ArrayList<HashMap<String,Double>>();
TFIDF tfidf = new TFIDF(sourcelist,idfList,tfList);
tfidfList = tfidf.PrintTFIDF();
//check similarity Score
HashMap<Integer,Double> similarityscore = new HashMap<Integer,Double>();
SimilarityScore ss = new SimilarityScore(sourcelist,tfidfList,idfList,InputTerm);
similarityscore = ss.PrintSimilarityScore();
for(int j=0; j<100; j++)
{
for(int i=0; i<similarityscore.size(); i++)
{
System.out.println(similarityscore.get(i));
}
}
}
}