相似度 (Similarity Score) 的计算

相似度 (Similarity Score) 是经常用在信息检索中. 相似度在信息检索中的作用是对文档中的字和词有的共性大小在匹配时来定义相似度T为被识字符块与标准字模板某类点的共有数分别占两图块该类点总数的比率,其取值范围为0~1. 以上我用JAVA代码介绍了TF-IDF的计算. 这里我将会用余玄相似度公式来介绍在信息检索中如何计算文档中的词与查询相似度.

 

公式:

 

在计算信息检索的相似度中, 这里需要先计算文档中的词的TF-IDF (TF-IDF的计算中的JAVA代码) 和所需要查询的词的TF-IDF的计算结果.

 

代码:

 

class SimilarityScore
{
 HashMap<Integer,Double> similarityscore = new HashMap<Integer,Double>();
 ArrayList<HashMap<String,Double>> TFIDFListSS = new ArrayList<HashMap<String,Double>>();
 ArrayList<ArrayList<String>> MainFileListSS = new ArrayList<ArrayList<String>>();
 HashMap<String,Double> IDFListSS = new HashMap<String,Double>();
 String[] InputWords;
 
 public SimilarityScore(ArrayList<ArrayList<String>> MFL, ArrayList<HashMap<String,Double>> TFIDFl, HashMap<String,Double> IDFL, String[] iw)
 {
  MainFileListSS = MFL;
  TFIDFListSS = TFIDFl;
  IDFListSS = IDFL;
  InputWords = iw;
 }
 
 
 public HashMap<Integer,Double> PrintSimilarityScore()
 {
  //Calculate query terms of tf
  HashMap<String,Double> SaveQueryTerms = new HashMap<String,Double>();
  HashMap<String,Double> StoreQueryTFIDF = new HashMap<String,Double>();
  double QueryTermsMaxFreq = 0.0;
  
  //Calculate query terms max freq
  for(int t=0; t<InputWords.length; t++)
  {    
   if(!SaveQueryTerms.containsKey(InputWords[t]))
   {
    SaveQueryTerms.put(InputWords[t], 1.0);
    if(QueryTermsMaxFreq < 1.0)
    {
     QueryTermsMaxFreq = 1.0;
    }
   }
   else
   {
    double queryValue = SaveQueryTerms.get(InputWords[t]);
    queryValue++;
    SaveQueryTerms.put(InputWords[t],queryValue);
    
    if(queryValue > QueryTermsMaxFreq)
    {
     queryValue = QueryTermsMaxFreq;
    }
   }
  }
  
  //Calculate query terms tf
  for(int k=0; k<InputWords.length; k++)
  {
   if(!StoreQueryTFIDF.containsKey(InputWords[k]))
   {
    //Calculate query terms TF
    double QueryTermsFreq = SaveQueryTerms.get(InputWords[k]);    
    double QueryTermsTF = QueryTermsFreq / QueryTermsMaxFreq;
    
    //Calculate query terms TF_IDF
    double QueryIDF = IDFListSS.get(InputWords[k]);
    double QueryTF = SaveQueryTerms.get(InputWords[k]);
    double QueryTFIDF = QueryTF * QueryIDF;
    
    //Store query terms TF_IDF
    //query terms TF_IDF list
    StoreQueryTFIDF.put(InputWords[k],QueryTFIDF);
   }
  } 
  
  //=======================================================
  for(int i=0; i<MainFileListSS.size(); i++)
  {
   ArrayList<String> SubFileListSS = MainFileListSS.get(i);
   HashMap<String,Double> TFIDFList = TFIDFListSS.get(i);//source file of TFIDF   
   
   double Updq = 0.0;
   double Down_Dsquare = 0.0;
   double Down_Qsquare = 0.0;
   
   for(int j=0; j<SubFileListSS.size(); j++)
   {
    //Calculate similary score up d*q
    if(StoreQueryTFIDF.containsKey(SubFileListSS.get(j)))
    {
     Updq = (StoreQueryTFIDF.get(SubFileListSS.get(j)) * TFIDFList.get(SubFileListSS.get(j))) + Updq;
    }
   }
   
   for(int d=0; d<SubFileListSS.size(); d++)
   {     
    Down_Dsquare = (TFIDFList.get(SubFileListSS.get(d)) * TFIDFList.get(SubFileListSS.get(d))) + Down_Dsquare;
   }
   
   double Down_Dsquare1 = (double)Math.sqrt(Down_Dsquare);
    
   //============================================
   for(int q=0; q<InputWords.length; q++)
   {
    Down_Qsquare = (StoreQueryTFIDF.get(InputWords[q]) * StoreQueryTFIDF.get(InputWords[q])) + Down_Qsquare;     
   }
   //============================================
   
   double Down_Qsquare1 = (double)Math.sqrt(Down_Qsquare);
   
   double DQmultiply = Down_Dsquare1 * Down_Qsquare1;
    
   double sim = Updq / DQmultiply;
    
   similarityscore.put(i,sim); 
  }
  return similarityscore;
 } 
}

 

最后需要在主函数中执行程序.

 

代码:

 

import java.util.*;
import java.io.*;

class VectorSpaceModel
{
 public static void main(String args[])
 {
  //Create scanner to obtain input from comman window
  Scanner input = new Scanner(System.in);  
  System.out.print("Enter Query words: "); //prompt
  String Terms;//terms to add string
  Terms = input.nextLine();//read first number from user
  
  String[] InputTerm = Terms.split(" ");
  
  //===============================================
  //print main file
  ArrayList<ArrayList<String>> sourcelist = new ArrayList<ArrayList<String>>();
  ReadSource source = new ReadSource();
  sourcelist = source.readsource();
  
  //check TF
  ArrayList<HashMap<String,Double>> tfList = new ArrayList<HashMap<String, Double>>();
  TFs tf = new TFs(sourcelist);
  tfList = tf.PrintTFs();
  
  //check IDF
  HashMap<String,Double> idfList = new HashMap<String,Double>();
  IDFs idf = new IDFs(sourcelist);
  idfList = idf.PrintIDFs();

  //check TF_IDF
  ArrayList<HashMap<String,Double>> tfidfList = new ArrayList<HashMap<String,Double>>();
  TFIDF tfidf = new TFIDF(sourcelist,idfList,tfList);
  tfidfList = tfidf.PrintTFIDF();
  
  //check similarity Score
  HashMap<Integer,Double> similarityscore = new HashMap<Integer,Double>();
  SimilarityScore ss = new SimilarityScore(sourcelist,tfidfList,idfList,InputTerm);
  similarityscore = ss.PrintSimilarityScore();
  
  for(int j=0; j<100; j++)
  {
   for(int i=0; i<similarityscore.size(); i++)
   {
    System.out.println(similarityscore.get(i));
   }
  } 
 }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值