相似度 (Similarity Score) 的计算

最新推荐文章于 2025-03-04 22:43:31 发布

天空上尉

最新推荐文章于 2025-03-04 22:43:31 发布

阅读量1w

点赞数

文章标签： query string import 文档 input class

本文链接：https://blog.csdn.net/dawei1980/article/details/6741392

版权

相似度 (Similarity Score) 是经常用在信息检索中. 相似度在信息检索中的作用是对文档中的字和词有的共性大小在匹配时来定义相似度T为被识字符块与标准字模板某类点的共有数分别占两图块该类点总数的比率,其取值范围为0~1. 以上我用JAVA代码介绍了TF-IDF的计算. 这里我将会用余玄相似度公式来介绍在信息检索中如何计算文档中的词与查询相似度.

公式:

在计算信息检索的相似度中, 这里需要先计算文档中的词的TF-IDF (TF-IDF的计算中的JAVA代码) 和所需要查询的词的TF-IDF的计算结果.

代码:

class SimilarityScore
{
HashMap<Integer,Double> similarityscore = new HashMap<Integer,Double>();
ArrayList<HashMap<String,Double>> TFIDFListSS = new ArrayList<HashMap<String,Double>>();
ArrayList<ArrayList<String>> MainFileListSS = new ArrayList<ArrayList<String>>();
HashMap<String,Double> IDFListSS = new HashMap<String,Double>();
String[] InputWords;

public SimilarityScore(ArrayList<ArrayList<String>> MFL, ArrayList<HashMap<String,Double>> TFIDFl, HashMap<String,Double> IDFL, String[] iw)
{
  MainFileListSS = MFL;
  TFIDFListSS = TFIDFl;
  IDFListSS = IDFL;
  InputWords = iw;
}

public HashMap<Integer,Double> PrintSimilarityScore()
{
  //Calculate query terms of tf
  HashMap<String,Double> SaveQueryTerms = new HashMap<String,Double>();
  HashMap<String,Double> StoreQueryTFIDF = new HashMap<String,Double>();
  double QueryTermsMaxFreq = 0.0;

  //Calculate query terms max freq
  for(int t=0; t<InputWords.length; t++)
  {
   if(!SaveQueryTerms.containsKey(InputWords[t]))
   {
    SaveQueryTerms.put(InputWords[t], 1.0);
    if(QueryTermsMaxFreq < 1.0)
    {
     QueryTermsMaxFreq = 1.0;
    }
   }
   else
   {
    double queryValue = SaveQueryTerms.get(InputWords[t]);
    queryValue++;
    SaveQueryTerms.put(InputWords[t],queryValue);

    if(queryValue > QueryTermsMaxFreq)
    {
     queryValue = QueryTermsMaxFreq;
    }
   }
  }

  //Calculate query terms tf
  for(int k=0; k<InputWords.length; k++)
  {
   if(!StoreQueryTFIDF.containsKey(InputWords[k]))
   {
    //Calculate query terms TF
    double QueryTermsFreq = SaveQueryTerms.get(InputWords[k]);
    double QueryTermsTF = QueryTermsFreq / QueryTermsMaxFreq;

    //Calculate query terms TF_IDF
    double QueryIDF = IDFListSS.get(InputWords[k]);
    double QueryTF = SaveQueryTerms.get(InputWords[k]);
    double QueryTFIDF = QueryTF * QueryIDF;

    //Store query terms TF_IDF
    //query terms TF_IDF list
    StoreQueryTFIDF.put(InputWords[k],QueryTFIDF);
   }
  }

  //=======================================================
  for(int i=0; i<MainFileListSS.size(); i++)
  {
   ArrayList<String> SubFileListSS = MainFileListSS.get(i);
   HashMap<String,Double> TFIDFList = TFIDFListSS.get(i);//source file of TFIDF

   double Updq = 0.0;
   double Down_Dsquare = 0.0;
   double Down_Qsquare = 0.0;

   for(int j=0; j<SubFileListSS.size(); j++)
   {
    //Calculate similary score up d*q
    if(StoreQueryTFIDF.containsKey(SubFileListSS.get(j)))
    {
     Updq = (StoreQueryTFIDF.get(SubFileListSS.get(j)) * TFIDFList.get(SubFileListSS.get(j))) + Updq;
    }
   }

   for(int d=0; d<SubFileListSS.size(); d++)
   {
    Down_Dsquare = (TFIDFList.get(SubFileListSS.get(d)) * TFIDFList.get(SubFileListSS.get(d))) + Down_Dsquare;
   }

   double Down_Dsquare1 = (double)Math.sqrt(Down_Dsquare);

   //============================================
   for(int q=0; q<InputWords.length; q++)
   {
    Down_Qsquare = (StoreQueryTFIDF.get(InputWords[q]) * StoreQueryTFIDF.get(InputWords[q])) + Down_Qsquare;
   }
   //============================================

   double Down_Qsquare1 = (double)Math.sqrt(Down_Qsquare);

   double DQmultiply = Down_Dsquare1 * Down_Qsquare1;

   double sim = Updq / DQmultiply;

   similarityscore.put(i,sim);
  }
  return similarityscore;
}
}

最后需要在主函数中执行程序.

代码:

import java.util.*;
import java.io.*;

class VectorSpaceModel
{
public static void main(String args[])
{
  //Create scanner to obtain input from comman window
  Scanner input = new Scanner(System.in);
  System.out.print("Enter Query words: "); //prompt
  String Terms;//terms to add string
  Terms = input.nextLine();//read first number from user

  String[] InputTerm = Terms.split(" ");

  //===============================================
  //print main file
  ArrayList<ArrayList<String>> sourcelist = new ArrayList<ArrayList<String>>();
  ReadSource source = new ReadSource();
  sourcelist = source.readsource();

  //check TF
  ArrayList<HashMap<String,Double>> tfList = new ArrayList<HashMap<String, Double>>();
  TFs tf = new TFs(sourcelist);
  tfList = tf.PrintTFs();

  //check IDF
  HashMap<String,Double> idfList = new HashMap<String,Double>();
  IDFs idf = new IDFs(sourcelist);
  idfList = idf.PrintIDFs();

  //check TF_IDF
  ArrayList<HashMap<String,Double>> tfidfList = new ArrayList<HashMap<String,Double>>();
  TFIDF tfidf = new TFIDF(sourcelist,idfList,tfList);
  tfidfList = tfidf.PrintTFIDF();

  //check similarity Score
  HashMap<Integer,Double> similarityscore = new HashMap<Integer,Double>();
  SimilarityScore ss = new SimilarityScore(sourcelist,tfidfList,idfList,InputTerm);
  similarityscore = ss.PrintSimilarityScore();

  for(int j=0; j<100; j++)
  {
   for(int i=0; i<similarityscore.size(); i++)
   {
    System.out.println(similarityscore.get(i));
   }
  }
}
}