相对熵（KL距离）的java实现

最新推荐文章于 2023-05-15 17:32:31 发布

随风媛55555

最新推荐文章于 2023-05-15 17:32:31 发布

阅读量1.2k

点赞数

分类专栏：大数据文章标签：机器学习 KL-距离文本相似性

大数据专栏收录该内容

3 篇文章 0 订阅

订阅专栏

相对熵（relative entropy或 Kullback-Leibler divergence,KL距离）的java实现（二）

实验中，我们采用两种方法计算概率。一：以字符为单位计算概率；二：以汉语词为单位计算概率在第二种情况下，我们采用Jeasy分词组件进行分词处理，该分词组件为基于前向最大匹配的分词方法，分词结果在绝大多数情况下是正确的。

 
   /** 
   
  *  
   @author 
    liuyu
  * 此实体作为每个字符的一个单位
  *
   
   */ 
   
   public 
     
   class 
    Entity
 {
     String word; 
   // 
   存储字符 
   
   float 
    pValue; 
   // 
   存储该字符对应的概率值 
   
   public 
    Entity() 
   // 
   类的构造函数 
   
       {  
         pValue 
   = 
   0 
   ;
         word 
   = 
   "" 
   ;
         
     }
 
 }

2.读取文件模块

 
     
   
 
    public 
      
    static 
     String GetFileText(String path)  
    throws 
      FileNotFoundException,IOException
     {   
         InputStreamReader inStreamReader 
    = 
    new 
     InputStreamReader( 
    new 
     FileInputStream(path), 
    " 
    UTF-8 
    " 
    );
          
    // 
    String strFile1= 
    
  
            BufferedReader bufReader 
    = 
    new 
     BufferedReader(inStreamReader);
         String line;
         StringBuilder sb 
    = 
    new 
     StringBuilder();
          
    while 
    ((line 
    = 
    bufReader.readLine()) 
    != 
    null 
    )
         {
             sb.append(line 
    + 
    " 
    　 
    " 
    );
         }
         inStreamReader.close();
         bufReader.close();
         String strFile 
    = 
    sb.toString();
       
         
         
          
    return 
     strFile;
         
     }
  
   
 
     
   

3.分割字符

（1）分词

 
     
   
 
    public 
      
    static 
     String CutText(String path) 
    throws 
     FileNotFoundException,IOException
     {
         
       String fileText 
    = 
    GetFileText(path);
      
         
         MMAnalyzer analyzer 
    = 
    new 
     MMAnalyzer();
         String result  
    = 
    null 
    ;
         String spliter 
    = 
    " 
    | 
    " 
    ;
          
    try 
          
         {
             result  
    = 
     analyzer.segment(fileText, spliter);    
         }      
          
    catch 
     (IOException e)      
         {     
             e.printStackTrace();     
         }     
          
    // 
    System.out.print(result); 
    
  
             
    return 
     result;
         
     }
      
   
 
     
   

（2）分出单个字符

 
     
   
 
    public 
      
    static 
     String CutTextSingleCharacter(String path) 
    throws 
     FileNotFoundException,IOException
     {   String text 
    = 
    GetFileText(path);
         String proText 
    = 
    null 
    ;
         Pattern pattern 
    = 
    Pattern.compile( 
    " 
    [\\u4E00-\\u9FA5\\uF900-\\uFA2D] 
    " 
    );
         Matcher m 
    = 
    pattern.matcher(text);
         StringBuffer sb 
    = 
    new 
     StringBuffer();
         Boolean flag 
    = 
    m.find();
          
    while 
    (flag)
         {
              
    int 
     start 
    = 
    m.start();
              
    int 
     end 
    = 
    m.end();
             sb.append(text.substring(start, end) 
    + 
    " 
    | 
    " 
    );
              
    // 
    System.out.println(text.substring(start,end)); 
    
  
                flag 
    = 
    m.find();
         }
        proText 
    = 
    sb.toString();
          
    return 
     proText;
 }
 
      
   
 
     
   

4.计算字符的概率

 
     
   
 
    public 
      
    static 
     ArrayList 
    < 
    Entity 
    > 
     CalcuP(String path)  
    throws 
     IOException
     {     
    // 
    以词为单位计算相对熵
          
    // 
    String result=CutText(path);
          
    // 
    以字为单位计算相对熵 
    
  
            String result 
    = 
    CutTextSingleCharacter(path);
         String []words 
    = 
    result.split( 
    " 
    \\| 
    " 
    );
         
       
         ArrayList 
    < 
    Entity 
    > 
     enList 
    = 
    new 
     ArrayList();
          
    for 
    (String w: words)
         {  w 
    = 
    w.trim();
             Entity en 
    = 
    new 
     Entity();
             en.word 
    = 
    w;
             en.pValue 
    = 
    1 
    ;
             enList.add(en);
              
    // 
    System.out.println(w); 
    
  
            }
     
          
    float 
     total 
    = 
    enList.size();
          
    for 
    ( 
    int 
     i 
    = 
    0 
    ;i 
    < 
    enList.size() 
    - 
    1 
    ;i 
    ++ 
    )
         { 
             
              
    if 
    ( 
    ! 
    enList.get(i).word.isEmpty())
             {
                  
    for 
    ( 
    int 
     j 
    = 
    i 
    + 
    1 
    ;j 
    < 
    enList.size();j 
    ++ 
    )
                 {
                      
    if 
    (enList.get(i).word.equals(enList.get(j).word))
                     {
                         enList.get(i).pValue 
    ++ 
    ;
                         enList.get(j).pValue 
    = 
    0 
    ;
                         enList.get(j).word 
    = 
    "" 
    ;
                     }
                 }
             }
         }
          
    for 
    ( 
    int 
     i 
    = 
    enList.size() 
    - 
    1 
    ;i 
    >= 
    0 
    ;i 
    -- 
    )
         {
              
    if 
    (enList.get(i).pValue 
    < 
    1.0 
    )
                 enList.remove(i);
         }
          
    for 
    ( 
    int 
     i 
    = 
    0 
    ;i 
    < 
    enList.size();i 
    ++ 
    )
         {
             enList.get(i).pValue 
    = 
    enList.get(i).pValue 
    / 
    total;
         }
         
      
    return 
     enList;
     } 
   
 
     
   

5.计算相对熵

 
     
   
 
    /* 
    用于计算两段文本的相对熵 
    */ 
    
  
    public 
      
    static 
      
    float 
     CalKL(ArrayList 
    < 
    Entity 
    > 
    p,ArrayList 
    < 
    Entity 
    > 
    q)
 {  
      
    float 
     kl 
    = 
    0 
    ;
     
      
    float 
     infinity 
    = 
    10000000 
    ; 
    // 
    无穷大 
    
  
         
    double 
     accretion 
    = 
    infinity; 
    // 
    设置熵增加量的初始值为无穷大。
      
    // 
    从q中找出与p中相对应词的概率，如果找到了，就将accretion的值更新，并累加到相对熵上面；如果没找到，则增加了为无穷大 
    
  
         
    for 
    ( 
    int 
     i 
    = 
    0 
    ;i 
    < 
    p.size();i 
    ++ 
    )
     {   
          
    if 
    (q.size() 
    != 
    0 
    )
         {    
    for 
    ( 
    int 
     j 
    = 
    q.size() 
    - 
    1 
    ;j 
    >= 
    0 
    ;j 
    -- 
    )
             {    
                  
    if 
    (p.get(i).word.equals(q.get(j).word))
                 {  accretion 
    = 
    p.get(i).pValue 
    * 
    Math.log(p.get(i).pValue 
    / 
    q.get(j).pValue);
                      
    // 
    q.remove(j); 
    
  
                         
    break 
    ;
                     
                 }
         }
         
         kl 
    += 
    accretion;
         accretion 
    = 
    infinity;
         }
         
         
     }
 
     
      
    return 
     kl;
     
 }
 
 }
  
   
 
     
   
 
    
  
   
 
    转载自：http://www.cnblogs.com/finallyliuyu/archive/2010/03/12/1684032.html 
   

随风媛55555

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
相对熵（KL距离）的java实现

相对熵（relative entropy或 Kullback-Leibler divergence,KL距离）的java实现（二）实验中，我们采用两种方法计算概率。一：以字符为单位计算概率；二：以汉语词为单位计算概率在第二种情况下，我们采用Jeasy分词组件进行分词处理，该分词组件为基于前向最大匹配的分词方法，分词结果在绝大多数情况下是正确的。 /**
复制链接

扫一扫