php文章相似度计算不用similar_text()函数

最新推荐文章于 2023-02-22 10:09:43 发布

Mir_憨豆先生

最新推荐文章于 2023-02-22 10:09:43 发布

阅读量5k

点赞数

分类专栏： php

php 专栏收录该内容

114 篇文章 0 订阅

订阅专栏

http://enenba.com/?post=303

php默认有个函数similar_text()用于计算字符串之间的相似度，该函数也可以计算两个字符串的相似度（以百分比计）。不过这个函数感觉对中文计算很不准确比如：

 
   1echo similar_text("吉林禽业公司火灾已致112人遇难","吉林宝源丰禽业公司火灾已致112人遇难");

这两个新闻标题其实都是一样的，如果使用similar_text()相似对结果为：42，即只相似42%，所以这个感觉很不靠谱，今天刚好收集到一段PHP代码也是用于比较两个字符串的相似度，直接贴出代码：

 
   01<?php 
 
   02class LCS {
 
   03    var $str1;
 
   04    var $str2;
 
   05    var $c = array();
 
   06    /*
 
   07        返回串一和串二的最长公共子序列
 
   08    */
 
   09    function getLCS($str1, $str2, $len1 = 0, $len2  = 0) {
 
   10        $this->str1 = $str1;
 
   11        $this->str2 = $str2;
 
   12        if ($len1  == 0) $len1 = strlen($str1);
 
   13        if ($len2  == 0) $len2 = strlen($str2);
 
   14        $this->initC($len1, $len2);
 
   15        return $this->printLCS($this->c, $len1 - 1, $len2 - 1);
 
   16    }
 
   17    /*
 
   18        返回两个串的相似度
 
   19    */
 
   20    function getSimilar($str1, $str2) {
 
   21        $len1 = strlen($str1);
 
   22        $len2 = strlen($str2);
 
   23        $len = strlen($this->getLCS($str1, $str2, $len1, $len2));
 
   24        return $len * 2 / ($len1 + $len2);
 
   25    }
 
   26    function initC($len1, $len2) {
 
   27        for ($i  = 0; $i < $len1; $i++) $this->c[$i][0] = 0;
 
   28        for ($j  = 0; $j < $len2; $j++) $this->c[0][$j] = 0;
 
   29        for ($i  = 1; $i < $len1; $i++) {
 
   30            for ($j  = 1; $j < $len2; $j++) {
 
   31                if ($this->str1[$i] == $this->str2[$j]) {
 
   32                    $this->c[$i][$j] = $this->c[$i - 1][$j  - 1] + 1;
 
   33                } else if  ($this->c[$i - 1][$j] >= $this->c[$i][$j - 1]) {
 
   34                    $this->c[$i][$j] = $this->c[$i - 1][$j];
 
   35                } else {
 
   36                    $this->c[$i][$j] = $this->c[$i][$j - 1];
 
   37                }
 
   38            }
 
   39        }
 
   40    }
 
   41    function printLCS($c, $i, $j) {
 
   42        if ($i  == 0 || $j == 0) {
 
   43            if ($this->str1[$i] == $this->str2[$j]) return $this->str2[$j];
 
   44            else return "";
 
   45        }
 
   46        if ($this->str1[$i] == $this->str2[$j]) {
 
   47            return $this->printLCS($this->c, $i - 1, $j - 1).$this->str2[$j];
 
   48        }  else if  ($this->c[$i - 1][$j] >= $this->c[$i][$j - 1]) {
 
   49            return $this->printLCS($this->c, $i - 1, $j);
 
   50        }  else {
 
   51            return $this->printLCS($this->c, $i, $j - 1);
 
   52        }
 
   53    }
 
   54} 
 
   55 
 
   56$lcs =  new LCS();
 
   57//返回最长公共子序列
 
   58$lcs->getLCS("hello word","hello china");
 
   59//返回相似度
 
   60echo $lcs->getSimilar("吉林禽业公司火灾已致112人遇难","吉林宝源丰禽业公司火灾已致112人遇难");

//========================以上是转载==============

具体原理如下，没有细致研究

基于文本比较算法——线性空间求最长公共子序列的Nakatsu算法

http://www.cnblogs.com/grenet/archive/2011/03/11/1964417.html

使用similar_text() 可以看这里《php计算title标题相似比》

end..

代码如下:

<?php
class LCS {
    var $str1;
    var $str2;
    var $c = array();
    /*
      返回串一和串二的最长公共子序列
   */
    function getLCS($str1, $str2, $len1 = 0, $len2 = 0) {
        $this->str1 = $str1;
        $this->str2 = $str2;
        if ($len1 == 0) $len1 = strlen($str1);
        if ($len2 == 0) $len2 = strlen($str2);
        $this->initC($len1, $len2);
        return $this->printLCS($this->c, $len1 - 1, $len2 - 1);
    }
    /*
      返回两个串的相似度
   */
    function getSimilar($str1, $str2) {
        $len1 = strlen($str1);
        $len2 = strlen($str2);
        $len = strlen($this->getLCS($str1, $str2, $len1, $len2));
        return $len * 2 / ($len1 + $len2);
    }
    function initC($len1, $len2) {
        for ($i = 0; $i < $len1; $i++) $this->c[$i][0] = 0;
        for ($j = 0; $j < $len2; $j++) $this->c[0][$j] = 0;
        for ($i = 1; $i < $len1; $i++) {
            for ($j = 1; $j < $len2; $j++) {
                if ($this->str1[$i] == $this->str2[$j]) {
                    $this->c[$i][$j] = $this->c[$i - 1][$j - 1] + 1;
                } else if ($this->c[$i - 1][$j] >= $this->c[$i][$j - 1]) {
                    $this->c[$i][$j] = $this->c[$i - 1][$j];
                } else {
                    $this->c[$i][$j] = $this->c[$i][$j - 1];
                }
            }
        }
    }
    function printLCS($c, $i, $j) {
        if ($i == 0 || $j == 0) {
            if ($this->str1[$i] == $this->str2[$j]) return $this->str2[$j];
            else return "";
        }
        if ($this->str1[$i] == $this->str2[$j]) {
            return $this->printLCS($this->c, $i - 1, $j - 1).$this->str2[$j];
        } else if ($this->c[$i - 1][$j] >= $this->c[$i][$j - 1]) {
            return $this->printLCS($this->c, $i - 1, $j);
        } else {
            return $this->printLCS($this->c, $i, $j - 1);
        }
    }
}

$lcs = new LCS();
//返回最长公共子序列
//echo $lcs->getLCS("hello word","hello china");
//echo "<br/>";
//返回相似度
echo $lcs->getSimilar("吉林宝源丰禽业公司","吉林宝源丰禽业公司");