基于PHP的DFA算法(敏感词过滤)
看到网上很多的DFA算法,很多都有不同程度的问题,自己修改了一下,亲测没有问题,用在系统中过滤敏感词汇,比正则匹配的速度快很多。
class DFA
{
private $arrHashMap = [];
public function getHashMap()
{
return $this->arrHashMap;
}
public function addKeyWord($strWord)
{
$len = mb_strlen($strWord, 'UTF-8');
// 传址
$arrHashMap = &$this->arrHashMap;
for ($i = 0; $i < $len; $i++) {
$word = mb_substr($strWord, $i, 1, 'UTF-8');
// 已存在
if (isset($arrHashMap[$word])) {
if ($i == ($len - 1)) {
$arrHashMap[$word]['end'] = 1;
}
} else {
// 不存在
if ($i == ($len - 1)) {
$arrHashMap[$word] = [];
$arrHashMap[$word]['end'] = 1;
} else {
$arrHashMap[$word] = [];
$arrHashMap[$word]['end'] = 0;
}
}
// 传址
$arrHashMap = &$arrHashMap[$word];
}
}
/**
* 搜索并替换
* @param $strWord
* @return string|string[]|null
*/
public function searchKey($strWord)
{
$len = mb_strlen($strWord, 'UTF-8');
$arrHashMap = $this->arrHashMap;
$keywords = '';
for ($i = 0; $i < $len; $i++) {
$word = mb_substr($strWord, $i, 1, 'UTF-8');
if (!isset($arrHashMap[$word])) {
$keywords = '';
$arrHashMap = $this->arrHashMap;
continue;
}
$keywords .= $word;
if ($arrHashMap[$word]['end'] == 1 ) {
$strWord = preg_replace('/' . $keywords . '/i', str_repeat('*', mb_strlen($keywords)), $strWord, 1);
$keywords = '';
$arrHashMap = $this->arrHashMap;
}
else{
if(!isset($arrHashMap[$word][mb_substr($strWord, $i + 1, 1, 'UTF-8')])){
$keywords = '';
$arrHashMap = $this->arrHashMap;
}
else{
$arrHashMap = $arrHashMap[$word];
}
}
}
return $strWord;
}
$DFA = new DFA();
$DFA->addKeyWord('太难了');
$DFA->addKeyWord('非常难');
$DFA->searchKey('我真的是太难了而且非常难');