算法:
* 0、构建Trie树
* 1、遍历目标字符串中的每一个字符,如果字符不存在,继续下一个。
* 2、如果字符存在,从该字符下一个字符开始遍历,匹配成功后直接返回匹配字符长度,然后跳过这个长度并不是+1, 算是优化了【匹配到一个词后直接返回,没有往下找最长】,否则回到第1步
* 3、重复1-2
/*
* 敏感词返回
*
* 例: 敏感词: [白粉,白粉人]
吸白粉的白粉人 => [白粉] [白粉]
*/
class TreeMap
{
// 节点字符
public $data;
// 存放子节点引用(因为有任意个子节点,所以靠数组来存储)
public $children = [];
// 是否是字符串结束字符
public $isEndingChar = false;
public function __construct($data)
{
$this->data = $data;
}
}
class TrieTree
{
/**
* 敏感词数组
*
* @var array
* @author qpf
*/
public $trieTreeMap = [];
public function __construct()
{
$this->trieTreeMap = new TreeMap('/');
}
/**
* 获取敏感词Map
*
* @return array
* @author qpf
*/
public function getTreeMap()
{
return $this->trieTreeMap;
}
/**
* 添加敏感词
*
* @param array $txtWords
* @author qpf
*/
public function addWords(array $words)
{
foreach ($words as $word) {
$trieTreeMap = $this->trieTreeMap;
$len = mb_strlen($word);
for ($i = 0; $i < $len; $i++) {
$char = mb_substr($word, $i, 1);
if(!isset($trieTreeMap->children[$char])){
$newNode = new TreeMap($char);
$trieTreeMap->children[$char] = $newNode;
}
$trieTreeMap = $trieTreeMap->children[$char];
}
$trieTreeMap->isEndingChar = true;
}
// var_dump($this->trieTreeMap);
}
/**
* 查找对应敏感词
*
* @param string $txt
* @return array
* @author qpf
*/
public function search($txt)
{
$wordsList = [];
$txtLength = mb_strlen($txt);
for ($i = 0; $i < $txtLength; $i++) {
$wordLength = $this->checkWord($txt, $i, $txtLength);
if ($wordLength > 0) {
$words = mb_substr($txt, $i, $wordLength);
$wordsList[] = $words;
// 匹配到一个词后,跳过整个词
$i += $wordLength - 1;
}
}
return $wordsList;
}
/**
* 敏感词检测
*
* @param $txt
* @param $beginIndex
* @param $length
* @return int
*/
private function checkWord($txt, $beginIndex, $length)
{
$flag = false;
$wordLength = 0;
// 获取敏感词树
$trieTree = $this->trieTreeMap;
for ($i = $beginIndex; $i < $length; $i++) {
// 检验单个字
$word = mb_substr($txt, $i, 1);
// 如果树中不存在,结束
if (!isset($trieTree->children[$word])) {
break;
}
// 如果存在
$wordLength++;
$trieTree = $trieTree->children[$word];
if ($trieTree->isEndingChar === true) {
// 直接退出了,没有匹配最长的敏感词
$flag = true;
break;
}
}
if ($beginIndex > 0) {
// 如果$flag == false 赋值$wordLenth为0
$flag || $wordLength = 0;
}
return $wordLength;
}
}
// 有bug,如果是返回有共同前缀的字符串,就会跳过。比如这句话,之后匹配到[白粉] [白粉人匹配不到]
// $data = ['白粉', '白粉人'];
$data = ['白粉', '白粉人', '白粉人嫩','不该大'];
$wordObj = new TrieTree();
$wordObj->addWords($data);
$txt = '中国人民白粉不应该吸收白粉人';
$words = $wordObj->search($txt);
var_dump($words);die;