关键词匹配类
<?php
namespace App\Library;
use App\Library\Redis;
class SensitiveWordFilter
{
protected $dict; //生成的词库文件
protected $key; //redis键名
public function __construct($key)
{
$this->dict = [];
$this->key = $key;
}
public function loadData($data,$time = 7200)
{
ini_set("memory_limit", "2048M");
set_time_limit(0);
if(!Redis::connection()->hkeys($this->key)){
Redis::connection()->del($this->key);
}
foreach ($data as $v){
if (empty($v)) {
continue;
}
$this->addWords(trim($v));
}
Redis::connection()->setex(
$this->key,
$time,
json_encode($this->dict, JSON_UNESCAPED_UNICODE)
);
}
public function checkLKey()
{
return Redis::connection()->exists($this->key);
}
/**
* 分割文本(注意ascii占1个字节, unicode...)
*
* @param string $str
*
* @return string[]
*/
protected function splitStr($str)
{
return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
}
/**
* 往dict树中添加语句
*
* @param $wordArr
*/
protected function addWords($words)
{
$wordArr = $this->splitStr($words);
$curNode = &$this->dict;
foreach ($wordArr as $char) {
if (!isset($curNode)) {
$curNode[$char] = [];
}
$curNode = &$curNode[$char];
}
// 标记到达当前节点完整路径为"敏感词"
$curNode['end'] = 1;
}
/**
* 过滤文本
*
* @param string $str 原始文本
* @param string $replace 敏感字替换字符
* @param int $skipDistance 严格程度: 检测时允许跳过的间隔
*
* @return string 返回过滤后的文本
*/
public function filter($str, $replace = '*', $skipDistance = 0)
{
$finalRes = [];
$this->dict = json_decode(Redis::connection()->get($this->key),true);
$maxDistance = max($skipDistance, 0) + 1;
$strArr = $this->splitStr($str);
$length = count($strArr);
for ($i = 0; $i < $length; $i++) {
$char = $strArr[$i];
if (!isset($this->dict[$char])) {
continue;
}
$curNode = &$this->dict[$char];
$dist = 0;
$matchIndex = [$i];
for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {
if (!isset($curNode[$strArr[$j]])) {
$dist ++;
continue;
}
$matchIndex[] = $j;
$curNode = &$curNode[$strArr[$j]];
}
// 匹配
if (isset($curNode['end'])) {
$res = [];
foreach ($matchIndex as $index) {
$res[] = $strArr[$index];
$strArr[$index] = $replace;
}
$finalRes[] = implode("",$res);
unset($res);
$i = max($matchIndex);
}
}
return $finalRes; //输出匹配到的关键词
// return implode('', $strArr); //输出替换内容
}
/**
* 确认所给语句是否为敏感词
*
* @param $strArr
*
* @return bool|mixed
*/
public function isMatch($strArr)
{
$strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
$curNode = &$this->dict;
foreach ($strArr as $char) {
if (!isset($curNode[$char])) {
return false;
}
}
// return $curNode['end'] ?? false; // php 7
return isset($curNode['end']) ? $curNode['end'] : false;
}
}
调用示例
$wordFilter = new SensitiveWordFilter('keywords_dict');
//检查词库文件是否存在,不存在重新生成
if(!$wordFilter->checkLKey()){
$keywordData = Keyword::query()->pluck('keyword');
if(!empty($keywordData)){
$keywordData = $keywordData->toArray();
$wordFilter->loadData($keywordData);
}
unset($keywordData);
}
//开始匹配
$keywords = $wordFilter->filter(‘努力读书,报效祖国’);
var_dump($keywords);
// "读书,祖国"