<?php
class WordFilterService
{
protected $dict;//敏感词字典
public function __construct()
{
$this->loadDataFormFile();
}
/**
* 加载敏感词字典
*
*
*/
protected function loadDataFormFile()
{
$arr = $this->getWord();
//将敏感词加入此次节点
foreach ($arr as $value) {
$this->addWords(trim($value));
}
}
/**
* 获取敏感词库
*
* @return array
*/
public function getWord():array
{
return ['大傻瓜', '笨蛋'];
}
/**
* 分割文本
*
* @param string $str
* @return array
*/
protected function splitStr($str):array
{
//将字符串分割成组成它的字符
// 其中/u 表示按unicode(utf-8)匹配(主要针对多字节比如汉字),否则默认按照ascii码容易出现乱码
return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
}
/**
* 添加敏感字至节点
*
* @param string $words
*/
protected function addWords(string $words)
{
//1.分割字典
$wordArr = $this->splitStr($words);
$curNode = &$this->dict;
foreach ($wordArr as $char) {
if (!isset($curNode)) {
$curNode[$char] = [];
}
$curNode = &$curNode[$char];
}
//标记到达当前节点完整路径为"敏感词"
@$curNode['end']++;
}
/**
* 敏感词校验
*
* @param string $str ;需要校验的字符串
* @param int $level ;屏蔽词校验等级 1-只要顺序包含都屏蔽;2-中间间隔skipDistance个字符就屏蔽;3-全词匹配即屏蔽
* @param int $skipDistance ;允许敏感词跳过的最大距离,如笨aa蛋a傻瓜等等
* @param int $handleStyle ;处理方式 1:替换 2:标红
* @param string $replace ;替换字符
* @return array
*/
public function filter(string $str,int $level = 3,int $skipDistance = 2,int $handleStyle = 2,string $replace = '*'):array
{
$str = strip_tags($str);//过滤掉html标签
//允许跳过的最大距离
if ($level == 1) {
$maxDistance = strlen($str) + 1;
} elseif ($level == 2) {
$maxDistance = max($skipDistance, 0) + 1;
} else {
$maxDistance = 2;
}
$strArr = $this->splitStr($str);
$strLength = count($strArr);
// $isSensitive = false;
$isMatch = false;
$match_arr = [];//匹配敏感词
for ($i = 0; $i < $strLength; $i++) {
//判断当前敏感字是否有存在对应节点
$curChar = $strArr[$i];
if (!isset($this->dict[$curChar])) {
continue;
}
// $isSensitive = true; //引用匹配到的敏感词节点
$curNode = &$this->dict[$curChar];
$dist = 0;
$matchIndex = [$i]; //匹配后续字符串是否match剩余敏感词
for ($j = $i + 1; $j < $strLength && $dist < $maxDistance; $j++) {
if (!isset($curNode[$strArr[$j]])) {
$dist++;
continue;
}
//如果匹配到的话,则把对应的字符所在位置存储起来,便于后续敏感词替换
$matchIndex[] = $j;
//继续引用
$curNode = &$curNode[$strArr[$j]];
}
//判断是否已经到敏感词字典结尾,是的话,进行敏感词替换
if (isset($curNode['end'])) {
$isMatch = true;
foreach ($matchIndex as $index) {
@$match_arr[$i] .= $strArr[$index];
if ($handleStyle == 1) {
$strArr[$index] = $replace;
} elseif ($handleStyle == 2) {
$strArr[$index] = '<span style="color:red;">' . $strArr[$index] . '</span>';
}
}
$i = max($matchIndex);
}
}
$text = implode('', $strArr);
return ['match' => $isMatch, 'text' => $text, 'match_arr' => $match_arr];
}
/**
* 敏感词检测
*
* @param string $str
* @return array
*/
public function sensitiveWordFilter(string $str):array
{
//敏感词过滤
$filter_res = $this->filter($str);
if (isset($filter_res) && $filter_res['match'] == true) {
$match_arr = array_unique($filter_res['match_arr']);
$filter_words = '';
foreach ($match_arr as $v) {
$filter_words .= '【' . $v . '】';
}
return ['code' => 0, 'msg' => '含有敏感词:' . $filter_words . '请修改'];
}
return ['code' => 1, 'msg' => '没有敏感词'];
}
}
$str = '你好呀,大傻瓜';
$obj = new WordFilterService();
$res = $obj->sensitiveWordFilter($str);
var_dump($res);
敏感词检测
最新推荐文章于 2024-09-08 15:57:10 发布