php自定义词库简单分词,敏感词替换

关键词匹配类

<?php

namespace App\Library;

use App\Library\Redis;

class SensitiveWordFilter
{
    protected $dict; //生成的词库文件
    protected $key; //redis键名

    
    public function __construct($key)
    {
        $this->dict = [];
        $this->key = $key;
    }

    public function loadData($data,$time = 7200)
    {
        ini_set("memory_limit", "2048M");
        set_time_limit(0);
        if(!Redis::connection()->hkeys($this->key)){
            Redis::connection()->del($this->key);
        }
        foreach ($data as $v){
            if (empty($v)) {
                continue;
            }
            $this->addWords(trim($v));
        }
        Redis::connection()->setex(
            $this->key,
            $time,
            json_encode($this->dict, JSON_UNESCAPED_UNICODE)
        );
    }


    public function checkLKey()
    {
        return Redis::connection()->exists($this->key);
    }

    /**
     * 分割文本(注意ascii占1个字节, unicode...)
     *
     * @param string $str
     *
     * @return string[]
     */
    protected function splitStr($str)
    {
        return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
    }

    /**
     * 往dict树中添加语句
     *
     * @param $wordArr
     */
    protected function addWords($words)
    {
        $wordArr = $this->splitStr($words);
        $curNode = &$this->dict;
        foreach ($wordArr as $char) {
            if (!isset($curNode)) {
                $curNode[$char] = [];
            }

            $curNode = &$curNode[$char];
        }
        // 标记到达当前节点完整路径为"敏感词"
        $curNode['end'] = 1;
    }

    /**
     * 过滤文本
     *
     * @param string $str 原始文本
     * @param string $replace 敏感字替换字符
     * @param int    $skipDistance 严格程度: 检测时允许跳过的间隔
     *
     * @return string 返回过滤后的文本
     */
    public function filter($str, $replace = '*', $skipDistance = 0)
    {
        $finalRes = [];
        $this->dict = json_decode(Redis::connection()->get($this->key),true);
        $maxDistance = max($skipDistance, 0) + 1;
        $strArr = $this->splitStr($str);
        $length = count($strArr);
        for ($i = 0; $i < $length; $i++) {
            $char = $strArr[$i];

            if (!isset($this->dict[$char])) {
                continue;
            }

            $curNode = &$this->dict[$char];
            $dist = 0;
            $matchIndex = [$i];
            for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {
                if (!isset($curNode[$strArr[$j]])) {
                    $dist ++;
                    continue;
                }

                $matchIndex[] = $j;
                $curNode = &$curNode[$strArr[$j]];
            }

            // 匹配
            if (isset($curNode['end'])) {
                $res = [];
                foreach ($matchIndex as $index) {
                    $res[] = $strArr[$index];
                    $strArr[$index] = $replace;
                }
                $finalRes[] = implode("",$res);
                unset($res);
                $i = max($matchIndex);
            }
        }
        return $finalRes;  //输出匹配到的关键词
//        return implode('', $strArr);   //输出替换内容
    }

    /**
     * 确认所给语句是否为敏感词
     *
     * @param $strArr
     *
     * @return bool|mixed
     */
    public function isMatch($strArr)
    {
        $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $curNode = &$this->dict;
        foreach ($strArr as $char) {
            if (!isset($curNode[$char])) {
                return false;
            }
        }
//        return $curNode['end'] ?? false;  // php 7
        return isset($curNode['end']) ? $curNode['end'] : false;
    }


}

调用示例

$wordFilter = new SensitiveWordFilter('keywords_dict');
//检查词库文件是否存在,不存在重新生成
if(!$wordFilter->checkLKey()){
                $keywordData = Keyword::query()->pluck('keyword');
                if(!empty($keywordData)){
                    $keywordData = $keywordData->toArray();
                    $wordFilter->loadData($keywordData);
                }
                unset($keywordData);
            }
//开始匹配
$keywords = $wordFilter->filter(‘努力读书,报效祖国’);
var_dump($keywords);
//  "读书,祖国"
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值