DFA算法 有穷自动机,敏感词过滤,PHP示例,PHP实例

DFA算法 有穷自动机,敏感词过滤,PHP示例,PHP实例;

1、PHP使用Array实现HashMap类库,

<?php
/**
 * php 使用array 构建HashMap 结构类
 **/

namespace DFAMaster;

class HashMap
{
    /**
     * 哈希表变量
     * @var array|null
     */
    protected $hashTable = array();

    public function __construct(){}

    /**
     * 向HashMap中添加一个键值对
     * @param $key
     * @param $value
     * @return mixed|null
     */
    public function put($key, $value)
    {
        if (! array_key_exists($key, $this->hashTable)) {
            $this->hashTable[$key] = $value;
            return null;
        }
        $_temp = $this->hashTable[$key];
        $this->hashTable[$key] = $value;
        return $_temp;
    }

    /**
     * 根据key获取对应的value
     * @param $key
     * @return mixed|null
     */
    public function get($key)
    {
        if (array_key_exists($key, $this->hashTable)) {
            return $this->hashTable[$key];
        }
        return null;
    }

    /**
     * 删除指定key的键值对
     * @param $key
     * @return mixed|null
     */
    public function remove($key)
    {
        $temp_table = array();
        if (array_key_exists($key, $this->hashTable)) {
            $tempValue = $this->hashTable[$key];
            while ($curValue = current($this->hashTable)) {
                if (! (key($this->hashTable) == $key)) {
                    $temp_table[key($this->hashTable)] = $curValue;
                }
                next($this->hashTable);
            }
            $this->hashTable = null;
            $this->hashTable = $temp_table;
            return $tempValue;
        }
        return null;
    }

    /**
     * 获取HashMap的所有键值
     * @return array
     */
    public function keys()
    {
        return array_keys($this->hashTable);
    }

    /**
     * 获取HashMap的所有value值
     * @return array
     */
    public function values()
    {
        return array_values($this->hashTable);
    }

    /**
     * 将一个HashMap的值全部put到当前HashMap中
     * @param \DfaFilter\HashMap $map
     */
    public function putAll($map)
    {
        if (! $map->isEmpty() && $map->size() > 0) {
            $keys = $map->keys();
            foreach ($keys as $key) {
                $this->put($key, $map->get($key));
            }
        }

        return ;
    }

    /**
     * 移除HashMap中所有元素
     * @return bool
     */
    public function removeAll()
    {
        $this->hashTable = null;
        return true;
    }

    /**
     * 判断HashMap中是否包含指定的值
     * @param $value
     * @return bool
     */
    public function containsValue($value)
    {
        while ($curValue = current($this->hashTable)) {
            if ($curValue == $value) {
                return true;
            }
            next($this->hashTable);
        }
        return false;
    }

    /**
     * 判断HashMap中是否包含指定的键key
     * @param $key
     * @return bool
     */
    public function containsKey($key)
    {
        if (array_key_exists($key, $this->hashTable)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * 获取HashMap中元素个数
     * @return int
     */
    public function size()
    {
        return count($this->hashTable);
    }

    /**
     * 判断HashMap是否为空
     * @return bool
     */
    public function isEmpty()
    {
        return (count($this->hashTable) == 0);
    }
}

2、封装敏感词顾虑逻辑

<?php
/**
 * DFA算法,将敏感词、违规词 按字符进行tree存储,实现有限违规词词库;
 * 将检测文本分字匹配检测记录起始索引位置,方便替换或统计含有违规词个数;
 * 敏感词之间不要有包含关系。
 */

namespace DFAMaster;

use Exception;

class SensitiveWords
{
    /**
     * 待检测语句长度
     * @var int
     */
    protected $contentLength = 0;

    /**
     * 敏感词单例
     * @var object|null
     */
    private static $_instance = null;

    /**
     * 铭感词库树
     * @var HashMap|null
     */
    protected $wordTree = null;

    /**
     * 存放待检测语句铭感词
     * @var array|null
     */
    protected static $badWordList = null;

    /**
     * 获取单例
     * @return self
     */
    public static function init()
    {
        if (!self::$_instance instanceof self) {
            self::$_instance = new self();
        }
        return self::$_instance;
    }

    /**
     * @param      $str
     * @param null $encoding
     * @return int
     * @throws Exception
     */
    function mb_strlen($str, $encoding = null)
    {
        $length = \mb_strlen($str, $encoding);
        if ($length === false) {
            throw new Exception(' encoding 无效');
        }

        return $length;
    }

    /**
     * 构建铭感词树【文件模式】
     * @param string $filepath
     * @return $this
     * @throws Exception
     */
    public function setTreeByFile($filepath = '')
    {
        if (!file_exists($filepath)) {
            throw new Exception('敏感违规词库文件不存在', 10003);
        }

        // 词库树初始化
        $this->wordTree = $this->wordTree ?: new HashMap();

        foreach ($this->yieldToReadFile($filepath) as $word) {
            $this->buildWordToTree(trim($word));
        }

        return $this;
    }


    /**
     * 构建铭感词树【数组模式】
     * @param null $sensitiveWords
     * @return $this
     * @throws Exception
     */
    public function setTree($sensitiveWords = null)
    {
        if (empty($sensitiveWords)) {
            throw new Exception('敏感违规词库不能为空', 10002);
        }

        $this->wordTree = new HashMap();

        foreach ($sensitiveWords as $word) {
            $this->buildWordToTree($word);
        }
        return $this;
    }

    /**
     * 检测文字中的敏感词
     * @param string $content 待检测内容
     * @param int $matchType 匹配类型 [默认为最小匹配规则]
     * @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
     * @return array
     * @throws Exception
     */
    public function getBadWord($content, $matchType = 1, $wordNum = 0)
    {
        $this->contentLength = $this->mb_strlen($content, 'utf-8');
        $badWordList = array();
        for ($length = 0; $length < $this->contentLength; $length++) {
            $matchFlag = 0;
            $flag = false;
            $tempMap = $this->wordTree;
            for ($i = $length; $i < $this->contentLength; $i++) {
                $keyChar = mb_substr($content, $i, 1, 'utf-8');

                // 获取指定节点树
                $nowMap = $tempMap->get($keyChar);

                // 不存在节点树,直接返回
                if (empty($nowMap)) {
                    break;
                }

                // 存在,则判断是否为最后一个
                $tempMap = $nowMap;

                // 找到相应key,偏移量+1
                $matchFlag++;

                // 如果为最后一个匹配规则,结束循环,返回匹配标识数
                if (false === $nowMap->get('ending')) {
                    continue;
                }

                $flag = true;

                // 最小规则,直接退出
                if (1 === $matchType) {
                    break;
                }
            }

            if (!$flag) {
                $matchFlag = 0;
            }

            // 找到相应key
            if ($matchFlag <= 0) {
                continue;
            }

            $badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');

            // 有返回数量限制
            if ($wordNum > 0 && count($badWordList) == $wordNum) {
                return $badWordList;
            }

            // 需匹配内容标志位往后移
            $length = $length + $matchFlag - 1;
        }
        return $badWordList;
    }

    /**
     * 替换敏感字字符
     * @param        $content      需要过滤词的文本内容
     * @param string $replaceChar 替换字符
     * @param bool $repeat true=>重复替换为敏感词相同长度的字符
     * @param int $matchType
     * @return mixed
     * @throws Exception
     * @throws Exception
     */
    public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
    {
        if (empty($content)) {
            throw new Exception('请填写检测的内容', 10001);
        }
        $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
        // 未检测到敏感词,直接返回
        if (empty($badWordList)) {
            return $content;
        }

        foreach ($badWordList as $badWord) {
            $hasReplacedChar = $replaceChar;
            if ($repeat) {
                $hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
            }
            $content = str_replace($badWord, $hasReplacedChar, $content);
        }
        return $content;
    }

    /**
     * 标记敏感词
     * @param        $content    文本内容
     * @param string $sTag 标签开头,如<mark>
     * @param string $eTag 标签结束,如</mark>
     * @param int $matchType
     * @return mixed
     * @throws Exception
     * @throws Exception
     */
    public function mark($content, $sTag, $eTag, $matchType = 1)
    {
        if (empty($content)) {
            throw new Exception('请填写检测的内容', 10001);
        }
        $badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
        // 未检测到敏感词,直接返回
        if (empty($badWordList)) {
            return $content;
        }
        $badWordList = array_unique($badWordList);
        foreach ($badWordList as $badWord) {
            $replaceChar = $sTag . $badWord . $eTag;
            $content = str_replace($badWord, $replaceChar, $content);
        }
        return $content;
    }

    /**
     * 被检测内容是否合法
     * @param $content
     * @return bool
     * @throws Exception
     */
    public function islegal($content)
    {
        $this->contentLength = $this->mb_strlen($content, 'utf-8');

        for ($length = 0; $length < $this->contentLength; $length++) {
            $matchFlag = 0;

            $tempMap = $this->wordTree;
            for ($i = $length; $i < $this->contentLength; $i++) {
                $keyChar = mb_substr($content, $i, 1, 'utf-8');

                // 获取指定节点树
                $nowMap = $tempMap->get($keyChar);
                // 不存在节点树,直接返回
                if (empty($nowMap)) {
                    break;
                }
                // 找到相应key,偏移量+1
                $tempMap = $nowMap;
                $matchFlag++;
                // 如果为最后一个匹配规则,结束循环,返回匹配标识数
                if (false === $nowMap->get('ending')) {
                    continue;
                }
                return true;
            }
            // 找到相应key
            if ($matchFlag <= 0) {
                continue;
            }
            // 需匹配内容标志位往后移
            $length = $length + $matchFlag - 1;
        }
        return false;
    }

    /**
     * 读取敏感词 文件
     * @param $filepath
     * @return \Generator
     */
    protected function yieldToReadFile($filepath)
    {
        $fp = fopen($filepath, 'r');
        while (!feof($fp)) {
            yield fgets($fp);
        }
        fclose($fp);
    }

    /**
     * 将单个敏感词构建成树结构
     * @param $word
     * @return void
     * @throws Exception
     */
    protected function buildWordToTree($word = '')
    {
        if ('' === $word) {
            return;
        }
        $tree = $this->wordTree;

        $wordLength = $this->mb_strlen($word, 'utf-8');
        for ($i = 0; $i < $wordLength; $i++) {
            $keyChar = mb_substr($word, $i, 1, 'utf-8');

            // 获取子节点树结构
            $tempTree = $tree->get($keyChar);

            if ($tempTree) {
                $tree = $tempTree;
            } else {
                // 设置标志位
                $newTree = new HashMap();
                $newTree->put('ending', false);

                // 添加到集合
                $tree->put($keyChar, $newTree);
                $tree = $newTree;
            }

            // 到达最后一个节点
            if ($i == $wordLength - 1) {
                $tree->put('ending', true);
            }
        }

        return;
    }

    /**
     * 敏感词替换为对应长度的字符
     * @param $word
     * @param $char
     * @return string
     * @throws Exception
     */
    protected function dfaBadWordConversChars($word, $char)
    {
        $str = '';
        $length = $this->mb_strlen($word, 'utf-8');
        for ($counter = 0; $counter < $length; ++$counter) {
            $str .= $char;
        }

        return $str;
    }
}

3、测试使用

<?php

use DFAMaster\SensitiveWords;

class Test
{
    protected $wordData;
    protected $content = '';
    protected $wordsPath = '';

    public function __construct($content)
    {
        //被识别内容
        $this->content = $content;
        //敏感词
        $wordPool = '被指抄袭,本公司担,本无码,毕业證,办证,证书';
        $this->wordData = explode(',', $wordPool);
        // 铭感词文件路径
        $this->wordsPath = '/data/words.txt';
    }

    /**
     * 测试
     * @return void
     * @throws Exception
     */
    public function test()
    {
        //过滤
        $filterContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
            ->getBadWord($this->content);

        //返回规定数量的敏感词,用于判断 存在敏感词时,弃用被识别的内容
        $badWords = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
            ->getBadWord($this->content, 1, 2);


        //过滤 替换
        $filterContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
            ->replace($this->content, '*');

        //过滤 替换
        $filterContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
            ->replace($this->content, '*', true);

        //过滤敏感词、违规词 标记
        $markedContent = SensitiveWords::init()
            ->setTree($this->wordData)
            //->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
            ->mark($this->content, '<mark>', '</mark>');

    }

}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值