DFA算法 有穷自动机,敏感词过滤,PHP示例,PHP实例;
1、PHP使用Array实现HashMap类库,
<?php
/**
* php 使用array 构建HashMap 结构类
**/
namespace DFAMaster;
class HashMap
{
/**
* 哈希表变量
* @var array|null
*/
protected $hashTable = array();
public function __construct(){}
/**
* 向HashMap中添加一个键值对
* @param $key
* @param $value
* @return mixed|null
*/
public function put($key, $value)
{
if (! array_key_exists($key, $this->hashTable)) {
$this->hashTable[$key] = $value;
return null;
}
$_temp = $this->hashTable[$key];
$this->hashTable[$key] = $value;
return $_temp;
}
/**
* 根据key获取对应的value
* @param $key
* @return mixed|null
*/
public function get($key)
{
if (array_key_exists($key, $this->hashTable)) {
return $this->hashTable[$key];
}
return null;
}
/**
* 删除指定key的键值对
* @param $key
* @return mixed|null
*/
public function remove($key)
{
$temp_table = array();
if (array_key_exists($key, $this->hashTable)) {
$tempValue = $this->hashTable[$key];
while ($curValue = current($this->hashTable)) {
if (! (key($this->hashTable) == $key)) {
$temp_table[key($this->hashTable)] = $curValue;
}
next($this->hashTable);
}
$this->hashTable = null;
$this->hashTable = $temp_table;
return $tempValue;
}
return null;
}
/**
* 获取HashMap的所有键值
* @return array
*/
public function keys()
{
return array_keys($this->hashTable);
}
/**
* 获取HashMap的所有value值
* @return array
*/
public function values()
{
return array_values($this->hashTable);
}
/**
* 将一个HashMap的值全部put到当前HashMap中
* @param \DfaFilter\HashMap $map
*/
public function putAll($map)
{
if (! $map->isEmpty() && $map->size() > 0) {
$keys = $map->keys();
foreach ($keys as $key) {
$this->put($key, $map->get($key));
}
}
return ;
}
/**
* 移除HashMap中所有元素
* @return bool
*/
public function removeAll()
{
$this->hashTable = null;
return true;
}
/**
* 判断HashMap中是否包含指定的值
* @param $value
* @return bool
*/
public function containsValue($value)
{
while ($curValue = current($this->hashTable)) {
if ($curValue == $value) {
return true;
}
next($this->hashTable);
}
return false;
}
/**
* 判断HashMap中是否包含指定的键key
* @param $key
* @return bool
*/
public function containsKey($key)
{
if (array_key_exists($key, $this->hashTable)) {
return true;
} else {
return false;
}
}
/**
* 获取HashMap中元素个数
* @return int
*/
public function size()
{
return count($this->hashTable);
}
/**
* 判断HashMap是否为空
* @return bool
*/
public function isEmpty()
{
return (count($this->hashTable) == 0);
}
}
2、封装敏感词顾虑逻辑
<?php
/**
* DFA算法,将敏感词、违规词 按字符进行tree存储,实现有限违规词词库;
* 将检测文本分字匹配检测记录起始索引位置,方便替换或统计含有违规词个数;
* 敏感词之间不要有包含关系。
*/
namespace DFAMaster;
use Exception;
class SensitiveWords
{
/**
* 待检测语句长度
* @var int
*/
protected $contentLength = 0;
/**
* 敏感词单例
* @var object|null
*/
private static $_instance = null;
/**
* 铭感词库树
* @var HashMap|null
*/
protected $wordTree = null;
/**
* 存放待检测语句铭感词
* @var array|null
*/
protected static $badWordList = null;
/**
* 获取单例
* @return self
*/
public static function init()
{
if (!self::$_instance instanceof self) {
self::$_instance = new self();
}
return self::$_instance;
}
/**
* @param $str
* @param null $encoding
* @return int
* @throws Exception
*/
function mb_strlen($str, $encoding = null)
{
$length = \mb_strlen($str, $encoding);
if ($length === false) {
throw new Exception(' encoding 无效');
}
return $length;
}
/**
* 构建铭感词树【文件模式】
* @param string $filepath
* @return $this
* @throws Exception
*/
public function setTreeByFile($filepath = '')
{
if (!file_exists($filepath)) {
throw new Exception('敏感违规词库文件不存在', 10003);
}
// 词库树初始化
$this->wordTree = $this->wordTree ?: new HashMap();
foreach ($this->yieldToReadFile($filepath) as $word) {
$this->buildWordToTree(trim($word));
}
return $this;
}
/**
* 构建铭感词树【数组模式】
* @param null $sensitiveWords
* @return $this
* @throws Exception
*/
public function setTree($sensitiveWords = null)
{
if (empty($sensitiveWords)) {
throw new Exception('敏感违规词库不能为空', 10002);
}
$this->wordTree = new HashMap();
foreach ($sensitiveWords as $word) {
$this->buildWordToTree($word);
}
return $this;
}
/**
* 检测文字中的敏感词
* @param string $content 待检测内容
* @param int $matchType 匹配类型 [默认为最小匹配规则]
* @param int $wordNum 需要获取的敏感词数量 [默认获取全部]
* @return array
* @throws Exception
*/
public function getBadWord($content, $matchType = 1, $wordNum = 0)
{
$this->contentLength = $this->mb_strlen($content, 'utf-8');
$badWordList = array();
for ($length = 0; $length < $this->contentLength; $length++) {
$matchFlag = 0;
$flag = false;
$tempMap = $this->wordTree;
for ($i = $length; $i < $this->contentLength; $i++) {
$keyChar = mb_substr($content, $i, 1, 'utf-8');
// 获取指定节点树
$nowMap = $tempMap->get($keyChar);
// 不存在节点树,直接返回
if (empty($nowMap)) {
break;
}
// 存在,则判断是否为最后一个
$tempMap = $nowMap;
// 找到相应key,偏移量+1
$matchFlag++;
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
if (false === $nowMap->get('ending')) {
continue;
}
$flag = true;
// 最小规则,直接退出
if (1 === $matchType) {
break;
}
}
if (!$flag) {
$matchFlag = 0;
}
// 找到相应key
if ($matchFlag <= 0) {
continue;
}
$badWordList[] = mb_substr($content, $length, $matchFlag, 'utf-8');
// 有返回数量限制
if ($wordNum > 0 && count($badWordList) == $wordNum) {
return $badWordList;
}
// 需匹配内容标志位往后移
$length = $length + $matchFlag - 1;
}
return $badWordList;
}
/**
* 替换敏感字字符
* @param $content 需要过滤词的文本内容
* @param string $replaceChar 替换字符
* @param bool $repeat true=>重复替换为敏感词相同长度的字符
* @param int $matchType
* @return mixed
* @throws Exception
* @throws Exception
*/
public function replace($content, $replaceChar = '', $repeat = false, $matchType = 1)
{
if (empty($content)) {
throw new Exception('请填写检测的内容', 10001);
}
$badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
// 未检测到敏感词,直接返回
if (empty($badWordList)) {
return $content;
}
foreach ($badWordList as $badWord) {
$hasReplacedChar = $replaceChar;
if ($repeat) {
$hasReplacedChar = $this->dfaBadWordConversChars($badWord, $replaceChar);
}
$content = str_replace($badWord, $hasReplacedChar, $content);
}
return $content;
}
/**
* 标记敏感词
* @param $content 文本内容
* @param string $sTag 标签开头,如<mark>
* @param string $eTag 标签结束,如</mark>
* @param int $matchType
* @return mixed
* @throws Exception
* @throws Exception
*/
public function mark($content, $sTag, $eTag, $matchType = 1)
{
if (empty($content)) {
throw new Exception('请填写检测的内容', 10001);
}
$badWordList = self::$badWordList ? self::$badWordList : $this->getBadWord($content, $matchType);
// 未检测到敏感词,直接返回
if (empty($badWordList)) {
return $content;
}
$badWordList = array_unique($badWordList);
foreach ($badWordList as $badWord) {
$replaceChar = $sTag . $badWord . $eTag;
$content = str_replace($badWord, $replaceChar, $content);
}
return $content;
}
/**
* 被检测内容是否合法
* @param $content
* @return bool
* @throws Exception
*/
public function islegal($content)
{
$this->contentLength = $this->mb_strlen($content, 'utf-8');
for ($length = 0; $length < $this->contentLength; $length++) {
$matchFlag = 0;
$tempMap = $this->wordTree;
for ($i = $length; $i < $this->contentLength; $i++) {
$keyChar = mb_substr($content, $i, 1, 'utf-8');
// 获取指定节点树
$nowMap = $tempMap->get($keyChar);
// 不存在节点树,直接返回
if (empty($nowMap)) {
break;
}
// 找到相应key,偏移量+1
$tempMap = $nowMap;
$matchFlag++;
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
if (false === $nowMap->get('ending')) {
continue;
}
return true;
}
// 找到相应key
if ($matchFlag <= 0) {
continue;
}
// 需匹配内容标志位往后移
$length = $length + $matchFlag - 1;
}
return false;
}
/**
* 读取敏感词 文件
* @param $filepath
* @return \Generator
*/
protected function yieldToReadFile($filepath)
{
$fp = fopen($filepath, 'r');
while (!feof($fp)) {
yield fgets($fp);
}
fclose($fp);
}
/**
* 将单个敏感词构建成树结构
* @param $word
* @return void
* @throws Exception
*/
protected function buildWordToTree($word = '')
{
if ('' === $word) {
return;
}
$tree = $this->wordTree;
$wordLength = $this->mb_strlen($word, 'utf-8');
for ($i = 0; $i < $wordLength; $i++) {
$keyChar = mb_substr($word, $i, 1, 'utf-8');
// 获取子节点树结构
$tempTree = $tree->get($keyChar);
if ($tempTree) {
$tree = $tempTree;
} else {
// 设置标志位
$newTree = new HashMap();
$newTree->put('ending', false);
// 添加到集合
$tree->put($keyChar, $newTree);
$tree = $newTree;
}
// 到达最后一个节点
if ($i == $wordLength - 1) {
$tree->put('ending', true);
}
}
return;
}
/**
* 敏感词替换为对应长度的字符
* @param $word
* @param $char
* @return string
* @throws Exception
*/
protected function dfaBadWordConversChars($word, $char)
{
$str = '';
$length = $this->mb_strlen($word, 'utf-8');
for ($counter = 0; $counter < $length; ++$counter) {
$str .= $char;
}
return $str;
}
}
3、测试使用
<?php
use DFAMaster\SensitiveWords;
class Test
{
protected $wordData;
protected $content = '';
protected $wordsPath = '';
public function __construct($content)
{
//被识别内容
$this->content = $content;
//敏感词
$wordPool = '被指抄袭,本公司担,本无码,毕业證,办证,证书';
$this->wordData = explode(',', $wordPool);
// 铭感词文件路径
$this->wordsPath = '/data/words.txt';
}
/**
* 测试
* @return void
* @throws Exception
*/
public function test()
{
//过滤
$filterContent = SensitiveWords::init()
->setTree($this->wordData)
//->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
->getBadWord($this->content);
//返回规定数量的敏感词,用于判断 存在敏感词时,弃用被识别的内容
$badWords = SensitiveWords::init()
->setTree($this->wordData)
//->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
->getBadWord($this->content, 1, 2);
//过滤 替换
$filterContent = SensitiveWords::init()
->setTree($this->wordData)
//->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
->replace($this->content, '*');
//过滤 替换
$filterContent = SensitiveWords::init()
->setTree($this->wordData)
//->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
->replace($this->content, '*', true);
//过滤敏感词、违规词 标记
$markedContent = SensitiveWords::init()
->setTree($this->wordData)
//->setTreeByFile($this->wordsPath) //通过文件 加载过滤词库
->mark($this->content, '<mark>', '</mark>');
}
}