已有6000条关键字,分3批次。
一批为替换 replace,一批为遇到需要审核 censor,最后一批为遇到就禁止发布banned。
设计数据表如下:
mysql> desc tbl_censor;
+-------------+----------------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+-------------+----------------------+------+-----+---------+----------------+
| id | smallint(6) unsigned | NO | PRI | NULL | auto_increment |
| censortype | smallint(6) | NO | | 1 | |
| find | varchar(120) | NO | UNI | | |
| replacement | varchar(255) | NO | | | |
| extra | varchar(255) | NO | | | |
| uptime | int(11) | YES | | NULL | |
| enable | int(1) | NO | | 1 | |
+-------------+----------------------+------+-----+---------+----------------+
7 rows in set (0.01 sec)
由于有6000多关键字,使用 foreach 的 strstr?还是preg_match ?
追求效率,每小时提交量为10万多文章。
刚刚写的一种:
php
namespace app\helpers;
use app\models\other\Censor;
use app\models\other\CensorLog;
class CensorHelper
{
public $id;
public $data;
public $match_banned;
public $match_censor;
public function __construct($id = 'censor')
{
$this->id = $id;
$this->match_banned = [];
$this->match_censor = [];
$this->data = $this->getData();
}
/**
* @description 获取正则表达式
* @return array|mixed
*/
public function getData()
{
$data = \Yii::$app->cache->get($this->id);
if (empty($data)) {
$words = Censor::find()
->where(['enable' => 1])
->andWhere([' != ', 'replacement', ''])
->orderBy(['replacement' => SORT_ASC, 'find' => SORT_DESC])
->asArray()
->all();
$censor = [];
$banned = [];
$replace = [];
foreach ($words as $row) {
switch ($row['replacement']) {
case '{censor}':
$censor[] = $row['find'];
break;
case '{banned}':
$banned[] = $row['find'];
break;
default:
$replace['from'][] = $row['replacement'];
$replace['to'][] = $row['find'];
break;
}
}
if ($censor || $banned) {
$data = [
'censor' => $this->generateRegularExpression($censor),
'banned' => $this->generateRegularExpression($banned),
'replace' => $replace,
];
\Yii::$app->cache->set($this->id, $data);
}
}
return $data;
}
/**
* @describe 生成正则表达式
* @param array $words
* @return string
*/
public function generateRegularExpression(array $words)
{
$regular = implode('|', array_map('preg_quote', $words));
return "/$regular/i";
}
public function check($string)
{
$this->banned($string);
$this->censor($string);
}
public function censor($string)
{
if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) {
$this->match_censor = array_merge($this->match_censor, $matches[0]);
}
}
public function banned($string)
{
if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) {
$this->match_banned = array_merge($this->match_banned, $matches[0]);
}
}
//重新加载
public function flush()
{
\Yii::$app->cache->delete($this->id);
$this->getData();
}
/**
* @describe 替换
* @param $string
* @return mixed
*/
public function replace($string)
{
return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string;
}
/**
* @return string
*/
public function getLevel()
{
if (!empty($this->match_banned)) {
return 'banned';
} else if (!empty($this->match_censor)) {
return 'censor';
} else {
return 'pass';
}
}
/**
* @describe 添加记录
* @param $tableId
* @param $dataId
*/
public function addLog($tableId, $dataId)
{
$log = new CensorLog();
$log->datatb = $tableId;
$log->dataid = $dataId;
$log->matchcensor = implode(',', $this->match_censor);
$log->matchbanned = implode(',', $this->match_banned);
$log->addtime = time();
if (!\Yii::$app->user->isGuest) {
$log->uid = \Yii::$app->user->getId();
$log->uname = \Yii::$app->user->getUname();
}
$log->ip = IpHelper::getIP();
$log->iploc = IpHelper::getLocation($log->ip);
$log->save();
}
}
trie 树算法最适合。
PHP 关键词过滤扩展,该扩展依赖于 libdatrie(Trie 算法的 C++ 实现)。
你这个敏感词匹配,不需要用到正则,只用简单的匹配或者替换就行了。
关键字分成三类存memcached。
然后对文章进行匹配,应该从最严厉的banned来匹配,接着是要censor的关键字,最后才是可以replace的敏感词。
1 遇到就禁止发布 => str_pos
2 遇到需要审核 => str_pos
3 替换 => str_replace