敏感词过滤的方法有很多,如何能更快速更大范围的匹配敏感词?
敏感词库小的情况,用正则、字符串匹配即可实现,还可以按照首字母对敏感词分组以提高效率。
当词库比较大时,就需要考虑效率问题了···
找了很多实现方法,感觉不是很适合现在的需求,于是自己尝试实现了一下,用的也是DFA算法。
1. 创建敏感词库
1.1 收集敏感词
1.2.1 指定字符穿分割
// 测试敏感词库:
敏感词|敏感词汇|敏感词语|敏感单词
1.2.2 处理敏感词汇(DFA算法: 有穷自动机, 通过event和当前的state得到下一个state, 即 event + state = nextstate)
将拆分的字符,整理成树状结构
// 词库处理后格式(伪代码, 仅用于展示):
{
item: "R",
isEnd: false,
childre:
{
item: "敏",
isEnd: false,
children:
{
item: "感",
isEnd: false,
children:
{
item: "词",
isEnd: true,
children:
{
"item": "汇",
"isEnd": true,
"children": null
},
{
"item": "语",
"isEnd": true,
"children": null
}
},
{
"item": "单",
"isEnd": false,
"children":
{
"item": "词",
"isEnd": true,
"children": null
}
}
}
}
}
处理:
/// 词库树节点
public class TreeNode
{
public char item;
public bool isEnd;
public List<TreeNode> children;
}
public class SensitiveWordsLibrary
{
//DFA:有穷自动机, 通过event和当前的state得到下一个state, 即 event + state = nextstate
string sensitiveWordsResourcesPath = "SensitiveWordsLibrary/SensitiveWords";
/// 词库树
private TreeNode treeRoot;
public SensitiveWordsLibrary()
{
//敏感词组
string[] sensitiveWords = LoadSensitiveWords();
//创建词库树
treeRoot = CreateTree(sensitiveWords);
}
/// 加载敏感词组
private string[] LoadSensitiveWords()
{
//todo 源文件字符校验
TextAsset textAsset = Resources.Load<TextAsset>(sensitiveWordsResourcesPath);
if (textAsset != null)
{
string wordStr = textAsset.text;
if (!string.IsNullOrEmpty(wordStr))
{
string[] words = wordStr.Split('|');
return words;
}
}
return null;
}
#region --- Create Tree ---
/// 创建词库树
private TreeNode CreateTree(string[] words)
{
if (words == null || words.Length == 0)
{
Debug.Log("无敏感词库");
return new TreeNode() { item = 'R', isEnd = true, children = null };
}
List<TreeNode> treeList = new List<TreeNode>();
foreach (string word in words)
{
if (string.IsNullOrEmpty(word))
continue;
char cha = word[0];
TreeNode treeNode = treeList.Find(e => e.item == cha);
if (treeNode == null)
{
TreeNode newTreeNode = CreateSingleTree(word);
if (newTreeNode != null)
treeList.Add(newTreeNode);
}
else
AddChildTree(treeNode, word);
}
return new TreeNode() { item = 'R', isEnd = false, children = treeList };
}
/// 创建单个完整树
private TreeNode CreateSingleTree(string word)
{
if (string.IsNullOrEmpty(word))
return null;
TreeNode root = new TreeNode() { item = word[0], isEnd = false, children = null };
TreeNode lastNode = root;
if (word.Length > 1)
{
for (int i = 1; i < word.Length; i++)
{
TreeNode child = new TreeNode() { item = word[i], isEnd = false, children = null };
lastNode.children = new List<TreeNode>() { child };
lastNode = child;
}
}
lastNode.isEnd = true;
return root;
}
/// 附加分支子树
private void AddChildTree(TreeNode childTree, string word)
{
if (childTree == null || string.IsNullOrEmpty(word))
return;
//字符长度==1
if (word.Length == 1)
{
childTree.isEnd = true;
return;
}
TreeNode lastNode = childTree;
//从第二个字符开始
for (int i = 1; i < word.Length; i++)
{
char cha = word[i];
if (lastNode.children == null)
{
//新建子节点
TreeNode newNode = new TreeNode() { item = cha, isEnd = false, children = null };
lastNode.children = new List<TreeNode>() { newNode };
lastNode = newNode;
}
else
{
//查找匹配子节点
TreeNode childNode = lastNode.children.Find(e => e.item == cha);
if (childNode == null)
{
//新建子节点
childNode = new TreeNode() { item = cha, isEnd = false, children = null };
lastNode.children.Add(childNode);
lastNode = childNode;
}
else
{
//有公共子节点
lastNode = childNode;
}
}
}
lastNode.isEnd = true;
}
#endregion
}
2. 对比字符串
按照树结构依次比对字符
注意问题:全角/半角转换,中文简繁体转换,英文大小写转换,特殊符号判断跳过
#region --- Checker ---
/// 替换敏感词
public bool CheckSensitiveWord(ref string word, char replaceChar = '*')
{
List<int> indexList = CheckWord(word);
if (indexList == null || indexList.Count == 0)
return true;
char[] chars = word.ToCharArray();
for (int i = 0; i < indexList.Count; i++)
{
if (indexList[i] >= 0 && indexList[i] < chars.Length)
chars[indexList[i]] = replaceChar;
}
word = new string(chars);
return false;
}
/// 检测敏感词
private List<int> CheckWord(string text)
{
if (treeRoot == null || string.IsNullOrEmpty(text))
return null;
//敏感字符index
List<int> checkIndexList = new List<int>();
List<int> tempIndexList = new List<int>();
TreeNode treeNode = treeRoot;
for (int i = 0; i < text.Length; i++)
{
char cha = text[i];
//校验字符
if (!CorrectChar(ref cha))
continue;
//查找匹配字符
TreeNode targetNode = treeNode.children.Find(e => e.item == cha);
//是否匹配字符
if (targetNode != null)
{
//记录字符位置
tempIndexList.Add(i);
//词汇树是结束节点或词汇树无子树
if (targetNode.isEnd || targetNode.children == null || targetNode.children.Count == 0)
{
//记录问题字符
for (int m = 0; m < tempIndexList.Count; m++)
{
if (!checkIndexList.Contains(tempIndexList[m]))
checkIndexList.Add(tempIndexList[m]);
}
//词汇树有子树, 可以继续匹配
if (targetNode.children != null && targetNode.children.Count > 0)
{
//下个字符符合匹配条件
int k = i + 1;
if (k < text.Length && targetNode.children.Exists(e => e.item == text[k]))
{
//继续校验
treeNode = targetNode;
continue;
}
}
//清除记录
tempIndexList.Clear();
//重新校验
treeNode = treeRoot;
}
else
{
//匹配未完成, 继续校验
treeNode = targetNode;
}
}
else
{
//没有匹配到字符
if(tempIndexList.Count > 0)
{
//如果上个字符匹配到, 当前字符未匹配, 用该字符作为第一个节点再次匹配
tempIndexList.Clear();
i--;
}
//重新校验
treeNode = treeRoot;
}
}
return checkIndexList;
}
//校验字符
bool CorrectChar(ref char cha)
{
全角/半角 todo
//ToDBC(ref cha);
//忽略对比数字
if (IsNumber(cha))
return false;
//英文字符统一转为小写
if(IsAlphabet(cha))
{
cha = char.ToLower(cha);
return true;
}
//中文统一转为简体
if (IsChinese(cha))
{
//繁体转简体 todo
return true;
}
//判断特殊符号, 其余视为特殊符号, 忽略对比
return false;
}
// ----------
int charValue;
//是否是中文
private bool IsChinese(char character)
{
// 中文表意字符的范围 4E00-9FA5
charValue = (int)character;
return (charValue >= 0x4e00 && charValue <= 0x9fa5);
}
//是否是数字
private bool IsNumber(char character)
{
charValue = (int)character;
return (charValue >= 48 && charValue <= 57);
}
//是否是英文字母
private bool IsAlphabet(char character)
{
charValue = (int)character;
return ((charValue >= 65 && charValue <= 90) || (charValue >= 97 && charValue <= 122));
}
/// <summary>
/// 转半角小写的函数(DBC case)
/// </summary>
/// <param name="input">任意字符串</param>
/// <returns>半角字符串</returns>
///<remarks>
///全角空格为12288,半角空格为32
///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
///</remarks>
private void ToDBC(ref char cha)
{
if (cha == 12288)
cha = (char)32;
else if (cha > 65280 && cha < 65375)
cha = (char)(cha - 65248);
}
#region --- 简体/繁体 ---
需要引用库 Microsoft.VisualBasic
/ 中文字符工具类
//private const int LOCALE_SYSTEM_DEFAULT = 0x0800;
//private const int LCMAP_SIMPLIFIED_CHINESE = 0x02000000;
//private const int LCMAP_TRADITIONAL_CHINESE = 0x04000000;
//[System.Runtime.InteropServices.DllImport("kernel32", CharSet = System.Runtime.InteropServices.CharSet.Auto, SetLastError = true)]
//private static extern int LCMapString(int Locale, int dwMapFlags, string lpSrcStr, int cchSrc, [System.Runtime.InteropServices.Out] string lpDestStr, int cchDest);
/ 将字符转换成简体中文
//public static string ToSimplified(string source)
//{
// String target = new String(' ', source.Length);
// int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_SIMPLIFIED_CHINESE, source, source.Length, target, source.Length);
// return target;
//}
/ 将字符转换为繁体中文
//public static string ToTraditional(string source)
//{
// String target = new String(' ', source.Length);
// int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_TRADITIONAL_CHINESE, source, source.Length, target, source.Length);
// return target;
//}
#endregion
#endregion