【C#】敏感词过滤校验

敏感词过滤的方法有很多,如何能更快速更大范围的匹配敏感词?

敏感词库小的情况,用正则、字符串匹配即可实现,还可以按照首字母对敏感词分组以提高效率。

当词库比较大时,就需要考虑效率问题了···

找了很多实现方法,感觉不是很适合现在的需求,于是自己尝试实现了一下,用的也是DFA算法。

参考:C#敏感词过滤算法实现Java实现敏感词过滤

1. 创建敏感词库

1.1 收集敏感词

1.2.1 指定字符穿分割

// 测试敏感词库:
敏感词|敏感词汇|敏感词语|敏感单词

1.2.2 处理敏感词汇(DFA算法: 有穷自动机, 通过event和当前的state得到下一个state, 即 event + state = nextstate)

将拆分的字符,整理成树状结构

// 词库处理后格式(伪代码, 仅用于展示):
{
    item: "R",
    isEnd: false,
    childre:
        {
            item: "敏",
            isEnd: false,
            children: 
                {
                    item: "感",
                    isEnd: false,
                    children: 
                        {
                            item: "词",
                            isEnd: true,
                            children: 
                                {
                                    "item": "汇",
                                    "isEnd": true,
                                    "children": null 
                                },
                                {
                                    "item": "语",
                                    "isEnd": true,
                                    "children": null
                                }
                        },
                        {
                            "item": "单",
                            "isEnd": false,
                            "children":
                                {
                                    "item": "词",
                                    "isEnd": true,
                                    "children": null
                                }
                        }
                }
        }
}

处理:

    /// 词库树节点
    public class TreeNode
    {
        public char item;
        public bool isEnd;
        public List<TreeNode> children;
    }
public class SensitiveWordsLibrary
{
    //DFA:有穷自动机, 通过event和当前的state得到下一个state, 即 event + state = nextstate

    string sensitiveWordsResourcesPath = "SensitiveWordsLibrary/SensitiveWords";

    /// 词库树
    private TreeNode treeRoot;

    public SensitiveWordsLibrary()
    {
        //敏感词组
        string[] sensitiveWords = LoadSensitiveWords();
        //创建词库树
        treeRoot = CreateTree(sensitiveWords);
    }

    /// 加载敏感词组
    private string[] LoadSensitiveWords()
    {
        //todo 源文件字符校验
        TextAsset textAsset = Resources.Load<TextAsset>(sensitiveWordsResourcesPath);

        if (textAsset != null)
        {
            string wordStr = textAsset.text;
            if (!string.IsNullOrEmpty(wordStr))
            {
                string[] words = wordStr.Split('|');
                return words;
            }
        }
        return null;
    }

    #region --- Create Tree ---

    /// 创建词库树
    private TreeNode CreateTree(string[] words)
    {
        if (words == null || words.Length == 0)
        {
            Debug.Log("无敏感词库");
            return new TreeNode() { item = 'R', isEnd = true, children = null };
        }

        List<TreeNode> treeList = new List<TreeNode>();

        foreach (string word in words)
        {
            if (string.IsNullOrEmpty(word))
                continue;

            char cha = word[0];
            TreeNode treeNode = treeList.Find(e => e.item == cha);

            if (treeNode == null)
            {
                TreeNode newTreeNode = CreateSingleTree(word);
                if (newTreeNode != null)
                    treeList.Add(newTreeNode);
            }
            else
                AddChildTree(treeNode, word);
        }

        return new TreeNode() { item = 'R', isEnd = false, children = treeList };
    }

    /// 创建单个完整树
    private TreeNode CreateSingleTree(string word)
    {
        if (string.IsNullOrEmpty(word))
            return null;

        TreeNode root = new TreeNode() { item = word[0], isEnd = false, children = null };
        TreeNode lastNode = root;

        if (word.Length > 1)
        {
            for (int i = 1; i < word.Length; i++)
            {
                TreeNode child = new TreeNode() { item = word[i], isEnd = false, children = null };
                lastNode.children = new List<TreeNode>() { child };
                lastNode = child;
            }
        }
        lastNode.isEnd = true;

        return root;
    }

    /// 附加分支子树
    private void AddChildTree(TreeNode childTree, string word)
    {
        if (childTree == null || string.IsNullOrEmpty(word))
            return;

        //字符长度==1
        if (word.Length == 1)
        {
            childTree.isEnd = true;
            return;
        }

        TreeNode lastNode = childTree;

        //从第二个字符开始
        for (int i = 1; i < word.Length; i++)
        {
            char cha = word[i];

            if (lastNode.children == null)
            {
                //新建子节点
                TreeNode newNode = new TreeNode() { item = cha, isEnd = false, children = null };
                lastNode.children = new List<TreeNode>() { newNode };
                lastNode = newNode;
            }
            else
            {
                //查找匹配子节点
                TreeNode childNode = lastNode.children.Find(e => e.item == cha);
                if (childNode == null)
                {
                    //新建子节点
                    childNode = new TreeNode() { item = cha, isEnd = false, children = null };
                    lastNode.children.Add(childNode);
                    lastNode = childNode;
                }
                else
                {
                    //有公共子节点
                    lastNode = childNode;
                }
            }
        }
        lastNode.isEnd = true;
    }

    #endregion
}

2. 对比字符串

按照树结构依次比对字符

注意问题:全角/半角转换,中文简繁体转换,英文大小写转换,特殊符号判断跳过


    #region --- Checker ---

    /// 替换敏感词
    public bool CheckSensitiveWord(ref string word, char replaceChar = '*')
    {
        List<int> indexList = CheckWord(word);

        if (indexList == null || indexList.Count == 0)
            return true;

        char[] chars = word.ToCharArray();

        for (int i = 0; i < indexList.Count; i++)
        {
            if (indexList[i] >= 0 && indexList[i] < chars.Length)
                chars[indexList[i]] = replaceChar;
        }
        word = new string(chars);

        return false;
    }

    /// 检测敏感词
    private List<int> CheckWord(string text)
    {
        if (treeRoot == null || string.IsNullOrEmpty(text))
            return null;

        //敏感字符index
        List<int> checkIndexList = new List<int>();
        List<int> tempIndexList = new List<int>();

        TreeNode treeNode = treeRoot;

        for (int i = 0; i < text.Length; i++)
        {
            char cha = text[i];

            //校验字符
            if (!CorrectChar(ref cha))
                continue;

            //查找匹配字符
            TreeNode targetNode = treeNode.children.Find(e => e.item == cha);

            //是否匹配字符
            if (targetNode != null)
            {
                //记录字符位置
                tempIndexList.Add(i);
                //词汇树是结束节点或词汇树无子树
                if (targetNode.isEnd || targetNode.children == null || targetNode.children.Count == 0)
                {
                    //记录问题字符
                    for (int m = 0; m < tempIndexList.Count; m++)
                    {
                        if (!checkIndexList.Contains(tempIndexList[m]))
                            checkIndexList.Add(tempIndexList[m]);
                    }
                    //词汇树有子树, 可以继续匹配
                    if (targetNode.children != null && targetNode.children.Count > 0)
                    {
                        //下个字符符合匹配条件
                        int k = i + 1;
                        if (k < text.Length && targetNode.children.Exists(e => e.item == text[k]))
                        {
                            //继续校验
                            treeNode = targetNode;
                            continue;
                        }
                    }
                    //清除记录
                    tempIndexList.Clear();
                    //重新校验
                    treeNode = treeRoot;
                }
                else
                {
                    //匹配未完成, 继续校验
                    treeNode = targetNode;
                }
            }
            else
            {
                //没有匹配到字符
                if(tempIndexList.Count > 0)
                {
                    //如果上个字符匹配到, 当前字符未匹配, 用该字符作为第一个节点再次匹配
                    tempIndexList.Clear();
                    i--;
                }

                //重新校验
                treeNode = treeRoot;
            }
        }
        return checkIndexList;
    }

    //校验字符
    bool CorrectChar(ref char cha)
    {
        全角/半角  todo
        //ToDBC(ref cha);

        //忽略对比数字
        if (IsNumber(cha))
            return false;

        //英文字符统一转为小写
        if(IsAlphabet(cha))
        {
            cha = char.ToLower(cha);
            return true;
        }

        //中文统一转为简体
        if (IsChinese(cha))
        {
            //繁体转简体 todo
            return true;
        }

        //判断特殊符号, 其余视为特殊符号, 忽略对比
        return false;
    }

    // ----------

    int charValue;

    //是否是中文
    private bool IsChinese(char character)
    {
        //  中文表意字符的范围 4E00-9FA5
        charValue = (int)character;
        return (charValue >= 0x4e00 && charValue <= 0x9fa5);
    }

    //是否是数字
    private bool IsNumber(char character)
    {
        charValue = (int)character;
        return (charValue >= 48 && charValue <= 57);
    }

    //是否是英文字母
    private bool IsAlphabet(char character)
    {
        charValue = (int)character;
        return ((charValue >= 65 && charValue <= 90) || (charValue >= 97 && charValue <= 122));
    }

    /// <summary>
    /// 转半角小写的函数(DBC case)
    /// </summary>
    /// <param name="input">任意字符串</param>
    /// <returns>半角字符串</returns>
    ///<remarks>
    ///全角空格为12288,半角空格为32
    ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
    ///</remarks>
    private void ToDBC(ref char cha)
    {
        if (cha == 12288)
            cha = (char)32;
        else if (cha > 65280 && cha < 65375)
            cha = (char)(cha - 65248);
    }

    #region --- 简体/繁体 ---

    需要引用库 Microsoft.VisualBasic

    / 中文字符工具类
    //private const int LOCALE_SYSTEM_DEFAULT = 0x0800;
    //private const int LCMAP_SIMPLIFIED_CHINESE = 0x02000000;
    //private const int LCMAP_TRADITIONAL_CHINESE = 0x04000000;
    //[System.Runtime.InteropServices.DllImport("kernel32", CharSet = System.Runtime.InteropServices.CharSet.Auto, SetLastError = true)]
    //private static extern int LCMapString(int Locale, int dwMapFlags, string lpSrcStr, int cchSrc, [System.Runtime.InteropServices.Out] string lpDestStr, int cchDest);

    / 将字符转换成简体中文
    //public static string ToSimplified(string source)
    //{
    //    String target = new String(' ', source.Length);
    //    int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_SIMPLIFIED_CHINESE, source, source.Length, target, source.Length);
    //    return target;
    //}

    / 将字符转换为繁体中文
    //public static string ToTraditional(string source)
    //{
    //    String target = new String(' ', source.Length);
    //    int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_TRADITIONAL_CHINESE, source, source.Length, target, source.Length);
    //    return target;
    //}

    #endregion

    #endregion

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

萧然CS

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值