using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleTest
{
internal class Program
{
/// <summary>
/// 简单关键字过滤
/// </summary>
/// <param name="args"></param>
private static void Main(string[] args)
{
var originstr = "sswo1lfsss殺殺殺尼玛币阿三勾特朗普大蘇打阿薩伊万卡";
var ss = CheckDirtyWords(originstr);
if (ss != null && ss.Count > 0)
for (var i = 0; i < ss.Count; i++)
originstr = originstr.Replace(ss[i].Keyword, "****");
Console.WriteLine(originstr);
Console.ReadLine();
}
/// <summary>
/// 检查指定的内容是否包含非法关键字
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
private static List<KeywordSearchResult> CheckDirtyWords(string text)
{
var dirtyStr = "wolf|jason|特朗普|尼玛币";
if (string.IsNullOrEmpty(dirtyStr)) return null;
var keywords = dirtyStr.Split('|').ToList();
var ks = new KeywordFilter(keywords);
return ks.FindAllKeywords(text);
}
//protected static bool CheckDirtyWords(string text)
//{
// var dirtyStr = "wolf|jason|hoho|barry|喫屎";
// if (string.IsNullOrEmpty(dirtyStr))
// {
// return false;
// }
// List<string> keywords = dirtyStr.Split('|').ToList();
// KeywordFilter ks = new KeywordFilter(keywords);
// return ks.FindAllKeywords(text).Count > 0;
//}
}
/// <summary>
/// Aho-Corasick算法实现
/// </summary>
public class KeywordFilter
{
/// <summary>
/// 构造节点
/// </summary>
private class Node
{
private readonly Dictionary<char, Node> transDict;
public Node(char c, Node parent)
{
Char = c;
Parent = parent;
Transitions = new List<Node>();
Results = new List<string>();
transDict = new Dictionary<char, Node>();
}
public char Char { get; }
public Node Parent { get; }
public Node Failure { get; set; }
public List<Node> Transitions { get; private set; }
public List<string> Results { get; }
public void AddResult(string result)
{
if (!Results.Contains(result)) Results.Add(result);
}
public void AddTransition(Node node)
{
transDict.Add(node.Char, node);
Transitions = transDict.Values.ToList();
}
public Node GetTransition(char c)
{
Node node;
if (transDict.TryGetValue(c, out node)) return node;
return null;
}
public bool ContainsTransition(char c)
{
return GetTransition(c) != null;
}
}
private Node root; // 根节点
private readonly string[] keywords; // 所有关键词
public KeywordFilter(IEnumerable<string> keywords)
{
this.keywords = keywords.ToArray();
Initialize();
}
/// <summary>
/// 根据关键词来初始化所有节点
/// </summary>
private void Initialize()
{
root = new Node(' ', null);
// 添加模式
foreach (var k in keywords)
{
var n = root;
foreach (var c in k)
{
Node temp = null;
foreach (var tnode in n.Transitions)
if (tnode.Char == c)
{
temp = tnode;
break;
}
if (temp == null)
{
temp = new Node(c, n);
n.AddTransition(temp);
}
n = temp;
}
n.AddResult(k);
}
// 第一层失败指向根节点
var nodes = new List<Node>();
foreach (var node in root.Transitions)
{
// 失败指向root
node.Failure = root;
foreach (var trans in node.Transitions) nodes.Add(trans);
}
// 其它节点 BFS
while (nodes.Count != 0)
{
var newNodes = new List<Node>();
foreach (var nd in nodes)
{
var r = nd.Parent.Failure;
var c = nd.Char;
while (r != null && !r.ContainsTransition(c)) r = r.Failure;
if (r == null)
{
// 失败指向root
nd.Failure = root;
}
else
{
nd.Failure = r.GetTransition(c);
foreach (var result in nd.Failure.Results) nd.AddResult(result);
}
foreach (var child in nd.Transitions) newNodes.Add(child);
}
nodes = newNodes;
}
// 根节点的失败指向自己
root.Failure = root;
}
/// <summary>
/// 找出所有出现过的关键词
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public List<KeywordSearchResult> FindAllKeywords(string text)
{
var list = new List<KeywordSearchResult>();
var current = root;
for (var index = 0; index < text.Length; ++index)
{
Node trans;
do
{
trans = current.GetTransition(text[index]);
if (current == root)
break;
if (trans == null) current = current.Failure;
} while (trans == null);
if (trans != null) current = trans;
foreach (var s in current.Results) list.Add(new KeywordSearchResult(index - s.Length + 1, s));
}
return list;
}
/// <summary>
/// 简单地过虑关键词
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public string FilterKeywords(string text)
{
var sb = new StringBuilder();
var current = root;
for (var index = 0; index < text.Length; index++)
{
Node trans;
do
{
trans = current.GetTransition(text[index]);
if (current == root)
break;
if (trans == null) current = current.Failure;
} while (trans == null);
if (trans != null) current = trans;
// 处理字符
if (current.Results.Count > 0)
{
var first = current.Results[0];
sb.Remove(sb.Length - first.Length + 1, first.Length - 1); // 把匹配到的替换为**
sb.Append(new string('*', current.Results[0].Length));
}
else
{
sb.Append(text[index]);
}
}
return sb.ToString();
}
}
/// <summary>
/// 表示一个查找结果
/// </summary>
public struct KeywordSearchResult
{
public static readonly KeywordSearchResult Empty = new KeywordSearchResult(-1, string.Empty);
public KeywordSearchResult(int index, string keyword)
{
Index = index;
Keyword = keyword;
}
/// <summary>
/// 位置
/// </summary>
public int Index { get; }
/// <summary>
/// 关键词
/// </summary>
public string Keyword { get; }
}
}
Aho-Corasick算法实现C#(简单关键字过滤)
最新推荐文章于 2020-12-08 10:06:13 发布