Erlang实现的AC自动机,通过预处理屏蔽词能够实现接近O(n)的时间复杂度
实现了屏蔽词的检查、匹配、替换功能
先挖个坑,贴个代码,后续有时间再讲讲aho_corasick算法的原理
在文件根目录运行命令行程序,由于模块统一使用了utf8编码,windows命令行默认使用的是gbk,需要使用chcp 65001
切换到unicode编码,然后使用命令erl
打开erlang程序
c(aho_corasick)
函数进行编译
aho_corasick:test()
运行测试用例,可以看到成功的屏蔽了题目和作者名
%%%-------------------------------------------------------------------
%%% @author Huangcanxin cx2298545090@outlook.com
%%% @copyright (C) 2022
%%% @doc
%%% AC自动机
%%% @end
%%% Created : 23. 6月 2022 10:22
%%%-------------------------------------------------------------------
-module(aho_corasick).
-define(AC_TRIE_ROOT, 0).
-record(ac_trie, {
mod,
success, % 成功跳转表
failure, % 回溯表
output, % 输出表
child,
next_node = 1
}).
%% API
-export([build/2, rebuild/2, rebuild/3]). % 生成树
-export([add_word/2, check/2, match/2, replace/3, replace/4, make_child/1, append_word/2]).
-export([init_trie/0, get/3, set/3, clean/1, foldl/3, erase/2]). % maps 结构的ac_trie
-export([merge/1, sub/2]).
-export([test/0]).
%%%% -----------------------------------
%%%% 添加关键词创建前缀树
%%%% -----------------------------------
% output 为了节省空间和加速替换存的是关键词长度
add_word(Word, AcTrie) ->
add_word(Word, bit_size(Word), ?AC_TRIE_ROOT, AcTrie).
add_word(<<>>, OutputWord, Node, AcTrie) ->
if
OutputWord == 0 orelse OutputWord == <<>> ->
AcTrie;
true ->
#ac_trie{
output = Output, mod = Mod} = AcTrie,
NOutput = Mod:set(Node, OutputWord, Output),
AcTrie#ac_trie{
output = NOutput}
end;
add_word(<<Char/utf8, Left/binary>>, OutputWord, Node, AcTrie = #ac_trie{
next_node = NextNode, child = Child, success = Success, mod = Mod}) ->
case Mod:get({
Node, Char}, Success, ?AC_TRIE_ROOT) of
?AC_TRIE_ROOT ->
NSuccess = Mod:set({
Node, Char}, NextNode, Success),
ChildChar = Mod:get(Node, Child, []),
NChild = Mod:set(Node, [Char | ChildChar], Child),
NewAcTrie = AcTrie#ac_trie{
next_node = NextNode + 1, child = NChild, success = NSuccess},
add_word(Left, OutputWord, NextNode, NewAcTrie);
GoNode ->
add_word(Left, OutputWord, GoNode, AcTrie)
end.
%%%% -----------------------------------
%%%% 通过success生成child
%%%% -----------------------------------
make_child(AcTrie) ->
#ac_trie{
success = Success, mod = Mod, child = Child} = AcTrie,
NChild = Mod:foldl(
fun({
{
Node, Char}, _V}, C) ->
ChildChar = Mod:get(Node, C, []),
Mod:set(Node, [Char | ChildChar], C)
end,
Child,
Success
),
AcTrie#ac_trie{
child = NChild}.
%%%% -----------------------------------
%%%% 需要确保child存在
%%%% 追加关键词 会重新建立failure 需要添加多个词的话最好一次性插入
%%%% -----------------------------------
append_word([], AcTrie) ->
#ac_trie{
failure = Failure, mod = Mod} = AcTrie,