目录
一、前提
是学习 Efficient string matching: an aid to bibliographic search 1975年的论文,看了很久,但是很多内容还是没看懂。
这里就其中的一些算法java版本实现及自己的理解记录下。
这个算法是用于文本匹配的,这在搜索中很常用,比如有词根:he、his,输入she,可以解析出he。
二、java实现(版本1)
节点类:
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class ApNode {
private int state;
private Map<Character, ApNode> go;
private ApNode fail;
/**
* output函数,仅root节点有
*/
private Map<ApNode, List<String>> output;
public ApNode() {
go = new TreeMap<>();
}
public int getState() {
return state;
}
public void setState(int state) {
this.state = state;
}
public Map<Character, ApNode> getGo() {
return go;
}
public void setGo(Map<Character, ApNode> go) {
this.go = go;
}
public ApNode getFail() {
return fail;
}
public void setFail(ApNode fail) {
this.fail = fail;
}
public Map<ApNode, List<String>> getOutput() {
return output;
}
public void setOutput(Map<ApNode, List<String>> output) {
this.output = output;
}
@Override
public String toString() {
StringBuilder bld = new StringBuilder();
bld.append(state);
if (go.size() > 0) {
bld.append("->");
}
for (Map.Entry<Character, ApNode> entry : go.entrySet()) {
bld.append("\n\t");
bld.append(entry.getKey());
bld.append(":");
String value = entry.getValue().toString();
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
bld.append(c);
if (i != 0 && value.charAt(i - 1) == '\n' && c == '\t') {
bld.append('\t');
}
}
}
return bld.toString();
}
}
节点帮助类:
import java.util.*;
public class ApNodeHelper {
public static ApNode goFunc(ApNode root, ApNode state, char a, boolean isCreating) {
if (state.getGo().containsKey(a)) {
return state.getGo().get(a);
}
if (isCreating) {
//代表没有通路
return null;
}
//如果是root没有路径的,全部赋值root
return root == state ? root : null;
}
public static String outputFail(ApNode node) {
StringBuilder noBld = new StringBuilder();
StringBuilder failBld = new StringBuilder();
if (node.getState() != 0) {
noBld.append(node.getState()).append(" ");
failBld.append(node.getFail().getState()).append(" ");
}
LinkedList<ApNode> queue = new LinkedList<>();
for (ApNode goNode : node.getGo().values()) {
queue.offer(goNode);
}
while (!queue.isEmpty()) {
ApNode currentNode = queue.poll();
noBld.append(currentNode.getState()).append(" ");
failBld.append(currentNode.getFail().getState()).append(" ");
for (ApNode goNode : currentNode.getGo().values()) {
queue.offer(goNode);
}
}
noBld.append('\n');
noBld.append(failBld);
return noBld.toString();
}
public static void output(ApNode root) {
for (Map.Entry<ApNode, List<String>> entry : root.getOutput().entrySet()) {
System.out.println(entry.getKey().getState() + ":" + Arrays.toString(entry.getValue().toArray()));
}
}
public static void addOutput(Map<ApNode, List<String>> output, ApNode c, List<String> keywords) {
if (keywords == null || keywords.size() == 0) {
return;
}
List<String> emits = output.get(c);
if (emits == null) {
ArrayList<String> newEmits = new ArrayList<>();
newEmits.addAll(keywords);
output.put(c, newEmits);
} else {
for (int i = 0; i < keywords.size(); i++) {
if (!emits.contains(keywords.get(i))) {
emits.add(keywords.get(i));
}
}
}
}
public static void addOutput(Map<ApNode, List<String>> output, ApNode c, String keyword) {
List<String> emits = output.get(c);
if (emits == null) {
ArrayList<String> newEmits = new ArrayList<>();
newEmits.add(keyword);
output.put(c, newEmits);
} else if (!emits.contains(keyword)) {
emits.add(keyword);
}
}
}
算法实现类:
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
public class AhoPaper {
public static ApNode gotoFunc(ApNode root, AtomicInteger newStateNo, String keyword) {
if (keyword == null || keyword.length() == 0) {
return root;
}
char[] text = keyword.toCharArray();
ApNode state = root;
int j = 0;
char a;
while (true) {
a = text[j];
ApNode goState = ApNodeHelper.goFunc(root, state, a, true);
if (goState == null) {
break;
}
state = goState;
j++;
}
for (int p = j; p < text.length; p++) {
a = text[p];
ApNode newState = new ApNode();
newState.setState(newStateNo.incrementAndGet());
state.getGo().put(a, newState);
state = newState;
}
ApNodeHelper.addOutput(root.getOutput(), state, keyword);
return state;
}
public static void failFunc(ApNode root) {
LinkedList<ApNode> queue = new LinkedList<>();
//第一层
for (ApNode node : root.getGo().values()) {
node.setFail(root);
queue.offer(node);
}
//剩余层
while (!queue.isEmpty()) {
ApNode parentNode = queue.poll();
for (Map.Entry<Character, ApNode> entry : parentNode.getGo().entrySet()) {
char a = entry.getKey();
ApNode currentNode = entry.getValue();
queue.offer(currentNode);
ApNode state = parentNode.getFail();
while (true) {
ApNode goState = ApNodeHelper.goFunc(root, state, a, false);
if (goState !=