在csdn上看到一篇DFA算法替换敏感词的全文替换文章,同时也看到网上不少文章说对于这种敏感词功能替换无疑DFA是一种效率不错的实现。
下面是基本FDA实现的java代码
public class DFA {
private Node rootNode = new Node('R');
private int a = 0;
private StringBuilder strWord = new StringBuilder();
public void searchWord(String content) {
char[] chars =content.toCharArray();
Node node = rootNode;
while(a<chars.length) {
node = findNode(node,chars[a]);
if(node == null){
node = rootNode;
strWord.append(chars[a]);
}else{
strWord.append("*");
}
a++;
}
}
public void createTree() {
for(String str : arr) {
char[] chars = str.toCharArray();
if(chars.length > 0)
insertNode(rootNode, chars, 0);
}
}
private void insertNode(Node node, char[] cs, int index) {
Node n = findNode(node, cs[index]);
if(n == null) {
n = new Node(cs[index]);
node.nodes.add(n);
}
if(index == (cs.length-1))
n.flag = 1;
index++;
if(index<cs.length)
insertNode(n, cs, index);
}
private Node findNode(Node node, char c) {
List<Node> nodes = node.nodes;
Node rn = null;
for(Node n : nodes) {
if(n.c==c) {
rn = n;
break;
}
}
return rn;
}
private static class Node {
public char c;
public int flag; //1:表示终结,0:延续 这里只替换成*所以用不着
public List<Node> nodes = new ArrayList<Node>();
public Node(char c) {
this.c = c;
this.flag = 0;
}
public Node(char c, int flag) {
this.c = c;
this.flag = flag;
}
}
}
下面是自己写的用Map实现的一段替换代码
public String replaceAllWord(String[] arr,String content){
char conCharArry[] = content.toCharArray();
//这里key为每个敏感词的第一个字符,里面放着第一个字符相同的敏感词list集合
Map<Character, List<String>> word = new HashMap<Character, List<String>>();
//遍历数组生成敏感词map对象
for(String str : arr){
char key = str.charAt(0);
List<String> list = word.get(key);
if(list == null){
list = new ArrayList<String>();
list.add(str);
word.put(key, list);
}else{
list.add(str);
}
}
//对内容每一个字符进行遍历,如果当前字符为敏感词的首字符则进行下面行为否则continue本次操作
for(int i = 0 ; i < conCharArry.length; i++){
List<String> list = word.get(conCharArry[i]);
if(list == null){
continue;
}
for(String str : list){
char words[] = str.toCharArray();
//对是否匹配一个完整的敏感词进行标志,如果匹配敏感词过程中有一个字符不符则标注为false
boolean mark = true;
for(int j = 0; j < words.length; j++){
if(j + i <= conCharArry.length && words[j] != conCharArry[j+i]){
mark = false;
break;
}
}
//把敏感词逐个替换成*
if(mark){
for(int j = 0; j < words.length; j++){
conCharArry[i++] = '*';
}
}
}
}
return new String(conCharArry);
}
当我们在main函数中测试代码
public static void main(String[] args) {
String[] arr = {"tmd", "小姐", "DA"};
String content = "tmd ITeye文章版权属于作者,受法律保护 Da 小姐"
long start = System.currentTimeMillis();
for(int i = 0; i < 10000 ; i++){
DFA dfa = new DFA();
dfa.createTree(arr );
dfa.searchWord();
//dfa.replaseAllword(arr,content);
}
long end =System.currentTimeMillis();
System.out.println(end - start);
}
测试结果
DFA实现:45毫秒
Map实现:16毫秒
当把替换类型加至11220字、敏感字词数组增至十来个时
DFA实现:16903毫秒
Map实现:4758毫秒
这里DFA效率不如下面的hash实现的疑惑还请各位指点下。
ps:
加一段备用修改后的代码(properties配制)
@SuppressWarnings("unchecked")
public String replaceWordStr(String content) {
char conCharArry[] = content.toCharArray();
StringBuffer sb = new StringBuffer();
// 这里key为每个敏感词的第一个字符,里面放着第一个字符相同的敏感词list集合
Map<Character, List<String>> word = new HashMap<Character, List<String>>();
// 遍历数组生成敏感词map对象
for (Entry entry : properties.entrySet()) {
String keyWrod = entry.getKey().toString();
char key = "".equals(keyWrod) ? ' ' : keyWrod.charAt(0);
List<String> list = word.get(key);
if (list == null) {
list = new ArrayList<String>();
list.add(keyWrod);
word.put(key, list);
} else {
list.add(keyWrod);
}
}
// 对内容每一个字符进行遍历,如果当前字符为敏感词的首字符则进行下面行为否则continue本次操作
for (int i = 0; i < conCharArry.length; i++) {
List<String> list = word.get(conCharArry[i]);
if (list == null) {
sb.append(conCharArry[i]);
continue;
}
for (String str : list) {
char words[] = str.toCharArray();
// 对是否匹配一个完整的敏感词进行标志,如果匹配敏感词过程中有一个字符不符则标注为false
boolean mark = true;
for (int j = 0; j < words.length; j++) {
if (j + i <= conCharArry.length && words[j] != conCharArry[j + i]) {
mark = false;
break;
}
}
// 把敏感词逐个替换
if (mark) {
sb.append(properties.get(str));
for (int j = 1; j < words.length; j++) {
i++;
}
} else {
sb.append(conCharArry[i]);
}
}
}
return sb.toString();
}