参考资料:http://bbs.9ria.com/thread-226068-1-1.html
class TreeNode {
private data: Dictionary;
private _isLeaf: boolean;
/**
*是否是敏感词的词尾字,敏感词树的叶子节点必然是词尾字,父节点不一定是
*/
public isEnd: boolean = false;
public parent: TreeNode;
public value: string;
public constructor() {
this.data = new Dictionary();
} //end of Function
public getChild(name: string): TreeNode {
return this.data.GetName(name);
} //end of Function
public addChild(char: string): TreeNode {
var node: TreeNode = new TreeNode();
this.data.SetName(char, node);
node.value = char;
node.parent = this;
return node;
} //end of Function
public getFullWord(): string {
var rt: string = this.value;
var node: TreeNode = this.parent;
while (node) {
rt = node.value + rt;
node = node.parent;
} //end while
return rt;
} //end of Function
/**
*是否是叶子节点
*/
public get isLeaf(): boolean {
var index: number = 0;
for (var key in this.data.dic) {
index++;
}
this._isLeaf = index == 0
return this._isLeaf;
}
}
class Dictionary {
public dic: Array<TreeNode>;
public constructor() {
if (!this.dic) {
this.dic = new Array();
}
}
public GetName(name:string):TreeNode {
return this.dic[name];
}
public SetName(name: string, src: TreeNode) {
this.dic[name] = src;
}
}
class SensitiveWordFilter {
public constructor() {
}
public static GetInstance(): SensitiveWordFilter {
if (!this.instance) {
this.instance = new SensitiveWordFilter();
}
return this.instance;
}
private static instance:SensitiveWordFilter;
public treeRoot: TreeNode;
public regSensitiveWords(words: Array<string>): void {
console.log("into");
//这是一个预处理步骤,生成敏感词索引树,功耗大于查找时使用的方法,但只在程序开始时调用一次。
var self = this;
self.treeRoot = new TreeNode();
self.treeRoot.value = "";
var words_len: number = words.length;
for (var i: number = 0; i < words_len; i++) {
console.log("into loop");
var word: string = words[i];
var len: number = word.length;
var currentBranch: TreeNode = self.treeRoot;
for (var c: number = 0; c < len; c++) {
var char: string = word.charAt(c);
var tmp: TreeNode = currentBranch.getChild(char);
if (tmp) {
currentBranch = tmp;
}
else {
currentBranch = currentBranch.addChild(char);
} //end if
} //end for
currentBranch.isEnd = true;
} //end for
} //end of Function
/**
*替换字符串中的敏感词返回
* @param dirtyWords
* @return
*
*/
private getReplaceWord(len: number): string {
var replaceWord: string = "";
for (var i: number = 0; i < len; i++) {
replaceWord += "*";
}
return replaceWord;
}
public replaceSensitiveWord(dirtyWords: string): string {
var self = this;
var char: string;
var curTree: TreeNode = self.treeRoot;
var childTree: TreeNode;
var curEndWordTree: TreeNode;
var dirtyWord: string;
var c: number = 0;//循环索引
var endIndex: number = 0;//词尾索引
var headIndex: number = -1;//敏感词词首索引
while (c < dirtyWords.length) {
char = dirtyWords.charAt(c);
childTree = curTree.getChild(char);
if (childTree)//在树中遍历
{
if (childTree.isEnd) {
curEndWordTree = childTree;
endIndex = c;
}
if (headIndex == -1) {
headIndex = c;
}
curTree = childTree;
c++;
}
else//跳出树的遍历
{
if (curEndWordTree)//如果之前有遍历到词尾,则替换该词尾所在的敏感词,然后设置循环索引为该词尾索引
{
dirtyWord = curEndWordTree.getFullWord();
dirtyWords = dirtyWords.replace(dirtyWord, self.getReplaceWord(dirtyWord.length));
c = endIndex;
}
else if (curTree != self.treeRoot)//如果之前有遍历到敏感词非词尾,匹配部分未完全匹配,则设置循环索引为敏感词词首索引
{
c = headIndex;
headIndex = -1;
}
curTree = self.treeRoot;
curEndWordTree = null;
c++;
}
}
//循环结束时,如果最后一个字符满足敏感词词尾条件,此时满足条件,但未执行替换,在这里补加
if (curEndWordTree) {
dirtyWord = curEndWordTree.getFullWord();
dirtyWords = dirtyWords.replace(dirtyWord, self.getReplaceWord(dirtyWord.length));
}
return dirtyWords;
}
/**
*判断是否包含敏感词
* @param dirtyWords
* @return
*
*/
public containsBadWords(dirtyWords: string): boolean {
var self = this;
var char: string;
var curTree: TreeNode = self.treeRoot;
var childTree: TreeNode;
var curEndWordTree: TreeNode;
var dirtyWord: string;
var c: number = 0;//循环索引
var endIndex: number = 0;//词尾索引
var headIndex: number = -1;//敏感词词首索引
while (c < dirtyWords.length) {
char = dirtyWords.charAt(c);
childTree = curTree.getChild(char);
if (childTree)//在树中遍历
{
if (childTree.isEnd) {
curEndWordTree = childTree;
endIndex = c;
}
if (headIndex == -1) {
headIndex = c;
}
curTree = childTree;
c++;
}
else//跳出树的遍历
{
if (curEndWordTree)//如果之前有遍历到词尾,则替换该词尾所在的敏感词,然后设置循环索引为该词尾索引
{
dirtyWord = curEndWordTree.getFullWord();
dirtyWords = dirtyWords.replace(dirtyWord, self.getReplaceWord(dirtyWord.length));
c = endIndex;
return true;
}
else if (curTree != self.treeRoot)//如果之前有遍历到敏感词非词尾,匹配部分未完全匹配,则设置循环索引为敏感词词首索引
{
c = headIndex;
headIndex = -1;
}
curTree = self.treeRoot;
curEndWordTree = null;
c++;
}
}
//循环结束时,如果最后一个字符满足敏感词词尾条件,此时满足条件,但未执行替换,在这里补加
if (curEndWordTree) {
dirtyWord = curEndWordTree.getFullWord();
dirtyWords = dirtyWords.replace(dirtyWord, self.getReplaceWord(dirtyWord.length));
return true;
}
return false;
}
}
我就是搬运了一下改成egret能用的而已。。。
主要想法和代码都是别人哒。。duang。。
demo
var array: Array<string> = ["敏感","词","和谐"];
SensitiveWordFilter.GetInstance().regSensitiveWords(array);
var str: string = "这些都是被和谐的敏感词啊哈哈哈";
console.log(str);
str = SensitiveWordFilter.GetInstance().replaceSensitiveWord(str);
console.log(str);
以下是输出
before:这些都是被和谐的敏感词啊哈哈哈
after:这些都是被**的***啊哈哈哈
当然敏感词库应该是相当长的数组.我这只是测试一下而已.0.0