1.MutiPatternParser.java
package com.mutiplepatternmatch;
import java.util.Vector;
/**
*
* @author Daniel
*
*/
public class MutiPatternParser {
public MutiPatternParser() {
}
private boolean initFlag = false;
private UnionPatternSet unionPatternSet = new UnionPatternSet();
private int maxIndex = (int) java.lang.Math.pow(2, 16);
private int shiftTable[] = new int[maxIndex];
public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex];
private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();
public boolean addFilterKeyWord(String keyWord, int level) {
if (initFlag == true)
return false;
UnionPattern unionPattern = new UnionPattern();
String []strArray = keyWord.split(" ");
for (int i = 0; i < strArray.length; i++) {
Pattern pattern = new Pattern(strArray[i]);
AtomicPattern atomicPattern = new AtomicPattern(pattern);
unionPattern.addNewAtomicPattrn(atomicPattern);
unionPattern.setLevel(level);
atomicPattern.setBelongUnionPattern(unionPattern);
}
tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
return true;
}
private boolean isValidChar(char ch) {
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z'))
return true;
if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f))
return true;//简体中文汉字编码
return false;
}
public String parse(String content, Vector<Integer> levelSet) {
if (initFlag == false)
init();
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
String preContent = preConvert(content);
for (int i = 0; i < preContent.length();) {
char checkChar = preContent.charAt(i);
if (shiftTable[checkChar] == 0) {
Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>();
tmpAps = findMathAps(preContent.substring(0, i + 1),
hashTable[checkChar]);
aps.addAll(tmpAps);
i++;
} else
i = i + shiftTable[checkChar];
}
parseAtomicPatternSet(aps, levelSet);
return content;
}
private void parseAtomicPatternSet(Vector<AtomicPattern> aps,
Vector<Integer> levelSet) {
while (aps.size() > 0) {
AtomicPattern ap = aps.get(0);
UnionPattern up = ap.belongUnionPattern;
if (up.isIncludeAllAp(aps) == true) {
levelSet.add(new Integer(up.getLevel()));
}
aps.remove(0);
}
}
private Vector<AtomicPattern> findMathAps(String src,
Vector<AtomicPattern> destAps) {
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
for (int i = 0; i < destAps.size(); i++) {
AtomicPattern ap = destAps.get(i);
if (ap.findMatchInString(src) == true)
aps.add(ap);
}
return aps;
}
private String preConvert(String content) {
String retStr = new String();
for (int i = 0; i < content.length(); i++) {
char ch = content.charAt(i);
if (this.isValidChar(ch) == true) {
retStr = retStr + ch;
}
}
return retStr;
}
// shift table and hash table of initialize
private void init() {
initFlag = true;
for (int i = 0; i < maxIndex; i++)
hashTable[i] = new Vector<AtomicPattern>();
shiftTableInit();
hashTableInit();
}
public void clear() {
tmpUnionPatternSet.clear();
initFlag = false;
}
private void shiftTableInit() {
for (int i = 0; i < maxIndex; i++)
shiftTable[i] = 2;
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (shiftTable[pattern.charAtEnd(1)] != 0)
shiftTable[pattern.charAtEnd(1)] = 1;
if (shiftTable[pattern.charAtEnd(0)] != 0)
shiftTable[pattern.charAtEnd(0)] = 0;
}
}
}
private void hashTableInit() {
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (pattern.charAtEnd(0) != 0) {
hashTable[pattern.charAtEnd(0)].add(ap);
}
}
}
}
}
class Pattern { // string
Pattern(String str) {
this.str = str;
}
public char charAtEnd(int index) {
if (str.length() > index) {
return str.charAt(str.length() - index - 1);
} else
return 0;
}
public String str;
public String getStr() {
return str;
};
}
class AtomicPattern {
public boolean findMatchInString(String str) {
if (this.pattern.str.length() > str.length())
return false;
int beginIndex = str.length() - this.pattern.str.length();
String eqaulLengthStr = str.substring(beginIndex);
if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr))
return true;
return false;
}
AtomicPattern(Pattern pattern) {
this.pattern = pattern;
};
private Pattern pattern;
public UnionPattern belongUnionPattern;
public UnionPattern getBelongUnionPattern() {
return belongUnionPattern;
}
public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
this.belongUnionPattern = belongUnionPattern;
}
public Pattern getPattern() {
return pattern;
}
public void setPattern(Pattern pattern) {
this.pattern = pattern;
}
}
class SameAtomicPatternSet {
SameAtomicPatternSet() {
SAPS = new Vector<AtomicPattern>();
};
public Vector<AtomicPattern> SAPS;
}
class UnionPattern { // union string
UnionPattern() {
this.apSet = new Vector<AtomicPattern>();
}
public Vector<AtomicPattern> apSet;
public void addNewAtomicPattrn(AtomicPattern ap) {
this.apSet.add(ap);
}
public Vector<AtomicPattern> getSet() {
return apSet;
}
public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) {
if (apSet.size() > inAps.size())
return false;
for (int i = 0; i < apSet.size(); i++) {
AtomicPattern ap = apSet.get(i);
if (isInAps(ap, inAps) == false)
return false;
}
return true;
}
private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) {
for (int i = 0; i < inAps.size(); i++) {
AtomicPattern destAp = inAps.get(i);
if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str) == true)
return true;
}
return false;
}
public void setLevel(int level) {
this.level = level;
}
public int getLevel() {
return this.level;
}
private int level;
}
class UnionPatternSet { // union string set
UnionPatternSet() {
this.unionPatternSet = new Vector<UnionPattern>();
}
public void addNewUnionPattrn(UnionPattern up) {
this.unionPatternSet.add(up);
}
public Vector<UnionPattern> unionPatternSet;
public Vector<UnionPattern> getSet() {
return unionPatternSet;
}
public void clear() {
unionPatternSet.clear();
}
}
2.TxtReader.java
package com.mutiplepatternmatch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
/**
*
* @author Daniel
*
*/
public class TxtReader implements Serializable{
public TxtReader() {
super();
}
public static BufferedReader keywordReader(String fileName){
File file=new File(fileName);
BufferedReader br=null;
try {
FileInputStream in=new FileInputStream(file);
InputStreamReader inReader = new InputStreamReader(in, "UTF-8");
br=new BufferedReader(inReader);
} catch (FileNotFoundException e) {
System.out.println("你想加载的文件没有找到!!!");
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
System.out.println("你指定的编码类型不支持哦!!!");
e.printStackTrace();
}
return br;
}
}
3.FilterTest.java
package com.mutiplepatternmatch;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Vector;
public class FilterTest {
public static void main(String args[]) {
MutiPatternParser filterEngine = new MutiPatternParser();
BufferedReader brKeyword = TxtReader
.keywordReader("file/illegalkeyword.txt");
BufferedReader brArticle = TxtReader.keywordReader("file/article.txt");
String keyword = null;
String article = null;
long startTime=System.currentTimeMillis();
StringBuffer buffer=new StringBuffer();
Vector<Integer> levelSet = new Vector<Integer>();
try {
while ((keyword = brKeyword.readLine()) != null) {
filterEngine.addFilterKeyWord(keyword, 1);
}
while ((article = brArticle.readLine()) != null) {
buffer.append(article);
}
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("读取文件IO异常!!!");
e.printStackTrace();
}
String content=filterEngine.parse(buffer.toString(), levelSet);
System.out.println("article.txt test:levelSet.size="
+ levelSet.size()+"\n this article's content: "+content);
long endTime=System.currentTimeMillis();
System.out.println("Cost:"+(endTime-startTime)+"ms");
/* 清除过滤算法引擎,带来的后果是引擎中将找不到任何的字符,测试代码,用来重载关键字时使用! */
filterEngine.clear();
levelSet.clear();
filterEngine.parse("我我我 你你你 他他他", levelSet);
System.out.println("test:levelSet.size=" + levelSet.size());// 测试正确结果是找不到匹配字符
levelSet.clear();
filterEngine.parse("a'b'c a b c", levelSet);
System.out.println("test:tz levelSet.size=" + levelSet.size());// 测试正确结果是找不到匹配字符*/
}
}
ps:09年网上掀起了敏感字词过滤热,一时想到就动手干起来了,此算法借鉴经典的WM算法改写而成,不足之处还需优化和改进,测试时只对屏蔽的illegal keywords 进行了个数统计!
还有两个txt文件不能发出来(illegalkeyword.txt,article.txt)这两个文件包含收集的大量脏字和敏感不法字词,故不能发布!