敏感词过滤代码

public class AtomicPattern {
public boolean findMatchInString(String str) {
if (this.pattern.str.length() > str.length())
return false;
int beginIndex = str.length() - this.pattern.str.length();
String eqaulLengthStr = str.substring(beginIndex);
if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr))
return true;
return false;
}


AtomicPattern(Pattern pattern) {
this.pattern = pattern;
};


private Pattern pattern;


public UnionPattern belongUnionPattern;


public UnionPattern getBelongUnionPattern() {
return belongUnionPattern;
}


public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
this.belongUnionPattern = belongUnionPattern;
}


public Pattern getPattern() {
return pattern;
}


public void setPattern(Pattern pattern) {
this.pattern = pattern;
}
}
public class MutiPatternParser {


private boolean initFlag = false;


//private UnionPatternSet unionPatternSet = new UnionPatternSet();


private int maxIndex = (int) java.lang.Math.pow(2, 16);


private int shiftTable[] = new int[maxIndex];


public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex];


private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();


public boolean addFilterKeyWord(String keyWord, int level) {
if (initFlag == true)
return false;
UnionPattern unionPattern = new UnionPattern();
String[] strArray = keyWord.split(" ");
for (int i = 0; i < strArray.length; i++) {
Pattern pattern = new Pattern(strArray[i]);
AtomicPattern atomicPattern = new AtomicPattern(pattern);
unionPattern.addNewAtomicPattrn(atomicPattern);
unionPattern.setLevel(level);
atomicPattern.setBelongUnionPattern(unionPattern);
}
tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
return true;
}


private boolean isValidChar(char ch) {
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z')|| (ch >= 'a' && ch <= 'z'))
return true;
if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f))
return true;// 简体中文汉字编码
return false;
}


public String parse(String content, Vector<Integer> levelSet) {
if (initFlag == false)
init();
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
String preContent = preConvert(content);
for (int i = 0; i < preContent.length();) {
char checkChar = preContent.charAt(i);
if (shiftTable[checkChar] == 0) {
Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>();
tmpAps = findMathAps(preContent.substring(0, i + 1),hashTable[checkChar]);
aps.addAll(tmpAps);
i++;
} else
i = i + shiftTable[checkChar];
}
parseAtomicPatternSet(aps, levelSet);
return content;
}


private void parseAtomicPatternSet(Vector<AtomicPattern> aps,
Vector<Integer> levelSet) {
while (aps.size() > 0) {
AtomicPattern ap = aps.get(0);
UnionPattern up = ap.belongUnionPattern;
if (up.isIncludeAllAp(aps) == true) {
levelSet.add(new Integer(up.getLevel()));
}
aps.remove(0);
}
}


private Vector<AtomicPattern> findMathAps(String src,
Vector<AtomicPattern> destAps) {
Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
for (int i = 0; i < destAps.size(); i++) {
AtomicPattern ap = destAps.get(i);
if (ap.findMatchInString(src) == true)


aps.add(ap);
}
return aps;
}


private String preConvert(String content) {
String retStr = new String();
for (int i = 0; i < content.length(); i++) {
char ch = content.charAt(i);
if (this.isValidChar(ch) == true) {
retStr = retStr + ch;
}
}
return retStr;
}


// shift table and hash table of initialize
private void init() {
initFlag = true;
for (int i = 0; i < maxIndex; i++)
hashTable[i] = new Vector<AtomicPattern>();
shiftTableInit();
hashTableInit();
}


public void clear() {
tmpUnionPatternSet.clear();
initFlag = false;
}


private void shiftTableInit() {
for (int i = 0; i < maxIndex; i++)
shiftTable[i] = 2;
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (shiftTable[pattern.charAtEnd(1)] != 0)
shiftTable[pattern.charAtEnd(1)] = 1;
if (shiftTable[pattern.charAtEnd(0)] != 0)
shiftTable[pattern.charAtEnd(0)] = 0;
}
}
}


private void hashTableInit() {
Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector<AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
if (pattern.charAtEnd(0) != 0) {
hashTable[pattern.charAtEnd(0)].add(ap);
}
}
}
}
}
public class Pattern {
Pattern(String str) {
this.str = str;
}


public char charAtEnd(int index) {
if (str.length() > index) {
return str.charAt(str.length() - index - 1);
} else
return 0;
}


public String str;


public String getStr() {
return str;
};
}
public class SameAtomicPatternSet {
SameAtomicPatternSet() {
SAPS = new Vector<AtomicPattern>();
};


public Vector<AtomicPattern> SAPS;
}
public class TxtReader {
public TxtReader() {
super();
}


public static BufferedReader keywordReader(String fileName) {
File file = new File(fileName);
BufferedReader br = null;
try {
FileInputStream in = new FileInputStream(file);
InputStreamReader inReader = new InputStreamReader(in, "UTF-8");


br = new BufferedReader(inReader);


} catch (FileNotFoundException e) {
System.out.println("你想加载的文件没有找到!!!");
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
System.out.println("你指定的编码类型不支持哦!!!");
e.printStackTrace();
}
return br;


}
}
public class UnionPattern {
UnionPattern() {
this.apSet = new Vector<AtomicPattern>();
}


public Vector<AtomicPattern> apSet;


public void addNewAtomicPattrn(AtomicPattern ap) {
this.apSet.add(ap);
}


public Vector<AtomicPattern> getSet() {
return apSet;
}


public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) {
if (apSet.size() > inAps.size())
return false;
for (int i = 0; i < apSet.size(); i++) {
AtomicPattern ap = apSet.get(i);
if (isInAps(ap, inAps) == false)
return false;
}
return true;
}


private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) {
for (int i = 0; i < inAps.size(); i++) {
AtomicPattern destAp = inAps.get(i);
if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str) == true)
return true;
}
return false;
}


public void setLevel(int level) {
this.level = level;
}


public int getLevel() {
return this.level;
}


private int level;
}
public class UnionPatternSet {
UnionPatternSet() {
this.unionPatternSet = new Vector<UnionPattern>();
}


public void addNewUnionPattrn(UnionPattern up) {
this.unionPatternSet.add(up);
}


public Vector<UnionPattern> unionPatternSet;


public Vector<UnionPattern> getSet() {
return unionPatternSet;
}


public void clear() {
unionPatternSet.clear();
}
}
public class FilterTest {
public static void main(String args[]) {
MutiPatternParser filterEngine = new MutiPatternParser();
BufferedReader brKeyword = TxtReader.keywordReader("D://file/illegalkeyword.txt");//关键字的文件,文件太肮脏了,这里就不上传了
BufferedReader brArticle = TxtReader.keywordReader("D://file/article.txt");//待验证的文章
String keyword = null;
String article = null;
StringBuffer buffer = new StringBuffer();
Vector<Integer> levelSet = new Vector<Integer>();
try {
while ((keyword = brKeyword.readLine()) != null) {
filterEngine.addFilterKeyWord(keyword, 1);
}
while ((article = brArticle.readLine()) != null) {
buffer.append(article);
}
} catch (IOException e) {
System.out.println("读取文件IO异常!!!");
e.printStackTrace();
}

String content = filterEngine.parse(buffer.toString(), levelSet);


levelSet.clear();
filterEngine.parse(content, levelSet);
System.out.println("有违法字符" + levelSet.size()+"处
levelSet.clear();
}
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值