1.基于DFA算法的实现
1.1 DFA介绍
DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。
1.2构建模型
举个例子来说,在bash脚本命令检测中,我们规定“rm”,“reboot”,“shutdown”,"::","/dev/null","rmr"为敏感词,则我们需要根据这6个敏感词来构建检测模型,使用json格式来表示:
{"r":{"e":{"b":{"isEnd":"0","o":{"isEnd":"0","o":{"t":{"deepCount":"6","isEnd":"1"},"isEnd":"0"}}},"isEnd":"0"},"isEnd":"0","m":{"r":{"deepCount":"3","isEnd":"1"},"deepCount":"2","isEnd":"1"}},":":{":":{"deepCount":"2","isEnd":"1"},"isEnd":"0"},"s":{"h":{"u":{"t":{"d":{"isEnd":"0","o":{"w":{"isEnd":"0","n":{"deepCount":"8","isEnd":"1"}},"isEnd":"0"}},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"},"/":{"d":{"e":{"v":{"isEnd":"0","/":{"isEnd":"0","n":{"u":{"l":{"l":{"deepCount":"9","isEnd":"1"},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"}}},"isEnd":"0"},"isEnd":"0"},"isEnd":"0"}}
代码实现:
private static Map initSensitiveWordMap(){
String key = null;
Map nowMap = null;
Map<String,String> newWorMap = null;
Set<String> sensitiveWordSet = new HashSet<>();
sensitiveWordSet.add("rm");
sensitiveWordSet.add("reboot");
sensitiveWordSet.add("shutdown");
sensitiveWordSet.add("::");
sensitiveWordSet.add("/dev/null");
sensitiveWordSet.add("rmr");
Map sensitiveWordMap = new HashMap(sensitiveWordSet.size());
Iterator<String> iterator = sensitiveWordSet.iterator();
while (iterator.hasNext()){
key = iterator.next().toLowerCase();
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
char keyChar = key.charAt(i);
Object wordMap = nowMap.get(keyChar);
if(wordMap != null){
nowMap = (Map) wordMap;
}else {
newWorMap = new HashMap<String, String>();
newWorMap.put("isEnd","0");
nowMap.put(keyChar,newWorMap);
nowMap = newWorMap;
}
if(i == key.length()-1){
nowMap.put("deepCount",i + 1 + "");
nowMap.put("isEnd","1");
}
}
}
System.out.println(JSONObject.toJSONString(sensitiveWordMap));
return sensitiveWordMap;
}
1.3检测脚本内容
public static Set<String> checkSensitiveWord(String scriptText, int matchType){
Map sensitiveWordMap = initSensitiveWordMap();
Set<String> sensitiveWordSet = new HashSet<>();
for (int i = 0; i < scriptText.length(); i++) {
int length = testSensitiveWord(scriptText,i,matchType,sensitiveWordMap);
if(length > 0){
sensitiveWordSet.add(scriptText.substring(i,i+length));
i = i+length - 1;
}
}
return sensitiveWordSet;
}
private static int testSensitiveWord(String scriptText,int index,int matchType,Map sensitiveWordMap){
boolean flag = false;
int matchFlag = 0;
char word = 0;
Map nowMap = sensitiveWordMap;
for (int i = index; i < scriptText.length(); i++) {
word = scriptText.charAt(i);
nowMap = (Map) nowMap.get(word);
if(nowMap != null){
matchFlag++;//找到相应的key,匹配标识+1
if("1".equals(nowMap.get("isEnd"))){
Integer deepCount =Integer.valueOf((String) nowMap.get("deepCount"));
flag = isWord(scriptText,i,deepCount);
if(1 == matchType || flag){//1:最小匹配,2:全匹配
break;
}
}
} else {
break;
}
}
if(matchFlag < 2 || !flag){
matchFlag = 0;
}
return matchFlag;
}
private static boolean isWord(String scriptText, int i,int deepCount) {
boolean isWord = true;
if(i < scriptText.length()-1 && ' '!=scriptText.charAt(i+1)){
isWord = false;
}
if(i - deepCount >= 0 && scriptText.charAt(i-deepCount) > 96 && scriptText.charAt(i-deepCount) < 123){
isWord = false;
}
return isWord;
}
2.基于动态正则法实现
private static Set<String> checkSensitiveWord(String scriptText){
Set<String> base_set = new HashSet<>();
base_set.add("rm");
base_set.add("dd");
base_set.add("reboot");
Set<Character> characters = new HashSet<>();
for (String s : base_set) {
for (int i = 0; i < s.length(); i++) {
sc.add(s.charAt(i));
}
}
String all = "";
for (Character character : characters) {
all += character;
}
String unexpected_character_reg = "[^a-z0-9\\s]";//提取符号
String unexpected_character = "";
Matcher m = Pattern.compile(unexpected_character_reg).matcher(all);
while(m.find()) {
unexpected_character += m.group();
}
String regexPre = "[a-z0-9";
String regexSuf = "]+";
String main_regex = regexPre+unexpected_character+regexSuf;//组装最终检验字符串的正则表达式
Matcher main_m = Pattern.compile(main_regex).matcher(script);
Set<String> match_set = new HashSet();
while (main_m.find()) {
if (base_set.contains(main_m.group())){
match_set.add(main_m.group());
}
}
return match_set;
}