本小节是示例如何通过神经网络检测Java溢出攻击,工作原理如下
1、数据集
本小节基于ADFA-LD数据集,处理源码如下
def load_one_flle(filename):
x=[]
with open(filename) as f:
line=f.readline()
line=line.strip('\n')
return line
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
print("Load file(%s)" % path)
y.append(0)
return x,y
def dirlist(path, allfile):
filelist = os.listdir(path)
for filename in filelist:
filepath = os.path.join(path, filename)
if os.path.isdir(filepath):
dirlist(filepath, allfile)
else:
allfile.append(filepath)
return allfile
def load_adfa_java_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(r"../data/ADFA-LD/Attack_Data_Master/Java_Meterpreter_",file):
print("Load file(%s)" % file)
x.append(load_one_flle(file))
y.append(1)
return x,y
if __name__ == '__main__':
x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/")
x2,y2=load_adfa_java_files("../data/ADFA-LD/Attack_Data_Master/")
x=x1+x2
y=y1+y2
2、特征化
本小节使用词集模型进行特征化,代码如下所示
vectorizer = CountVectorizer(min_df=1)
x=vectorizer.fit_transform(x)
x=x.toarray()
3、完整代码
# -*- coding:utf-8 -*-
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
import os
import numpy as np
from sklearn.neural_network import MLPClassifier
def load_one_flle(filename):
x=[]
with open(filename) as f:
line=f.readline()
line=line.strip('\n')
return line
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
print("Load file(%s)" % path)
y.append(0)
return x,y
def dirlist(path, allfile):
filelist = os.listdir(path)
for filename in filelist:
filepath = os.path.join(path, filename)
if os.path.isdir(filepath):
dirlist(filepath, allfile)
else:
allfile.append(filepath)
return allfile
def load_adfa_java_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(r"../data/ADFA-LD/Attack_Data_Master/Java_Meterpreter_",file):
print("Load file(%s)" % file)
x.append(load_one_flle(file))
y.append(1)
return x,y
if __name__ == '__main__':
x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/")
x2,y2=load_adfa_java_files("../data/ADFA-LD/Attack_Data_Master/")
x=x1+x2
y=y1+y2
vectorizer = CountVectorizer(min_df=1)
x=vectorizer.fit_transform(x)
x=x.toarray()
mlp = MLPClassifier(hidden_layer_sizes=(150,50), max_iter=10, alpha=1e-4,
solver='sgd', verbose=10, tol=1e-4, random_state=1,
learning_rate_init=.1)
score=model_selection.cross_val_score(mlp, x, y, n_jobs=1, cv=10)
print(score)
print(np.mean(score))
4、运行结果
这里使用十折交叉验证,测试结果准确率为87%,效果一般般啊,毕竟正常样本833个,黑样本124个,一共才 957个,这个准确率确实不理想啊
[0.86597938 0.86597938 0.87628866 0.86458333 0.87368421 0.87368421
0.87368421 0.87368421 0.87368421 0.87368421]
0.871493601917164