1.使用ADFA-LD数据集,逐行读取,并记录系统调用序号的最大值。
max_sequences_len=300
max_sys_call=0
def load_one_flle(filename):
global max_sys_call
x=[]
with open(filename) as f:
line=f.readline()
line=line.strip('\n')
line=line.split(' ')
for v in line:
if len(v) > 0:
x.append(int(v))
if int(v) > max_sys_call:
max_sys_call=int(v)
return x
加载正常系统调用序列,并标记为0
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
y.append(0)
return x,y
加载WebShell运行下的系统调用序列,并标记为1
def load_adfa_webshell_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(r"../data/ADFA-LD/Attack_Data_Master/Web_Shell_\d+/UAD-W*",file):
x.append(load_one_flle(file))
y.append(1)
return x,y
2.用RNN训练
def do_rnn(trainX, testX, trainY, testY):
global max_sequences_len
global max_sys_call
trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.)
testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.)
trainY = to_categorical(trainY, nb_classes=2)
testY_old=testY
testY = to_categorical(testY, nb_classes=2)
print "GET max_sequences_len embedding %d" % max_sequences_len
print "GET max_sys_call embedding %d" % max_sys_call
net = tflearn.input_data([None, max_sequences_len])
net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.3)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,
loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=3)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=32,run_id="kkk")
y_predict_list = model.predict(testX)
y_predict = []
for i in y_predict_list:
#print i[0]
if i[0] > 0.5:
y_predict.append(0)
else:
y_predict.append(1)
print(classification_report(testY_old, y_predict))
print metrics.confusion_matrix(testY_old, y_predict)
#print metrics.recall_score(testY_old, y_predict)
#print metrics.accuracy_score(testY_old, y_predict)
3.结果
验证效果不是很好,准确率87%