我们的任务是根据KDD99数据集训练神经网络来判别网络入侵。
1、
关于KDD99数据集的介绍,参见博客:https://blog.csdn.net/com_stu_zhang/article/details/6987632。我对该数据集的了解也是主要来自于该博客。所以关于该数据集不再做描述。
2、
任务主要分为三个部分,分别是数据处理,模型训练,分类预测。
现在开始一部分一部分的讲。
2.1、 数据处理
KDD99的数据中有数字有字符串,标签也是字符串,同时,数字当中也是范围都不一样,有的是01离散数据,有的跨度为0到几百万,这样的数据训练肯定是不行的。所以在训练之前,需要对数据进行预处理。
首先,将字符串转换为离散的数字。我们可以先读数据,找出所有存在的字符串。这个比较简单,我将结果直接列出来了。并写成一个函数方便后面获取。
def get_col_types():
protocol_type = ['icmp', 'tcp', 'udp']
service_type = ['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain',
'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher',
'hostnames', 'http', 'http_443', 'icmp', 'imap4', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link',
'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp',
'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje',
'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i',
'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois']
flag_type = ['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH']
train_label_type = ['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.', 'ipsweep.', 'land.',
'loadmodule.', 'multihop.', 'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.',
'warezmaster.']
test_label_type = ['apache2.', 'back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'httptunnel.', 'imap.',
'ipsweep.', 'land.', 'loadmodule.', 'mailbomb.', 'mscan.', 'multihop.', 'named.', 'neptune.',
'nmap.', 'normal.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'processtable.', 'ps.', 'rootkit.',
'saint.', 'satan.', 'sendmail.', 'smurf.', 'snmpgetattack.', 'snmpguess.', 'sqlattack.',
'teardrop.', 'udpstorm.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.', 'xterm.']
label_type = [['normal.'],
['ipsweep.', 'mscan.', 'nmap.', 'portsweep.', 'saint.', 'satan.'],
['apache2.', 'back.', 'land.', 'mailbomb.', 'neptune.', 'pod.', 'processtable.', 'smurf.', 'teardrop.', 'udpstorm.'],
['buffer_overflow.', 'httptunnel.', 'loadmodule.', 'perl.', 'ps.', 'rootkit.', 'sqlattack.', 'xterm.'],
['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'named.', 'phf.', 'sendmail.', 'snmpgetattack.',
'snmpguess.', 'spy.', 'warezclient.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.']]
return protocol_type,service_type,flag_type,label_type
特别注意的是,在label中,我们采用前面所提的博客中的分类方法,只分类为5个类,因为如果分类太多太细的话会导致某些类的训练数据太少而根本不能fit到它。当然,有兴趣的也可以试一下。
接下来是真正的数据处理了:
def handle_data():
protocol_type,service_type,flag_type,label_type = get_col_types()
source_file = 'kddcup.data_10_percent_corrected'
handled_file = 'train_data.csv' # write to csv file
data_file = open(handled_file, 'w', newline='')
csv_writer = csv.writer(data_file)
with open(source_file, 'r') as data_source:
csv_reader = csv.reader(data_source)
for row in csv_reader:
row[1] = protocol_type.index(row[1])
row[2] = service_type.index(row[2])
row[3] = flag_type.index(row[3])
for labels in label_type:
if labels.count(row[-1])>0:
row[-1] = label_type.index(labels)
csv_writer.writerow(row)
data_file.close()
test_source_file = 'corrected'
test_handled_file = 'test_data.csv' # write to csv file
test_data_file = open(test_handled_file, 'w', newline='')
test_csv_writer = csv.writer(test_data_file)
with open(test_source_file, 'r') as data_source:
csv_reader = csv.reader(data_source)
for row in csv_reader:
row[1] = protocol_type.index(row[1])
row[2] = service_type.index(row[2])
row[3] = flag_type.index(row[3])
for labels in label_type:
if labels.count(row[-1]) > 0:
row[-1] = label_type.index(labels)
test_csv_writer.writerow(row)
test_data_file.close()
print('pre process completed!')
handle_data后,数据将会由下面这样:
变成下面这样:
好了,这是第一步预处理。但是数据有的范围很大有的很小的问题仍然容易导致训练失败。所以,我们对数据进行一次归一化。
def normalization(self,minibatch):
data = np.delete(minibatch, -1, axis=1)
labels = np.array(minibatch,dtype=np.int32)[:, -1]
mmax = np.max(data, axis=0)
mmin = np.min(data, axis=0)
for i in range(len(mmax)):
if mmax[i] == mmin[i]:
mmax[i] += 0.000001 # avoid getting devided by 0
res = (data - mmin) / (mmax - mmin)
res = np.c_[res,labels]
return res
归一化中需要避免max和min相同的情况,实际上数据中真的出现了这种情况,所以我们在遇到这种情况时将max加上了0.000001,这是为了避免除以0的情况,但是其实加几都是全0因为data和min是一样的所以不是很重要。
这样,我们的预处理基本就完成了。现在开始第二个问题,训练。
2.2 模型训练和预测
在训练中,我们建立一个3层的全连接,并且经过relu和dropout,这是其中比较简单的一部分了。所以将直接给出代码。
# train the network and use the model to predict the test set
import tensorflow as tf
import numpy as np
import csv,os,random
# Hyper Parameters
LEARNING_RATE = 0.0001
BATCH_SIZE = 32
DATA_DIM = 41
NUM_HIDDEN1_NODES = 32
NUM_HIDDEN2_NODES = 64
NUM_HIDDEN3_NODES = 64
OUTPUT_DIM = 5
MODEL_SAVE_PATH = './model/'
MODEL_NAME = 'kdd_model'
KEEP_PROB = 0.5
class DNN():
def __init__(self):
self.learning_rate = LEARNING_RATE
self.batch_size = BATCH_SIZE
self.x_input = tf.placeholder(dtype=tf.float32,shape=[None,DATA_DIM])
self.target = tf.placeholder(dtype=tf.int32,shape=[None])
self.train_step = tf.Variable(0,trainable=False)
self.init_data()
self.create_DNN()
self.build_loss()
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
self.ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH)
self.saver = tf.train.Saver()
self.train_start = 0
if self.ckpt and self.ckpt.model_checkpoint_path:
self.saver.restore(self.sess,self.ckpt.model_checkpoint_path)
self.train_start = self.sess.run(self.train_step)
def create_DNN(self):
w1 = tf.Variable(tf.truncated_normal(dtype=tf.float32,shape=[DATA_DIM,NUM_HIDDEN1_NODES],stddev=0.1))
b1 = tf.Variable(tf.constant(0.1,dtype=tf.float32,shape=[NUM_HIDDEN1_NODES]))
w2 = tf.Variable(tf.truncated_normal(dtype=tf.float32,shape=[NUM_HIDDEN1_NODES,NUM_HIDDEN2_NODES],stddev=0.1))
b2 = tf.Variable(tf.constant(0.1,dtype=tf.float32,shape=[NUM_HIDDEN2_NODES]))
w3 = tf.Variable(tf.truncated_normal(dtype=tf.float32,shape=[NUM_HIDDEN2_NODES,NUM_HIDDEN3_NODES],stddev=0.1))
b3 = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[NUM_HIDDEN3_NODES]))
w4 = tf.Variable(tf.truncated_normal(dtype=tf.float32, shape=[NUM_HIDDEN3_NODES, OUTPUT_DIM], stddev=0.1))
b4 = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[OUTPUT_DIM]))
h_layer = tf.nn.relu(tf.nn.dropout(tf.matmul(self.x_input,w1)+b1,KEEP_PROB))
h_layer2 = tf.nn.relu(tf.nn.dropout(tf.matmul(h_layer,w2)+b2,KEEP_PROB))
h_layer3 = tf.nn.relu(tf.matmul(h_layer2,w3)+b3)
self.y = tf.nn.softmax(tf.matmul(h_layer3,w4)+b4)
def build_loss(self):
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.y, labels=self.target)
self.loss = tf.reduce_sum(ce) # tf.reduce_mean()
self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss,global_step=self.train_step)
def train(self):
step = self.sess.run(self.train_step)
batch = self.get_a_train_batch(step)
data,label = self.get_data_label(batch)
_,y = self.sess.run([self.train_op,self.y], feed_dict={self.x_input: data, self.target: label})
if (step+1) % 1000 == 0:
# print(label)
curr_loss,y = self.sess.run([self.loss,self.y], feed_dict={self.x_input: data, self.target: label})
# print(y[0])
print('trained {} rounds, current loss: {}'.format(step+1, curr_loss))
self.saver.save(self.sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=step+1)
def get_a_train_batch(self,step):
step = step%int(self.train_length/BATCH_SIZE)
min = step*BATCH_SIZE
max = min+BATCH_SIZE-1
return self.train_data[int(min):int(max+1)]
def get_data_label(self,batch):
data = np.delete(batch, -1, axis=1)
label = np.array(batch,dtype=np.int32)[:, -1]
return data,label
def init_data(self):
self.train_data = []
self.test_data = [] # init train and test data
self.label_status = {}
filename = 'train_data.csv'
csv_reader = csv.reader(open(filename))
label0_data = []
label1_data = []
label2_data = []
label3_data = []
label4_data = []
for row in csv_reader:
data = []
for char in row:
if char=='None':
data.append(0)
else:
data.append(np.float32(char)) # transform data from format of string to float32
if data[-1] == 0:
label0_data.append(data)
if data[-1] == 1:
label1_data.append(data)
if data[-1] == 2:
label2_data.append(data)
if data[-1] == 3:
label3_data.append(data)
if data[-1] == 4:
label4_data.append(data)
if self.label_status.get(str(int(data[-1])),0)>0:
self.label_status[str(int(data[-1]))] += 1
else:
self.label_status[str(int(data[-1]))] = 1
while len(label0_data) < 10000:
label0_data = label0_data + label0_data
label0_data = random.sample(label0_data, 10000)
while len(label1_data) < 10000:
label1_data = label1_data + label1_data
label1_data = random.sample(label1_data, 10000)
while len(label2_data) < 10000:
label2_data = label2_data + label2_data
label2_data = random.sample(label2_data, 10000)
while len(label3_data) < 10000:
label3_data = label3_data + label3_data
label3_data = random.sample(label3_data, 10000)
while len(label4_data) < 10000:
label4_data = label4_data + label4_data
label4_data = random.sample(label4_data, 10000)
self.train_data = label0_data+label1_data+label2_data+label3_data+label4_data
filename = 'test_data.csv'
csv_reader = csv.reader(open(filename))
for row in csv_reader:
data = []
for char in row:
if char=='None':
data.append(0)
else:
data.append(np.float32(char)) # transform data from format of string to float32
self.test_data.append(data)
self.train_length = len(self.train_data)
self.test_length = len(self.test_data)
self.train_data = self.normalization(self.train_data)
self.test_data = self.normalization(self.test_data)
np.random.shuffle(self.train_data)
print('init data completed!')
def normalization(self,minibatch):
data = np.delete(minibatch, -1, axis=1)
labels = np.array(minibatch,dtype=np.int32)[:, -1]
mmax = np.max(data, axis=0)
mmin = np.min(data, axis=0)
for i in range(len(mmax)):
if mmax[i] == mmin[i]:
mmax[i] += 0.000001 # avoid getting devided by 0
res = (data - mmin) / (mmax - mmin)
res = np.c_[res,labels]
return res
def predict(self,x_feature):
predict = self.sess.run(self.y,feed_dict={self.x_input:[x_feature]})[0]
classLabel = np.argmax(predict)
return classLabel
def test(self):
length = len(self.test_data)
rightNum = 0
np.random.shuffle(self.test_data)
for row in self.test_data:
feature = row[0:-1]
label = row[-1]
x_feature = np.array(feature)
classLabel = self.predict(x_feature)
if label == classLabel:
rightNum += 1
accuracy = rightNum/length
return accuracy
if __name__ == '__main__':
dnn = DNN()
print('the number of training data: ',dnn.train_length)
print('the number of test data: ',dnn.test_length)
print('the training data status : ',dnn.label_status)
print('start training...')
for i in range(dnn.train_start, 20000):
dnn.train()
print('start testing...')
accuracy = dnn.test()
per = accuracy*100
print('the accuracy on test data: ','%.4f'%(per),'%')
整体代码中比较关键的几点:
init_data中对数据做了一个均衡,使每一个标签的数据都有10000条,这是很简单的暴力增广和采样,虽然丢失了很多数据,但是使训练数据相对均衡,不至于模型直接坍塌到0和2两个分类。标签为3的数据只有52个,即使增广,意义也不大,因为特征很难提取出来,可以直接舍弃这部分数据。测试中发现有没有这些数据其实差别不是很大。
采用类的构造方式,这是比较常用的代码风格。
build_loss中使用reduce_sum和reduce_mean差别不是很大,可以调节不同的学习率去测试体会其中的差距。reduce_mean是比较标准的交叉熵。
整体代码的流程就是:
从csv中读取数据;
对各个标签的数据进行增广、采样,各取10000条放入训练集;
打乱训练集;
每次取一个batch进行训练;
测试。