用于做网络流量分类
下载地址
https://www.cl.cam.ac.uk/research/srg/netos/projects/archive/nprobe/data/papers/sigmetrics/index.html
使用moore做流量分类
这里用了BP神经网络,CNN神经网络,朴素贝叶斯,决策树,KNN,SVM导入的模块有
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn import metrics, neighbors
from tensorflow import keras
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
from sklearn.ensemble import RandomForestClassifier
数据处理
list_y = ['WWW','MAIL','FTP-CONTROL','FTP-PASV','ATTACK','P2P','DATABASE','FTP-DATA','MULTIMEDIA','SERVICES','INTERACTIVE','GAMES']
#read the file,change 'Y,N,?,', translate to tensor
def data_prepross(filename):
X, Y = [], []
for f in filename:
print(f)
with open(os.getcwd() + '/' + f, 'r') as file:
for n, i in enumerate(file.readlines()[253:]):
i = i.replace('Y','1')
i = i.replace('N', '0')
spl = i.split(',')
if spl.count('?')>8:
continue
i = i.replace('\n', '')
fz = [float(f) for f in i.split(',')[:-1] if f != '?']
meana = sum(fz) / len(fz)
i = i.replace('?', str(0))
#均值填充,加高斯白噪声
x = [float(j) for j in i.split(',')[:-1]] +[meana]*8 + np.random.normal(0,1,256)
#x = [float(j) for j in i.split(',')[:-1]] + [0] * 8
#x =x.tolist()
y = i.split(',')[-1].replace('FTP-CO0TROL','FTP-CONTROL')
y = y.replace('I0TERACTIVE','INTERACTIVE' )
y = list_y.index(y)
X.append(x)
Y.append(y)
file.close()
return X, Y
#data nomalization
#train_x,train_y = data_prepross(['entry01.weka.allclass.arff',])
total_x,total_y = data_prepross(['entry01.weka.allclass.arff','entry02.weka.allclass.arff','entry03.weka.allclass.arff','entry04.weka.allclass.arff',
'entry05.weka.allclass.arff','entry09.weka.allclass.arff', 'entry10.weka.allclass.arff','entry07.weka.allclass.arff','entry08.weka.allclass.arff','entry06.weka.allclass.arff'])
train_x,test_x,train_y,test_y = train_test_split(total_x,total_y,test_size=0.25, random_state=0)
train_x = tf.convert_to_tensor(train_x, dtype=tf.float32)
train_y= tf.convert_to_tensor(train_y,dtype=tf.int32)
test_x = tf.convert_to_tensor(test_x, dtype=tf.float32)
test_y = tf.convert_to_tensor(test_y,dtype= tf.int32)
train_x = tf.keras.utils.normalize(train_x, axis=1)
test_x = tf.keras.utils.normalize(test_x, axis=1)
BP神经网络
num_classes =12
num_pixels=256
def baseline_model():
model = Sequential()
model.add(layers.Dense(num_pixels, input_dim=num_pixels, activation='relu'))
#layers.Dropout(0.5)
model.add(layers.Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
#,class_weight=class_weight
def baseline():
t1 = time.time()
model= baseline_model()
X_train = tf.reshape(train_x,[-1,256])
X_test = tf.reshape(test_x, [-1,256])
history = model.fit(X_train, train_y, validation_data=(X_test, test_y), nb_epoch=20, batch_size=batchsize, verbose=2,)
scores = model.evaluate(X_test, test_y, verbose=0)
predict_y = model.predict(X_test)
t2 = time.time()
print("Baseline Error: %.2f%%" % (100-scores[1]*100),t2-t1)
print(history.history)
return scores,predict_y
#CNN
def simple_CNNmodel():
model = keras.models.Sequential([
layers.Conv2D(filters=8, kernel_size=(3, 3),padding='same',input_shape=(16, 16, 1), activation='relu'),
layers.MaxPooling2D(pool_size=(2, 2), padding = 'same' ),
layers.Conv2D(filters=16, kernel_size=(3, 3), padding='same',activation='relu'),
layers.MaxPooling2D(pool_size=(2, 2), padding = 'same' ),
#layers.Dropout(0.25),
#(5,5,16) > 400
layers.Flatten(),
layers.Dense(256, activation='relu'),
#layers.Dropout(0.5),
#layers.Dense(84, activation='relu'),
layers.Dense(128, activation='relu'),
#layers.Dropout(0.5),
layers.Dense(12, activation='softmax')
])
# Compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
return model
#,class_weight=class_weight
def simple_CNN():
t1 = time.time()
model = simple_CNNmodel()
X_train = tf.reshape(train_x,[-1,16,16,1])
X_test = tf.reshape(test_x, [-1,16,16,1])
model.summary()
history = model.fit(X_train, train_y, validation_data=(X_test, test_y), nb_epoch=25, batch_size=128, verbose=2)
scores = model.evaluate(X_test, test_y, verbose=0)
t2 = time.time()
pred_y = model.predict(X_test)
print(scores)
print("Baseline Error: %.2f%%" % (100 - scores[1] * 100),t2-t1)
print(history.history)
return scores,pred_y
朴素贝叶斯
def Bayes(trainData,trainLable,testData,testLable):
t1 = time.time()
mnb = GaussianNB() #
mnb.fit(trainData, trainLable) #
y_predict = mnb.predict(testData)
t2 =time.time()
print(t2-t1)
print(confusion_matrix(testLable,y_predict))
print('The Accuracy of Naive Bayes Classifier is:', mnb.score(testData,testLable ))
决策树
def DecisionTr(trainData,trainLable,testData,testLable):
t1 = time.time()
model = DecisionTreeClassifier()
model.fit(trainData, trainLable)
predicted = model.predict(testData)
score = metrics.accuracy_score(testLable, predicted)
t2 = time.time()
print(t2-t1,score)
print(confusion_matrix(testLable,predicted))
SVM支持向量机
def SVM(trainData,trainLable,testData,testLable):
t1=time.time()
clf = SVC()
clf.fit(trainData, trainLable)
svmPredict=clf.predict(testData)
svmScore=metrics.accuracy_score(testLable, svmPredict)
t2=time.time()
print(t2-t1,svmScore)
KNN
def Knn(trainData,trainLable,testData,testLable):
t1=time.time()
knn = KNeighborsClassifier()
knn.fit(trainData, trainLable)
knnPredict = knn.predict(testData)
knnscore=metrics.accuracy_score(testLable, knnPredict)
t2=time.time()
print(t2-t1,knnscore)
print(confusion_matrix(testLable, knnPredict))
Knn(train_x.numpy(),train_y.numpy(),test_x.numpy(),test_y.numpy())
灰度图片绘图
def plt_image(trainx,trainy):
p_www = np.where(trainy == 0)[0][0]
p_mail = np.where(trainy == 1)[0][0]
p_control = np.where(trainy == 2)[0][0]
p_pasv = np.where(trainy == 3)[0][0]
p_attack = np.where(trainy == 4)[0][0]
p_p2p = np.where(trainy == 5)[0][0]
p_database = np.where(trainy == 6)[0][0]
p_data = np.where(trainy == 7)[0][0]
p_multimedia = np.where(trainy == 8)[0][0]
p_service = np.where(trainy == 9)[0][0]
p_interactive = np.where(trainy == 10)[0][0]
p_games = np.where(trainy == 11)[0][0]
plt.figure(num='classffication', figsize=(6, 12))
plt.subplot(3,4,1)
plt.title('WWW')
plt.imshow(np.reshape(trainx[p_www],(16,16)))
plt.subplot(3,4,2)
plt.title('MAIL')
plt.imshow(np.reshape(trainx[p_mail],(16,16)))
plt.subplot(3,4,3)
plt.title('FTP-CONTROL')
plt.imshow(np.reshape(trainx[p_control],(16,16)))
plt.subplot(3,4,4)
plt.title('FTP-PASV')
plt.imshow(np.reshape(trainx[p_pasv],(16,16)))
plt.subplot(3,4,5)
plt.title('ATTCK')
plt.imshow(np.reshape(trainx[p_attack],(16,16)))
plt.subplot(3,4,6)
plt.title('P2P')
plt.imshow(np.reshape(trainx[p_p2p],(16,16)))
plt.subplot(3,4,7)
plt.title('DATABASE')
plt.imshow(np.reshape(train_x[p_database],(16,16)))
plt.subplot(3,4,8)
plt.title('FTP-DATA')
plt.imshow(np.reshape(trainx[p_data],(16,16)))
plt.subplot(3,4,9)
plt.title('MULTIMEDIA')
plt.imshow(np.reshape(train_x[p_multimedia],(16,16)))
plt.subplot(3,4,10)
plt.title('SERVICES')
plt.imshow(np.reshape(trainx[p_service],(16,16)))
plt.subplot(3,4,11)
plt.title('INTERACTIVE')
plt.imshow(np.reshape(train_x[p_interactive],(16,16)))
plt.subplot(3,4,12)
plt.title('GAMES')
plt.imshow(np.reshape(trainx[p_games],(16,16)))
plt.show()
混淆矩阵绘图
`def plot_confusion_matrix(title, pred_y):
cm = confusion_matrix(test_y,np.argmax(pred_y, 1))
#cm = confusion_matrix(test_y, pred_y> 0.5)
labels_name = list_y
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # 归一化
plt.imshow(cm, interpolation='nearest') # 在特定的窗口上显示图像
plt.title(title) # 图像标题
plt.colorbar()
num_local = np.array(range(len(labels_name)))
plt.xticks(num_local, labels_name, rotation=90) # 将标签印在x轴坐标上
plt.yticks(num_local, labels_name) # 将标签印在y轴坐标上
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()