1.python2->python3源码报错修改:
(1)print加括号
print cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10)
^
SyntaxError: invalid syntax
(2)头文件与调用修改,并将n_jobs改为1:
#头文件变更
from sklearn import model_selection
#调用函数变更
print(model_selection.cross_val_score(clf, x, y, n_jobs=1, cv=10))
2.ADFA-LD数据集
(1)加载正常样本数据
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
y.append(0)
return x,y
(2)从攻击数据集加载ftp暴力破解数据,并将其标签赋值为1
file_prefix = r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+/UAD-Hydra-FTP*"
def load_adfa_hydra_ftp_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(file_prefix,file):
x.append(load_one_flle(file))
y.append(1)
return x,y
实际运行时,调用命令如下:
x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/")
结果为空,即load_adfa_hydra_ftp_files函数的prefix匹配失败,修改代码如下:
file_prefix = r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+"
或者如下亦可
file_prefix = r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_"
这部分内容可以参考专栏中5.6内容
3.完整代码
修改后可以在python3运行的完整代码如下
# -*- coding:utf-8 -*-
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
import os
from sklearn import tree
import pydotplus
def load_one_flle(filename):
x=[]
with open(filename) as f:
line=f.readline()
line=line.strip('\n')
return line
def load_adfa_training_files(rootdir):
x=[]
y=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
x.append(load_one_flle(path))
y.append(0)
return x,y
def dirlist(path, allfile):
filelist = os.listdir(path)
for filename in filelist:
filepath = os.path.join(path, filename)
if os.path.isdir(filepath):
dirlist(filepath, allfile)
else:
allfile.append(filepath)
return allfile
file_prefix = r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+"
def load_adfa_hydra_ftp_files(rootdir):
x=[]
y=[]
allfile=dirlist(rootdir,[])
for file in allfile:
if re.match(file_prefix,file):
x.append(load_one_flle(file))
y.append(1)
return x,y
if __name__ == '__main__':
x1,y1=load_adfa_training_files("../data/ADFA-LD/Training_Data_Master/")
x2,y2=load_adfa_hydra_ftp_files("../data/ADFA-LD/Attack_Data_Master/")
x=x1+x2
y=y1+y2
print(len(x1), len(x2))
vectorizer = CountVectorizer(min_df=1)
x=vectorizer.fit_transform(x)
x=x.toarray()
#print y
clf = tree.DecisionTreeClassifier()
print(model_selection.cross_val_score(clf, x, y, n_jobs=1, cv=10))
clf = clf.fit(x, y)
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("../photo/6/ftp.pdf")
4.运行结果如下,准确率约95%
833 162
[1. 0.97029703 0.93 0.96969697 0.96969697 0.87878788
0.98989899 0.97979798 0.94949495 0.94949495]
决策树可视化如下所示: