学习使用决策树和随机森林算法检测暴力破解-CSDN博客

本文链接：https://blog.csdn.net/qq_37865996/article/details/87743230

1.使用决策树算法检测POP3暴力破解

# -*- coding:utf-8 -*-

import re
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
import os
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus

#加载KDD99数据集中的数据 
def load_kdd99(filename):
    x=[]
    with open(filename) as f:
        for line in f:
            line=line.strip('\n')
            line=line.split(',')
            x.append(line)
    return x

def get_guess_passwdandNormal(x):
    v=[]
    w=[]
    y=[]
#筛选标记为guess-password和normal且是POP3协议的数据
    for x1 in x:
        if ( x1[41] in ['guess_passwd.','normal.'] ) and ( x1[2] == 'pop_3' ):
            if x1[41] == 'guess_passwd.':
                y.append(1)
            else:
                y.append(0)
#挑选与POP3密码破解相关的网络特征以及TCP协议中的特征作为样本特征
            x1 = [x1[0]] + x1[4:8]+x1[22:30]
            v.append(x1)

    for x1 in v :
        v1=[]
        for x2 in x1:
            v1.append(float(x2))
        w.append(v1)
    return w,y

if __name__ == '__main__':
    v=load_kdd99("/Users/zhanglipeng/Data/kdd99/corrected")
    x,y=get_guess_passwdandNormal(v)
#实例化决策树算法
    clf = tree.DecisionTreeClassifier()
#使用十折交叉验证法
    print  cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10)

    clf = clf.fit(x, y)
    dot_data = tree.export_graphviz(clf, out_file=None)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf("/Users/zhanglipeng/Data/iris-dt.pdf")

运行结果：

(base) zhanglipengdeMacBook-Pro:WSaL zhanglipeng$ python TreeFTP.py

/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

"This module will be removed in 0.20.", DeprecationWarning)

[0.90463215 1. 1. 1. 1. 1.

1. 1. 1. 1. ]

可知准确率为90.5%

2.使用决策树算法检测FTP暴力破解

原代码：

# -*- coding:utf-8 -*-

import re
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
import os
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus


def load_one_flle(filename):
    x=[]
    with open(filename) as f:
        line=f.readline()
        line=line.strip('\n')
    return line

def load_adfa_training_files(rootdir):
    x=[]
    y=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            x.append(load_one_flle(path))
            y.append(0)
    return x,y

def dirlist(path, allfile):
    filelist = os.listdir(path)
    
    for filename in filelist:
        filepath = os.path.join(path, filename)
        if os.path.isdir(filepath):
            dirlist(filepath, allfile)
        else:
            allfile.append(filepath)
    return allfile

def load_adfa_hydra_ftp_files(rootdir):
    x=[]
    y=[]
    allfile=dirlist(rootdir,[])
    for file in allfile:
        if re.match(r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+/UAD-Hydra-FTP*",file):
            x.append(load_one_flle(file))
            y.append(1)
    return x,y



if __name__ == '__main__':
    x1,y1=load_adfa_training_files("/Users/zhanglipeng/Data/ADFA-LD/Training_Data_Master/")
    x2,y2=load_adfa_hydra_ftp_files("/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/")
    
    x=x1+x2
    y=y1+y2
    #print x
    vectorizer = CountVectorizer(min_df=1)
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    #print y
    clf = tree.DecisionTreeClassifier()
    print  cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=10)
    
    
    clf = clf.fit(x, y)
    dot_data = tree.export_graphviz(clf, out_file=None)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf("/Users/zhanglipeng/Data/photo/6/ftp.pdf")

报错：

Traceback (most recent call last):

File "TreeFTP2.py", line 4, in <module>

import matplotlib.pyplot as plt

File "/anaconda2/envs/python27/lib/python2.7/site-packages/matplotlib/pyplot.py", line 115, in <module>

_backend_mod, new_figure_manager, draw_if_interactive, _show = pylab_setup()

File "/anaconda2/envs/python27/lib/python2.7/site-packages/matplotlib/backends/__init__.py", line 62, in pylab_setup

[backend_name], 0)

File "/anaconda2/envs/python27/lib/python2.7/site-packages/matplotlib/backends/backend_macosx.py", line 17, in <module>

from matplotlib.backends import _macosx

RuntimeError: Python is not installed as a framework. The Mac OS X backend will not be able to function correctly if Python is not installed as a framework. See the Python documentation for more information on installing Python as a framework on Mac OS X. Please either reinstall Python as a framework, or try one of the other backends. If you are using (Ana)Conda please install python.app and replace the use of 'python' with 'pythonw'. See 'Working with Matplotlib on OSX' in the Matplotlib FAQ for more information.

把import matplotlib.pyplot as plt 改成下面的

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

出现新的报错：

(python27) zhanglipengdeMacBook-Pro:WSaL zhanglipeng$ python TreeFTP2.py

Traceback (most recent call last):

File "TreeFTP2.py", line 9, in <module>

from sklearn import cross_validation

ImportError: cannot import name cross_validation

解决方法可以看：https://blog.csdn.net/sinat_17697111/article/details/84835873

出现新的报错：

(python27) zhanglipengdeMacBook-Pro:WSaL zhanglipeng$ python TreeFTP2.py

Traceback (most recent call last):

File "TreeFTP2.py", line 13, in <module>

import pydotplus

ImportError: No module named pydotplus

conda命令安装一下就可以了。

最后的代码为：

# -*- coding:utf-8 -*-

import re
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import os
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus


def load_one_flle(filename):
    x=[]
    with open(filename) as f:
        line=f.readline()
        line=line.strip('\n')
    return line

def load_adfa_training_files(rootdir):
    x=[]
    y=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            x.append(load_one_flle(path))
            y.append(0)
    return x,y

def dirlist(path, allfile):
    filelist = os.listdir(path)
    
    for filename in filelist:
        filepath = os.path.join(path, filename)
        if os.path.isdir(filepath):
            dirlist(filepath, allfile)
        else:
            allfile.append(filepath)
    return allfile

def load_adfa_hydra_ftp_files(rootdir):
    x=[]
    y=[]
    allfile=dirlist(rootdir,[])
    for file in allfile:
        if re.match(r"../data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+/UAD-Hydra-FTP*",file):
            x.append(load_one_flle(file))
            y.append(1)
    return x,y



if __name__ == '__main__':
    x1,y1=load_adfa_training_files("/Users/zhanglipeng/Data/ADFA-LD/Training_Data_Master/")
    x2,y2=load_adfa_hydra_ftp_files("/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/")
    
    x=x1+x2
    y=y1+y2
    #print x
    vectorizer = CountVectorizer(min_df=1)
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    #print y
    clf = tree.DecisionTreeClassifier()
    print  cross_val_score(clf, x, y, n_jobs=-1, cv=10)
    
    
    clf = clf.fit(x, y)
    dot_data = tree.export_graphviz(clf, out_file=None)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf("/Users/zhanglipeng/Data/ftp.pdf")

3.使用随机森林算法检测FTP暴力破解

# -*- coding:utf-8 -*-

import re
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import os
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import numpy as np


def load_one_flle(filename):
    x=[]
    with open(filename) as f:
        line=f.readline()
        line=line.strip('\n')
    return line

def load_adfa_training_files(rootdir):
    x=[]
    y=[]
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            x.append(load_one_flle(path))
            y.append(0)
    return x,y

def dirlist(path, allfile):
    filelist = os.listdir(path)

    for filename in filelist:
        filepath = os.path.join(path, filename)
        if os.path.isdir(filepath):
            dirlist(filepath, allfile)
        else:
            allfile.append(filepath)
    return allfile

def load_adfa_hydra_ftp_files(rootdir):
    x=[]
    y=[]
    allfile=dirlist(rootdir,[])
    for file in allfile:
        if re.match(r"/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/Hydra_FTP_\d+/UAD-Hydra-FTP*",file):
            x.append(load_one_flle(file))
            y.append(1)
    return x,y



if __name__ == '__main__':

    x1,y1=load_adfa_training_files("/Users/zhanglipeng/Data/ADFA-LD/Training_Data_Master/")
    x2,y2=load_adfa_hydra_ftp_files("/Users/zhanglipeng/Data/ADFA-LD/Attack_Data_Master/")

    x=x1+x2
    y=y1+y2
    #print x
    vectorizer = CountVectorizer(min_df=1)
    x=vectorizer.fit_transform(x)
    x=x.toarray()
    #print y
    clf1 = tree.DecisionTreeClassifier()
    score=cross_val_score(clf1, x, y, n_jobs=-1, cv=10)
    print  np.mean(score)
    clf2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
    score=cross_val_score(clf2, x, y, n_jobs=-1, cv=10)
    print  np.mean(score)

和上面的程序一样，输出结果（用红色标出）的同时也报了错：

/anaconda2/envs/python27/lib/python2.7/site-packages/sklearn/externals/joblib/externals/loky/backend/semlock.py:217: RuntimeWarning: semaphore are broken on OSX, release might increase its maximal value

"increase its maximal value", RuntimeWarning)

0.9678470847084707

"increase its maximal value", RuntimeWarning)

0.9838684868486848

"increase its maximal value", RuntimeWarning)