支持向量机区分僵尸网络DGA

僵尸网络一般为了躲避域名黑名单,会使用DGA动态生成域名,通过DGA不同的特征,可以识别不同的特征。

DGA文件格式如下:


首先从DGA文件中提取域名数据

def load_alexa(filename):
    domain_list = []
    csv_reader = csv.reader(open(filename))
    for row in csv_reader:
        domain = row[1]
        if len(domain) >= MIN_LEN:
            domain_list.append(domain)
    return domain_list
def load_dga(filename):
    domain_list = []
    with open(filename) as f:
        for line in f:
            domain = line.split(',')[0]
            if len(domain) >= MIN_LEN:
                domain_list.append(domain)
    return domain_list

接下来是进行特征提取

1、元音字母个数

正常在取域名的时候,会偏向好读的几个字母组合,所有英文的元音字母比例会比较高,而DGA有随机因素,因此这个特征不明显,从图中可以看出来,不同家族之间具有明显聚合效果,alexa为正常域名。


def get_aeiou(domain_list):
    x = []
    y = []
    for domain in domain_list:
        x.append(len(domain))
        count = len(re.findall(r'[aeiou]',domain.lower()))
        count = (0.0+count)/len(domain)
        y.append(count)
    return x,y

2、去重后的字母个数与域名长度的比例

这个反映了域名字符组成的统计特征。


def get_uniq_char_num(domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        x.append(len(domain))
        count=len(set(domain))
        count=(0.0+count)/len(domain)
        y.append(count)
    return x,y

3、平均jarccard系数

jarccard系数定义为两个集合交集与并集元素个数的比值,这里是基于2-gram计算的


def count2string_jarccard_index(a,b):
    x=set(' '+a[0])
    y=set(' '+b[0])
    for i in range(0,len(a)-1):
        x.add(a[i]+a[i+1])
    x.add(a[len(a)-1]+' ')
    for i in range(0,len(b)-1):
        y.add(b[i]+b[i+1])
    y.add(b[len(b)-1]+' ')
    return (0.0+len(x-y))/len(x|y)
def get_jarccard_index(a_list,b_list):
    x=[]
    y=[]
    for a in a_list:
        j=0.0
        for b in b_list:
            j+=count2string_jarccard_index(a,b)
        x.append(len(a))
        y.append(j/len(b_list))
    return x,y

4、HMM系数

正常在取域名的时候,会偏向选取常见的几个单词的组合,以常见的英文单词训练HMM,正常域名的HMM系数高,而DGA生成的是随机的HMM系数偏低。


def train_hmm(domain_list):
    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver = domain2ver(domain)
        np_ver = np.array(ver)
        X = np.concatenate([X,np_ver])
        X_lens.append(len(np_ver))
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
    remodel.fit(X,X_lens)
    joblib.dump(remodel, FILE_MODEL)
    return remodel
def test_dga(remodel,filename):
    x = []
    y = []
    dga_cryptolocke_list = load_dga(filename)
    for domain in dga_cryptolocke_list:
        domain_ver = domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        x.append(len(domain))
        y.append(pro)
    return x,y

完整代码:

import sys
import urllib
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import nltk
import csv
import matplotlib.pyplot as plt
import os
#定义域名最小长度
MIN_LEN = 10
#定义状态个数
N = 8
#最大似然概率阈值
T = -50
#模型文件名
FILE_MODEL = "svm_study.m"

def load_alexa(filename):
    domain_list = []
    csv_reader = csv.reader(open(filename))
    for row in csv_reader:
        domain = row[1]
        if len(domain) >= MIN_LEN:
            domain_list.append(domain)
    return domain_list
def load_dga(filename):
    domain_list = []
    with open(filename) as f:
        for line in f:
            domain = line.split(',')[0]
            if len(domain) >= MIN_LEN:
                domain_list.append(domain)
    return domain_list
def domain2ver(domain):
    ver = []
    for i in range(len(domain)):
        ver.append([ord(domain[i])])
    return ver
def train_hmm(domain_list):
    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver = domain2ver(domain)
        np_ver = np.array(ver)
        X = np.concatenate([X,np_ver])
        X_lens.append(len(np_ver))
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
    remodel.fit(X,X_lens)
    joblib.dump(remodel, FILE_MODEL)
    return remodel
def test_dga(remodel,filename):
    x = []
    y = []
    dga_cryptolocke_list = load_dga(filename)
    for domain in dga_cryptolocke_list:
        domain_ver = domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        x.append(len(domain))
        y.append(pro)
    return x,y
def test_alexa(remodel,filename):
    x = []
    y = []
    alexa_list = load_alexa(filename)
    for domain in alexa_list:
        domain_ver = domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        x.append(len(domain))
        y.append(pro)
    return x,y
def show_hmm():
    domain_list = load_alexa("G:/data/top-1000.csv")
    if not os.path.exists(FILE_MODEL):
        remodel = train_hmm(domain_list)
    remodel = joblib.load(FILE_MODEL)
    x_3,y_3=test_dga(remodel, "G:/data/dga-post-tovar-goz-1000.txt")
    x_2,y_2=test_dga(remodel,"G:/data/dga-cryptolocke-1000.txt")
    x_1,y_1=test_alexa(remodel, "G:/data/test-top-1000.csv")
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('HMM Score')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='best')
    plt.show()
def get_aeiou(domain_list):
    x = []
    y = []
    for domain in domain_list:
        x.append(len(domain))
        count = len(re.findall(r'[aeiou]',domain.lower()))
        count = (0.0+count)/len(domain)
        y.append(count)
    return x,y
def show_aeiou():
    x1_domain_list = load_alexa("G:/data/top-1000.csv")
    x_1,y_1=get_aeiou(x1_domain_list)
    x2_domain_list = load_dga("G:/data/dga-cryptolocke-1000.txt")
    x_2,y_2=get_aeiou(x2_domain_list)
    x3_domain_list = load_dga("G:/data/dga-post-tovar-goz-1000.txt")
    x_3,y_3=get_aeiou(x3_domain_list)
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('AEIOU Score')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='best')
    plt.show()

def get_uniq_char_num(domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        x.append(len(domain))
        count=len(set(domain))
        count=(0.0+count)/len(domain)
        y.append(count)
    return x,y

def show_uniq_char_num():
    x1_domain_list = load_alexa("G:/data/top-1000.csv")
    x_1,y_1=get_uniq_char_num(x1_domain_list)
    x2_domain_list = load_dga("G:/data/dga-cryptolocke-1000.txt")
    x_2,y_2=get_uniq_char_num(x2_domain_list)
    x3_domain_list = load_dga("G:/data/dga-post-tovar-goz-1000.txt")
    x_3,y_3=get_uniq_char_num(x3_domain_list)
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('UNIQ CHAR NUMBER')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='best')
    plt.show()
def count2string_jarccard_index(a,b):
    x=set(' '+a[0])
    y=set(' '+b[0])
    for i in range(0,len(a)-1):
        x.add(a[i]+a[i+1])
    x.add(a[len(a)-1]+' ')
    for i in range(0,len(b)-1):
        y.add(b[i]+b[i+1])
    y.add(b[len(b)-1]+' ')
    return (0.0+len(x-y))/len(x|y)
def get_jarccard_index(a_list,b_list):
    x=[]
    y=[]
    for a in a_list:
        j=0.0
        for b in b_list:
            j+=count2string_jarccard_index(a,b)
        x.append(len(a))
        y.append(j/len(b_list))
    return x,y

def show_jarccard_index():
    x1_domain_list = load_alexa("G:/data/top-1000.csv")
    x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)
    x2_domain_list = load_dga("G:/data/dga-cryptolocke-1000.txt")
    x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)
    x3_domain_list = load_dga("G:/data/dga-post-tovar-goz-1000.txt")
    x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('JARCCARD INDEX')
    ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
    ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
    ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
    ax.legend(loc='lower right')
    plt.show()
show_hmm()
show_aeiou()
show_uniq_char_num()
show_jarccard_index()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值