DGA(Domain Generation Algorithm)域名生成算法是一种利用随机字符等算法来生成C&C域名,从而逃避安全设备域名黑名单检测的技术手段。
1.黑样本
def load_dga(filename):
domain_list=[]
#xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
# http://osint.bambenekconsulting.com/manual/cl.txt
with open(filename) as f:
for line in f:
domain=line.split(",")[0]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)
2.白样本
def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)
3.特征化
(1) 元音字母个数
show_aeiou()
获取元音字母比例的函数源码如下
def get_aeiou(domain_list):
x=[]
y=[]
for domain in domain_list:
x.append(len(domain))
count=len(re.findall(r'[aeiou]',domain.lower()))
count=(0.0+count)/len(domain)
y.append(count)
return x,y
计算两类僵尸网络(蓝色和绿色)和正常域名(红色)的数据集的元音字母比例, 以域名长度为横轴,元音字母比例为纵轴作图,并图形展示的源码。
def show_aeiou():
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_aeiou(x1_domain_list)
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_aeiou(x2_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_aeiou(x3_domain_list)
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('AEIOU Score')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='best')
plt.show()
运行结果如下所示:
如图所示,dga两个家族之间有明显的聚合效果,正常域名与DGA之间有一定的区分性。
(2)去重后字母数字个数与域名长度的比例
简单来讲,比方说baidu的个数为5,facebook的个数为7(去除了一个重复的o),google的个数为5(去掉了一个重复的g,去掉一个重复的o),代码如下
def get_uniq_char_num(domain_list):
x=[]
y=[]
for domain in domain_list:
x.append(len(domain))
count=len(set(domain))
count=(0.0+count)/len(domain)
y.append(count)
return x,y
计算两类僵尸网络(蓝色和绿色)和正常域名(红色)的数据集的去重后字母数字个数与域名长度之比, 以域名长度为横轴,去重后字母数字个数与字符长度比例为纵轴作图,并图形展示的源码。
def show_uniq_char_num():
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_uniq_char_num(x1_domain_list)
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_uniq_char_num(x2_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_uniq_char_num(x3_domain_list)
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('UNIQ CHAR NUMBER')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='best')
plt.show()
运行结果如下所示
如图所示,dga两个家族之间有明显的聚合效果,正常域名与DGA之间有一定的区分性。
(3)平均jarccard系数:
定义为两个集合交集与并集元素个数的比值,本次基于2-gram计算
def get_jarccard_index(a_list,b_list):
x=[]
y=[]
for a in a_list:
j=0.0
for b in b_list:
j+=count2string_jarccard_index(a,b)
x.append(len(a))
y.append(j/len(b_list))
return x,y
计算两类僵尸网络(蓝色和绿色)和正常域名(红色)的数据集与正常的交集与并集之比, 以域名长度为横轴,交集与并集之比例为纵轴作图,并图形展示的源码。
def show_jarccard_index():
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('JARCCARD INDEX')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='lower right')
plt.show()
运行结果如下:
(4)HMM系数:
正常人取域名的时候都会偏向选取常见的几个单词组合,抽象成数学可以理解的语言,因此以常见单词训练HMM模型,正常域名的HMM系数偏高,僵尸网络DGA域名由于是随机生成的,所以HMM系数偏低。
首先通过正常的域名训练hmm,代码如下所示,为了节省训练,如果存在训练好的模型,则是直接load训练好的模型即可。
def train_hmm(domain_list):
X = [[0]]
X_lens = [1]
for domain in domain_list:
ver=domain2ver(domain)
np_ver = np.array(ver)
X=np.concatenate([X,np_ver])
X_lens.append(len(np_ver))
remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
remodel.fit(X,X_lens)
joblib.dump(remodel, FILE_MODEL)
return remodel
def show_hmm():
domain_list = load_alexa("../data/top-1000.csv")
if not os.path.exists(FILE_MODEL):
remodel=train_hmm(domain_list)
remodel=joblib.load(FILE_MODEL)
计算两类僵尸网络(蓝色和绿色)和正常域名(红色)的HMM, 以域名长度为横轴,HMM分数为纵轴作图,并图形展示的源码处理
def test_dga(remodel,filename):
x = []
y = []
dga_cryptolocke_list = load_dga(filename)
for domain in dga_cryptolocke_list:
domain_ver = domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
x.append(len(domain))
y.append(pro)
return x,y
def test_alexa(remodel,filename):
x=[]
y=[]
alexa_list = load_alexa(filename)
for domain in alexa_list:
domain_ver=domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
x.append(len(domain))
y.append(pro)
return x, y
def show_hmm():
domain_list = load_alexa("../data/top-1000.csv")
if not os.path.exists(FILE_MODEL):
remodel=train_hmm(domain_list)
remodel=joblib.load(FILE_MODEL)
x_3,y_3=test_dga(remodel, "../data/dga-post-tovar-goz-1000.txt")
x_2,y_2=test_dga(remodel,"../data/dga-cryptolocke-1000.txt")
x_1,y_1=test_alexa(remodel, "../data/test-top-1000.csv")
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('HMM Score')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='best')
plt.show()
配套源码报错
C:\ProgramData\Anaconda3\python.exe C:/Users/liujiannan/PycharmProjects/pythonProject/Web安全之机器学习入门/code/9-3.py
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
warnings.warn(msg, category=DeprecationWarning)
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\numpy_pickle.py", line 526, in _unpickle
obj = unpickler.load()
File "C:\ProgramData\Anaconda3\lib\pickle.py", line 1088, in load
dispatch[key[0]](self)
File "C:\ProgramData\Anaconda3\lib\pickle.py", line 1264, in load_short_binstring
self.append(self._decode_string(data))
File "C:\ProgramData\Anaconda3\lib\pickle.py", line 1204, in _decode_string
return value.decode(self.encoding, self.errors)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe8 in position 1: ordinal not in range(128)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/liujiannan/PycharmProjects/pythonProject/Web安全之机器学习入门/code/9-3.py", line 207, in <module>
show_hmm()
File "C:/Users/liujiannan/PycharmProjects/pythonProject/Web安全之机器学习入门/code/9-3.py", line 93, in show_hmm
remodel=joblib.load(FILE_MODEL)
File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\numpy_pickle.py", line 598, in load
obj = _unpickle(fobj, filename, mmap_mode)
File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\numpy_pickle.py", line 541, in _unpickle
raise new_exc
ValueError: You may be trying to read with python 3 a joblib pickle generated with python 2. This feature is not supported by joblib.
只要删掉配套源码中对应的9-2.m即可
运行结果如下所示
4.完整源码
# -*- coding:utf-8 -*-
import re
from hmmlearn import hmm
import numpy as np
import joblib
import csv
import matplotlib.pyplot as plt
import os
#处理域名的最小长度
MIN_LEN=10
#状态个数
N=8
#最大似然概率阈值
T=-50
#模型文件名
FILE_MODEL="9-2.m"
def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def domain2ver(domain):
ver=[]
for i in range(0,len(domain)):
ver.append([ord(domain[i])])
return ver
def train_hmm(domain_list):
X = [[0]]
X_lens = [1]
for domain in domain_list:
ver = domain2ver(domain)
np_ver = np.array(ver)
X = np.concatenate([X,np_ver])
X_lens.append(len(np_ver))
remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
remodel.fit(X,X_lens)
joblib.dump(remodel, FILE_MODEL)
return remodel
def test_dga(remodel,filename):
x = []
y = []
dga_cryptolocke_list = load_dga(filename)
for domain in dga_cryptolocke_list:
domain_ver = domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
x.append(len(domain))
y.append(pro)
return x,y
def load_dga(filename):
domain_list=[]
#xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
# http://osint.bambenekconsulting.com/manual/cl.txt
with open(filename) as f:
for line in f:
domain=line.split(",")[0]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
def test_alexa(remodel,filename):
x=[]
y=[]
alexa_list = load_alexa(filename)
for domain in alexa_list:
domain_ver=domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
#print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
x.append(len(domain))
y.append(pro)
return x, y
def show_hmm():
domain_list = load_alexa("../data/top-1000.csv")
if not os.path.exists(FILE_MODEL):
remodel=train_hmm(domain_list)
remodel=joblib.load(FILE_MODEL)
x_3,y_3=test_dga(remodel, "../data/dga-post-tovar-goz-1000.txt")
x_2,y_2=test_dga(remodel,"../data/dga-cryptolocke-1000.txt")
x_1,y_1=test_alexa(remodel, "../data/test-top-1000.csv")
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('HMM Score')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='best')
plt.show()
def get_aeiou(domain_list):
x=[]
y=[]
for domain in domain_list:
x.append(len(domain))
count=len(re.findall(r'[aeiou]',domain.lower()))
count=(0.0+count)/len(domain)
y.append(count)
return x,y
def show_aeiou():
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_aeiou(x1_domain_list)
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_aeiou(x2_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_aeiou(x3_domain_list)
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('AEIOU Score')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='best')
plt.show()
def get_uniq_char_num(domain_list):
x=[]
y=[]
for domain in domain_list:
x.append(len(domain))
count=len(set(domain))
count=(0.0+count)/len(domain)
y.append(count)
return x,y
def show_uniq_char_num():
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_uniq_char_num(x1_domain_list)
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_uniq_char_num(x2_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_uniq_char_num(x3_domain_list)
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('UNIQ CHAR NUMBER')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='best')
plt.show()
def count2string_jarccard_index(a,b):
x=set(' '+a[0])
y=set(' '+b[0])
for i in range(0,len(a)-1):
x.add(a[i]+a[i+1])
x.add(a[len(a)-1]+' ')
for i in range(0,len(b)-1):
y.add(b[i]+b[i+1])
y.add(b[len(b)-1]+' ')
return (0.0+len(x-y))/len(x|y)
def get_jarccard_index(a_list,b_list):
x=[]
y=[]
for a in a_list:
j=0.0
for b in b_list:
j+=count2string_jarccard_index(a,b)
x.append(len(a))
y.append(j/len(b_list))
return x,y
def show_jarccard_index():
x1_domain_list = load_alexa("../data/top-1000.csv")
x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)
x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)
x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('JARCCARD INDEX')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')
ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')
ax.legend(loc='lower right')
plt.show()
if __name__ == '__main__':
show_hmm()
show_aeiou()
show_uniq_char_num()
show_jarccard_index()