在9.4节通过SVM算法识别DGA域名中,其中使用了HMM特征。本小节详细讲解隐式马尔可夫法识别DGA域名。
1、白名单
def load_alexa(filename):
domain_list=[]
csv_reader = csv.reader(open(filename))
for row in csv_reader:
domain=row[1]
if len(domain) >= MIN_LEN:
domain_list.append(domain)
return domain_list
domain_list = load_alexa("../data/top-1000.csv")
2、训练hmm模型
首先通过正常的域名训练hmm,代码如下所示,为了节省训练,如果存在训练好的模型,则是直接load训练好的模型即可。
def train_hmm(domain_list):
X = [[0]]
X_lens = [1]
for domain in domain_list:
ver=domain2ver(domain)
np_ver = np.array(ver)
X=np.concatenate([X,np_ver])
X_lens.append(len(np_ver))
remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
remodel.fit(X,X_lens)
joblib.dump(remodel, FILE_MODEL)
return remodel
if not os.path.exists(FILE_MODEL):
remodel=train_hmm(domain_list)
remodel=joblib.load(FILE_MODEL)
3、验证HMM模型
def test_dga(remodel,filename):
x=[]
y=[]
dga_cryptolocke_list = load_dga(filename)
for domain in dga_cryptolocke_list:
domain_ver=domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
#print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
x.append(len(domain))
y.append(pro)
return x,y
def test_alexa(remodel,filename):
x=[]
y=[]
alexa_list = load_alexa(filename)
for domain in alexa_list:
domain_ver=domain2ver(domain)
np_ver = np.array(domain_ver)
pro = remodel.score(np_ver)
#print "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
x.append(len(domain))
y.append(pro)
return x, y
x_3,y_3=test_dga(remodel, "../data/dga-post-tovar-goz-1000.txt")
x_2,y_2=test_dga(remodel,"../data/dga-cryptolocke-1000.txt")
x_1,y_1=test_alexa(remodel, "../data/test-top-1000.csv")
4、可视化HMM
计算两类僵尸网络(蓝色和绿色)的HMM, 以域名长度为横轴,HMM分数为纵轴作图,并图形展示的源码处理。
fig,ax=plt.subplots()
ax.set_xlabel('Domain Length')
ax.set_ylabel('HMM Score')
ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz")
ax.scatter(x_2, y_2, color='g', label="dga_cryptolock")
#ax.scatter(x_1, y_1, color='r', label="alexa")
ax.legend(loc='right')
plt.show()
可视化运行结果如下: