基于TensorFlow2实现MalConv恶意软件检测

鹏阿鹏

已于 2023-04-15 22:17:03 修改

阅读量327

点赞数

分类专栏：机器学习与深度学习 TensorFlow2 网络安全文章标签： tensorflow python 深度学习

于 2023-04-15 22:14:35 首次发布

本文链接：https://blog.csdn.net/awesomep/article/details/130123842

版权

机器学习与深度学习同时被 3 个专栏收录

18 篇文章 2 订阅

订阅专栏

TensorFlow2

16 篇文章 4 订阅

订阅专栏

网络安全

3 篇文章 0 订阅

订阅专栏

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Conv1D, multiply, GlobalMaxPool1D, Input, Activation
import tensorflow as tf
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
import csv
import os
import struct
from sklearn.model_selection import train_test_split

定义一维卷积神经网络

def Malconv(max_len=2000000, win_size=500, vocab_size=256):
    inp = Input((max_len,))
    emb = Embedding(vocab_size, 8)(inp)

    conv1 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
    conv2 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
    a = Activation('sigmoid', name='sigmoid')(conv2)

    mul = multiply([conv1, a])
    a = Activation('relu', name='relu')(mul)
    p = GlobalMaxPool1D()(a)
    d = Dense(64)(p)
    out = Dense(1, activation='sigmoid')(d)

    return Model(inp, out)

定义变量

mal_file_dir = 'E:\malware1500'
ben_file_dir = 'E:\PeBenFileForEXE'
ben_max_sum = 1000
mal_max_sum = 1000

遍历PE文件夹中的恶意与良性软件，做好标签存入CSV

def gen_csv():
    rows = []
    mallist = os.listdir(mal_file_dir)
    mal_num = 0
    for a in mallist:
        abspath = mal_file_dir+os.sep+a
        if(os.path.getsize(abspath)<2000000):
    #         print(abspath,os.path.getsize(abspath))
            rows.append((a,'1'))
            mal_num = mal_num + 1
        if mal_num>=mal_max_sum:
            break;
    benlist = os.listdir(ben_file_dir)
    ben_num = 0
    for b in benlist:
        abspath = ben_file_dir+os.sep+b
        if(os.path.getsize(abspath)<2000000):
    #         print(abspath,os.path.getsize(abspath))
            rows.append((b,'0'))
            ben_num = ben_num + 1
        if ben_num>=ben_max_sum:
            break;
    print(ben_num,mal_num)
    headers = ['filename','label']
    with open('MalAndBenFile.csv','w',encoding='utf8',newline='') as f :
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(rows)

执行获取CSV

gen_csv()

字节填充

def fill_list(my_list: list, length, fill=100): # 使用 fill字符/数字 填充，使得最后的长度为 length
#     print(len(my_list))
    if len(my_list) >= length:
        return my_list
    else:
        return my_list + (length - len(my_list)) * [fill]

获取数据

data = pd.read_csv('./MalAndBenFile.csv')

在这里插入图片描述
PE文件转字节列表

def read_bin(file_name):
    """
    function: read a bin file, return the list of the content in file
    """
    with open(file_name, "rb") as f:
        f_content = f.read()
        content = struct.unpack("B" * len(f_content), f_content)
        f.close()
    return list(content)

构造训练与测试数据集

data = data.to_numpy()
new_data = []
new_label= []

for i in data:
    if i[1] == 0:
        p = ben_file_dir+os.sep+i[0]
        new_data.append(fill_list(read_bin(p),2000000))
        new_label.append(0)
    elif i[1] == 1:
        p = mal_file_dir+os.sep+i[0]
        new_data.append(fill_list(read_bin(p),2000000))
        new_label.append(1)
new_data = np.array(new_data,dtype='int32')
new_label = np.array(new_label,dtype='int32')
x_train, x_test, y_train, y_test = train_test_split(new_data,new_label,test_size=0.25)

x_train = tf.cast(x_train, tf.int32)
x_test = tf.cast(x_test, tf.int32)
y_train = tf.cast(y_train, tf.int32)
y_test = tf.cast(y_test, tf.int32)

# 标签one-hot处理（可用可不用）
# train_labels = tf.keras.utils.to_categorical(train_labels,num_classes=10) 
# test_labels = tf.keras.utils.to_categorical(test_labels,num_classes=10)

### 创建Dataset
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(2000).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

模型训练

tf.keras.backend.clear_session()
model = Malconv()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(dataset,validation_data=test_dataset, epochs=10)

Epoch 1/10
47/47 [==============================] - 321s 7s/step - loss: 0.4718 - acc: 0.7947 - val_loss: 0.2674 - val_acc: 0.8700
Epoch 2/10
47/47 [==============================] - 320s 7s/step - loss: 0.1739 - acc: 0.9353 - val_loss: 0.1817 - val_acc: 0.9260
Epoch 3/10
47/47 [==============================] - 321s 7s/step - loss: 0.0577 - acc: 0.9833 - val_loss: 0.1847 - val_acc: 0.9220
Epoch 4/10
47/47 [==============================] - 322s 7s/step - loss: 0.0191 - acc: 0.9973 - val_loss: 0.2146 - val_acc: 0.9340
Epoch 5/10
47/47 [==============================] - 321s 7s/step - loss: 0.0035 - acc: 1.0000 - val_loss: 0.2066 - val_acc: 0.9280
Epoch 6/10
47/47 [==============================] - 322s 7s/step - loss: 0.0015 - acc: 1.0000 - val_loss: 0.2175 - val_acc: 0.9320
Epoch 7/10
47/47 [==============================] - 320s 7s/step - loss: 9.3740e-04 - acc: 1.0000 - val_loss: 0.2352 - val_acc: 0.9360
Epoch 8/10
47/47 [==============================] - 320s 7s/step - loss: 6.8522e-04 - acc: 1.0000 - val_loss: 0.2359 - val_acc: 0.9360
Epoch 9/10
47/47 [==============================] - 321s 7s/step - loss: 5.1924e-04 - acc: 1.0000 - val_loss: 0.2400 - val_acc: 0.9360
Epoch 10/10
47/47 [==============================] - 321s 7s/step - loss: 4.1028e-04 - acc: 1.0000 - val_loss: 0.2508 - val_acc: 0.9360