吴恩达DeepLearning第二部分作业week2 优化器设计: min_batch、momentum梯度下降、adam优化算法

本文探讨了三种深度学习优化算法:批处理梯度下降、动量梯度下降和Adam算法。通过代码实现,展示了它们在训练神经网络时的效率和准确性。实验结果显示,Adam算法的收敛速度最快,其次是动量梯度下降,而批处理梯度下降最慢。分批训练虽然牺牲了向量化的优势,但提高了训练效果。
摘要由CSDN通过智能技术生成

这次作业比较简单按照公式打就行:

首先导包,读取数据并查看数据

import numpy as np
import matplotlib.pyplot as plt
import opt_utils
import time

x_train, y_train = opt_utils.load_dataset()
# print(x_train.shape)
# print(y_train.shape)

构建神经网络

def init(x):
    first_num = 9
    second_num = 5
    third_num = 2
    np.random.seed(5)
    w1 = np.random.randn(first_num, x.shape[0]) * np.sqrt(2 / x.shape[0])
    b1 = np.zeros((first_num, 1))
    w2 = np.random.randn(second_num, first_num) * np.sqrt(2 / first_num)
    b2 = np.zeros((second_num, 1))
    w3 = np.random.randn(third_num, second_num) * np.sqrt(2 / second_num)
    b3 = np.zeros((third_num, 1))
    w4 = np.random.randn(1, third_num) * np.sqrt(2 / third_num)
    b4 = np.zeros((1, 1))
    ini_param = {
        "w1": w1,
        "b1": b1,
        "w2": w2,
        "b2": b2,
        "w3": w3,
        "b3": b3,
        "w4": w4,
        "b4": b4
    }
    return ini_param


# 构建前向
def cal_z(w, a, b):  # a维度:上层节点数*例子数 w维度:该层节点数*上层节点数
    return np.dot(w, a) + b


def cal_sigma(z):  # z维度:该层节点数*例子数
    return 1 / (1 + np.exp(-z))


def cal_tan_h(z):  # z1维度hidden_num*x.shape[1]
    return np.tanh(z)


def cal_relu(z):  # z维度:该层节点数*例子数
    return np.maximum(0, z)


def forward_f(x, p):
    z1 = cal_z(p["w1"], x, p["b1"])
    a1 = cal_relu(z1)
    z2 = cal_z(p["w2"], a1, p["b2"])
    a2 = cal_relu(z2)
    z3 = cal_z(p["w3"], a2, p["b3"])
    a3 = cal_relu(z3)
    z4 = cal_z(p["w4"], a3, p["b4"])
    a4 = cal_sigma(z4)
    forward_param = {
        "z1": z1,
        "a1": a1,
        "z2": z2,
        "a2": a2,
        "z3": z3,
        "a3": a3,
        "z4": z4,
        "a4": a4
    }
    return forward_param


# 计算损失函数
def cost_f(a, y):  # a维度: 1*例子数 y维度:1*例子数
    m = y.shape[1]
    return -np.sum(y * np.log(a) + (1 - y) * np.log(1 - a)) / m


# 后向
def cal_dz_last(a, y):  # ai维度:第i层节点数*例子数 y维度:1*例子数
    return a - y


def cal_dw_db(dz, a, m):  # dzi维度:第i层节点数*例子数 ai维度:第i层节点数*例子数
    return np.dot(dz, a.T) / m, np.sum(dz, axis=1, keepdims=True) / m


def cal_da(dz, w):  # dzi维度:第i层节点数*例子数 wi维度:第i层节点数*第i-1层节点数
    return np.dot(w.T, dz)


def cal_drelu(da, z):  # dai维度:第i层节点数*例子数 zi维度:第i层节点数*例子数
    t = np.ones(z.shape)
    t[z <= 0] = 0
    return da * t


def back_f(p, f_p, x, y):
    dz4 = cal_dz_last(f_p["a4"], y)
    dw4, db4 = cal_dw_db(dz4, f_p["a3"], y.shape[1])

    da3 = cal_da(dz4, p["w4"])
    dz3 = cal_drelu(da3, f_p["z3"])
    dw3, db3 = cal_dw_db(dz3, f_p["a2"], y.shape[1])

    da2 = cal_da(dz3, p["w3"])
    dz2 = cal_drelu(da2, f_p["z2"])
    dw2, db2 = cal_dw_db(dz2, f_p["a1"], y.shape[1])

    da1 = cal_da(dz2, p["w2"])
    dz1 = cal_drelu(da1, f_p["z1"])
    dw1, db1 = cal_dw_db(dz1, x, y.shape[1])

    back_param = {
        "dw4": dw4,
        "db4": db4,
        "dw3": dw3,
        "db3": db3,
        "dw2": dw2,
        "db2": db2,
        "dw1": dw1,
        "db1": db1
    }
    return back_param



# 更新参数
def update_p(p, b_p, learning_rate):
    upd_p = {
        "w1": p["w1"] - learning_rate * b_p["dw1"],
        "b1": p["b1"] - learning_rate * b_p["db1"],
        "w2": p["w2"] - learning_rate * b_p["dw2"],
        "b2": p["b2"] - learning_rate * b_p["db2"],
        "w3": p["w3"] - learning_rate * b_p["dw3"],
        "b3": p["b3"] - learning_rate * b_p["db3"],
        "w4": p["w4"] - learning_rate * b_p["dw4"],
        "b4": p["b4"] - learning_rate * b_p["db4"]
    }
    return upd_p

这边建模,训练。要改成批处理梯度下降和min_batch梯度下降可以通用的:

# 建模
def model(x, y, learning_rate, loop_num, p):
    cost_t = []
    for i in range(loop_num):
        f_p = forward_f(x, p)
        b_p = back_f(p, f_p, x, y)
        p = update_p(p, b_p, learning_rate)
        if i % 10 == 0:
            cost_t.append(cost_f(f_p["a4"], y))
    return p, np.array(cost_t)


def model_min_batch(x, y, min_batch_size, learning_rate, loop_num):
    # 洗牌
    size = x.shape[1]
    temp = np.random.permutation(size)  # 获取一个0-m之间的随机排列序列数组
    shuttle_x = x[:, temp]
    shuttle_y = y[:, temp]

    # 划分
    p = init(x)
    cost = []
    num = int(size / min_batch_size)
    if size % min_batch_size != 0:
        start = int(num * min_batch_size)
        x_min_batch = shuttle_x[:, start:size - 1]
        y_min_batch = shuttle_y[:, start:size - 1]
        p, cost_t = model(x_min_batch, y_min_batch, learning_rate, loop_num, p)
        cost.extend(cost_t)
    for i in range(num):
        start = int(i * min_batch_size)
        end = int((i + 1) * min_batch_size - 1)
        x_min_batch = shuttle_x[:, start:end]
        y_min_batch = shuttle_y[:, start:end]
        p, cost_t = model(x_min_batch, y_min_batch, learning_rate, loop_num, p)
        cost.extend(cost_t)
    return p, cost

首先,不分块只需将min_batch_size设为数据集大小就行。

model_p_gd, cost_gd = model_min_batch(x_train, y_train, x_train.shape[1], 0.003, 800)

time_start_gd = time.time()
model_p_gd, cost_gd = model_min_batch(x_train, y_train, x_train.shape[1], 0.003, 800)
time_end_gd = time.time()
print("整体运时间:", time_end_gd - time_start_gd)
train_f_p_gd=forward_f(x_train,model_p_gd)
print("整个运行准确度:", 100 * (1 - np.sum(np.abs(np.round(train_f_p_gd["a4"]) - y_train)) / y_train.shape[1]), '%')
print_figure(x_train, y_train, model_p_gd, cost_gd,"batch_gd")

分批运行修改传入参数:

odel_p_mb, cost_mb = model_min_batch(x_train, y_train, 64, 0.003, 800)

time_start_mb = time.time()
model_p_mb, cost_mb = model_min_batch(x_train, y_train, 64, 0.003, 800)  # 分成2^n每份最佳
time_end_mb = time.time()
print("分批:", time_end_mb - time_start_mb)
train_f_p_mb=forward_f(x_train,model_p_mb)
print("分批运行准确度:", 100 * (1 - np.sum(np.abs(np.round(train_f_p_mb["a4"]) - y_train)) / y_train.shape[1]), '%')
print_figure(x_train, y_train, model_p_mb, cost_mb,"minbatch")

使用动量梯度下降,要修改以下三个部分:

1.后向加入vdw和vdb

2.更新w和b的值,需要将传入后向参数改为传入v值

3.修改训练的函数

 修改后向:

def back_f_mementum(p, f_p, x, y, v, beta1):
    dz4 = cal_dz_last(f_p["a4"], y)
    dw4, db4 = cal_dw_db(dz4, f_p["a3"], y.shape[1])
    v["vdw4"] = beta1 * v["vdw4"] + (1 - beta1) * dw4
    v["vdb4"] = beta1 * v["vdb4"] + (1 - beta1) * db4

    da3 = cal_da(dz4, p["w4"])
    dz3 = cal_drelu(da3, f_p["z3"])
    dw3, db3 = cal_dw_db(dz3, f_p["a2"], y.shape[1])
    v["vdw3"] = beta1 * v["vdw3"] + (1 - beta1) * dw3
    v["vdb3"] = beta1 * v["vdb3"] + (1 - beta1) * db3

    da2 = cal_da(dz3, p["w3"])
    dz2 = cal_drelu(da2, f_p["z2"])
    dw2, db2 = cal_dw_db(dz2, f_p["a1"], y.shape[1])
    v["vdw2"] = beta1 * v["vdw2"] + (1 - beta1) * dw2
    v["vdb2"] = beta1 * v["vdb2"] + (1 - beta1) * db2

    da1 = cal_da(dz2, p["w2"])
    dz1 = cal_drelu(da1, f_p["z1"])
    dw1, db1 = cal_dw_db(dz1, x, y.shape[1])
    v["vdw1"] = beta1 * v["vdw1"] + (1 - beta1) * dw1
    v["vdb1"] = beta1 * v["vdb1"] + (1 - beta1) * db1

    back_param = {
        "dw4": dw4,
        "db4": db4,
        "dw3": dw3,
        "db3": db3,
        "dw2": dw2,
        "db2": db2,
        "dw1": dw1,
        "db1": db1
    }
    return back_param, v

修改更新w、b函数:

def update_p_momentum(p, v, learning_rate):
    upd_p = {
        "w1": p["w1"] - learning_rate * v["vdw1"],
        "b1": p["b1"] - learning_rate * v["vdb1"],
        "w2": p["w2"] - learning_rate * v["vdw2"],
        "b2": p["b2"] - learning_rate * v["vdb2"],
        "w3": p["w3"] - learning_rate * v["vdw3"],
        "b3": p["b3"] - learning_rate * v["vdb3"],
        "w4": p["w4"] - learning_rate * v["vdw4"],
        "b4": p["b4"] - learning_rate * v["vdb4"]
    }
    return upd_p

修改训练函数:

def model_momentum(x, y, learning_rate, loop_num, p, v, beta1):
    cost_t = []

    for i in range(loop_num):
        f_p = forward_f(x, p)
        b_p, v = back_f_mementum(p, f_p, x, y, v, beta1)
        p = update_p_momentum(p, v, learning_rate)
        if i % 10 == 0:
            cost_t.append(cost_f(f_p["a4"], y))
    return p, np.array(cost_t), v


def model_min_batch_momentum(x, y, min_batch_size, learning_rate, loop_num, beta1=0.9):
    # 洗牌
    size = x.shape[1]
    temp = np.random.permutation(size)  # 获取一个0-m之间的随机排列序列数组
    shuttle_x = x[:, temp]
    shuttle_y = y[:, temp]

    # 划分
    p = init(x)
    cost = []
    num = int(size / min_batch_size)
    v = {
        "vdw4": 0,
        "vdb4": 0,
        "vdw3": 0,
        "vdb3": 0,
        "vdw2": 0,
        "vdb2": 0,
        "vdw1": 0,
        "vdb1": 0
    }
    if size % min_batch_size != 0:
        start = int(num * min_batch_size)
        x_min_batch = shuttle_x[:, start:size - 1]
        y_min_batch = shuttle_y[:, start:size - 1]
        p, cost_t, v = model_momentum(x_min_batch, y_min_batch, learning_rate, loop_num, p, v, beta1)
        cost.extend(cost_t)
    for i in range(num):
        start = int(i * min_batch_size)
        end = int((i + 1) * min_batch_size - 1)
        x_min_batch = shuttle_x[:, start:end]
        y_min_batch = shuttle_y[:, start:end]
        p, cost_t, v = model_momentum(x_min_batch, y_min_batch, learning_rate, loop_num, p, v, beta1)
        cost.extend(cost_t)
    return p, cost

 用Adam算法需要修改:

1.后向在mementum基础上引入s

2.更新w和b的值

3修改训练的函数

修改后向:

def back_f_Adam(p, f_p, x, y, v, s, beta1, beta2):
    dz4 = cal_dz_last(f_p["a4"], y)
    dw4, db4 = cal_dw_db(dz4, f_p["a3"], y.shape[1])
    v["vdw4"] = beta1 * v["vdw4"] + (1 - beta1) * dw4
    v["vdb4"] = beta1 * v["vdb4"] + (1 - beta1) * db4
    s["sdw4"] = beta2 * s["sdw4"] + (1 - beta2) * (dw4 ** 2)
    s["sdb4"] = beta2 * s["sdb4"] + (1 - beta2) * (db4 ** 2)

    da3 = cal_da(dz4, p["w4"])
    dz3 = cal_drelu(da3, f_p["z3"])
    dw3, db3 = cal_dw_db(dz3, f_p["a2"], y.shape[1])
    v["vdw3"] = beta1 * v["vdw3"] + (1 - beta1) * dw3
    v["vdb3"] = beta1 * v["vdb3"] + (1 - beta1) * db3
    s["sdw3"] = beta2 * s["sdw3"] + (1 - beta2) * (dw3 ** 2)
    s["sdb3"] = beta2 * s["sdb3"] + (1 - beta2) * (db3 ** 2)

    da2 = cal_da(dz3, p["w3"])
    dz2 = cal_drelu(da2, f_p["z2"])
    dw2, db2 = cal_dw_db(dz2, f_p["a1"], y.shape[1])
    v["vdw2"] = beta1 * v["vdw2"] + (1 - beta1) * dw2
    v["vdb2"] = beta1 * v["vdb2"] + (1 - beta1) * db2
    s["sdw2"] = beta2 * s["sdw2"] + (1 - beta2) * (dw2 ** 2)
    s["sdb2"] = beta2 * s["sdb2"] + (1 - beta2) * (db2 ** 2)

    da1 = cal_da(dz2, p["w2"])
    dz1 = cal_drelu(da1, f_p["z1"])
    dw1, db1 = cal_dw_db(dz1, x, y.shape[1])
    v["vdw1"] = beta1 * v["vdw1"] + (1 - beta1) * dw1
    v["vdb1"] = beta1 * v["vdb1"] + (1 - beta1) * db1
    s["sdw1"] = beta2 * s["sdw1"] + (1 - beta2) * (dw1 ** 2)
    s["sdb1"] = beta2 * s["sdb1"] + (1 - beta2) * (db1 ** 2)

    back_param = {
        "dw4": dw4,
        "db4": db4,
        "dw3": dw3,
        "db3": db3,
        "dw2": dw2,
        "db2": db2,
        "dw1": dw1,
        "db1": db1
    }
    return back_param, v, s

 修改更新函数:

def update_p_Adam(p, v, s, learning_rate, i, beta1, beta2):
    e = 0.000000001
    vbias_cor = (1 - beta1 ** (i+1)) + e
    sbias_cor = (1 - beta2 ** (i+1)) + e
    upd_p = {
        "w1": p["w1"] - learning_rate * (v["vdw1"] / vbias_cor) / np.sqrt(s["sdw1"] / sbias_cor),
        "b1": p["b1"] - learning_rate * (v["vdb1"] / vbias_cor) / np.sqrt(s["sdb1"] / sbias_cor),
        "w2": p["w2"] - learning_rate * (v["vdw2"] / vbias_cor) / np.sqrt(s["sdw2"] / sbias_cor),
        "b2": p["b2"] - learning_rate * (v["vdb2"] / vbias_cor) / np.sqrt(s["sdb2"] / sbias_cor),
        "w3": p["w3"] - learning_rate * (v["vdw3"] / vbias_cor) / np.sqrt(s["sdw3"] / sbias_cor),
        "b3": p["b3"] - learning_rate * (v["vdb3"] / vbias_cor) / np.sqrt(s["sdb3"] / sbias_cor),
        "w4": p["w4"] - learning_rate * (v["vdw4"] / vbias_cor) / np.sqrt(s["sdw4"] / sbias_cor),
        "b4": p["b4"] - learning_rate * (v["vdb4"] / vbias_cor) / np.sqrt(s["sdb4"] / sbias_cor)
    }
    return upd_p

修改训练函数:

def model_Adam(x, y, learning_rate, loop_num, p, v, s, beta1, beta2):
    cost_t = []

    for i in range(loop_num):
        f_p = forward_f(x, p)
        b_p, v, s = back_f_Adam(p, f_p, x, y, v, s, beta1, beta2)
        p = update_p_Adam(p, v, s, learning_rate, i,beta1,beta2)
        if i % 10 == 0:
            cost_t.append(cost_f(f_p["a4"], y))
    return p, np.array(cost_t), v, s


def model_min_batch_Adam(x, y, min_batch_size, learning_rate, loop_num, beta1=0.9, beta2=0.999):
    # 洗牌
    size = x.shape[1]
    temp = np.random.permutation(size)  # 获取一个0-m之间的随机排列序列数组
    shuttle_x = x[:, temp]
    shuttle_y = y[:, temp]

    # 划分
    p = init(x)
    cost = []
    num = int(size / min_batch_size)

    v = {
        "vdw4": 0,
        "vdb4": 0,
        "vdw3": 0,
        "vdb3": 0,
        "vdw2": 0,
        "vdb2": 0,
        "vdw1": 0,
        "vdb1": 0
    }
    s = {
        "sdw4": 0,
        "sdb4": 0,
        "sdw3": 0,
        "sdb3": 0,
        "sdw2": 0,
        "sdb2": 0,
        "sdw1": 0,
        "sdb1": 0
    }
    if size % min_batch_size != 0:
        start = int(num * min_batch_size)
        x_min_batch = shuttle_x[:, start:size - 1]
        y_min_batch = shuttle_y[:, start:size - 1]
        p, cost_t, v, s = model_Adam(x_min_batch, y_min_batch, learning_rate, loop_num, p, v, s, beta1, beta2)
        cost.extend(cost_t)

    for i in range(num):
        start = int(i * min_batch_size)
        end = int((i + 1) * min_batch_size - 1)
        x_min_batch = shuttle_x[:, start:end]
        y_min_batch = shuttle_y[:, start:end]
        p, cost_t, v, s = model_Adam(x_min_batch, y_min_batch, learning_rate, loop_num, p, v,s, beta1, beta2)
        cost.extend(cost_t)

    return p, cost

整体运行时间: 0.28107190132141113
整个运行准确度: 84.33333333333333 %


分批运行时间: 0.9800760746002197
分批运行准确度: 88.66666666666667 %


动量运行时间: 1.166001319885254
动量下降准确度: 88.66666666666667 %


Adam: 1.5370657444000244
Adam准确度: 92.0 %
 

4次训练采用相同epoch次数和学习率,根据loss图可以看出下降速率Adam>mementum=min_batch>batch gradient decent

min_batch牺牲了向量化,运行时间变长,可能数据太小mementum和min_batch没有明显区别

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值