深度学习中的优化方法

最新推荐文章于 2023-06-23 15:25:45 发布

粥粥粥少女的拧发条鸟

最新推荐文章于 2023-06-23 15:25:45 发布

阅读量542

点赞数 4

分类专栏：深度学习框架文章标签：深度学习 python 机器学习

本文链接：https://blog.csdn.net/qq_41691212/article/details/124645533

版权

深度学习框架专栏收录该内容

30 篇文章 35 订阅

订阅专栏

第1关：梯度下降算法实战学习：BGD和SGD

本关任务：编写一个分别应用BGD、SGD、mini-batch梯度下降优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。


import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


def initialize_parameters(layer_dims):
    '''
    初始化参数w,b
    :param layer_dims: 网络中每一层的单元个数，类型为list
    :return:存储参数w1,w2,...,wL,b1,...,bL，类型为dictionary
    '''
    np.random.seed(3)
    L = len(layer_dims)  # the number of layers in the network
    parameters = {}
    for l in range(1, L):
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(
            2 / layer_dims[l - 1])  # he initialization
        # parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])  # xavier initialization
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters


def relu(Z):
    """
    实现relu激活层
    :param Z: 线性层的输出
    :return A: 激活后的输出
    """
    A = np.maximum(0, Z)
    return A


def sigmoid(Z):
    """
    实现sigmoid激活层
    :param Z: 线性层的输出
    :return A:激活后的输出
    """
    A = 1 / (1 + np.exp(-Z))
    return A


def forward_propagation(X, parameters):
    """
    前向传播
    :param X: 输入数据，大小为 (input size, number of examples)
    :param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
                    W -- 网络节点的权重，大小为 (size of current layer, size of previous layer)
                    b -- 网络节点的偏置，大小为 (size of current layer,1)
    :return:
    AL: 输出层(y_predict)的输出
    caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
    """
    L = len(parameters) // 2  # number of layer
    A = X
    caches = [(None, None, None, X)]  # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致，用于存储每一层的，w,b,z,A
    # calculate from 1 to L-1 layer
    for l in range(1, L):
        A_pre = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        z = np.dot(W, A_pre) + b  # 计算z = wx + b
        A = relu(z)  # relu activation function
        caches.append((W, b, z, A))
    # calculate Lth layer
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    zL = np.dot(WL, A) + bL
    AL = sigmoid(zL)
    caches.append((WL, bL, zL, AL))
    return AL, caches


def compute_cost(AL, Y):
    """
    计算损失
    :param AL: 最后一层的激活值，即预测值，shape:(1,number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :return:
    """
    m = Y.shape[1]
    cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
                              np.multiply(-np.log(1 - AL), 1 - Y))
    # 从数组的形状中删除单维条目，即把shape中为1的维度去掉，比如把[[[2]]]变成2
    cost = np.squeeze(cost)
    return cost


def relu_backward(Z):
    """
    :param Z: 激活层的输入
    :return: 激活层的输出
    """
    dA = np.int64(Z > 0)
    return dA


def backward_propagation(AL, Y, caches):
    """
    反向传播
    :param AL -- 最后一层的激活值，即预测值，shape:(1,number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
    Returns:
    gradients --  关于dW,db的梯度值，类型为directory
    """
    m = Y.shape[1]
    L = len(caches) - 1
    # print("L:   " + str(L))
    # calculate the Lth layer gradients
    prev_AL = caches[L - 1][3]
    dzL = 1. / m * (AL - Y)
    # print(dzL.shape)
    # print(prev_AL.T.shape)
    dWL = np.dot(dzL, prev_AL.T)
    dbL = np.sum(dzL, axis=1, keepdims=True)
    gradients = {"dW" + str(L): dWL, "db" + str(L): dbL}
    # calculate from L-1 to 1 layer gradients
    for l in reversed(range(1, L)):  # L-1,L-3,....,1
        post_W = caches[l + 1][0]  # 要用后一层的W
        dz = dzL  # 用后一层的dz

        dal = np.dot(post_W.T, dz)
        z = caches[l][2]  # 当前层的z
        dzl = np.multiply(dal, relu_backward(z))
        prev_A = caches[l - 1][3]  # 前一层的A
        dWl = np.dot(dzl, prev_A.T)
        dbl = np.sum(dzl, axis=1, keepdims=True)

        gradients["dW" + str(l)] = dWl
        gradients["db" + str(l)] = dbl
        dzL = dzl  # 更新dz
    return gradients


def update_parameters(parameters, grads, learning_rate):
    """
    :param parameters: dictionary,  W,b
    :param grads: dW,db
    :param learning_rate: 学习率alpha
    :return:
    """
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    return parameters
    # ********** End **********#


def random_mini_batches(X, Y, mini_batch_size=64, seed=1):
    """
    从(X, Y)创建minibatches
    :param X -- 输入数据, of shape (input size, number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param mini_batch_size -- mini-batches的大小, integer
    Returns:
    mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
    """
    np.random.seed(seed)
    m = X.shape[1]  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size  # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, gradient_descent='bgd', mini_batch_size=64):
    """
    :param X:输入数据, of shape (input size, number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :param layer_dims:网络中每层的节点数,list
    :param learning_rate:学习率
    :param num_iterations:迭代次数
    :return:
    parameters：最终的网络参数:(W,b)
    """
    m = Y.shape[1]
    costs = []
    # initialize parameters
    parameters = initialize_parameters(layer_dims)
    if gradient_descent == 'bgd':
        for i in range(0, num_iterations):
            # foward propagation
            AL, caches = forward_propagation(X, parameters)
            # calculate the cost
            cost = compute_cost(AL, Y)
            if i % 1000 == 0:
                # print("Cost after iteration {}: {}".format(i, cost))
                costs.append(cost)
            # backward propagation
            grads = backward_propagation(AL, Y, caches)
            # update parameters
            parameters = update_parameters(parameters, grads, learning_rate)
    elif gradient_descent == 'sgd':
        np.random.seed(3)
        # 把数据集打乱，这个很重要
        permutation = list(np.random.permutation(m))
        shuffled_X = X[:, permutation]
        shuffled_Y = Y[:, permutation].reshape((1, m))
        for i in range(0, num_iterations):
            for j in range(0, m):  # 每次训练一个样本
                # Forward propagation
                AL, caches = forward_propagation(shuffled_X[:, j].reshape(-1, 1), parameters)
                # Compute cost
                cost = compute_cost(AL, shuffled_Y[:, j].reshape(1, 1))
                # Backward propagation
                grads = backward_propagation(AL, shuffled_Y[:, j].reshape(1, 1), caches)
                # Update parameters.
                parameters = update_parameters(parameters, grads, learning_rate)
                
    elif gradient_descent == 'mini-batch':
        seed = 0
        for i in range(0, num_iterations):
            # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
            seed = seed + 1
            minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
            for minibatch in minibatches:
                # 请在此添加代码 完成本关任务
                # ********** Begin *********#
                # Select a minibatch
                (minibatch_X,minibatch_Y)=minibatch
                # Forward propagation
                AL,caches=forward_propagation(minibatch_X,parameters)
                # Compute cost
                cost=compute_cost(AL,minibatch_Y)
                # Backward propagation
                grads=backward_propagation(AL,minibatch_Y,caches)
                parameters=update_parameters(parameters,grads,learning_rate)
                   # ********** End **********#
            
    return parameters


# DNN model
def DNN(X_train, y_train, layer_dims, learning_rate=0.0006, num_iterations=30000,
        gradient_descent='bgd', mini_batch_size=64):
    parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, gradient_descent,
                               mini_batch_size)
    return parameters

第2关：动量梯度下降法

本关任务：编写一个分别应用momentum、NAG梯度下降优化算法法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。

import numpy as np
from sklearn.datasets import  load_breast_cancer
from sklearn.model_selection import train_test_split
#initialize parameters(w,b)
def initialize_parameters(layer_dims):
    '''
    初始化参数w,b
    :param layer_dims: 网络中每一层的单元个数，类型为list
    :return:存储参数w1,w2,...,wL,b1,...,bL，类型为dictionary
    '''
    np.random.seed(3)
    L = len(layer_dims)#the number of layers in the network
    parameters = {}
    for l in range(1,L):
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2/layer_dims[l-1]) # he initialization
        # parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])  # xavier initialization
        parameters["b" + str(l)] = np.zeros((layer_dims[l],1))
    return parameters
def relu(Z):
    """
    实现relu激活层
    :param Z: 线性层的输出
    :return A: 激活后的输出
    """
    A = np.maximum(0,Z)
    return A

def sigmoid(Z):
    """
    实现sigmoid激活层
    :param Z: 线性层的输出
    :return A:激活后的输出
    """
    A = 1 / (1 + np.exp(-Z))
    return A

def forward_propagation(X, parameters):
    """
    前向传播
    :param X: 输入数据，大小为 (input size, number of examples)
    :param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
                    W -- 网络节点的权重，大小为 (size of current layer, size of previous layer)
                    b -- 网络节点的偏置，大小为 (size of current layer,1)
    :return:
    AL: 输出层(y_predict)的输出
    caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
    """
    L = len(parameters) // 2  # number of layer
    A = X
    caches = [(None,None,None,X)]  # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致，用于存储每一层的，w,b,z,A
    # calculate from 1 to L-1 layer
    for l in range(1,L):
        A_pre = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        z = np.dot(W,A_pre) + b #计算z = wx + b
        A = relu(z) #relu activation function
        caches.append((W,b,z,A))
    # calculate Lth layer
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    zL = np.dot(WL,A) + bL
    AL = sigmoid(zL)
    caches.append((WL,bL,zL,AL))
    return AL, caches

def compute_cost(AL,Y):
    """
    计算损失
    :param AL: 最后一层的激活值，即预测值，shape:(1,number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :return:
    """
    m = Y.shape[1]
    # cost = -1.0/m * np.sum(Y*np.log(AL)+(1-Y)*np.log(1.0 - AL))#py中*是点乘
    # cost = (1. / m) * (-np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)) #推荐用这个，上面那个容易出错
    cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
                              np.multiply(-np.log(1 - AL), 1 - Y))
    #从数组的形状中删除单维条目，即把shape中为1的维度去掉，比如把[[[2]]]变成2
    cost = np.squeeze(cost)
    return cost

# derivation of relu
def relu_backward(Z):
    """
    :param Z: 激活层的输入
    :return: 激活层的输出
    """
    dA = np.int64(Z > 0)
    return dA

def backward_propagation(AL, Y, caches):
    """
    反向传播
    :param AL -- 最后一层的激活值，即预测值，shape:(1,number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
    Returns:
    gradients --  关于W,b的梯度值，类型为directory
    """
    m = Y.shape[1]
    L = len(caches) - 1
    # print("L:   " + str(L))
    #calculate the Lth layer gradients
    prev_AL = caches[L-1][3]
    dzL = 1./m * (AL - Y)
    # print(dzL.shape)
    # print(prev_AL.T.shape)
    dWL = np.dot(dzL, prev_AL.T)
    dbL = np.sum(dzL, axis=1, keepdims=True)
    gradients = {"dW"+str(L):dWL, "db"+str(L):dbL}
    #calculate from L-1 to 1 layer gradients
    for l in reversed(range(1,L)): # L-1,L-3,....,1
        post_W= caches[l+1][0] #要用后一层的W
        dz = dzL #用后一层的dz

        dal = np.dot(post_W.T, dz)
        z = caches[l][2]#当前层的z
        dzl = np.multiply(dal, relu_backward(z))
        prev_A = caches[l-1][3]#前一层的A
        dWl = np.dot(dzl, prev_A.T)
        dbl = np.sum(dzl, axis=1, keepdims=True)

        gradients["dW" + str(l)] = dWl
        gradients["db" + str(l)] = dbl
        dzL = dzl #更新dz
    return gradients

def random_mini_batches(X, Y, mini_batch_size = 64, seed=1):
    """
    从(X, Y)创建minibatches
    :param X -- 输入数据, of shape (input size, number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param mini_batch_size -- mini-batches的大小, integer
    Returns:
    mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
    """
    np.random.seed(seed)
    m = X.shape[1]  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size  # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def initialize_velocity(parameters):
    """
    初始化参数velocity，类型为directory:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ，shape和相应的参数一致
    :params:
    parameters -- 网络模型参数w,b，类型为python dictionary 
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    Returns:
    v -- 当前的velocity变量，类型为python dictionary.
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    # Initialize velocity
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
    return v

#momentum
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    """
    使用Momentum算法更新网络参数
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary :
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- 当前的velocity变量，类型为python dictionary:
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    beta -- momentum超参数, scalar
    learning_rate -- 学习率, scalar
    Returns:
    parameters --  更新后的网络参数，python dictionary
    '''
    VdW = beta * VdW + (1-beta) * dW
    Vdb = beta * Vdb + (1-beta) * db
    W = W - learning_rate * VdW
    b = b - learning_rate * Vdb
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=beta*v["db"+str(l+1)]+(1-beta)*grads['db'+str(l+1)]
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    return parameters
    # ********** End **********#

#nesterov momentum
def update_parameters_with_nesterov_momentum(parameters, grads, v, beta, learning_rate):
    """
    用nesterov_mmentum算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary :
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- 当前的velocity变量，类型为python dictionary:
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    beta -- momentum超参数, scalar
    learning_rate -- 学习率, scalar
    Returns:
    parameters -- 更新后的网络参数，python dictionary
    v -- 更新后的velocities，python dictionary
    '''
    VdW = beta * VdW - learning_rate * dW
    Vdb = beta * Vdb - learning_rate * db
    W = W + beta * VdW - learning_rate * dW
    b = b + beta * Vdb - learning_rate * db
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]-learning_rate*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=beta*v["db"+str(l+1)]-learning_rate*grads['db'+str(l+1)]
        parameters["W"+str(l+1)]+=beta*v["dW"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]+=beta*v["db"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    return parameters
    # ********** End **********#


def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, optimizer,beta, mini_batch_size = 64):
    """
    :param X:输入数据, of shape (input size, number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :param layer_dims:网络中每层的节点数,list
    :param learning_rate:学习率
    :param num_iterations:迭代次数
    :return:
    parameters：最终的网络参数:(W,b)
    """
    costs = []
    # initialize parameters
    parameters = initialize_parameters(layer_dims)
    if optimizer == "momentum" or optimizer == "nesterov_momentum" :
        v = initialize_velocity(parameters)
    t = 0 # initializing the counter required for Adam update
    seed = 0
    for i in range(0, num_iterations):
        # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
        seed = seed + 1
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
        for minibatch in minibatches:
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            # Forward propagation
            AL, caches = forward_propagation(minibatch_X, parameters)
            # Compute cost
            cost = compute_cost(AL, minibatch_Y)
            # Backward propagation
            grads = backward_propagation(AL, minibatch_Y, caches)
            if optimizer == "momentum":
                parameters = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
            elif optimizer == "nesterov_momentum":
                parameters = update_parameters_with_nesterov_momentum(parameters, grads, v, beta, learning_rate)

        if i % 100 == 0:
            #print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)
    return parameters

#DNN model
def DNN(X_train, y_train,layer_dims, learning_rate= 0.0005, num_iterations=5000,optimizer = 'momentum', beta = 0.9, mini_batch_size = 64):
    parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, optimizer, beta, mini_batch_size)
    return parameters

第3关：自适应学习率算法

本关任务：编写一个分别应用Adagrad、Adadelta、RMSProp、Adam优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


# initialize parameters(w,b)
def initialize_parameters(layer_dims):
    '''
    初始化参数w,b
    :param layer_dims: 网络中每一层的单元个数，类型为list
    :return:存储参数w1,w2,...,wL,b1,...,bL，类型为dictionary
    '''
    np.random.seed(3)
    L = len(layer_dims)  # the number of layers in the network
    parameters = {}
    for l in range(1, L):
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(
            2 / layer_dims[l - 1])  # he initialization
        # parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])  # xavier initialization
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters


def relu(Z):
    """
    实现relu激活层
    :param Z: 线性层的输出
    :return A: 激活后的输出
    """
    A = np.maximum(0, Z)
    return A


# implement the activation function(ReLU and sigmoid)
def sigmoid(Z):
    """
    实现sigmoid激活层
    :param Z: 线性层的输出
    :return A:激活后的输出
    """
    A = 1 / (1 + np.exp(-Z))
    return A


def forward_propagation(X, parameters):
    """
    前向传播
    :param X: 输入数据，大小为 (input size, number of examples)
    :param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
                    W -- 网络节点的权重，大小为 (size of current layer, size of previous layer)
                    b -- 网络节点的偏置，大小为 (size of current layer,1)
    :return:
    AL: 输出层(y_predict)的输出
    caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
    """
    L = len(parameters) // 2  # number of layer
    A = X
    caches = [(None, None, None, X)]  # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致，用于存储每一层的，w,b,z,A
    # calculate from 1 to L-1 layer
    for l in range(1, L):
        A_pre = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        z = np.dot(W, A_pre) + b  # 计算z = wx + b
        A = relu(z)  # relu activation function
        caches.append((W, b, z, A))
    # calculate Lth layer
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    zL = np.dot(WL, A) + bL
    AL = sigmoid(zL)
    caches.append((WL, bL, zL, AL))
    return AL, caches


# calculate cost function
def compute_cost(AL, Y):
    """
    计算损失
    :param AL: 最后一层的激活值，即预测值，shape:(1,number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :return:
    """
    m = Y.shape[1]
    # cost = -1.0/m * np.sum(Y*np.log(AL)+(1-Y)*np.log(1.0 - AL))#py中*是点乘
    # cost = (1. / m) * (-np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)) #推荐用这个，上面那个容易出错
    cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
                              np.multiply(-np.log(1 - AL), 1 - Y))
    # 从数组的形状中删除单维条目，即把shape中为1的维度去掉，比如把[[[2]]]变成2
    cost = np.squeeze(cost)
    # print('=====================cost===================')
    # print(cost)
    return cost


# derivation of relu
def relu_backward(Z):
    """
    :param Z: 激活层的输入
    :return: 激活层的输出
    """
    dA = np.int64(Z > 0)
    return dA


def backward_propagation(AL, Y, caches):
    """
    反向传播
    :param AL -- 最后一层的激活值，即预测值，shape:(1,number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
    Returns:
    gradients --  关于W,b的梯度值，类型为directory
    """
    m = Y.shape[1]
    L = len(caches) - 1
    # print("L:   " + str(L))
    # calculate the Lth layer gradients
    prev_AL = caches[L - 1][3]
    dzL = 1. / m * (AL - Y)
    # print(dzL.shape)
    # print(prev_AL.T.shape)
    dWL = np.dot(dzL, prev_AL.T)
    dbL = np.sum(dzL, axis=1, keepdims=True)
    gradients = {"dW" + str(L): dWL, "db" + str(L): dbL}
    # calculate from L-1 to 1 layer gradients
    for l in reversed(range(1, L)):  # L-1,L-3,....,1
        post_W = caches[l + 1][0]  # 要用后一层的W
        dz = dzL  # 用后一层的dz

        dal = np.dot(post_W.T, dz)
        z = caches[l][2]  # 当前层的z
        dzl = np.multiply(dal, relu_backward(z))
        prev_A = caches[l - 1][3]  # 前一层的A
        dWl = np.dot(dzl, prev_A.T)
        dbl = np.sum(dzl, axis=1, keepdims=True)

        gradients["dW" + str(l)] = dWl
        gradients["db" + str(l)] = dbl
        dzL = dzl  # 更新dz
    return gradients


def random_mini_batches(X, Y, mini_batch_size=64, seed=1):
    """
    从(X, Y)创建minibatches
    :param X -- 输入数据, of shape (input size, number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param mini_batch_size -- mini-batches的大小, integer
    Returns:
    mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
    """
    np.random.seed(seed)
    m = X.shape[1]  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size  # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def initialize_velocity(parameters):
    """
    初始化参数velocity，类型为directory:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ，shape和相应的参数一致
    :params:
    parameters -- 网络模型参数w,b，类型为python dictionary 
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    Returns:
    v -- 当前的velocity变量，类型为python dictionary.
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    # Initialize velocity
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
    return v


# AdaGrad initialization
def initialize_adagrad(parameters):
    """
    初始化 velocity变量, 类型为python dictionary :
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ，shape和相应的参数一致
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary 
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    Returns:
    Gt -- 到第t次迭代对dw,db的平方梯度和，python dictionary
                    G['dW' + str(l)] = sum of the squares of the gradients up to dwl
                    G['db' + str(l)] = sum of the squares of the gradients up to db1
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    G = {}
    # Initialize velocity
    for l in range(L):
        G["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        G["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
    return G


# AdaGrad
def update_parameters_with_adagrad(parameters, grads, G, learning_rate, epsilon=1e-7):
    """
    用Adagrad算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    G -- 当前的G变量，python dictionary :
                    G['dW' + str(l)] = G of dW1
                    G['db' + str(l)] = G of db1
    learning_rate -- xue'xi学习率, scalar
    epsilon -- 防止除0错误的超参数
    Returns:
    parameters -- 更新后的网络参数，python dictionary
    '''
    GW += (dW)^2
    W -= learning_rate/sqrt(GW + epsilon)*dW
    Gb += (db)^2
    b -= learning_rate/sqrt(Gb + epsilon)*db
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********# 
    L=len(parameters) // 2
    for l in range(L):
        G["dW"+str(l+1)]+=grads['dW'+str(l+1)]**2
        G["db"+str(l+1)]+=grads['db'+str(l+1)]**2
        parameters["W"+str(l+1)]-=learning_rate/(np.sqrt(G["dW"+str(l+1)])+epsilon)*grads['dW'+str(l+1)]
        parameters["b"+str(l+1)]-=learning_rate/(np.sqrt(G["db"+str(l+1)])+epsilon)*grads['db'+str(l+1)]
    return parameters
    # ********** End **********#


# initialize_adadelta
def initialize_adadelta(parameters):
    """
    初始化s和delta 变量，均为 python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ，shape和相应的参数一致
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary:
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    Returns:
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient of dw
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...
    v -- python dictionary that will contain the RMS
                v["dW" + str(l)] = ...
                v["db" + str(l)] = ...
    delta --  dw的平方梯度的指数加权平均，python dictionary 
                    delta["dW" + str(l)] = ...
                    delta["db" + str(l)] = ...
    """

    L = len(parameters) // 2  # number of layers in the neural networks
    s = {}
    v = {}
    delta = {}
    # Initialize s, v, delta. Input: "parameters". Outputs: "s, v, delta".
    for l in range(L):
        s["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        s["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
        delta["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        delta["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)

    return s, v, delta


# adadelta
def update_parameters_with_adadelta(parameters, grads, rho, s, v, delta, epsilon=1e-6):
    """
    使用adadelta算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    rho -- 衰减常数 ，和 momentum方法类似, scalar
    s -- 当前的velocity变量，python dictionary:
                    s['dW' + str(l)] = ...
                    s['db' + str(l)] = ...
    delta -- 当前的RMS变量，python dictionary:
                    delta['dW' + str(l)] = ...
                    delta['db' + str(l)] = ...
    epsilon -- 防止除0错误的超参数
    Returns:
    parameters -- 更新后的参数，python dictionary
    '''
    Sdw = rho*Sdw + (1 - rho)*(dW)^2
    Sdb = rho*Sdb + (1 - rho)*(db)^2
    Vdw = sqrt((delta_w + epsilon) / (Sdw + epsilon))*dW
    Vdb = sqrt((delta_b + epsilon) / (Sdb + epsilon))*dW
    W -= Vdw
    b -= Vdb
    delta_w = rho*delta_w + (1 - rho)*(Vdw)^2
    delta_b = rho*delta_b + (1 - rho)*(Vdb)^2
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        s["dW"+str(l+1)]=rho*s["dW"+str(l+1)]+(1-rho)*grads['dW'+str(l+1)]**2
        s["db"+str(l+1)]=rho*s["db"+str(l+1)]+(1-rho)*grads['db'+str(l+1)]**2
        v["dW"+str(l+1)]=np.sqrt((delta["db"+str(l+1)]+epsilon)/(s["dW"+str(l+1)]+epsilon))*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=np.sqrt((delta["db"+str(l+1)]+epsilon)/(s["db"+str(l+1)]+epsilon))*grads['db'+str(l+1)]
        parameters["W"+str(l+1)]-=v["dW"+str(l+1)]
        parameters["b"+str(l+1)]-=v["db"+str(l+1)]
        delta["dW"+str(l+1)]=rho*delta["dW"+str(l+1)]+(1-rho)*v["dW"+str(l+1)]**2
        delta["db"+str(l+1)]=rho*delta["db"+str(l+1)]+(1-rho)*v["db"+str(l+1)]**2
    return parameters
    # ********** End **********#


# RMSprop
def update_parameters_with_rmsprop(parameters, grads, s, beta=0.9, learning_rate=0.01, epsilon=1e-6):
    """
    使用RMSprop算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b，类型为python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    s -- 当前的velocity变量，python dictionary:
                    v['dW' + str(l)] = ...
                    v['db' + str(l)] = ...
    beta -- momentum 超参数, scalar
    learning_rate -- 学习率, scalar
    Returns:
    parameters -- 更新后的参数，python dictionary
    '''
    SdW = beta * SdW + (1-beta) * (dW)^2
    sdb = beta * Sdb + (1-beta) * (db)^2
    W = W - learning_rate * dW/sqrt(SdW + epsilon)
    b = b - learning_rate * db/sqrt(Sdb + epsilon)
    '''
    """
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        s["dW"+str(l+1)]=beta*s["dW"+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]**2
        s["db"+str(l+1)]=beta*s["db"+str(l+1)]+(1-beta)*grads['db'+str(l+1)]**2
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads['dW'+str(l+1)]/np.sqrt(s["dW"+str(l+1)]+epsilon)
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads['db'+str(l+1)]/np.sqrt(s["db"+str(l+1)]+epsilon)
    return parameters
    # ********** End **********#


# initialize adam
def initialize_adam(parameters):
    """
    初始化v和 s ，均为python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ，shape和相应的参数一致
    Arguments:
    parameters -- 网络参数w,b,python dictionary 
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    Returns:
    v -- 梯度指数加权平均，python dictionary 
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- 平方梯度指数加权平均，python dictionary .
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    s = {}
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
        s["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        s["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)

    return v, s


# adam
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """
    使用Adam算法更新参数
    Arguments:
    parameters -- 网络参数w,b，python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, 梯度指数加权平均，python dictionary 
    s -- Adam variable, 平方梯度指数加权平均，python dictionary 
    learning_rate -- 学习率, scalar.
    beta1 -- 第一个动量估计的指数衰减超参数
    beta2 -- 第二个动量估计的指数衰减超参数
    epsilon -- 防止除0错误的超参数
    Returns:
    parameters -- 更新后的参数，python dictionary
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    v_corrected={}
    s_corrected={}
    for l in range(L):
        v["dW"+str(l+1)]=beta1*v["dW"+str(l+1)]+(1-beta1)*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=beta1*v["db"+str(l+1)]+(1-beta1)*grads['db'+str(l+1)]
        v_corrected["dW"+str(l+1)]=v["dW"+str(l+1)]/(1-np.power(beta1,t))
        v_corrected["db"+str(l+1)]=v["db"+str(l+1)]/(1-np.power(beta1,t))
        s["dW"+str(l+1)]=beta2*s["dW"+str(l+1)]+(1-beta2)*np.power(grads['dW'+str(l+1)],2)
        s["db"+str(l+1)]=beta2*s["db"+str(l+1)]+(1-beta2)*np.power(grads['db'+str(l+1)],2)
        s_corrected["dW"+str(l+1)]=s["dW"+str(l+1)]/(1-np.power(beta2,t))
        s_corrected["db"+str(l+1)]=s["db"+str(l+1)]/(1-np.power(beta2,t))
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*v_corrected["dW"+str(l+1)]/np.sqrt(s_corrected["dW"+str(l+1)]+epsilon)
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*v_corrected["db"+str(l+1)]/np.sqrt(s_corrected["db"+str(l+1)]+epsilon)
    return parameters
    # ********** End **********#


def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, optimizer, beta=0.9, beta2=0.999, mini_batch_size=64,
                  epsilon=1e-8):
    """
    :param X:输入数据, of shape (input size, number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :param layer_dims:网络中每层的节点数,list
    :param learning_rate:学习率
    :param num_iterations:迭代次数
    :return:
    parameters：最终的网络参数:(W,b)
    """
    costs = []
    # initialize parameters
    parameters = initialize_parameters(layer_dims)
    if optimizer == "sgd":
        pass  # no initialization required for gradient descent
    elif optimizer == "momentum" or optimizer == "nesterov_momentum" or optimizer == "rmsprop":
        v = initialize_velocity(parameters)
    elif optimizer == "adagrad":
        G = initialize_adagrad(parameters)
    elif optimizer == "adadelta":
        s, v, delta = initialize_adadelta(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)
    t = 0  # initializing the counter required for Adam update
    seed = 0
    for i in range(0, num_iterations):
        # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
        seed = seed + 1
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
        for minibatch in minibatches:
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            # Forward propagation
            AL, caches = forward_propagation(minibatch_X, parameters)
            # Compute cost
            cost = compute_cost(AL, minibatch_Y)
            # Backward propagation
            grads = backward_propagation(AL, minibatch_Y, caches)
            if optimizer == "adagrad":
                parameters = update_parameters_with_adagrad(parameters, grads, G, learning_rate, epsilon)
            elif optimizer == "adadelta":
                parameters = update_parameters_with_adadelta(parameters, grads, beta, s, v, delta, epsilon)
            elif optimizer == "rmsprop":
                parameters = update_parameters_with_rmsprop(parameters, grads, v, beta, learning_rate, epsilon)
            elif optimizer == "adam":
                t += 1
                parameters = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta, beta2,
                                                         epsilon)

        if i % 100 == 0:
            #print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)
    return parameters


# DNN model
def DNN(X_train, y_train, X_test, y_test, layer_dims, learning_rate=0.0005, num_iterations=5000, optimizer='adam',
        beta=0.9, beta2=0.999, mini_batch_size=64, epsilon=1e-8):
    parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, optimizer, beta, beta2,
                               mini_batch_size, epsilon)
    return parameters

第4关：运用Keras优化器

本关任务：用Keras框架编写一个简单的DNN实现mnist数据集分类。

import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(1337)  # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.optimizers import SGD,Adagrad,Adadelta,RMSprop,Adam

# input image dimensions
img_rows, img_cols = 28, 28

def dnn(X_train,Y_train,X_test,Y_test,choice):
    """
    用Keras框架实现简单的dnn，完成对mnist数据集分类，重点在于掌握各种优化器的使用
    Arguments:
    X_train -- 训练样本
    Y_train -- 训练样本的标签
    X_test -- 测试样本
    Y_test -- 测试样本的标签
    choice -- 选择哪种优化器
    Returns:
    model -- Keras模型
    """
    epoch = 2
    sgd = SGD(lr=0.01, momentum=0., decay=0., nesterov=False)
    momentum = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    nag = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    adagrad = Adagrad(lr=0.01, epsilon=1e-6)
    adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-6)
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-6)
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    #定义网络的各层
    model = Sequential()
    model.add(Dense(512, input_shape=(img_cols*img_rows,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.add(Activation('softmax'))
    

    #choose the optimizer
    if choice==1:
        model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    elif choice==2:
        model.compile(loss='categorical_crossentropy', optimizer=momentum, metrics=['accuracy'])
    elif choice==3:
        model.compile(loss='categorical_crossentropy', optimizer=nag, metrics=['accuracy'])
    elif choice==4:
        model.compile(loss='categorical_crossentropy', optimizer=adagrad, metrics=['accuracy'])
    elif choice==5:
        model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy'])
    elif choice==6:
        model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    elif choice==7:
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    model.fit(X_train, Y_train,
              batch_size=128, nb_epoch=epoch,
              verbose=0, validation_data=(X_test, Y_test))
    return model

粥粥粥少女的拧发条鸟

关注

4
点赞
踩
3

收藏

觉得还不错? 一键收藏
打赏
0
评论
深度学习中的优化方法

目录第1关：梯度下降算法实战学习：BGD和SGD第2关：动量梯度下降法第3关：自适应学习率算法第4关：运用Keras优化器第1关：梯度下降算法实战学习：BGD和SGD本关任务：编写一个分别应用BGD、SGD、mini-batch梯度下降优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。import numpy as npfrom sklearn.datasets import load_breast_cancerfrom sklearn.model_se
复制链接

扫一扫