深度学习中的优化方法

第1关:梯度下降算法实战学习:BGD和SGD

本关任务:编写一个分别应用BGD、SGD、mini-batch梯度下降优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。


import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


def initialize_parameters(layer_dims):
    '''
    初始化参数w,b
    :param layer_dims: 网络中每一层的单元个数,类型为list
    :return:存储参数w1,w2,...,wL,b1,...,bL,类型为dictionary
    '''
    np.random.seed(3)
    L = len(layer_dims)  # the number of layers in the network
    parameters = {}
    for l in range(1, L):
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(
            2 / layer_dims[l - 1])  # he initialization
        # parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])  # xavier initialization
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters


def relu(Z):
    """
    实现relu激活层
    :param Z: 线性层的输出
    :return A: 激活后的输出
    """
    A = np.maximum(0, Z)
    return A


def sigmoid(Z):
    """
    实现sigmoid激活层
    :param Z: 线性层的输出
    :return A:激活后的输出
    """
    A = 1 / (1 + np.exp(-Z))
    return A


def forward_propagation(X, parameters):
    """
    前向传播
    :param X: 输入数据,大小为 (input size, number of examples)
    :param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
                    W -- 网络节点的权重,大小为 (size of current layer, size of previous layer)
                    b -- 网络节点的偏置,大小为 (size of current layer,1)
    :return:
    AL: 输出层(y_predict)的输出
    caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
    """
    L = len(parameters) // 2  # number of layer
    A = X
    caches = [(None, None, None, X)]  # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致,用于存储每一层的,w,b,z,A
    # calculate from 1 to L-1 layer
    for l in range(1, L):
        A_pre = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        z = np.dot(W, A_pre) + b  # 计算z = wx + b
        A = relu(z)  # relu activation function
        caches.append((W, b, z, A))
    # calculate Lth layer
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    zL = np.dot(WL, A) + bL
    AL = sigmoid(zL)
    caches.append((WL, bL, zL, AL))
    return AL, caches


def compute_cost(AL, Y):
    """
    计算损失
    :param AL: 最后一层的激活值,即预测值,shape:(1,number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :return:
    """
    m = Y.shape[1]
    cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
                              np.multiply(-np.log(1 - AL), 1 - Y))
    # 从数组的形状中删除单维条目,即把shape中为1的维度去掉,比如把[[[2]]]变成2
    cost = np.squeeze(cost)
    return cost


def relu_backward(Z):
    """
    :param Z: 激活层的输入
    :return: 激活层的输出
    """
    dA = np.int64(Z > 0)
    return dA


def backward_propagation(AL, Y, caches):
    """
    反向传播
    :param AL -- 最后一层的激活值,即预测值,shape:(1,number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
    Returns:
    gradients --  关于dW,db的梯度值,类型为directory
    """
    m = Y.shape[1]
    L = len(caches) - 1
    # print("L:   " + str(L))
    # calculate the Lth layer gradients
    prev_AL = caches[L - 1][3]
    dzL = 1. / m * (AL - Y)
    # print(dzL.shape)
    # print(prev_AL.T.shape)
    dWL = np.dot(dzL, prev_AL.T)
    dbL = np.sum(dzL, axis=1, keepdims=True)
    gradients = {"dW" + str(L): dWL, "db" + str(L): dbL}
    # calculate from L-1 to 1 layer gradients
    for l in reversed(range(1, L)):  # L-1,L-3,....,1
        post_W = caches[l + 1][0]  # 要用后一层的W
        dz = dzL  # 用后一层的dz

        dal = np.dot(post_W.T, dz)
        z = caches[l][2]  # 当前层的z
        dzl = np.multiply(dal, relu_backward(z))
        prev_A = caches[l - 1][3]  # 前一层的A
        dWl = np.dot(dzl, prev_A.T)
        dbl = np.sum(dzl, axis=1, keepdims=True)

        gradients["dW" + str(l)] = dWl
        gradients["db" + str(l)] = dbl
        dzL = dzl  # 更新dz
    return gradients


def update_parameters(parameters, grads, learning_rate):
    """
    :param parameters: dictionary,  W,b
    :param grads: dW,db
    :param learning_rate: 学习率alpha
    :return:
    """
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    return parameters
    # ********** End **********#


def random_mini_batches(X, Y, mini_batch_size=64, seed=1):
    """
    从(X, Y)创建minibatches
    :param X -- 输入数据, of shape (input size, number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param mini_batch_size -- mini-batches的大小, integer
    Returns:
    mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
    """
    np.random.seed(seed)
    m = X.shape[1]  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size  # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, gradient_descent='bgd', mini_batch_size=64):
    """
    :param X:输入数据, of shape (input size, number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :param layer_dims:网络中每层的节点数,list
    :param learning_rate:学习率
    :param num_iterations:迭代次数
    :return:
    parameters:最终的网络参数:(W,b)
    """
    m = Y.shape[1]
    costs = []
    # initialize parameters
    parameters = initialize_parameters(layer_dims)
    if gradient_descent == 'bgd':
        for i in range(0, num_iterations):
            # foward propagation
            AL, caches = forward_propagation(X, parameters)
            # calculate the cost
            cost = compute_cost(AL, Y)
            if i % 1000 == 0:
                # print("Cost after iteration {}: {}".format(i, cost))
                costs.append(cost)
            # backward propagation
            grads = backward_propagation(AL, Y, caches)
            # update parameters
            parameters = update_parameters(parameters, grads, learning_rate)
    elif gradient_descent == 'sgd':
        np.random.seed(3)
        # 把数据集打乱,这个很重要
        permutation = list(np.random.permutation(m))
        shuffled_X = X[:, permutation]
        shuffled_Y = Y[:, permutation].reshape((1, m))
        for i in range(0, num_iterations):
            for j in range(0, m):  # 每次训练一个样本
                # Forward propagation
                AL, caches = forward_propagation(shuffled_X[:, j].reshape(-1, 1), parameters)
                # Compute cost
                cost = compute_cost(AL, shuffled_Y[:, j].reshape(1, 1))
                # Backward propagation
                grads = backward_propagation(AL, shuffled_Y[:, j].reshape(1, 1), caches)
                # Update parameters.
                parameters = update_parameters(parameters, grads, learning_rate)
                
    elif gradient_descent == 'mini-batch':
        seed = 0
        for i in range(0, num_iterations):
            # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
            seed = seed + 1
            minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
            for minibatch in minibatches:
                # 请在此添加代码 完成本关任务
                # ********** Begin *********#
                # Select a minibatch
                (minibatch_X,minibatch_Y)=minibatch
                # Forward propagation
                AL,caches=forward_propagation(minibatch_X,parameters)
                # Compute cost
                cost=compute_cost(AL,minibatch_Y)
                # Backward propagation
                grads=backward_propagation(AL,minibatch_Y,caches)
                parameters=update_parameters(parameters,grads,learning_rate)
                   # ********** End **********#
            
    return parameters


# DNN model
def DNN(X_train, y_train, layer_dims, learning_rate=0.0006, num_iterations=30000,
        gradient_descent='bgd', mini_batch_size=64):
    parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, gradient_descent,
                               mini_batch_size)
    return parameters



第2关:动量梯度下降法

本关任务:编写一个分别应用momentum、NAG梯度下降优化算法法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。

import numpy as np
from sklearn.datasets import  load_breast_cancer
from sklearn.model_selection import train_test_split
#initialize parameters(w,b)
def initialize_parameters(layer_dims):
    '''
    初始化参数w,b
    :param layer_dims: 网络中每一层的单元个数,类型为list
    :return:存储参数w1,w2,...,wL,b1,...,bL,类型为dictionary
    '''
    np.random.seed(3)
    L = len(layer_dims)#the number of layers in the network
    parameters = {}
    for l in range(1,L):
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2/layer_dims[l-1]) # he initialization
        # parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])  # xavier initialization
        parameters["b" + str(l)] = np.zeros((layer_dims[l],1))
    return parameters
def relu(Z):
    """
    实现relu激活层
    :param Z: 线性层的输出
    :return A: 激活后的输出
    """
    A = np.maximum(0,Z)
    return A

def sigmoid(Z):
    """
    实现sigmoid激活层
    :param Z: 线性层的输出
    :return A:激活后的输出
    """
    A = 1 / (1 + np.exp(-Z))
    return A

def forward_propagation(X, parameters):
    """
    前向传播
    :param X: 输入数据,大小为 (input size, number of examples)
    :param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
                    W -- 网络节点的权重,大小为 (size of current layer, size of previous layer)
                    b -- 网络节点的偏置,大小为 (size of current layer,1)
    :return:
    AL: 输出层(y_predict)的输出
    caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
    """
    L = len(parameters) // 2  # number of layer
    A = X
    caches = [(None,None,None,X)]  # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致,用于存储每一层的,w,b,z,A
    # calculate from 1 to L-1 layer
    for l in range(1,L):
        A_pre = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        z = np.dot(W,A_pre) + b #计算z = wx + b
        A = relu(z) #relu activation function
        caches.append((W,b,z,A))
    # calculate Lth layer
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    zL = np.dot(WL,A) + bL
    AL = sigmoid(zL)
    caches.append((WL,bL,zL,AL))
    return AL, caches

def compute_cost(AL,Y):
    """
    计算损失
    :param AL: 最后一层的激活值,即预测值,shape:(1,number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :return:
    """
    m = Y.shape[1]
    # cost = -1.0/m * np.sum(Y*np.log(AL)+(1-Y)*np.log(1.0 - AL))#py中*是点乘
    # cost = (1. / m) * (-np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)) #推荐用这个,上面那个容易出错
    cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
                              np.multiply(-np.log(1 - AL), 1 - Y))
    #从数组的形状中删除单维条目,即把shape中为1的维度去掉,比如把[[[2]]]变成2
    cost = np.squeeze(cost)
    return cost

# derivation of relu
def relu_backward(Z):
    """
    :param Z: 激活层的输入
    :return: 激活层的输出
    """
    dA = np.int64(Z > 0)
    return dA

def backward_propagation(AL, Y, caches):
    """
    反向传播
    :param AL -- 最后一层的激活值,即预测值,shape:(1,number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
    Returns:
    gradients --  关于W,b的梯度值,类型为directory
    """
    m = Y.shape[1]
    L = len(caches) - 1
    # print("L:   " + str(L))
    #calculate the Lth layer gradients
    prev_AL = caches[L-1][3]
    dzL = 1./m * (AL - Y)
    # print(dzL.shape)
    # print(prev_AL.T.shape)
    dWL = np.dot(dzL, prev_AL.T)
    dbL = np.sum(dzL, axis=1, keepdims=True)
    gradients = {"dW"+str(L):dWL, "db"+str(L):dbL}
    #calculate from L-1 to 1 layer gradients
    for l in reversed(range(1,L)): # L-1,L-3,....,1
        post_W= caches[l+1][0] #要用后一层的W
        dz = dzL #用后一层的dz

        dal = np.dot(post_W.T, dz)
        z = caches[l][2]#当前层的z
        dzl = np.multiply(dal, relu_backward(z))
        prev_A = caches[l-1][3]#前一层的A
        dWl = np.dot(dzl, prev_A.T)
        dbl = np.sum(dzl, axis=1, keepdims=True)

        gradients["dW" + str(l)] = dWl
        gradients["db" + str(l)] = dbl
        dzL = dzl #更新dz
    return gradients

def random_mini_batches(X, Y, mini_batch_size = 64, seed=1):
    """
    从(X, Y)创建minibatches
    :param X -- 输入数据, of shape (input size, number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param mini_batch_size -- mini-batches的大小, integer
    Returns:
    mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
    """
    np.random.seed(seed)
    m = X.shape[1]  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size  # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def initialize_velocity(parameters):
    """
    初始化参数velocity,类型为directory:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ,shape和相应的参数一致
    :params:
    parameters -- 网络模型参数w,b,类型为python dictionary 
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    Returns:
    v -- 当前的velocity变量,类型为python dictionary.
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    # Initialize velocity
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
    return v

#momentum
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    """
    使用Momentum算法更新网络参数
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary :
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- 当前的velocity变量,类型为python dictionary:
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    beta -- momentum超参数, scalar
    learning_rate -- 学习率, scalar
    Returns:
    parameters --  更新后的网络参数,python dictionary
    '''
    VdW = beta * VdW + (1-beta) * dW
    Vdb = beta * Vdb + (1-beta) * db
    W = W - learning_rate * VdW
    b = b - learning_rate * Vdb
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=beta*v["db"+str(l+1)]+(1-beta)*grads['db'+str(l+1)]
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    return parameters
    # ********** End **********#

#nesterov momentum
def update_parameters_with_nesterov_momentum(parameters, grads, v, beta, learning_rate):
    """
    用nesterov_mmentum算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary :
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- 当前的velocity变量,类型为python dictionary:
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    beta -- momentum超参数, scalar
    learning_rate -- 学习率, scalar
    Returns:
    parameters -- 更新后的网络参数,python dictionary
    v -- 更新后的velocities,python dictionary
    '''
    VdW = beta * VdW - learning_rate * dW
    Vdb = beta * Vdb - learning_rate * db
    W = W + beta * VdW - learning_rate * dW
    b = b + beta * Vdb - learning_rate * db
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]-learning_rate*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=beta*v["db"+str(l+1)]-learning_rate*grads['db'+str(l+1)]
        parameters["W"+str(l+1)]+=beta*v["dW"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]+=beta*v["db"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    return parameters
    # ********** End **********#


def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, optimizer,beta, mini_batch_size = 64):
    """
    :param X:输入数据, of shape (input size, number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :param layer_dims:网络中每层的节点数,list
    :param learning_rate:学习率
    :param num_iterations:迭代次数
    :return:
    parameters:最终的网络参数:(W,b)
    """
    costs = []
    # initialize parameters
    parameters = initialize_parameters(layer_dims)
    if optimizer == "momentum" or optimizer == "nesterov_momentum" :
        v = initialize_velocity(parameters)
    t = 0 # initializing the counter required for Adam update
    seed = 0
    for i in range(0, num_iterations):
        # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
        seed = seed + 1
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
        for minibatch in minibatches:
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            # Forward propagation
            AL, caches = forward_propagation(minibatch_X, parameters)
            # Compute cost
            cost = compute_cost(AL, minibatch_Y)
            # Backward propagation
            grads = backward_propagation(AL, minibatch_Y, caches)
            if optimizer == "momentum":
                parameters = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
            elif optimizer == "nesterov_momentum":
                parameters = update_parameters_with_nesterov_momentum(parameters, grads, v, beta, learning_rate)

        if i % 100 == 0:
            #print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)
    return parameters

#DNN model
def DNN(X_train, y_train,layer_dims, learning_rate= 0.0005, num_iterations=5000,optimizer = 'momentum', beta = 0.9, mini_batch_size = 64):
    parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, optimizer, beta, mini_batch_size)
    return parameters



第3关:自适应学习率算法

本关任务:编写一个分别应用Adagrad、Adadelta、RMSProp、Adam优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


# initialize parameters(w,b)
def initialize_parameters(layer_dims):
    '''
    初始化参数w,b
    :param layer_dims: 网络中每一层的单元个数,类型为list
    :return:存储参数w1,w2,...,wL,b1,...,bL,类型为dictionary
    '''
    np.random.seed(3)
    L = len(layer_dims)  # the number of layers in the network
    parameters = {}
    for l in range(1, L):
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(
            2 / layer_dims[l - 1])  # he initialization
        # parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
        # parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1])  # xavier initialization
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters


def relu(Z):
    """
    实现relu激活层
    :param Z: 线性层的输出
    :return A: 激活后的输出
    """
    A = np.maximum(0, Z)
    return A


# implement the activation function(ReLU and sigmoid)
def sigmoid(Z):
    """
    实现sigmoid激活层
    :param Z: 线性层的输出
    :return A:激活后的输出
    """
    A = 1 / (1 + np.exp(-Z))
    return A


def forward_propagation(X, parameters):
    """
    前向传播
    :param X: 输入数据,大小为 (input size, number of examples)
    :param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
                    W -- 网络节点的权重,大小为 (size of current layer, size of previous layer)
                    b -- 网络节点的偏置,大小为 (size of current layer,1)
    :return:
    AL: 输出层(y_predict)的输出
    caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
    """
    L = len(parameters) // 2  # number of layer
    A = X
    caches = [(None, None, None, X)]  # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致,用于存储每一层的,w,b,z,A
    # calculate from 1 to L-1 layer
    for l in range(1, L):
        A_pre = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        z = np.dot(W, A_pre) + b  # 计算z = wx + b
        A = relu(z)  # relu activation function
        caches.append((W, b, z, A))
    # calculate Lth layer
    WL = parameters["W" + str(L)]
    bL = parameters["b" + str(L)]
    zL = np.dot(WL, A) + bL
    AL = sigmoid(zL)
    caches.append((WL, bL, zL, AL))
    return AL, caches


# calculate cost function
def compute_cost(AL, Y):
    """
    计算损失
    :param AL: 最后一层的激活值,即预测值,shape:(1,number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :return:
    """
    m = Y.shape[1]
    # cost = -1.0/m * np.sum(Y*np.log(AL)+(1-Y)*np.log(1.0 - AL))#py中*是点乘
    # cost = (1. / m) * (-np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)) #推荐用这个,上面那个容易出错
    cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
                              np.multiply(-np.log(1 - AL), 1 - Y))
    # 从数组的形状中删除单维条目,即把shape中为1的维度去掉,比如把[[[2]]]变成2
    cost = np.squeeze(cost)
    # print('=====================cost===================')
    # print(cost)
    return cost


# derivation of relu
def relu_backward(Z):
    """
    :param Z: 激活层的输入
    :return: 激活层的输出
    """
    dA = np.int64(Z > 0)
    return dA


def backward_propagation(AL, Y, caches):
    """
    反向传播
    :param AL -- 最后一层的激活值,即预测值,shape:(1,number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
    Returns:
    gradients --  关于W,b的梯度值,类型为directory
    """
    m = Y.shape[1]
    L = len(caches) - 1
    # print("L:   " + str(L))
    # calculate the Lth layer gradients
    prev_AL = caches[L - 1][3]
    dzL = 1. / m * (AL - Y)
    # print(dzL.shape)
    # print(prev_AL.T.shape)
    dWL = np.dot(dzL, prev_AL.T)
    dbL = np.sum(dzL, axis=1, keepdims=True)
    gradients = {"dW" + str(L): dWL, "db" + str(L): dbL}
    # calculate from L-1 to 1 layer gradients
    for l in reversed(range(1, L)):  # L-1,L-3,....,1
        post_W = caches[l + 1][0]  # 要用后一层的W
        dz = dzL  # 用后一层的dz

        dal = np.dot(post_W.T, dz)
        z = caches[l][2]  # 当前层的z
        dzl = np.multiply(dal, relu_backward(z))
        prev_A = caches[l - 1][3]  # 前一层的A
        dWl = np.dot(dzl, prev_A.T)
        dbl = np.sum(dzl, axis=1, keepdims=True)

        gradients["dW" + str(l)] = dWl
        gradients["db" + str(l)] = dbl
        dzL = dzl  # 更新dz
    return gradients


def random_mini_batches(X, Y, mini_batch_size=64, seed=1):
    """
    从(X, Y)创建minibatches
    :param X -- 输入数据, of shape (input size, number of examples)
    :param Y -- 真实值,shape:(1, number of examples)
    :param mini_batch_size -- mini-batches的大小, integer
    Returns:
    mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
    """
    np.random.seed(seed)
    m = X.shape[1]  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1, m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size  # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches


def initialize_velocity(parameters):
    """
    初始化参数velocity,类型为directory:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ,shape和相应的参数一致
    :params:
    parameters -- 网络模型参数w,b,类型为python dictionary 
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    Returns:
    v -- 当前的velocity变量,类型为python dictionary.
                    v['dW' + str(l)] = velocity of dWl
                    v['db' + str(l)] = velocity of dbl
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    # Initialize velocity
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
    return v


# AdaGrad initialization
def initialize_adagrad(parameters):
    """
    初始化 velocity变量, 类型为python dictionary :
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ,shape和相应的参数一致
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary 
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    Returns:
    Gt -- 到第t次迭代对dw,db的平方梯度和,python dictionary
                    G['dW' + str(l)] = sum of the squares of the gradients up to dwl
                    G['db' + str(l)] = sum of the squares of the gradients up to db1
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    G = {}
    # Initialize velocity
    for l in range(L):
        G["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        G["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
    return G


# AdaGrad
def update_parameters_with_adagrad(parameters, grads, G, learning_rate, epsilon=1e-7):
    """
    用Adagrad算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    G -- 当前的G变量,python dictionary :
                    G['dW' + str(l)] = G of dW1
                    G['db' + str(l)] = G of db1
    learning_rate -- xue'xi学习率, scalar
    epsilon -- 防止除0错误的超参数
    Returns:
    parameters -- 更新后的网络参数,python dictionary
    '''
    GW += (dW)^2
    W -= learning_rate/sqrt(GW + epsilon)*dW
    Gb += (db)^2
    b -= learning_rate/sqrt(Gb + epsilon)*db
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********# 
    L=len(parameters) // 2
    for l in range(L):
        G["dW"+str(l+1)]+=grads['dW'+str(l+1)]**2
        G["db"+str(l+1)]+=grads['db'+str(l+1)]**2
        parameters["W"+str(l+1)]-=learning_rate/(np.sqrt(G["dW"+str(l+1)])+epsilon)*grads['dW'+str(l+1)]
        parameters["b"+str(l+1)]-=learning_rate/(np.sqrt(G["db"+str(l+1)])+epsilon)*grads['db'+str(l+1)]
    return parameters
    # ********** End **********#


# initialize_adadelta
def initialize_adadelta(parameters):
    """
    初始化s和delta 变量,均为 python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ,shape和相应的参数一致
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary:
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    Returns:
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient of dw
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...
    v -- python dictionary that will contain the RMS
                v["dW" + str(l)] = ...
                v["db" + str(l)] = ...
    delta --  dw的平方梯度的指数加权平均,python dictionary 
                    delta["dW" + str(l)] = ...
                    delta["db" + str(l)] = ...
    """

    L = len(parameters) // 2  # number of layers in the neural networks
    s = {}
    v = {}
    delta = {}
    # Initialize s, v, delta. Input: "parameters". Outputs: "s, v, delta".
    for l in range(L):
        s["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        s["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
        delta["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        delta["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)

    return s, v, delta


# adadelta
def update_parameters_with_adadelta(parameters, grads, rho, s, v, delta, epsilon=1e-6):
    """
    使用adadelta算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    rho -- 衰减常数 ,和 momentum方法类似, scalar
    s -- 当前的velocity变量,python dictionary:
                    s['dW' + str(l)] = ...
                    s['db' + str(l)] = ...
    delta -- 当前的RMS变量,python dictionary:
                    delta['dW' + str(l)] = ...
                    delta['db' + str(l)] = ...
    epsilon -- 防止除0错误的超参数
    Returns:
    parameters -- 更新后的参数,python dictionary
    '''
    Sdw = rho*Sdw + (1 - rho)*(dW)^2
    Sdb = rho*Sdb + (1 - rho)*(db)^2
    Vdw = sqrt((delta_w + epsilon) / (Sdw + epsilon))*dW
    Vdb = sqrt((delta_b + epsilon) / (Sdb + epsilon))*dW
    W -= Vdw
    b -= Vdb
    delta_w = rho*delta_w + (1 - rho)*(Vdw)^2
    delta_b = rho*delta_b + (1 - rho)*(Vdb)^2
    '''
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        s["dW"+str(l+1)]=rho*s["dW"+str(l+1)]+(1-rho)*grads['dW'+str(l+1)]**2
        s["db"+str(l+1)]=rho*s["db"+str(l+1)]+(1-rho)*grads['db'+str(l+1)]**2
        v["dW"+str(l+1)]=np.sqrt((delta["db"+str(l+1)]+epsilon)/(s["dW"+str(l+1)]+epsilon))*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=np.sqrt((delta["db"+str(l+1)]+epsilon)/(s["db"+str(l+1)]+epsilon))*grads['db'+str(l+1)]
        parameters["W"+str(l+1)]-=v["dW"+str(l+1)]
        parameters["b"+str(l+1)]-=v["db"+str(l+1)]
        delta["dW"+str(l+1)]=rho*delta["dW"+str(l+1)]+(1-rho)*v["dW"+str(l+1)]**2
        delta["db"+str(l+1)]=rho*delta["db"+str(l+1)]+(1-rho)*v["db"+str(l+1)]**2
    return parameters
    # ********** End **********#


# RMSprop
def update_parameters_with_rmsprop(parameters, grads, s, beta=0.9, learning_rate=0.01, epsilon=1e-6):
    """
    使用RMSprop算法更新参数
    Arguments:
    parameters -- 网络模型参数w,b,类型为python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    s -- 当前的velocity变量,python dictionary:
                    v['dW' + str(l)] = ...
                    v['db' + str(l)] = ...
    beta -- momentum 超参数, scalar
    learning_rate -- 学习率, scalar
    Returns:
    parameters -- 更新后的参数,python dictionary
    '''
    SdW = beta * SdW + (1-beta) * (dW)^2
    sdb = beta * Sdb + (1-beta) * (db)^2
    W = W - learning_rate * dW/sqrt(SdW + epsilon)
    b = b - learning_rate * db/sqrt(Sdb + epsilon)
    '''
    """
    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    for l in range(L):
        s["dW"+str(l+1)]=beta*s["dW"+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]**2
        s["db"+str(l+1)]=beta*s["db"+str(l+1)]+(1-beta)*grads['db'+str(l+1)]**2
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads['dW'+str(l+1)]/np.sqrt(s["dW"+str(l+1)]+epsilon)
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads['db'+str(l+1)]/np.sqrt(s["db"+str(l+1)]+epsilon)
    return parameters
    # ********** End **********#


# initialize adam
def initialize_adam(parameters):
    """
    初始化v和 s ,均为python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL"
                - values: 全0的numpy arrays ,shape和相应的参数一致
    Arguments:
    parameters -- 网络参数w,b,python dictionary 
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    Returns:
    v -- 梯度指数加权平均,python dictionary 
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- 平方梯度指数加权平均,python dictionary .
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...
    """
    L = len(parameters) // 2  # number of layers in the neural networks
    v = {}
    s = {}
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
        v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
        s["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
        s["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)

    return v, s


# adam
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """
    使用Adam算法更新参数
    Arguments:
    parameters -- 网络参数w,b,python dictionary:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, 梯度指数加权平均,python dictionary 
    s -- Adam variable, 平方梯度指数加权平均,python dictionary 
    learning_rate -- 学习率, scalar.
    beta1 -- 第一个动量估计的指数衰减超参数
    beta2 -- 第二个动量估计的指数衰减超参数
    epsilon -- 防止除0错误的超参数
    Returns:
    parameters -- 更新后的参数,python dictionary
    """

    # 请在此添加代码 完成本关任务
    # ********** Begin *********#
    L=len(parameters) // 2
    v_corrected={}
    s_corrected={}
    for l in range(L):
        v["dW"+str(l+1)]=beta1*v["dW"+str(l+1)]+(1-beta1)*grads['dW'+str(l+1)]
        v["db"+str(l+1)]=beta1*v["db"+str(l+1)]+(1-beta1)*grads['db'+str(l+1)]
        v_corrected["dW"+str(l+1)]=v["dW"+str(l+1)]/(1-np.power(beta1,t))
        v_corrected["db"+str(l+1)]=v["db"+str(l+1)]/(1-np.power(beta1,t))
        s["dW"+str(l+1)]=beta2*s["dW"+str(l+1)]+(1-beta2)*np.power(grads['dW'+str(l+1)],2)
        s["db"+str(l+1)]=beta2*s["db"+str(l+1)]+(1-beta2)*np.power(grads['db'+str(l+1)],2)
        s_corrected["dW"+str(l+1)]=s["dW"+str(l+1)]/(1-np.power(beta2,t))
        s_corrected["db"+str(l+1)]=s["db"+str(l+1)]/(1-np.power(beta2,t))
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*v_corrected["dW"+str(l+1)]/np.sqrt(s_corrected["dW"+str(l+1)]+epsilon)
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*v_corrected["db"+str(l+1)]/np.sqrt(s_corrected["db"+str(l+1)]+epsilon)
    return parameters
    # ********** End **********#


def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, optimizer, beta=0.9, beta2=0.999, mini_batch_size=64,
                  epsilon=1e-8):
    """
    :param X:输入数据, of shape (input size, number of examples)
    :param Y:真实值,shape:(1, number of examples)
    :param layer_dims:网络中每层的节点数,list
    :param learning_rate:学习率
    :param num_iterations:迭代次数
    :return:
    parameters:最终的网络参数:(W,b)
    """
    costs = []
    # initialize parameters
    parameters = initialize_parameters(layer_dims)
    if optimizer == "sgd":
        pass  # no initialization required for gradient descent
    elif optimizer == "momentum" or optimizer == "nesterov_momentum" or optimizer == "rmsprop":
        v = initialize_velocity(parameters)
    elif optimizer == "adagrad":
        G = initialize_adagrad(parameters)
    elif optimizer == "adadelta":
        s, v, delta = initialize_adadelta(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)
    t = 0  # initializing the counter required for Adam update
    seed = 0
    for i in range(0, num_iterations):
        # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
        seed = seed + 1
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
        for minibatch in minibatches:
            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
            # Forward propagation
            AL, caches = forward_propagation(minibatch_X, parameters)
            # Compute cost
            cost = compute_cost(AL, minibatch_Y)
            # Backward propagation
            grads = backward_propagation(AL, minibatch_Y, caches)
            if optimizer == "adagrad":
                parameters = update_parameters_with_adagrad(parameters, grads, G, learning_rate, epsilon)
            elif optimizer == "adadelta":
                parameters = update_parameters_with_adadelta(parameters, grads, beta, s, v, delta, epsilon)
            elif optimizer == "rmsprop":
                parameters = update_parameters_with_rmsprop(parameters, grads, v, beta, learning_rate, epsilon)
            elif optimizer == "adam":
                t += 1
                parameters = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta, beta2,
                                                         epsilon)

        if i % 100 == 0:
            #print("Cost after iteration {}: {}".format(i, cost))
            costs.append(cost)
    return parameters


# DNN model
def DNN(X_train, y_train, X_test, y_test, layer_dims, learning_rate=0.0005, num_iterations=5000, optimizer='adam',
        beta=0.9, beta2=0.999, mini_batch_size=64, epsilon=1e-8):
    parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, optimizer, beta, beta2,
                               mini_batch_size, epsilon)
    return parameters






第4关:运用Keras优化器

本关任务:用Keras框架编写一个简单的DNN实现mnist数据集分类。

import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(1337)  # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.optimizers import SGD,Adagrad,Adadelta,RMSprop,Adam

# input image dimensions
img_rows, img_cols = 28, 28

def dnn(X_train,Y_train,X_test,Y_test,choice):
    """
    用Keras框架实现简单的dnn,完成对mnist数据集分类,重点在于掌握各种优化器的使用
    Arguments:
    X_train -- 训练样本
    Y_train -- 训练样本的标签
    X_test -- 测试样本
    Y_test -- 测试样本的标签
    choice -- 选择哪种优化器
    Returns:
    model -- Keras模型
    """
    epoch = 2
    sgd = SGD(lr=0.01, momentum=0., decay=0., nesterov=False)
    momentum = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    nag = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    adagrad = Adagrad(lr=0.01, epsilon=1e-6)
    adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-6)
    rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-6)
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    #定义网络的各层
    model = Sequential()
    model.add(Dense(512, input_shape=(img_cols*img_rows,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.add(Activation('softmax'))
    

    #choose the optimizer
    if choice==1:
        model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    elif choice==2:
        model.compile(loss='categorical_crossentropy', optimizer=momentum, metrics=['accuracy'])
    elif choice==3:
        model.compile(loss='categorical_crossentropy', optimizer=nag, metrics=['accuracy'])
    elif choice==4:
        model.compile(loss='categorical_crossentropy', optimizer=adagrad, metrics=['accuracy'])
    elif choice==5:
        model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy'])
    elif choice==6:
        model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
    elif choice==7:
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    model.fit(X_train, Y_train,
              batch_size=128, nb_epoch=epoch,
              verbose=0, validation_data=(X_test, Y_test))
    return model

  • 4
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
深度学习优化方法是指在训练神经网络时,通过更新模型参数来最小化损失函数的过程所采用的算法。常用的优化方法包括:momentum、Nesterov Momentum、AdaGrad、Adadelta、RMSprop、Adam等。 1. Momentum Momentum是一种基于梯度下降的优化方法,它通过引入动量来加速收敛。在更新模型参数时,不仅考虑当前的梯度,还考虑之前的梯度对更新方向的影响,通过累积之前的梯度,使得更新方向更加稳定,加速收敛。 2. Nesterov Momentum Nesterov Momentum是Momentum的一种变体,它在更新模型参数之前,先向前“看一步”,计算模型参数在当前动量下的移动方向,然后再计算当前位置的梯度,最后根据这两个信息来更新模型参数。相比于Momentum,Nesterov Momentum能够更快地收敛。 3. AdaGrad AdaGrad是一种自适应学习率的优化方法,它通过动态地调整学习率来适应不同参数的更新需求。具体地说,它将学习率分别应用于每个参数的更新量上,使得每个参数的学习率随着训练的进行不断减小,从而减少参数更新的震荡。 4. Adadelta Adadelta也是一种自适应学习率的优化方法,它和AdaGrad不同之处在于,它不仅考虑了过去的梯度信息,还考虑了过去的参数更新信息。具体地说,它通过维护一个累积梯度平方的指数衰减平均值和一个累积参数更新平方的指数衰减平均值,来动态调整学习率和更新量,使得参数更新更加平稳。 5. RMSprop RMSprop也是一种自适应学习率的优化方法,它和Adadelta类似,但只考虑了过去的梯度信息,没有考虑过去的参数更新信息。具体地说,它通过维护一个梯度平方的指数衰减平均值来动态调整学习率,使得参数更新更加平稳。 6. Adam Adam是一种结合了Momentum和RMSprop的优化方法,它不仅考虑了梯度的一阶矩和二阶矩信息,还引入了偏置修正,使得参数更新更加准确。相比于其他优化方法,Adam不仅收敛速度快,还具有较好的性能表现。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

粥粥粥少女的拧发条鸟

你的鼓励是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值