吴恩达神经网络学习-L2W2作业1

import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets

from opt_utils import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation
from opt_utils import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset
from testCases_L2W2 import *

plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

#1.梯度下降
#实现梯度下降更新方法。
def update_parameters_with_gd(parameters,grads,learning_rate):
    """
        Update parameters using one step of gradient descent

        Arguments:
        parameters -- python dictionary containing your parameters to be updated:
                        parameters['W' + str(l)] = Wl
                        parameters['b' + str(l)] = bl
        grads -- python dictionary containing your gradients to update each parameters:
                        grads['dW' + str(l)] = dWl
                        grads['db' + str(l)] = dbl
        learning_rate -- the learning rate, scalar.

        Returns:
        parameters -- python dictionary containing your updated parameters
        """
    L=len(parameters)//2

    for l in range(L):
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]

    return parameters

parameters, grads, learning_rate=update_parameters_with_gd_test_case()
parameters = update_parameters_with_gd(parameters, grads, learning_rate)
# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))

# 它的一种变体是随机梯度下降(SGD),它相当于mini版的批次梯度下降,其中每个mini-batch只有一个数据示例。刚刚实现的更新规则不会更改。
# 不同的是,SGD一次仅在一个训练数据上计算梯度,而不是在整个训练集合上计算梯度。下面的代码示例说明了随机梯度下降和(批量)梯度下降之间的区别。

#(Batch) Gradient Descent:
# X = data_input
# Y = labels
# parameters = initialize_parameters(layers_dims)
# for i in range(0, num_iterations):
#     # Forward propagation
#     a, caches = forward_propagation(X, parameters)
#     # Compute cost.
#     cost = compute_cost(a, Y)
#     # Backward propagation.
#     grads = backward_propagation(a, caches, parameters)
#     # Update parameters.
#     parameters = update_parameters(parameters, grads)

#Stochastic Gradient Descent:
# X = data_input
# Y = labels
# parameters = initialize_parameters(layers_dims)
# for i in range(0, num_iterations):
#     for j in range(0, m):
#         # Forward propagation
#         a, caches = forward_propagation(X[:,j], parameters)
#         # Compute cost
#         cost = compute_cost(a, Y[:,j])
#         # Backward propagation
#         grads = backward_propagation(a, caches, parameters)
#         # Update parameters.
#         parameters = update_parameters(parameters, grads)

# 梯度下降,小批量梯度下降和随机梯度下降之间的差异是用于执行一个更新步骤的数据数量。
# 必须调整超参数学习率
# 在小批量的情况下,通常它会胜过梯度下降或随机梯度下降(尤其是训练集较大时)


#2.Mini-Batch 梯度下降
#学习如何从训练集(X,Y)中构建小批次数据
# 分两个步骤:
# 1.Shuffle:如下所示,创建训练集(X,Y)的随机打乱版本。X和Y中的每一列代表一个训练示例。注意,随机打乱是在X和Y之间同步完成的。
# 这样,在随机打乱之后,X的列就是对应于Y中标签的示例。打乱步骤可确保该示例将随机分为不同小批。
# 2.Partition:将打乱后的(X,Y)划分为大小为mini_batch_size(此处为64)的小批处理。请注意,训练示例的数量并不总是可以被mini_batch_size整除。

#实现random_mini_batches。我们为你编码好了shuffling部分。
# first_mini_batch_X = shuffled_X[:, 0 : mini_batch_size]
# second_mini_batch_X = shuffled_X[:, mini_batch_size : 2 * mini_batch_size]

def random_mini_batches(X,Y,mini_batch_size=64,seed=0):
    np.random.seed(seed)
    m=X.shape[1]
    mini_batches=[]

    # Step 1: Shuffle (X, Y)
    permutation=list(np.random.permutation(m)) #np.random.permutation():随机排列序列
    shuffled_X=X[:,permutation]
    shuffled_Y=Y[:,permutation].reshape((1,m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches=math.floor(m/mini_batch_size) #math.floor(s)代表s向下舍入到最接近的整数
    for k in range(num_complete_minibatches):
        mini_batch_X=shuffled_X[:,k*mini_batch_size:(k+1)*mini_batch_size]
        mini_batch_Y=shuffled_Y[:,k*mini_batch_size:(k+1)*mini_batch_size]
        mini_batch=(mini_batch_X,mini_batch_Y)
        mini_batches.append(mini_batch)

    if m%mini_batch_size!=0:
        mini_batch_X=shuffled_X[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch_Y=shuffled_Y[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

X_assess, Y_assess, mini_batch_size = random_mini_batches_test_case()
mini_batches = random_mini_batches(X_assess, Y_assess, mini_batch_size)

# print ("shape of the 1st mini_batch_X: " + str(mini_batches[0][0].shape))
# print ("shape of the 2nd mini_batch_X: " + str(mini_batches[1][0].shape))
# print ("shape of the 3rd mini_batch_X: " + str(mini_batches[2][0].shape))
# print ("shape of the 1st mini_batch_Y: " + str(mini_batches[0][1].shape))
# print ("shape of the 2nd mini_batch_Y: " + str(mini_batches[1][1].shape))
# print ("shape of the 3rd mini_batch_Y: " + str(mini_batches[2][1].shape))
# print ("mini batch sanity check: " + str(mini_batches[0][0][0][0:3]))


#3.Momentum
#初始化速度。速度v是一个Python字典,需要使用零数组进行初始化。它的键与grads词典中的键相同,即:
# v["dW" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters["W" + str(l+1)])
# v["db" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters["b" + str(l+1)])

def initialize_velocity(parameters):
    L=len(parameters)//2
    v={}

    for l in range(L):
        v["dW"+str(l+1)]=np.zeros((parameters["W"+str(l+1)].shape[0],parameters["W"+str(l+1)].shape[1]))
        v["db"+str(l+1)]=np.zeros((parameters["b"+str(l+1)].shape[0],parameters["b"+str(l+1)].shape[1]))

    return v

parameters = initialize_velocity_test_case()

v = initialize_velocity(parameters)
# print("v[\"dW1\"] = " + str(v["dW1"]))
# print("v[\"db1\"] = " + str(v["db1"]))
# print("v[\"dW2\"] = " + str(v["dW2"]))
# print("v[\"db2\"] = " + str(v["db2"]))

#实现带冲量的参数更新
#冲量将过去的梯度考虑在内,以平滑梯度下降的步骤。它可以应用于批量梯度下降,小批次梯度下降或随机梯度下降。
def update_parameters_with_momentum(parameters,grads,v,beta,learning_rate):
    L=len(parameters)//2
    for l in range(L):
        v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]+(1-beta)*grads["dW"+str(l+1)]
        v["db"+str(l+1)]=beta*v["db"+str(l+1)]+(1-beta)*grads["db"+str(l+1)]
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*v["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*v["db"+str(l+1)]

    return parameters,v

parameters, grads, v = update_parameters_with_momentum_test_case()

parameters, v = update_parameters_with_momentum(parameters, grads, v, beta = 0.9, learning_rate = 0.01)
# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))
# print("v[\"dW1\"] = " + str(v["dW1"]))
# print("v[\"db1\"] = " + str(v["db1"]))
# print("v[\"dW2\"] = " + str(v["dW2"]))
# print("v[\"db2\"] = " + str(v["db2"]))


#4.Adam
#初始化跟踪过去信息的Adam变量v和s,它们是需要用零数组初始化的python字典。它们的key与grads的key相同,即:
# v["dW" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters["W" + str(l+1)])
# v["db" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters["b" + str(l+1)])
# s["dW" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters["W" + str(l+1)])
# s["db" + str(l+1)] = ... #(numpy array of zeros with the same shape as parameters["b" + str(l+1)])

def initialize_adam(parameters):
    L=len(parameters)//2
    v={}
    s={}

    for l in range(L):
        v["dW"+str(l+1)]=np.zeros((parameters["W"+str(l+1)].shape[0],parameters["W"+str(l+1)].shape[1]))
        v["db" + str(l + 1)] = np.zeros((parameters["b" + str(l + 1)].shape[0], parameters["b" + str(l + 1)].shape[1]))
        s["dW" + str(l + 1)] = np.zeros((parameters["W" + str(l + 1)].shape[0], parameters["W" + str(l + 1)].shape[1]))
        s["db" + str(l + 1)] = np.zeros((parameters["b" + str(l + 1)].shape[0], parameters["b" + str(l + 1)].shape[1]))

    return v,s

parameters = initialize_adam_test_case()

v, s = initialize_adam(parameters)
# print("v[\"dW1\"] = " + str(v["dW1"]))
# print("v[\"db1\"] = " + str(v["db1"]))
# print("v[\"dW2\"] = " + str(v["dW2"]))
# print("v[\"db2\"] = " + str(v["db2"]))
# print("s[\"dW1\"] = " + str(s["dW1"]))
# print("s[\"db1\"] = " + str(s["db1"]))
# print("s[\"dW2\"] = " + str(s["dW2"]))
# print("s[\"db2\"] = " + str(s["db2"]))

def update_parameters_with_adam(parameters,grads,v,s,t,learning_rate=0.01,beta1=0.9,beta2=0.999,epsilon=1e-8):
    """
        Update parameters using Adam

        Arguments:
        parameters -- python dictionary containing your parameters:
                        parameters['W' + str(l)] = Wl
                        parameters['b' + str(l)] = bl
        grads -- python dictionary containing your gradients for each parameters:
                        grads['dW' + str(l)] = dWl
                        grads['db' + str(l)] = dbl
        v -- Adam variable, moving average of the first gradient, python dictionary
        s -- Adam variable, moving average of the squared gradient, python dictionary
        learning_rate -- the learning rate, scalar.
        beta1 -- Exponential decay hyperparameter for the first moment estimates
        beta2 -- Exponential decay hyperparameter for the second moment estimates
        epsilon -- hyperparameter preventing division by zero in Adam updates

        Returns:
        parameters -- python dictionary containing your updated parameters
        v -- Adam variable, moving average of the first gradient, python dictionary
        s -- Adam variable, moving average of the squared gradient, python dictionary
        """
    L=len(parameters)//2
    v_correct={}
    s_correct={}

    for l in range(L):
        v["dW"+str(l+1)]=beta1*v["dW"+str(l+1)]+(1-beta1)*grads["dW"+str(l+1)]
        v["db"+str(l+1)]=beta1*v["db"+str(l+1)]+(1-beta1)*grads["db"+str(l+1)]
        s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * (grads["dW" + str(l + 1)]**2)
        s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * (grads["db" + str(l + 1)]**2)

        v_correct["dW"+str(l+1)]=v["dW"+str(l+1)]/(1-beta1**t)
        v_correct["db" + str(l + 1)] = v["db" + str(l + 1)] / (1 - beta1 ** t)
        s_correct["dW"+str(l+1)]=s["dW" + str(l + 1)]/(1-beta2**t)
        s_correct["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - beta2 ** t)

        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*v_correct["dW"+str(l+1)]/(np.sqrt(s_correct["dW"+str(l+1)]+epsilon))
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*v_correct["db"+str(l+1)]/(np.sqrt(s_correct["db"+str(l+1)]+epsilon))

    return parameters,v,s

parameters, grads, v, s = update_parameters_with_adam_test_case()
parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t=2)

# print("W1 = " + str(parameters["W1"]))
# print("b1 = " + str(parameters["b1"]))
# print("W2 = " + str(parameters["W2"]))
# print("b2 = " + str(parameters["b2"]))
# print("v[\"dW1\"] = " + str(v["dW1"]))
# print("v[\"db1\"] = " + str(v["db1"]))
# print("v[\"dW2\"] = " + str(v["dW2"]))
# print("v[\"db2\"] = " + str(v["db2"]))
# print("s[\"dW1\"] = " + str(s["dW1"]))
# print("s[\"db1\"] = " + str(s["db1"]))
# print("s[\"dW2\"] = " + str(s["dW2"]))
# print("s[\"db2\"] = " + str(s["db2"]))


#5.不同优化算法的模型
train_X, train_Y = load_dataset()
plt.show()

# 我们已经实现了一个三层的神经网络。你将使用以下方法进行训练:
#
# 小批次 Gradient Descent:它将调用你的函数:
#      - update_parameters_with_gd()
# 小批次 冲量:它将调用你的函数:
#      - initialize_velocity()和update_parameters_with_momentum()
# 小批次 Adam:它将调用你的函数:
#      - initialize_adam()和update_parameters_with_adam()

def model(X,Y,layer_dims,optimizer,learning_rate=0.0007,mini_batch_size=64
          ,beta=0.9,beta1=0.9,beta2=0.999,epsilon=1e-8,num_epochs=10000,print_cost=True):
    """
        3-layer neural network model which can be run in different optimizer modes.

        Arguments:
        X -- input data, of shape (2, number of examples)
        Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
        layers_dims -- python list, containing the size of each layer
        learning_rate -- the learning rate, scalar.
        mini_batch_size -- the size of a mini batch
        beta -- Momentum hyperparameter
        beta1 -- Exponential decay hyperparameter for the past gradients estimates
        beta2 -- Exponential decay hyperparameter for the past squared gradients estimates
        epsilon -- hyperparameter preventing division by zero in Adam updates
        num_epochs -- number of epochs
        print_cost -- True to print the cost every 1000 epochs

        Returns:
        parameters -- python dictionary containing your updated parameters
        """

    L=len(layer_dims)
    costs=[]
    t=0
    seed=10

    parameters=initialize_parameters(layer_dims)

    if optimizer=="gd":
        pass
    elif optimizer=="momentum":
        v=initialize_velocity(parameters)
    elif optimizer=="adam":
        v,s=initialize_adam(parameters)

    for i in range(num_epochs):
        seed+=1
        mini_batches = random_mini_batches(X, Y, mini_batch_size, seed)

        for (batch_X,batch_Y) in mini_batches:
            a3,cache=forward_propagation(batch_X,parameters)
            cost=compute_cost(a3,batch_Y)
            grads=backward_propagation(batch_X,batch_Y,cache)

            if optimizer=="gd":
                parameters=update_parameters_with_gd(parameters,grads,learning_rate)
            elif optimizer=="momentum":
                parameters,v=update_parameters_with_momentum(parameters,grads,v,beta,learning_rate)
            elif optimizer=="adam":
                t+=1 # Adam counter
                parameters,v,s=update_parameters_with_adam(parameters,grads,v,s,t,learning_rate,beta1,beta2,epsilon)

        if print_cost and i % 100 == 0:
            print("Cost after iteration %i: %f" % (i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)

    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()

    return parameters

#5.1小批量梯度下降
# train 3-layer model
layers_dims = [train_X.shape[0], 5, 2, 1]
parameters = model(train_X, train_Y, layers_dims, optimizer = "gd")
# Predict
predictions = predict(train_X, train_Y, parameters)
# Plot decision boundary
plt.title("Model with Gradient Descent optimization")
axes = plt.gca()
axes.set_xlim([-1.5,2.5])
axes.set_ylim([-1,1.5])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

#5.2 带冲量的小批量梯度下降
# train 3-layer model
layers_dims = [train_X.shape[0], 5, 2, 1]
parameters = model(train_X, train_Y, layers_dims, beta = 0.9, optimizer = "momentum")
# Predict
predictions = predict(train_X, train_Y, parameters)
# Plot decision boundary
plt.title("Model with Momentum optimization")
axes = plt.gca()
axes.set_xlim([-1.5,2.5])
axes.set_ylim([-1,1.5])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

#5.3 Adam模式的小批量梯度下降
# train 3-layer model
layers_dims = [train_X.shape[0], 5, 2, 1]
parameters = model(train_X, train_Y, layers_dims, optimizer = "adam")
# Predict
predictions = predict(train_X, train_Y, parameters)
# Plot decision boundary
plt.title("Model with Adam optimization")
axes = plt.gca()
axes.set_xlim([-1.5,2.5])
axes.set_ylim([-1,1.5])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

# Adam的优势包括:
# 相对较低的内存要求(尽管高于梯度下降和带冲量的梯度下降)
# 即使很少调整超参数,通常也能很好地工作

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值