第1关:梯度下降算法实战学习:BGD和SGD
本关任务:编写一个分别应用BGD、SGD、mini-batch梯度下降优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
def initialize_parameters(layer_dims):
'''
初始化参数w,b
:param layer_dims: 网络中每一层的单元个数,类型为list
:return:存储参数w1,w2,...,wL,b1,...,bL,类型为dictionary
'''
np.random.seed(3)
L = len(layer_dims) # the number of layers in the network
parameters = {}
for l in range(1, L):
# parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(
2 / layer_dims[l - 1]) # he initialization
# parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
# parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1]) # xavier initialization
parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
return parameters
def relu(Z):
"""
实现relu激活层
:param Z: 线性层的输出
:return A: 激活后的输出
"""
A = np.maximum(0, Z)
return A
def sigmoid(Z):
"""
实现sigmoid激活层
:param Z: 线性层的输出
:return A:激活后的输出
"""
A = 1 / (1 + np.exp(-Z))
return A
def forward_propagation(X, parameters):
"""
前向传播
:param X: 输入数据,大小为 (input size, number of examples)
:param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
W -- 网络节点的权重,大小为 (size of current layer, size of previous layer)
b -- 网络节点的偏置,大小为 (size of current layer,1)
:return:
AL: 输出层(y_predict)的输出
caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
"""
L = len(parameters) // 2 # number of layer
A = X
caches = [(None, None, None, X)] # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致,用于存储每一层的,w,b,z,A
# calculate from 1 to L-1 layer
for l in range(1, L):
A_pre = A
W = parameters["W" + str(l)]
b = parameters["b" + str(l)]
z = np.dot(W, A_pre) + b # 计算z = wx + b
A = relu(z) # relu activation function
caches.append((W, b, z, A))
# calculate Lth layer
WL = parameters["W" + str(L)]
bL = parameters["b" + str(L)]
zL = np.dot(WL, A) + bL
AL = sigmoid(zL)
caches.append((WL, bL, zL, AL))
return AL, caches
def compute_cost(AL, Y):
"""
计算损失
:param AL: 最后一层的激活值,即预测值,shape:(1,number of examples)
:param Y:真实值,shape:(1, number of examples)
:return:
"""
m = Y.shape[1]
cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
np.multiply(-np.log(1 - AL), 1 - Y))
# 从数组的形状中删除单维条目,即把shape中为1的维度去掉,比如把[[[2]]]变成2
cost = np.squeeze(cost)
return cost
def relu_backward(Z):
"""
:param Z: 激活层的输入
:return: 激活层的输出
"""
dA = np.int64(Z > 0)
return dA
def backward_propagation(AL, Y, caches):
"""
反向传播
:param AL -- 最后一层的激活值,即预测值,shape:(1,number of examples)
:param Y -- 真实值,shape:(1, number of examples)
:param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
Returns:
gradients -- 关于dW,db的梯度值,类型为directory
"""
m = Y.shape[1]
L = len(caches) - 1
# print("L: " + str(L))
# calculate the Lth layer gradients
prev_AL = caches[L - 1][3]
dzL = 1. / m * (AL - Y)
# print(dzL.shape)
# print(prev_AL.T.shape)
dWL = np.dot(dzL, prev_AL.T)
dbL = np.sum(dzL, axis=1, keepdims=True)
gradients = {"dW" + str(L): dWL, "db" + str(L): dbL}
# calculate from L-1 to 1 layer gradients
for l in reversed(range(1, L)): # L-1,L-3,....,1
post_W = caches[l + 1][0] # 要用后一层的W
dz = dzL # 用后一层的dz
dal = np.dot(post_W.T, dz)
z = caches[l][2] # 当前层的z
dzl = np.multiply(dal, relu_backward(z))
prev_A = caches[l - 1][3] # 前一层的A
dWl = np.dot(dzl, prev_A.T)
dbl = np.sum(dzl, axis=1, keepdims=True)
gradients["dW" + str(l)] = dWl
gradients["db" + str(l)] = dbl
dzL = dzl # 更新dz
return gradients
def update_parameters(parameters, grads, learning_rate):
"""
:param parameters: dictionary, W,b
:param grads: dW,db
:param learning_rate: 学习率alpha
:return:
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
for l in range(L):
parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
return parameters
# ********** End **********#
def random_mini_batches(X, Y, mini_batch_size=64, seed=1):
"""
从(X, Y)创建minibatches
:param X -- 输入数据, of shape (input size, number of examples)
:param Y -- 真实值,shape:(1, number of examples)
:param mini_batch_size -- mini-batches的大小, integer
Returns:
mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
"""
np.random.seed(seed)
m = X.shape[1] # number of training examples
mini_batches = []
# Step 1: Shuffle (X, Y)
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation].reshape((1, m))
# Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
num_complete_minibatches = m // mini_batch_size # number of mini batches of size mini_batch_size in your partitionning
for k in range(0, num_complete_minibatches):
mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
# Handling the end case (last mini-batch < mini_batch_size)
if m % mini_batch_size != 0:
mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, gradient_descent='bgd', mini_batch_size=64):
"""
:param X:输入数据, of shape (input size, number of examples)
:param Y:真实值,shape:(1, number of examples)
:param layer_dims:网络中每层的节点数,list
:param learning_rate:学习率
:param num_iterations:迭代次数
:return:
parameters:最终的网络参数:(W,b)
"""
m = Y.shape[1]
costs = []
# initialize parameters
parameters = initialize_parameters(layer_dims)
if gradient_descent == 'bgd':
for i in range(0, num_iterations):
# foward propagation
AL, caches = forward_propagation(X, parameters)
# calculate the cost
cost = compute_cost(AL, Y)
if i % 1000 == 0:
# print("Cost after iteration {}: {}".format(i, cost))
costs.append(cost)
# backward propagation
grads = backward_propagation(AL, Y, caches)
# update parameters
parameters = update_parameters(parameters, grads, learning_rate)
elif gradient_descent == 'sgd':
np.random.seed(3)
# 把数据集打乱,这个很重要
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation].reshape((1, m))
for i in range(0, num_iterations):
for j in range(0, m): # 每次训练一个样本
# Forward propagation
AL, caches = forward_propagation(shuffled_X[:, j].reshape(-1, 1), parameters)
# Compute cost
cost = compute_cost(AL, shuffled_Y[:, j].reshape(1, 1))
# Backward propagation
grads = backward_propagation(AL, shuffled_Y[:, j].reshape(1, 1), caches)
# Update parameters.
parameters = update_parameters(parameters, grads, learning_rate)
elif gradient_descent == 'mini-batch':
seed = 0
for i in range(0, num_iterations):
# Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
seed = seed + 1
minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
for minibatch in minibatches:
# 请在此添加代码 完成本关任务
# ********** Begin *********#
# Select a minibatch
(minibatch_X,minibatch_Y)=minibatch
# Forward propagation
AL,caches=forward_propagation(minibatch_X,parameters)
# Compute cost
cost=compute_cost(AL,minibatch_Y)
# Backward propagation
grads=backward_propagation(AL,minibatch_Y,caches)
parameters=update_parameters(parameters,grads,learning_rate)
# ********** End **********#
return parameters
# DNN model
def DNN(X_train, y_train, layer_dims, learning_rate=0.0006, num_iterations=30000,
gradient_descent='bgd', mini_batch_size=64):
parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, gradient_descent,
mini_batch_size)
return parameters
第2关:动量梯度下降法
本关任务:编写一个分别应用momentum、NAG梯度下降优化算法法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
#initialize parameters(w,b)
def initialize_parameters(layer_dims):
'''
初始化参数w,b
:param layer_dims: 网络中每一层的单元个数,类型为list
:return:存储参数w1,w2,...,wL,b1,...,bL,类型为dictionary
'''
np.random.seed(3)
L = len(layer_dims)#the number of layers in the network
parameters = {}
for l in range(1,L):
# parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2/layer_dims[l-1]) # he initialization
# parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
# parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1]) # xavier initialization
parameters["b" + str(l)] = np.zeros((layer_dims[l],1))
return parameters
def relu(Z):
"""
实现relu激活层
:param Z: 线性层的输出
:return A: 激活后的输出
"""
A = np.maximum(0,Z)
return A
def sigmoid(Z):
"""
实现sigmoid激活层
:param Z: 线性层的输出
:return A:激活后的输出
"""
A = 1 / (1 + np.exp(-Z))
return A
def forward_propagation(X, parameters):
"""
前向传播
:param X: 输入数据,大小为 (input size, number of examples)
:param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
W -- 网络节点的权重,大小为 (size of current layer, size of previous layer)
b -- 网络节点的偏置,大小为 (size of current layer,1)
:return:
AL: 输出层(y_predict)的输出
caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
"""
L = len(parameters) // 2 # number of layer
A = X
caches = [(None,None,None,X)] # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致,用于存储每一层的,w,b,z,A
# calculate from 1 to L-1 layer
for l in range(1,L):
A_pre = A
W = parameters["W" + str(l)]
b = parameters["b" + str(l)]
z = np.dot(W,A_pre) + b #计算z = wx + b
A = relu(z) #relu activation function
caches.append((W,b,z,A))
# calculate Lth layer
WL = parameters["W" + str(L)]
bL = parameters["b" + str(L)]
zL = np.dot(WL,A) + bL
AL = sigmoid(zL)
caches.append((WL,bL,zL,AL))
return AL, caches
def compute_cost(AL,Y):
"""
计算损失
:param AL: 最后一层的激活值,即预测值,shape:(1,number of examples)
:param Y:真实值,shape:(1, number of examples)
:return:
"""
m = Y.shape[1]
# cost = -1.0/m * np.sum(Y*np.log(AL)+(1-Y)*np.log(1.0 - AL))#py中*是点乘
# cost = (1. / m) * (-np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)) #推荐用这个,上面那个容易出错
cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
np.multiply(-np.log(1 - AL), 1 - Y))
#从数组的形状中删除单维条目,即把shape中为1的维度去掉,比如把[[[2]]]变成2
cost = np.squeeze(cost)
return cost
# derivation of relu
def relu_backward(Z):
"""
:param Z: 激活层的输入
:return: 激活层的输出
"""
dA = np.int64(Z > 0)
return dA
def backward_propagation(AL, Y, caches):
"""
反向传播
:param AL -- 最后一层的激活值,即预测值,shape:(1,number of examples)
:param Y -- 真实值,shape:(1, number of examples)
:param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
Returns:
gradients -- 关于W,b的梯度值,类型为directory
"""
m = Y.shape[1]
L = len(caches) - 1
# print("L: " + str(L))
#calculate the Lth layer gradients
prev_AL = caches[L-1][3]
dzL = 1./m * (AL - Y)
# print(dzL.shape)
# print(prev_AL.T.shape)
dWL = np.dot(dzL, prev_AL.T)
dbL = np.sum(dzL, axis=1, keepdims=True)
gradients = {"dW"+str(L):dWL, "db"+str(L):dbL}
#calculate from L-1 to 1 layer gradients
for l in reversed(range(1,L)): # L-1,L-3,....,1
post_W= caches[l+1][0] #要用后一层的W
dz = dzL #用后一层的dz
dal = np.dot(post_W.T, dz)
z = caches[l][2]#当前层的z
dzl = np.multiply(dal, relu_backward(z))
prev_A = caches[l-1][3]#前一层的A
dWl = np.dot(dzl, prev_A.T)
dbl = np.sum(dzl, axis=1, keepdims=True)
gradients["dW" + str(l)] = dWl
gradients["db" + str(l)] = dbl
dzL = dzl #更新dz
return gradients
def random_mini_batches(X, Y, mini_batch_size = 64, seed=1):
"""
从(X, Y)创建minibatches
:param X -- 输入数据, of shape (input size, number of examples)
:param Y -- 真实值,shape:(1, number of examples)
:param mini_batch_size -- mini-batches的大小, integer
Returns:
mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
"""
np.random.seed(seed)
m = X.shape[1] # number of training examples
mini_batches = []
# Step 1: Shuffle (X, Y)
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation].reshape((1, m))
# Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
num_complete_minibatches = m // mini_batch_size # number of mini batches of size mini_batch_size in your partitionning
for k in range(0, num_complete_minibatches):
mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
# Handling the end case (last mini-batch < mini_batch_size)
if m % mini_batch_size != 0:
mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
def initialize_velocity(parameters):
"""
初始化参数velocity,类型为directory:
- keys: "dW1", "db1", ..., "dWL", "dbL"
- values: 全0的numpy arrays ,shape和相应的参数一致
:params:
parameters -- 网络模型参数w,b,类型为python dictionary
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
Returns:
v -- 当前的velocity变量,类型为python dictionary.
v['dW' + str(l)] = velocity of dWl
v['db' + str(l)] = velocity of dbl
"""
L = len(parameters) // 2 # number of layers in the neural networks
v = {}
# Initialize velocity
for l in range(L):
v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
return v
#momentum
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
"""
使用Momentum算法更新网络参数
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary :
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
v -- 当前的velocity变量,类型为python dictionary:
v['dW' + str(l)] = velocity of dWl
v['db' + str(l)] = velocity of dbl
beta -- momentum超参数, scalar
learning_rate -- 学习率, scalar
Returns:
parameters -- 更新后的网络参数,python dictionary
'''
VdW = beta * VdW + (1-beta) * dW
Vdb = beta * Vdb + (1-beta) * db
W = W - learning_rate * VdW
b = b - learning_rate * Vdb
'''
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
for l in range(L):
v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]
v["db"+str(l+1)]=beta*v["db"+str(l+1)]+(1-beta)*grads['db'+str(l+1)]
parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
return parameters
# ********** End **********#
#nesterov momentum
def update_parameters_with_nesterov_momentum(parameters, grads, v, beta, learning_rate):
"""
用nesterov_mmentum算法更新参数
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary :
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
v -- 当前的velocity变量,类型为python dictionary:
v['dW' + str(l)] = velocity of dWl
v['db' + str(l)] = velocity of dbl
beta -- momentum超参数, scalar
learning_rate -- 学习率, scalar
Returns:
parameters -- 更新后的网络参数,python dictionary
v -- 更新后的velocities,python dictionary
'''
VdW = beta * VdW - learning_rate * dW
Vdb = beta * Vdb - learning_rate * db
W = W + beta * VdW - learning_rate * dW
b = b + beta * Vdb - learning_rate * db
'''
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
for l in range(L):
v["dW"+str(l+1)]=beta*v["dW"+str(l+1)]-learning_rate*grads['dW'+str(l+1)]
v["db"+str(l+1)]=beta*v["db"+str(l+1)]-learning_rate*grads['db'+str(l+1)]
parameters["W"+str(l+1)]+=beta*v["dW"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
parameters["b"+str(l+1)]+=beta*v["db"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
return parameters
# ********** End **********#
def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, optimizer,beta, mini_batch_size = 64):
"""
:param X:输入数据, of shape (input size, number of examples)
:param Y:真实值,shape:(1, number of examples)
:param layer_dims:网络中每层的节点数,list
:param learning_rate:学习率
:param num_iterations:迭代次数
:return:
parameters:最终的网络参数:(W,b)
"""
costs = []
# initialize parameters
parameters = initialize_parameters(layer_dims)
if optimizer == "momentum" or optimizer == "nesterov_momentum" :
v = initialize_velocity(parameters)
t = 0 # initializing the counter required for Adam update
seed = 0
for i in range(0, num_iterations):
# Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
seed = seed + 1
minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
for minibatch in minibatches:
# Select a minibatch
(minibatch_X, minibatch_Y) = minibatch
# Forward propagation
AL, caches = forward_propagation(minibatch_X, parameters)
# Compute cost
cost = compute_cost(AL, minibatch_Y)
# Backward propagation
grads = backward_propagation(AL, minibatch_Y, caches)
if optimizer == "momentum":
parameters = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
elif optimizer == "nesterov_momentum":
parameters = update_parameters_with_nesterov_momentum(parameters, grads, v, beta, learning_rate)
if i % 100 == 0:
#print("Cost after iteration {}: {}".format(i, cost))
costs.append(cost)
return parameters
#DNN model
def DNN(X_train, y_train,layer_dims, learning_rate= 0.0005, num_iterations=5000,optimizer = 'momentum', beta = 0.9, mini_batch_size = 64):
parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, optimizer, beta, mini_batch_size)
return parameters
第3关:自适应学习率算法
本关任务:编写一个分别应用Adagrad、Adadelta、RMSProp、Adam优化方法的3层DNN,使用sklearn的breast_cancer数据集作训练集和测试集。
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# initialize parameters(w,b)
def initialize_parameters(layer_dims):
'''
初始化参数w,b
:param layer_dims: 网络中每一层的单元个数,类型为list
:return:存储参数w1,w2,...,wL,b1,...,bL,类型为dictionary
'''
np.random.seed(3)
L = len(layer_dims) # the number of layers in the network
parameters = {}
for l in range(1, L):
# parameters["W" + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(
2 / layer_dims[l - 1]) # he initialization
# parameters["W" + str(l)] = np.zeros((layer_dims[l], layer_dims[l - 1])) #为了测试初始化为0的后果
# parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(1 / layer_dims[l - 1]) # xavier initialization
parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
return parameters
def relu(Z):
"""
实现relu激活层
:param Z: 线性层的输出
:return A: 激活后的输出
"""
A = np.maximum(0, Z)
return A
# implement the activation function(ReLU and sigmoid)
def sigmoid(Z):
"""
实现sigmoid激活层
:param Z: 线性层的输出
:return A:激活后的输出
"""
A = 1 / (1 + np.exp(-Z))
return A
def forward_propagation(X, parameters):
"""
前向传播
:param X: 输入数据,大小为 (input size, number of examples)
:param parameters -- python字典包含参数 "W1", "b1", "W2", "b2",...,"WL", "bL"
W -- 网络节点的权重,大小为 (size of current layer, size of previous layer)
b -- 网络节点的偏置,大小为 (size of current layer,1)
:return:
AL: 输出层(y_predict)的输出
caches: list类型, list中每个元素为tuple类型:(W,b,z,A_pre)
"""
L = len(parameters) // 2 # number of layer
A = X
caches = [(None, None, None, X)] # 第0层(None,None,None,A0) w,b,z用none填充,下标与层数一致,用于存储每一层的,w,b,z,A
# calculate from 1 to L-1 layer
for l in range(1, L):
A_pre = A
W = parameters["W" + str(l)]
b = parameters["b" + str(l)]
z = np.dot(W, A_pre) + b # 计算z = wx + b
A = relu(z) # relu activation function
caches.append((W, b, z, A))
# calculate Lth layer
WL = parameters["W" + str(L)]
bL = parameters["b" + str(L)]
zL = np.dot(WL, A) + bL
AL = sigmoid(zL)
caches.append((WL, bL, zL, AL))
return AL, caches
# calculate cost function
def compute_cost(AL, Y):
"""
计算损失
:param AL: 最后一层的激活值,即预测值,shape:(1,number of examples)
:param Y:真实值,shape:(1, number of examples)
:return:
"""
m = Y.shape[1]
# cost = -1.0/m * np.sum(Y*np.log(AL)+(1-Y)*np.log(1.0 - AL))#py中*是点乘
# cost = (1. / m) * (-np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)) #推荐用这个,上面那个容易出错
cost = 1. / m * np.nansum(np.multiply(-np.log(AL), Y) +
np.multiply(-np.log(1 - AL), 1 - Y))
# 从数组的形状中删除单维条目,即把shape中为1的维度去掉,比如把[[[2]]]变成2
cost = np.squeeze(cost)
# print('=====================cost===================')
# print(cost)
return cost
# derivation of relu
def relu_backward(Z):
"""
:param Z: 激活层的输入
:return: 激活层的输出
"""
dA = np.int64(Z > 0)
return dA
def backward_propagation(AL, Y, caches):
"""
反向传播
:param AL -- 最后一层的激活值,即预测值,shape:(1,number of examples)
:param Y -- 真实值,shape:(1, number of examples)
:param caches -- 调用forward_propagation()函数得到的caches,(W,b,z,pre_A)
Returns:
gradients -- 关于W,b的梯度值,类型为directory
"""
m = Y.shape[1]
L = len(caches) - 1
# print("L: " + str(L))
# calculate the Lth layer gradients
prev_AL = caches[L - 1][3]
dzL = 1. / m * (AL - Y)
# print(dzL.shape)
# print(prev_AL.T.shape)
dWL = np.dot(dzL, prev_AL.T)
dbL = np.sum(dzL, axis=1, keepdims=True)
gradients = {"dW" + str(L): dWL, "db" + str(L): dbL}
# calculate from L-1 to 1 layer gradients
for l in reversed(range(1, L)): # L-1,L-3,....,1
post_W = caches[l + 1][0] # 要用后一层的W
dz = dzL # 用后一层的dz
dal = np.dot(post_W.T, dz)
z = caches[l][2] # 当前层的z
dzl = np.multiply(dal, relu_backward(z))
prev_A = caches[l - 1][3] # 前一层的A
dWl = np.dot(dzl, prev_A.T)
dbl = np.sum(dzl, axis=1, keepdims=True)
gradients["dW" + str(l)] = dWl
gradients["db" + str(l)] = dbl
dzL = dzl # 更新dz
return gradients
def random_mini_batches(X, Y, mini_batch_size=64, seed=1):
"""
从(X, Y)创建minibatches
:param X -- 输入数据, of shape (input size, number of examples)
:param Y -- 真实值,shape:(1, number of examples)
:param mini_batch_size -- mini-batches的大小, integer
Returns:
mini_batches -- 对应(mini_batch_X, mini_batch_Y)的list
"""
np.random.seed(seed)
m = X.shape[1] # number of training examples
mini_batches = []
# Step 1: Shuffle (X, Y)
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation].reshape((1, m))
# Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
num_complete_minibatches = m // mini_batch_size # number of mini batches of size mini_batch_size in your partitionning
for k in range(0, num_complete_minibatches):
mini_batch_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size]
mini_batch_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
# Handling the end case (last mini-batch < mini_batch_size)
if m % mini_batch_size != 0:
mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
def initialize_velocity(parameters):
"""
初始化参数velocity,类型为directory:
- keys: "dW1", "db1", ..., "dWL", "dbL"
- values: 全0的numpy arrays ,shape和相应的参数一致
:params:
parameters -- 网络模型参数w,b,类型为python dictionary
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
Returns:
v -- 当前的velocity变量,类型为python dictionary.
v['dW' + str(l)] = velocity of dWl
v['db' + str(l)] = velocity of dbl
"""
L = len(parameters) // 2 # number of layers in the neural networks
v = {}
# Initialize velocity
for l in range(L):
v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
return v
# AdaGrad initialization
def initialize_adagrad(parameters):
"""
初始化 velocity变量, 类型为python dictionary :
- keys: "dW1", "db1", ..., "dWL", "dbL"
- values: 全0的numpy arrays ,shape和相应的参数一致
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
Returns:
Gt -- 到第t次迭代对dw,db的平方梯度和,python dictionary
G['dW' + str(l)] = sum of the squares of the gradients up to dwl
G['db' + str(l)] = sum of the squares of the gradients up to db1
"""
L = len(parameters) // 2 # number of layers in the neural networks
G = {}
# Initialize velocity
for l in range(L):
G["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
G["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
return G
# AdaGrad
def update_parameters_with_adagrad(parameters, grads, G, learning_rate, epsilon=1e-7):
"""
用Adagrad算法更新参数
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary:
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
G -- 当前的G变量,python dictionary :
G['dW' + str(l)] = G of dW1
G['db' + str(l)] = G of db1
learning_rate -- xue'xi学习率, scalar
epsilon -- 防止除0错误的超参数
Returns:
parameters -- 更新后的网络参数,python dictionary
'''
GW += (dW)^2
W -= learning_rate/sqrt(GW + epsilon)*dW
Gb += (db)^2
b -= learning_rate/sqrt(Gb + epsilon)*db
'''
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
for l in range(L):
G["dW"+str(l+1)]+=grads['dW'+str(l+1)]**2
G["db"+str(l+1)]+=grads['db'+str(l+1)]**2
parameters["W"+str(l+1)]-=learning_rate/(np.sqrt(G["dW"+str(l+1)])+epsilon)*grads['dW'+str(l+1)]
parameters["b"+str(l+1)]-=learning_rate/(np.sqrt(G["db"+str(l+1)])+epsilon)*grads['db'+str(l+1)]
return parameters
# ********** End **********#
# initialize_adadelta
def initialize_adadelta(parameters):
"""
初始化s和delta 变量,均为 python dictionaries with:
- keys: "dW1", "db1", ..., "dWL", "dbL"
- values: 全0的numpy arrays ,shape和相应的参数一致
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary:
parameters["W" + str(l)] = Wl
parameters["b" + str(l)] = bl
Returns:
s -- python dictionary that will contain the exponentially weighted average of the squared gradient of dw
s["dW" + str(l)] = ...
s["db" + str(l)] = ...
v -- python dictionary that will contain the RMS
v["dW" + str(l)] = ...
v["db" + str(l)] = ...
delta -- dw的平方梯度的指数加权平均,python dictionary
delta["dW" + str(l)] = ...
delta["db" + str(l)] = ...
"""
L = len(parameters) // 2 # number of layers in the neural networks
s = {}
v = {}
delta = {}
# Initialize s, v, delta. Input: "parameters". Outputs: "s, v, delta".
for l in range(L):
s["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
s["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
delta["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
delta["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
return s, v, delta
# adadelta
def update_parameters_with_adadelta(parameters, grads, rho, s, v, delta, epsilon=1e-6):
"""
使用adadelta算法更新参数
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary:
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
rho -- 衰减常数 ,和 momentum方法类似, scalar
s -- 当前的velocity变量,python dictionary:
s['dW' + str(l)] = ...
s['db' + str(l)] = ...
delta -- 当前的RMS变量,python dictionary:
delta['dW' + str(l)] = ...
delta['db' + str(l)] = ...
epsilon -- 防止除0错误的超参数
Returns:
parameters -- 更新后的参数,python dictionary
'''
Sdw = rho*Sdw + (1 - rho)*(dW)^2
Sdb = rho*Sdb + (1 - rho)*(db)^2
Vdw = sqrt((delta_w + epsilon) / (Sdw + epsilon))*dW
Vdb = sqrt((delta_b + epsilon) / (Sdb + epsilon))*dW
W -= Vdw
b -= Vdb
delta_w = rho*delta_w + (1 - rho)*(Vdw)^2
delta_b = rho*delta_b + (1 - rho)*(Vdb)^2
'''
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
for l in range(L):
s["dW"+str(l+1)]=rho*s["dW"+str(l+1)]+(1-rho)*grads['dW'+str(l+1)]**2
s["db"+str(l+1)]=rho*s["db"+str(l+1)]+(1-rho)*grads['db'+str(l+1)]**2
v["dW"+str(l+1)]=np.sqrt((delta["db"+str(l+1)]+epsilon)/(s["dW"+str(l+1)]+epsilon))*grads['dW'+str(l+1)]
v["db"+str(l+1)]=np.sqrt((delta["db"+str(l+1)]+epsilon)/(s["db"+str(l+1)]+epsilon))*grads['db'+str(l+1)]
parameters["W"+str(l+1)]-=v["dW"+str(l+1)]
parameters["b"+str(l+1)]-=v["db"+str(l+1)]
delta["dW"+str(l+1)]=rho*delta["dW"+str(l+1)]+(1-rho)*v["dW"+str(l+1)]**2
delta["db"+str(l+1)]=rho*delta["db"+str(l+1)]+(1-rho)*v["db"+str(l+1)]**2
return parameters
# ********** End **********#
# RMSprop
def update_parameters_with_rmsprop(parameters, grads, s, beta=0.9, learning_rate=0.01, epsilon=1e-6):
"""
使用RMSprop算法更新参数
Arguments:
parameters -- 网络模型参数w,b,类型为python dictionary:
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
s -- 当前的velocity变量,python dictionary:
v['dW' + str(l)] = ...
v['db' + str(l)] = ...
beta -- momentum 超参数, scalar
learning_rate -- 学习率, scalar
Returns:
parameters -- 更新后的参数,python dictionary
'''
SdW = beta * SdW + (1-beta) * (dW)^2
sdb = beta * Sdb + (1-beta) * (db)^2
W = W - learning_rate * dW/sqrt(SdW + epsilon)
b = b - learning_rate * db/sqrt(Sdb + epsilon)
'''
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
for l in range(L):
s["dW"+str(l+1)]=beta*s["dW"+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]**2
s["db"+str(l+1)]=beta*s["db"+str(l+1)]+(1-beta)*grads['db'+str(l+1)]**2
parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads['dW'+str(l+1)]/np.sqrt(s["dW"+str(l+1)]+epsilon)
parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads['db'+str(l+1)]/np.sqrt(s["db"+str(l+1)]+epsilon)
return parameters
# ********** End **********#
# initialize adam
def initialize_adam(parameters):
"""
初始化v和 s ,均为python dictionaries with:
- keys: "dW1", "db1", ..., "dWL", "dbL"
- values: 全0的numpy arrays ,shape和相应的参数一致
Arguments:
parameters -- 网络参数w,b,python dictionary
parameters["W" + str(l)] = Wl
parameters["b" + str(l)] = bl
Returns:
v -- 梯度指数加权平均,python dictionary
v["dW" + str(l)] = ...
v["db" + str(l)] = ...
s -- 平方梯度指数加权平均,python dictionary .
s["dW" + str(l)] = ...
s["db" + str(l)] = ...
"""
L = len(parameters) // 2 # number of layers in the neural networks
v = {}
s = {}
# Initialize v, s. Input: "parameters". Outputs: "v, s".
for l in range(L):
v["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
v["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
s["dW" + str(l + 1)] = np.zeros(parameters["W" + str(l + 1)].shape)
s["db" + str(l + 1)] = np.zeros(parameters["b" + str(l + 1)].shape)
return v, s
# adam
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
"""
使用Adam算法更新参数
Arguments:
parameters -- 网络参数w,b,python dictionary:
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
v -- Adam variable, 梯度指数加权平均,python dictionary
s -- Adam variable, 平方梯度指数加权平均,python dictionary
learning_rate -- 学习率, scalar.
beta1 -- 第一个动量估计的指数衰减超参数
beta2 -- 第二个动量估计的指数衰减超参数
epsilon -- 防止除0错误的超参数
Returns:
parameters -- 更新后的参数,python dictionary
"""
# 请在此添加代码 完成本关任务
# ********** Begin *********#
L=len(parameters) // 2
v_corrected={}
s_corrected={}
for l in range(L):
v["dW"+str(l+1)]=beta1*v["dW"+str(l+1)]+(1-beta1)*grads['dW'+str(l+1)]
v["db"+str(l+1)]=beta1*v["db"+str(l+1)]+(1-beta1)*grads['db'+str(l+1)]
v_corrected["dW"+str(l+1)]=v["dW"+str(l+1)]/(1-np.power(beta1,t))
v_corrected["db"+str(l+1)]=v["db"+str(l+1)]/(1-np.power(beta1,t))
s["dW"+str(l+1)]=beta2*s["dW"+str(l+1)]+(1-beta2)*np.power(grads['dW'+str(l+1)],2)
s["db"+str(l+1)]=beta2*s["db"+str(l+1)]+(1-beta2)*np.power(grads['db'+str(l+1)],2)
s_corrected["dW"+str(l+1)]=s["dW"+str(l+1)]/(1-np.power(beta2,t))
s_corrected["db"+str(l+1)]=s["db"+str(l+1)]/(1-np.power(beta2,t))
parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*v_corrected["dW"+str(l+1)]/np.sqrt(s_corrected["dW"+str(l+1)]+epsilon)
parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*v_corrected["db"+str(l+1)]/np.sqrt(s_corrected["db"+str(l+1)]+epsilon)
return parameters
# ********** End **********#
def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, optimizer, beta=0.9, beta2=0.999, mini_batch_size=64,
epsilon=1e-8):
"""
:param X:输入数据, of shape (input size, number of examples)
:param Y:真实值,shape:(1, number of examples)
:param layer_dims:网络中每层的节点数,list
:param learning_rate:学习率
:param num_iterations:迭代次数
:return:
parameters:最终的网络参数:(W,b)
"""
costs = []
# initialize parameters
parameters = initialize_parameters(layer_dims)
if optimizer == "sgd":
pass # no initialization required for gradient descent
elif optimizer == "momentum" or optimizer == "nesterov_momentum" or optimizer == "rmsprop":
v = initialize_velocity(parameters)
elif optimizer == "adagrad":
G = initialize_adagrad(parameters)
elif optimizer == "adadelta":
s, v, delta = initialize_adadelta(parameters)
elif optimizer == "adam":
v, s = initialize_adam(parameters)
t = 0 # initializing the counter required for Adam update
seed = 0
for i in range(0, num_iterations):
# Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch
seed = seed + 1
minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
for minibatch in minibatches:
# Select a minibatch
(minibatch_X, minibatch_Y) = minibatch
# Forward propagation
AL, caches = forward_propagation(minibatch_X, parameters)
# Compute cost
cost = compute_cost(AL, minibatch_Y)
# Backward propagation
grads = backward_propagation(AL, minibatch_Y, caches)
if optimizer == "adagrad":
parameters = update_parameters_with_adagrad(parameters, grads, G, learning_rate, epsilon)
elif optimizer == "adadelta":
parameters = update_parameters_with_adadelta(parameters, grads, beta, s, v, delta, epsilon)
elif optimizer == "rmsprop":
parameters = update_parameters_with_rmsprop(parameters, grads, v, beta, learning_rate, epsilon)
elif optimizer == "adam":
t += 1
parameters = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta, beta2,
epsilon)
if i % 100 == 0:
#print("Cost after iteration {}: {}".format(i, cost))
costs.append(cost)
return parameters
# DNN model
def DNN(X_train, y_train, X_test, y_test, layer_dims, learning_rate=0.0005, num_iterations=5000, optimizer='adam',
beta=0.9, beta2=0.999, mini_batch_size=64, epsilon=1e-8):
parameters = L_layer_model(X_train, y_train, layer_dims, learning_rate, num_iterations, optimizer, beta, beta2,
mini_batch_size, epsilon)
return parameters
第4关:运用Keras优化器
本关任务:用Keras框架编写一个简单的DNN实现mnist数据集分类。
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(1337) # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout
from keras.optimizers import SGD,Adagrad,Adadelta,RMSprop,Adam
# input image dimensions
img_rows, img_cols = 28, 28
def dnn(X_train,Y_train,X_test,Y_test,choice):
"""
用Keras框架实现简单的dnn,完成对mnist数据集分类,重点在于掌握各种优化器的使用
Arguments:
X_train -- 训练样本
Y_train -- 训练样本的标签
X_test -- 测试样本
Y_test -- 测试样本的标签
choice -- 选择哪种优化器
Returns:
model -- Keras模型
"""
epoch = 2
sgd = SGD(lr=0.01, momentum=0., decay=0., nesterov=False)
momentum = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
nag = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
adagrad = Adagrad(lr=0.01, epsilon=1e-6)
adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-6)
rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-6)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
#定义网络的各层
model = Sequential()
model.add(Dense(512, input_shape=(img_cols*img_rows,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Activation('softmax'))
#choose the optimizer
if choice==1:
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
elif choice==2:
model.compile(loss='categorical_crossentropy', optimizer=momentum, metrics=['accuracy'])
elif choice==3:
model.compile(loss='categorical_crossentropy', optimizer=nag, metrics=['accuracy'])
elif choice==4:
model.compile(loss='categorical_crossentropy', optimizer=adagrad, metrics=['accuracy'])
elif choice==5:
model.compile(loss='categorical_crossentropy', optimizer=adadelta, metrics=['accuracy'])
elif choice==6:
model.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
elif choice==7:
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(X_train, Y_train,
batch_size=128, nb_epoch=epoch,
verbose=0, validation_data=(X_test, Y_test))
return model