在两层神经网络的设计与实现中,介绍了两层神经网络的工作原理。对于搭建多层神经网络,该方法依然适用。因此,本文不再推导公式,而是直接给出代码实现。
1. 定义激活函数
# 定义激活函数
def sigmoid(Z):
A = 1 / (1 + np.exp(-Z))
assert(A.shape == Z.shape)
cache = Z
return A, cache
# 定义修正线性单元为激活函数
def relu(Z):
A = np.maximum(0, Z)
assert(A.shape == Z.shape)
cache = Z
return A, cache
2. 定义初始化超参数
''' 初始化超参数,参数为每层网络的维度'''
def initialize_parameters_deep(layer_dims):
np.random.seed(3) # 生成随机种子
L = len(layer_dims) # 获取网络的深度(其实比我们平常所定义的层数多1)
parameters = {} # 定义参数字典
for l in range(1, L): # 生成每层网络的权值w,阈值b
parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) # * 0.01
parameters['b'+str(l)] = np.zeros((layer_dims[l], 1))
assert(parameters['W'+str(l)].shape == (layer_dims[l], layer_dims[l-1]))
assert(parameters['b'+str(l)].shape == (layer_dims[l], 1))
return parameters
注意:随机生成的权值这一超参数是非常重要的,它影响着代价函数的收敛。在训练week2的样例时,发现当网络变深时,直接乘上0.01代价函数收敛缓慢,而如果设置成上面表达的那种形式,代价函数可以很好的收敛。
3. 定义前向计算过程
# 正向线性传播计算
def linear_forward(A, W, b):
Z = np.dot(W, A) + b # 计算线性值
cache = (A, W, b) # 缓存A,W,b的值,为反向梯度求导使用
return Z, cache
# 计算激活函数
def linear_activation_forward(A_prev, W, b, activation):
Z, linear_cache = linear_forward(A_prev, W, b) # 计算线性值
# 根据需要选择不同的激活函数
if activation == 'sigmoid':
A, activation_cache = sigmoid(Z)
if activation == 'relu':
A, activation_cache = relu(Z)
cache = (linear_cache, activation_cache)
return A, cache
# 正向计算整个网络
def L_model_forward(X, parameters):
A = X # X为训练数据
L = len(parameters)/2 # 计算网络深度,这里深度和我们定义的一样
caches = []
for l in range(1,L):
A_prev = A
A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu')
caches.append(cache)
AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid')
caches.append(cache)
assert(AL.shape==(1,X.shape[1]))
return AL, caches
4. 定义代价函数
# 定义损失函数或者代价函数
def compute_cost(AL, Y):
m = Y.shape[1] # 计算样例的数量
cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True) / m
cost = np.squeeze(cost)
return cost
注意:如果把语句“cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True) / m”中的“m”放到前面,应该写成“cost = - 1.0 / m * np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True)" 。
5. 定义后向推导过程
# 定义线性反向传播
def linear_backward(dZ, cache):
A_prev, W, b = cache
m = A_prev.shape[1]
dW = np.dot(dZ, A_prev.T) / m
db = np.sum(dZ, axis = 1, keepdims=True) / m
dA_prev = np.dot(W.T, dZ)
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
def relu_backward(dA, cache):
Z = cache
dZ = np.array(dA, copy=True) # just converting dz to a correct object.
# When z <= 0, you should set dz to 0 as well.
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
def sigmoid_backward(dA, cache):
Z = cache
s = 1/(1+np.exp(-Z))
dZ = dA * s * (1-s)
assert (dZ.shape == Z.shape)
return dZ
def linear_activation_backward(dA, cache, activation='sigmoid'):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == 'sigmoid':
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
return dA_prev, dW, db
# 计算反向传播的整个网络
def L_model_backward(AL, Y, caches):
grads = {}
L = len(caches) # 网络的层数
m = AL.shape[1]
Y = Y.reshape(AL.shape) # 把Y的形状和AL统一
# 先求最后一层的dA
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
# 求最后一层的dZ
current_cache = caches[L-1]
grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
for l in reversed(range(L-1)):
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu")
grads["dA" + str(l + 1)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
return grads
6. 变更参数
# 更新超参数
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) / 2
for l in range(L):
parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)]
parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)]
return parameters
7. 整个神经网络训练过程
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
np.random.seed(1)
costs = [] # 记录损失函数
parameters = initialize_parameters_deep(layers_dims) # 初始化参数
# 循环梯度下降
for i in range(0, num_iterations):
AL, caches = L_model_forward(X, parameters) # 前向计算
cost = compute_cost(AL, Y) # 计算代价
grads = L_model_backward(AL, Y, caches) # 后向推导
parameters = update_parameters(parameters, grads, learning_rate) # 更新参数
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
if print_cost and i % 100 == 0:
costs.append(cost)
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters
8. 训练和测试
def load_dataset():
train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
classes = np.array(test_dataset["list_classes"][:]) # the list of classes
train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
def train_model(X, Y, layers_dims, learning_rate, num_iterations):
parameters = L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True)
return parameters
def predict_data(X, Y, parameters):
AL, caches= L_model_forward(X, parameters)
AL[AL >= 0.5] = 1
AL[AL < 0.5] = 0
print '准确率:', np.sum(AL==Y) * 1.0 / Y.shape[1]
if __name__ == '__main__':
train_data_x, train_data_y, test_data_x, test_data_y, classes = load_dataset() # 获取数据集
X = train_data_x.reshape(209, 64*64*3).T * 1.0 / 255 # 把训练数据构造成二维矩阵,行数为X的维度,列值为训练样本的个数
Y = train_data_y
X2 = test_data_x.reshape(50, 64*64*3).T * 1.0 / 255 # 转换成对应格式的矩阵
Y2 = test_data_y
row_count = 64*64*3 # 表示一个样例的特征维度
examples_count = 209 # 表示样例的数量
layers_dims = [12288, 20, 7, 5, 1] # [12288, 4, 1]
parameters = train_model(X, Y, layers_dims, 0.0075, 1500) # 训练参数
print '训练',
predict_data(X, Y, parameters) # 根据训练的参数进行预测
print '测试',
predict_data(X2, Y2, parameters) # 根据训练的参数进行预测
9. 小结
通过测试发现,超参数的初始化非常重要;而且并非网络深度设置越深越好;也并非迭代次数越多越好。
10. 代码整合
import numpy as np
import math
import h5py
from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets
from testCases_v2 import *
import matplotlib.pyplot as plt
''' 初始化超参数,参数为每层网络的维度'''
def initialize_parameters_deep(layer_dims):
np.random.seed(3) # 生成随机种子
L = len(layer_dims) # 获取网络的深度(其实比我们平常所定义的层数多1)
parameters = {} # 定义参数字典
for l in range(1, L): # 生成每层网络的权值w,阈值b
parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) # * 0.01
parameters['b'+str(l)] = np.zeros((layer_dims[l], 1))
assert(parameters['W'+str(l)].shape == (layer_dims[l], layer_dims[l-1]))
assert(parameters['b'+str(l)].shape == (layer_dims[l], 1))
return parameters
# 定义激活函数
def sigmoid(Z):
A = 1 / (1 + np.exp(-Z))
assert(A.shape == Z.shape)
cache = Z
return A, cache
# 定义修正线性单元为激活函数
def relu(Z):
A = np.maximum(0, Z)
assert(A.shape == Z.shape)
cache = Z
return A, cache
# 正向线性传播计算
def linear_forward(A, W, b):
Z = np.dot(W, A) + b # 计算线性值
cache = (A, W, b) # 缓存A,W,b的值,为反向梯度求导使用
return Z, cache
# 计算激活函数
def linear_activation_forward(A_prev, W, b, activation):
Z, linear_cache = linear_forward(A_prev, W, b) # 计算线性值
# 根据需要选择不同的激活函数
if activation == 'sigmoid':
A, activation_cache = sigmoid(Z)
if activation == 'relu':
A, activation_cache = relu(Z)
cache = (linear_cache, activation_cache)
return A, cache
# 正向计算整个网络
def L_model_forward(X, parameters):
A = X # X为训练数据
L = len(parameters)/2 # 计算网络深度,这里深度和我们定义的一样
caches = []
for l in range(1,L):
A_prev = A
A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu')
caches.append(cache)
AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid')
caches.append(cache)
assert(AL.shape==(1,X.shape[1]))
return AL, caches
# 定义损失函数或者代价函数
def compute_cost(AL, Y):
m = Y.shape[1] # 计算样例的数量
cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True) / m
cost = np.squeeze(cost)
return cost
# 定义线性反向传播
def linear_backward(dZ, cache):
A_prev, W, b = cache
m = A_prev.shape[1]
dW = np.dot(dZ, A_prev.T) / m
db = np.sum(dZ, axis = 1, keepdims=True) / m
dA_prev = np.dot(W.T, dZ)
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
def relu_backward(dA, cache):
Z = cache
dZ = np.array(dA, copy=True) # just converting dz to a correct object.
# When z <= 0, you should set dz to 0 as well.
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
def sigmoid_backward(dA, cache):
Z = cache
s = 1/(1+np.exp(-Z))
dZ = dA * s * (1-s)
assert (dZ.shape == Z.shape)
return dZ
def linear_activation_backward(dA, cache, activation='sigmoid'):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == 'sigmoid':
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
return dA_prev, dW, db
# 计算反向传播的整个网络
def L_model_backward(AL, Y, caches):
grads = {}
L = len(caches) # 网络的层数
m = AL.shape[1]
Y = Y.reshape(AL.shape) # 把Y的形状和AL统一
# 先求最后一层的dA
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
# 求最后一层的dZ
current_cache = caches[L-1]
grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
for l in reversed(range(L-1)):
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu")
grads["dA" + str(l + 1)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
return grads
# 更新超参数
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) / 2
for l in range(L):
parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)]
parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)]
return parameters
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
np.random.seed(1)
costs = [] # 记录损失函数
parameters = initialize_parameters_deep(layers_dims) # 初始化参数
# 循环梯度下降
for i in range(0, num_iterations):
AL, caches = L_model_forward(X, parameters) # 前向计算
cost = compute_cost(AL, Y) # 计算代价
grads = L_model_backward(AL, Y, caches) # 后向推导
parameters = update_parameters(parameters, grads, learning_rate) # 更新参数
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
if print_cost and i % 100 == 0:
costs.append(cost)
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters
def load_dataset():
train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
classes = np.array(test_dataset["list_classes"][:]) # the list of classes
train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
def train_model(X, Y, layers_dims, learning_rate, num_iterations):
parameters = L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True)
return parameters
def predict_data(X, Y, parameters):
AL, caches= L_model_forward(X, parameters)
AL[AL >= 0.5] = 1
AL[AL < 0.5] = 0
print '准确率:', np.sum(AL==Y) * 1.0 / Y.shape[1]
if __name__ == '__main__':
train_data_x, train_data_y, test_data_x, test_data_y, classes = load_dataset() # 获取数据集
X = train_data_x.reshape(209, 64*64*3).T * 1.0 / 255 # 把训练数据构造成二维矩阵,行数为X的维度,列值为训练样本的个数
Y = train_data_y
X2 = test_data_x.reshape(50, 64*64*3).T * 1.0 / 255 # 转换成对应格式的矩阵
Y2 = test_data_y
row_count = 64*64*3 # 表示一个样例的特征维度
examples_count = 209 # 表示样例的数量
layers_dims = [12288, 20, 7, 5, 1] # [12288, 4, 1]
parameters = train_model(X, Y, layers_dims, 0.0075, 1500) # 训练参数
print '训练',
predict_data(X, Y, parameters) # 根据训练的参数进行预测
print '测试',
predict_data(X2, Y2, parameters) # 根据训练的参数进行预测