深层神经网络的搭建

 

       在两层神经网络的设计与实现中,介绍了两层神经网络的工作原理。对于搭建多层神经网络,该方法依然适用。因此,本文不再推导公式,而是直接给出代码实现。


1. 定义激活函数

# 定义激活函数
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    assert(A.shape == Z.shape)
    cache = Z
    return A, cache

# 定义修正线性单元为激活函数
def relu(Z):
    A = np.maximum(0, Z)
    assert(A.shape == Z.shape)
    cache = Z
    return A, cache



2. 定义初始化超参数

''' 初始化超参数,参数为每层网络的维度'''
def initialize_parameters_deep(layer_dims):
    np.random.seed(3) # 生成随机种子
    L = len(layer_dims) # 获取网络的深度(其实比我们平常所定义的层数多1)
    parameters = {} # 定义参数字典
    
    for l in range(1, L): # 生成每层网络的权值w,阈值b
        parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) # * 0.01
        parameters['b'+str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W'+str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b'+str(l)].shape == (layer_dims[l], 1))
    
    return parameters

注意:随机生成的权值这一超参数是非常重要的,它影响着代价函数的收敛。在训练week2的样例时,发现当网络变深时,直接乘上0.01代价函数收敛缓慢,而如果设置成上面表达的那种形式,代价函数可以很好的收敛。



3. 定义前向计算过程

# 正向线性传播计算
def linear_forward(A, W, b):
    
    Z = np.dot(W, A) + b # 计算线性值
    cache = (A, W, b) # 缓存A,W,b的值,为反向梯度求导使用
    
    return Z, cache

# 计算激活函数
def linear_activation_forward(A_prev, W, b, activation):
    Z, linear_cache = linear_forward(A_prev, W, b) # 计算线性值
    
    # 根据需要选择不同的激活函数
    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    if activation == 'relu':
        A, activation_cache = relu(Z)
        
    cache = (linear_cache, activation_cache)
    return A, cache

# 正向计算整个网络
def L_model_forward(X, parameters):
    
    A = X # X为训练数据
    L = len(parameters)/2 # 计算网络深度,这里深度和我们定义的一样
    caches = []
    
    for l in range(1,L):
        A_prev = A
        
        A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu')
        caches.append(cache)        
    
        
    AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid')
    caches.append(cache)
    
    assert(AL.shape==(1,X.shape[1]))
        
    return AL, caches


4. 定义代价函数

# 定义损失函数或者代价函数
def compute_cost(AL, Y):
    
    m = Y.shape[1] # 计算样例的数量
    cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True)  / m
    cost = np.squeeze(cost) 
    return cost

注意:如果把语句“cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True)  / m”中的“m”放到前面,应该写成“cost = - 1.0 / m * np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True)" 。


5. 定义后向推导过程

# 定义线性反向传播
def linear_backward(dZ, cache):
    
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims=True) / m
    dA_prev = np.dot(W.T, dZ) 
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db    

def relu_backward(dA, cache):    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def linear_activation_backward(dA, cache, activation='sigmoid'):
    
    linear_cache, activation_cache = cache
    
    if activation == 'relu':
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache) 
    
    return dA_prev, dW, db

# 计算反向传播的整个网络
def L_model_backward(AL, Y, caches):
    
    grads = {}
    L = len(caches) # 网络的层数
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # 把Y的形状和AL统一
    
    # 先求最后一层的dA
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    # 求最后一层的dZ
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
   
    for l in reversed(range(L-1)):       
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads


6. 变更参数

# 更新超参数
def update_parameters(parameters, grads, learning_rate):
    
    L = len(parameters) / 2
    for l in range(L):
        parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)]
        parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)]
    
    return parameters


7. 整个神经网络训练过程

def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
   
    np.random.seed(1)
    costs = []  # 记录损失函数
    
    parameters = initialize_parameters_deep(layers_dims) # 初始化参数
    
    # 循环梯度下降
    for i in range(0, num_iterations):
        AL, caches = L_model_forward(X, parameters) # 前向计算
        cost = compute_cost(AL, Y) # 计算代价
        grads = L_model_backward(AL, Y, caches) # 后向推导
        parameters = update_parameters(parameters, grads, learning_rate) # 更新参数
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)             
        
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()    
   
    return parameters


8. 训练和测试

def load_dataset():
    train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

def train_model(X, Y, layers_dims, learning_rate, num_iterations):
    
    parameters = L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True)
    
    return parameters

def predict_data(X, Y, parameters):
    AL, caches= L_model_forward(X, parameters)
    
    AL[AL >= 0.5] = 1
    AL[AL < 0.5] = 0
    print '准确率:', np.sum(AL==Y) * 1.0 / Y.shape[1]
    
    
if __name__ == '__main__':
    train_data_x, train_data_y, test_data_x, test_data_y, classes = load_dataset()  # 获取数据集
    X = train_data_x.reshape(209, 64*64*3).T * 1.0 / 255 # 把训练数据构造成二维矩阵,行数为X的维度,列值为训练样本的个数
    Y = train_data_y
    X2 = test_data_x.reshape(50, 64*64*3).T * 1.0 / 255 # 转换成对应格式的矩阵
    Y2 = test_data_y
    
    
    row_count = 64*64*3 # 表示一个样例的特征维度
    examples_count = 209 # 表示样例的数量
    
    layers_dims = [12288, 20, 7, 5, 1] # [12288, 4, 1]
    parameters = train_model(X, Y, layers_dims, 0.0075, 1500) # 训练参数
    print '训练',
    predict_data(X, Y, parameters) # 根据训练的参数进行预测
    print '测试',
    predict_data(X2, Y2, parameters) # 根据训练的参数进行预测


9. 小结


       通过测试发现,超参数的初始化非常重要;而且并非网络深度设置越深越好;也并非迭代次数越多越好。


10. 代码整合

import numpy as np
import math
import h5py
from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets
from testCases_v2 import *
import matplotlib.pyplot as plt

''' 初始化超参数,参数为每层网络的维度'''
def initialize_parameters_deep(layer_dims):
    np.random.seed(3) # 生成随机种子
    L = len(layer_dims) # 获取网络的深度(其实比我们平常所定义的层数多1)
    parameters = {} # 定义参数字典
    
    for l in range(1, L): # 生成每层网络的权值w,阈值b
        parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) # * 0.01
        parameters['b'+str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W'+str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b'+str(l)].shape == (layer_dims[l], 1))
    
    return parameters

# 定义激活函数
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    assert(A.shape == Z.shape)
    cache = Z
    return A, cache

# 定义修正线性单元为激活函数
def relu(Z):
    A = np.maximum(0, Z)
    assert(A.shape == Z.shape)
    cache = Z
    return A, cache

# 正向线性传播计算
def linear_forward(A, W, b):
    
    Z = np.dot(W, A) + b # 计算线性值
    cache = (A, W, b) # 缓存A,W,b的值,为反向梯度求导使用
    
    return Z, cache

# 计算激活函数
def linear_activation_forward(A_prev, W, b, activation):
    Z, linear_cache = linear_forward(A_prev, W, b) # 计算线性值
    
    # 根据需要选择不同的激活函数
    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    if activation == 'relu':
        A, activation_cache = relu(Z)
        
    cache = (linear_cache, activation_cache)
    return A, cache

# 正向计算整个网络
def L_model_forward(X, parameters):
    
    A = X # X为训练数据
    L = len(parameters)/2 # 计算网络深度,这里深度和我们定义的一样
    caches = []
    
    for l in range(1,L):
        A_prev = A
        
        A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu')
        caches.append(cache)        
    
        
    AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid')
    caches.append(cache)
    
    assert(AL.shape==(1,X.shape[1]))
        
    return AL, caches
    
# 定义损失函数或者代价函数
def compute_cost(AL, Y):
    
    m = Y.shape[1] # 计算样例的数量
    cost = - np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL), axis=1, keepdims=True)  / m
    cost = np.squeeze(cost) 
    return cost

# 定义线性反向传播
def linear_backward(dZ, cache):
    
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims=True) / m
    dA_prev = np.dot(W.T, dZ) 
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db    

def relu_backward(dA, cache):    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def linear_activation_backward(dA, cache, activation='sigmoid'):
    
    linear_cache, activation_cache = cache
    
    if activation == 'relu':
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache) 
    
    return dA_prev, dW, db

# 计算反向传播的整个网络
def L_model_backward(AL, Y, caches):
    
    grads = {}
    L = len(caches) # 网络的层数
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # 把Y的形状和AL统一
    
    # 先求最后一层的dA
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    # 求最后一层的dZ
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
   
    for l in reversed(range(L-1)):       
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

# 更新超参数
def update_parameters(parameters, grads, learning_rate):
    
    L = len(parameters) / 2
    for l in range(L):
        parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)]
        parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)]
    
    return parameters

def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
   
    np.random.seed(1)
    costs = []  # 记录损失函数
    
    parameters = initialize_parameters_deep(layers_dims) # 初始化参数
    
    # 循环梯度下降
    for i in range(0, num_iterations):
        AL, caches = L_model_forward(X, parameters) # 前向计算
        cost = compute_cost(AL, Y) # 计算代价
        grads = L_model_backward(AL, Y, caches) # 后向推导
        parameters = update_parameters(parameters, grads, learning_rate) # 更新参数
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)             
        
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()    
   
    return parameters

def load_dataset():
    train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

def train_model(X, Y, layers_dims, learning_rate, num_iterations):
    
    parameters = L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True)
    
    return parameters

def predict_data(X, Y, parameters):
    AL, caches= L_model_forward(X, parameters)
    
    AL[AL >= 0.5] = 1
    AL[AL < 0.5] = 0
    print '准确率:', np.sum(AL==Y) * 1.0 / Y.shape[1]
    
    
if __name__ == '__main__':
    train_data_x, train_data_y, test_data_x, test_data_y, classes = load_dataset()  # 获取数据集
    X = train_data_x.reshape(209, 64*64*3).T * 1.0 / 255 # 把训练数据构造成二维矩阵,行数为X的维度,列值为训练样本的个数
    Y = train_data_y
    X2 = test_data_x.reshape(50, 64*64*3).T * 1.0 / 255 # 转换成对应格式的矩阵
    Y2 = test_data_y
    
    
    row_count = 64*64*3 # 表示一个样例的特征维度
    examples_count = 209 # 表示样例的数量
    
    layers_dims = [12288, 20, 7, 5, 1] # [12288, 4, 1]
    parameters = train_model(X, Y, layers_dims, 0.0075, 1500) # 训练参数
    print '训练',
    predict_data(X, Y, parameters) # 根据训练的参数进行预测
    print '测试',
    predict_data(X2, Y2, parameters) # 根据训练的参数进行预测


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值