上一篇文章讲Logistic Regression时,说出Logistic Regression是含有0个隐层的神经网络,本文将介绍含一层隐层的浅层神经网络,浅层神经网络对理解深度神经网络有很大帮助,因为深度神经网络对各层进行优化时,用到的理论也是浅层神经网络。本文也将从正向反向传播介绍浅层神经网络。
1.正向传播
浅层神经网络如下下图(图来着吴恩达notebook)所示,它与Logistic Regession网络只差了一个隐藏层。
根据浅层神经网络图可以写出正向传播公式:
其中,上标[i]表示第i层,g[i]表示第i层的激活函数。通常隐层的激活函数设置为ReLU函数,原因已在上篇博客中说了。
2.损失函数
损失函数和Logistic Regression中的损失函数相同:
对于多个样本,成本函数与Logistic Regression中的损失函数相同:
3.反向传播
反向传播与Logistic Regression 类似,无非就是链式法则求导,这里直接给出结果:
对于多个样本,将其向量化:
4.更新参数
5.Python源码
import numpy as np
import matplotlib.pyplot as plt
import h5py
from testCases import *
import sklearn
import sklearn.datasets
import sklearn.learning_curve
from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets
%matplotlib inline
np.random.seed(1)
def layer_sizes(X, n_h, Y):
n_x = X.shape[0]
n_y = Y.shape[0]
n_h.insert(0, n_x)
n_h.append(n_y)
return n_h
def initialize_parameters(n_h):
n_h_size = np.shape(n_h)[0]
W = []
b = []
for i in range(n_h_size - 1):
W_i = np.random.randn(n_h[i + 1], n_h[i]) * 0.01
b_i = np.zeros((n_h[i + 1], 1))
W.append(W_i)
b.append(b_i)
W1 = np.random.randn(n_h[1], n_h[0]) * 0.01
b1 = np.zeros((n_h[1], 1))
W2 = np.random.randn(n_h[2], n_h[1]) * 0.01
b2 = np.zeros((n_h[2], 1))
parameters = {"W1": W1,
"b1": b1,
"W2": W2,
"b2": b2}
return parameters
def forword_propagation(X, parameters):
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
Z1 = np.dot(W1, X) + b1
A1 = np.tanh(Z1)
Z2 = np.dot(W2, A1) + b2
A2 = sigmoid(Z2)
assert(A2.shape == (1, X.shape[1]))
chache = {"Z1": Z1,
"Z2": Z2,
"A1": A1,
"A2":A2}
return A2, chache
def compute_cost(A2, Y):
m = float(Y.shape[1])
lost = np.dot(Y, np.log(A2).T) + np.dot(1 - Y, np.log(1 - A2).T)
cost = -1 / m * np.sum(lost)
cost = np.squeeze(cost)
assert(isinstance(cost, float))
return cost
def back_propagation(X, Y, chache, parameters):
m = float(Y.shape[1])
Z1 = chache["Z1"]
Z2 = chache["Z2"]
A1 = chache["A1"]
A2 = chache["A2"]
W1 = parameters["W1"]
W2 = parameters["W2"]
dZ2 = A2 - Y
dW2 = 1 / m * np.dot(dZ2, A1.T)
db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
dZ1 = np.dot(W2.T, dZ2) * (1 - np.power(A1, 2))
dW1 = 1 / m * np.dot(dZ1, X.T)
db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
grads = {"dW2": dW2,
"dW1": dW1,
"db1": db1,
"db2": db2}
return grads
def update_parameters(grads, parameters, learning_rate = 1.2):
W1 = parameters["W1"]
W2 = parameters["W2"]
b1 = parameters["b1"]
b2 = parameters["b2"]
dW1 = grads["dW1"]
dW2 = grads["dW2"]
db1 = grads["db1"]
db2 = grads["db2"]
W1 -= learning_rate * dW1
W2 -= learning_rate * dW2
b1 -= learning_rate * db1
b2 -= learning_rate * db2
parameters = {"W1": W1,
"W2": W2,
"b1": b1,
"b2": b2}
return parameters
def model(X, Y, n_h, num_iterations = 10000, learning_rate = 1.2, show_costs = False):
n_x = layer_sizes(X, Y)[0]
n_y = layer_sizes(X, Y)[2]
params = initialize_parameters(n_x, n_h, n_y)
for i in range(num_iterations):
A2, chache = forword_propagation(X, parameters= params)
cost = compute_cost(A2, Y)
grads = back_propagation(X, Y, chache, params)
params = update_parameters(grads, params, learning_rate)
if show_costs and i % 1000 == 0:
print cost
return params
def predict(parameters, X):
A2, chache = forword_propagation(X, parameters)
predictions = (A2 > 0.5)
return predictions