深度学习初始化，正则化，梯度检验

最新推荐文章于 2024-08-08 20:39:10 发布

热爱技术的小曹

最新推荐文章于 2024-08-08 20:39:10 发布

阅读量672

点赞数

文章标签：深度学习机器学习 python

本文链接：https://blog.csdn.net/jianjiange1/article/details/131375520

版权

1.init_utils

-- coding: utf-8 --

#init_utils.py

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets

def sigmoid(x):
“”"
Compute the sigmoid of x

Arguments:
x -- A scalar or numpy array of any size.

Return:
s -- sigmoid(x)
"""
s = 1/(1+np.exp(-x))
return s

def relu(x):
“”"
Compute the relu of x

Arguments:
x -- A scalar or numpy array of any size.

Return:
s -- relu(x)
"""
s = np.maximum(0,x)

return s

def compute_loss(a3, Y):

"""
Implement the loss function

Arguments:
a3 -- post-activation, output of forward propagation
Y -- "true" labels vector, same shape as a3

Returns:
loss - value of the loss function
"""

m = Y.shape[1]
logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
loss = 1./m * np.nansum(logprobs)

return loss

def forward_propagation(X, parameters):
“”"
Implements the forward propagation (and computes the loss) presented in Figure 2.

Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
                W1 -- weight matrix of shape ()
                b1 -- bias vector of shape ()
                W2 -- weight matrix of shape ()
                b2 -- bias vector of shape ()
                W3 -- weight matrix of shape ()
                b3 -- bias vector of shape ()

Returns:
loss -- the loss function (vanilla logistic loss)
"""
    
# retrieve parameters
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]

# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
z1 = np.dot(W1, X) + b1
a1 = relu(z1)
z2 = np.dot(W2, a1) + b2
a2 = relu(z2)
z3 = np.dot(W3, a2) + b3
a3 = sigmoid(z3)

cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)

return a3, cache

def backward_propagation(X, Y, cache):
“”"
Implement the backward propagation presented in figure 2.

Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
cache -- cache output from forward_propagation()

Returns:
gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
"""
m = X.shape[1]
(z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache

dz3 = 1./m * (a3 - Y)
dW3 = np.dot(dz3, a2.T)
db3 = np.sum(dz3, axis=1, keepdims = True)

da2 = np.dot(W3.T, dz3)
dz2 = np.multiply(da2, np.int64(a2 > 0))
dW2 = np.dot(dz2, a1.T)
db2 = np.sum(dz2, axis=1, keepdims = True)

da1 = np.dot(W2.T, dz2)
dz1 = np.multiply(da1, np.int64(a1 > 0))
dW1 = np.dot(dz1, X.T)
db1 = np.sum(dz1, axis=1, keepdims = True)

gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
             "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
             "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}

return gradients

def update_parameters(parameters, grads, learning_rate):
“”"
Update parameters using gradient descent

Arguments:
parameters -- python dictionary containing your parameters 
grads -- python dictionary containing your gradients, output of n_model_backward

Returns:
parameters -- python dictionary containing your updated parameters 
              parameters['W' + str(i)] = ... 
              parameters['b' + str(i)] = ...
"""

L = len(parameters) // 2 # number of layers in the neural networks

# Update rule for each parameter
for k in range(L):
    parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)]
    parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)]
    
return parameters

def predict(X, y, parameters):
“”"
This function is used to predict the results of a n-layer neural network.

Arguments:
X -- data set of examples you would like to label
parameters -- parameters of the trained model

Returns:
p -- predictions for the given dataset X
"""

m = X.shape[1]
p = np.zeros((1,m), dtype = np.int)

# Forward propagation
a3, caches = forward_propagation(X, parameters)

# convert probas to 0/1 predictions
for i in range(0, a3.shape[1]):
    if a3[0,i] > 0.5:
        p[0,i] = 1
    else:
        p[0,i] = 0

# print results
print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))

return p

def load_dataset(is_plot=True):
np.random.seed(1)
train_X, train_Y = sklearn.datasets.make_circles(n_samples=300, noise=.05)
np.random.seed(2)
test_X, test_Y = sklearn.datasets.make_circles(n_samples=100, noise=.05)
# Visualize the data
if is_plot:
plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);
train_X = train_X.T
train_Y = train_Y.reshape((1, train_Y.shape[0]))
test_X = test_X.T
test_Y = test_Y.reshape((1, test_Y.shape[0]))
return train_X, train_Y, test_X, test_Y

def plot_decision_boundary(model, X, y):
# Set min and max values and give it some padding
x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole grid
Z = model(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.ylabel(‘x2’)
plt.xlabel(‘x1’)
plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
plt.show()

def predict_dec(parameters, X):
“”"
Used for plotting decision boundary.

Arguments:
parameters -- python dictionary containing your parameters 
X -- input data of size (m, K)

Returns
predictions -- vector of predictions of our model (red: 0 / blue: 1)
"""

# Predict using forward propagation and a classification threshold of 0.5
a3, cache = forward_propagation(X, parameters)
predictions = (a3>0.5)
return predictions

2. reg_utils

-- coding: utf-8 --

#reg_utils.py

import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio

def sigmoid(x):
“”"
Compute the sigmoid of x

Arguments:
x -- A scalar or numpy array of any size.

Return:
s -- sigmoid(x)
"""
s = 1/(1+np.exp(-x))
return s

def relu(x):
“”"
Compute the relu of x

Arguments:
x -- A scalar or numpy array of any size.

Return:
s -- relu(x)
"""
s = np.maximum(0,x)

return s

def initialize_parameters(layer_dims):
“”"
Arguments:
layer_dims – python array (list) containing the dimensions of each layer in our network

Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                b1 -- bias vector of shape (layer_dims[l], 1)
                Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
                bl -- bias vector of shape (1, layer_dims[l])
                
Tips:
- For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. 
This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!
- In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer.
"""

np.random.seed(3)
parameters = {}
L = len(layer_dims) # number of layers in the network

for l in range(1, L):
    parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
    parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    
    assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
    assert(parameters['W' + str(l)].shape == layer_dims[l], 1)

    
return parameters

def forward_propagation(X, parameters):
“”"
Implements the forward propagation (and computes the loss) presented in Figure 2.

Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
                W1 -- weight matrix of shape ()
                b1 -- bias vector of shape ()
                W2 -- weight matrix of shape ()
                b2 -- bias vector of shape ()
                W3 -- weight matrix of shape ()
                b3 -- bias vector of shape ()

Returns:
loss -- the loss function (vanilla logistic loss)
"""
    
# retrieve parameters
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]

# LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
z1 = np.dot(W1, X) + b1
a1 = relu(z1)
z2 = np.dot(W2, a1) + b2
a2 = relu(z2)
z3 = np.dot(W3, a2) + b3
a3 = sigmoid(z3)

cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)

return a3, cache

def compute_cost(a3, Y):
“”"
Implement the cost function

Arguments:
a3 -- post-activation, output of forward propagation
Y -- "true" labels vector, same shape as a3

Returns:
cost - value of the cost function
"""
m = Y.shape[1]

logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
cost = 1./m * np.nansum(logprobs)

return cost

def backward_propagation(X, Y, cache):
“”"
Implement the backward propagation presented in figure 2.

Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
cache -- cache output from forward_propagation()

Returns:
gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
"""
m = X.shape[1]
(z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache

dz3 = 1./m * (a3 - Y)
dW3 = np.dot(dz3, a2.T)
db3 = np.sum(dz3, axis=1, keepdims = True)

da2 = np.dot(W3.T, dz3)
dz2 = np.multiply(da2, np.int64(a2 > 0))
dW2 = np.dot(dz2, a1.T)
db2 = np.sum(dz2, axis=1, keepdims = True)

da1 = np.dot(W2.T, dz2)
dz1 = np.multiply(da1, np.int64(a1 > 0))
dW1 = np.dot(dz1, X.T)
db1 = np.sum(dz1, axis=1, keepdims = True)

gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
             "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
             "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}

return gradients

def update_parameters(parameters, grads, learning_rate):
“”"
Update parameters using gradient descent

Arguments:
parameters -- python dictionary containing your parameters 
grads -- python dictionary containing your gradients, output of n_model_backward

Returns:
parameters -- python dictionary containing your updated parameters 
              parameters['W' + str(i)] = ... 
              parameters['b' + str(i)] = ...
"""

L = len(parameters) // 2 # number of layers in the neural networks

# Update rule for each parameter
for k in range(L):
    parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)]
    parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)]
    
return parameters

def load_2D_dataset(is_plot=True):
data = sio.loadmat(‘datasets/data.mat’)
train_X = data[‘X’].T
train_Y = data[‘y’].T
test_X = data[‘Xval’].T
test_Y = data[‘yval’].T
if is_plot:
plt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral);

return train_X, train_Y, test_X, test_Y

def predict(X, y, parameters):
“”"
This function is used to predict the results of a n-layer neural network.

Arguments:
X -- data set of examples you would like to label
parameters -- parameters of the trained model

Returns:
p -- predictions for the given dataset X
"""

m = X.shape[1]
p = np.zeros((1,m), dtype = np.int)

# Forward propagation
a3, caches = forward_propagation(X, parameters)

# convert probas to 0/1 predictions
for i in range(0, a3.shape[1]):
    if a3[0,i] > 0.5:
        p[0,i] = 1
    else:
        p[0,i] = 0

# print results
print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))

return p

def predict_dec(parameters, X):
“”"
Used for plotting decision boundary.

Arguments:
parameters -- python dictionary containing your parameters 
X -- input data of size (m, K)

Returns
predictions -- vector of predictions of our model (red: 0 / blue: 1)
"""

# Predict using forward propagation and a classification threshold of 0.5
a3, cache = forward_propagation(X, parameters)
predictions = (a3>0.5)
return predictions

3. gc_utils

-- coding: utf-8 --

#gc_utils.py

import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
“”"
Compute the sigmoid of x

Arguments:
x -- A scalar or numpy array of any size.

Return:
s -- sigmoid(x)
"""
s = 1/(1+np.exp(-x))
return s

def relu(x):
“”"
Compute the relu of x

Arguments:
x -- A scalar or numpy array of any size.

Return:
s -- relu(x)
"""
s = np.maximum(0,x)

return s

def dictionary_to_vector(parameters):
“”"
Roll all our parameters dictionary into a single vector satisfying our specific required shape.
“”"
keys = []
count = 0
for key in [“W1”, “b1”, “W2”, “b2”, “W3”, “b3”]:

    # flatten parameter
    new_vector = np.reshape(parameters[key], (-1,1))
    keys = keys + [key]*new_vector.shape[0]
    
    if count == 0:
        theta = new_vector
    else:
        theta = np.concatenate((theta, new_vector), axis=0)
    count = count + 1

return theta, keys

def vector_to_dictionary(theta):
“”"
Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
“”"
parameters = {}
parameters[“W1”] = theta[:20].reshape((5,4))
parameters[“b1”] = theta[20:25].reshape((5,1))
parameters[“W2”] = theta[25:40].reshape((3,5))
parameters[“b2”] = theta[40:43].reshape((3,1))
parameters[“W3”] = theta[43:46].reshape((1,3))
parameters[“b3”] = theta[46:47].reshape((1,1))

return parameters

def gradients_to_vector(gradients):
“”"
Roll all our gradients dictionary into a single vector satisfying our specific required shape.
“”"

count = 0
for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
    # flatten parameter
    new_vector = np.reshape(gradients[key], (-1,1))
    
    if count == 0:
        theta = new_vector
    else:
        theta = np.concatenate((theta, new_vector), axis=0)
    count = count + 1

return theta

4. 导入相关的库

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import init_utils #第一部分，初始化
import reg_utils #第二部分，正则化
import gc_utils #第三部分，梯度校验
#%matplotlib inline #如果你使用的是Jupyter Notebook，请取消注释。
plt.rcParams[‘figure.figsize’] = (7.0, 4.0) # set default size of plots
plt.rcParams[‘image.interpolation’] = ‘nearest’
plt.rcParams[‘image.cmap’] = ‘gray’

5.初始化参数

1.读取并绘制数据集

train_X, train_Y, test_X, test_Y=init_utils.load_dataset(is_plot=True)

2.我们的模型

def model(X,Y,learning_rate=0.01,num_iterations=15000,print_cost=True,initialization=“he”,is_polt=True):
“”"
实现一个三层的神经网络：LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

参数：
    X - 输入的数据，维度为(2, 要训练/测试的数量)
    Y - 标签，【0 | 1】，维度为(1，对应的是输入的数据的标签)
    learning_rate - 学习速率
    num_iterations - 迭代的次数
    print_cost - 是否打印成本值，每迭代1000次打印一次
    initialization - 字符串类型，初始化的类型【"zeros" | "random" | "he"】
    is_polt - 是否绘制梯度下降的曲线图
返回
    parameters - 学习后的参数
"""
grads = {}
costs = []
m = X.shape[1]
layers_dims = [X.shape[0],10,5,1]

#选择初始化参数的类型
if initialization == "zeros":
    parameters = initialize_parameters_zeros(layers_dims)
elif initialization == "random":
    parameters = initialize_parameters_random(layers_dims)
elif initialization == "he":
    parameters = initialize_parameters_he(layers_dims)
else : 
    print("错误的初始化参数！程序退出")
    exit

#开始学习
for i in range(0,num_iterations):
    #前向传播
    a3 , cache = init_utils.forward_propagation(X,parameters)
    
    #计算成本        
    cost = init_utils.compute_loss(a3,Y)
    
    #反向传播
    grads = init_utils.backward_propagation(X,Y,cache)
    
    #更新参数
    parameters = init_utils.update_parameters(parameters,grads,learning_rate)
    
    #记录成本
    if i % 1000 == 0:
        costs.append(cost)
        #打印成本
        if print_cost:
            print("第" + str(i) + "次迭代，成本值为：" + str(cost))
    

#学习完毕，绘制成本曲线
if is_polt:
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

#返回学习完毕后的参数
return parameters

热爱技术的小曹

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
深度学习初始化，正则化，梯度检验

实现一个三层的神经网络：LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID。#%matplotlib inline #如果你使用的是Jupyter Notebook，请取消注释。import gc_utils #第三部分，梯度校验。import init_utils #第一部分，初始化。import reg_utils #第二部分，正则化。
复制链接

扫一扫