#深度学习模型具有很高的灵活性和能力,如果训练数据集不够大,将会造成一个严重的问题--过拟合。尽管它在训练集上效果很好,但是学到的网络不能应用到测试集中!
# 你将学习: 在深度学习模型中使用正则化。
import numpy as np
import matplotlib.pyplot as plt
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases_L2W1 import *
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
#你刚刚被法国足球公司聘为AI专家。他们希望你推荐预测法国守门员将球踢出的位置,以便法国队的球员可以用头将球击中。
#守门员将球踢到空中,每支球队的球员都在尽力用头击球
# 他们为你提供了法国过去10场比赛的二维数据集。
#如果圆点为蓝色,则表示法国球员设法用头部将球击中
# 如果圆点为红色,则表示另一支球队的球员用头撞球
train_X, train_Y, test_X, test_Y=load_2D_dataset()
plt.show()
# 你将使用以下神经网络(已为你实现),可以如下使用此模型:
# 在regularization mode中,通过lambd将输入设置为非零值。我们使用lambd代替lambda,因为lambda是Python中的保留关键字。
# 在dropout mode中,将keep_prob设置为小于1的值
# 首先,你将尝试不进行任何正则化的模型。然后,你将实现:
# L2 正则化 函数:compute_cost_with_regularization()和backward_propagation_with_regularization()
# Dropout 函数:forward_propagation_with_dropout()和backward_propagation_with_dropout()
#1. 非正则化模型
def model(X,Y,learning_rate=0.3,num_iteration=30000,print_cost=True,lambd=0,keep_prob=1):
"""
Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
Arguments:
X -- input data, of shape (input size, number of examples)
Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
learning_rate -- learning rate of the optimization
num_iterations -- number of iterations of the optimization loop
print_cost -- If True, print the cost every 10000 iterations
lambd -- regularization hyperparameter, scalar
keep_prob - probability of keeping a neuron active during drop-out, scalar.
Returns:
parameters -- parameters learned by the model. They can then be used to predict.
"""
grads={}
costs=[]
m=X.shape[1]
layer_dims=[X.shape[0],20,3,1]
parameters=initialize_parameters(layer_dims)
for i in range(num_iteration):
#forward
if keep_prob==1:
a3, cache=forward_propagation(X,parameters)
elif keep_prob<1:
a3,cache=forward_propagation_with_dropout(X, parameters, keep_prob)
#cost
if lambd==0:
cost=compute_cost(a3,Y)
else:
cost=compute_cost_with_regularization(a3, Y, parameters, lambd)
#backward
assert (lambd == 0 or keep_prob == 1) # it is possible to use both L2 regularization and dropout,
# but this assignment will only explore one at a time
if lambd==0 and keep_prob==1:
grads=backward_propagation(X,Y,cache)
elif lambd!=0:
grads=backward_propagation_with_regularization(X, Y, cache, lambd)
elif keep_prob<1:
grads=backward_propagation_with_dropout(X, Y, cache, keep_prob)
parameters=update_parameters(parameters,grads,learning_rate)
if print_cost and i%10000==0:
print("Cost after iteration %i: %f" % (i, cost))
if print_cost and i % 1000 == 0:
costs.append(cost)
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (x1,000)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters
#让我们在不进行任何正则化的情况下训练模型,并观察训练/测试集的准确性。
parameters = model(train_X, train_Y)
print ("On the training set:")
predictions_train = predict(train_X, train_Y, parameters)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, parameters)
#绘制模型的决策边界
plt.title("Model without regularization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
#2.L2正则化
def compute_cost_with_regularization(A3,Y,parameters,lambd):
m=Y.shape[1]
W1=parameters["W1"]
W2 = parameters["W2"]
W3 = parameters["W3"]
cross_entropy_cost=compute_cost(A3,Y)
L2_regularization_cost=lambd/(2*m)*(np.sum(np.square(W1))+np.sum(np.square(W2))+np.sum(np.square(W3)))
cost=cross_entropy_cost+L2_regularization_cost
return cost
A3, Y_assess, parameters = compute_cost_with_regularization_test_case()
print("cost = " + str(compute_cost_with_regularization(A3, Y_assess, parameters, lambd = 0.1)))
#因为你更改了损失,所以还必须更改反向传播! 必须针对新损失函数计算所有梯度
def backward_propagation_with_regularization(X,Y,cache,lambd):
m=X.shape[1]
Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3=cache
dZ3=A3-Y#sigmoid
dW3=1/m*np.dot(dZ3,A2.T)+lambd/m*W3
db3=1/m*np.sum(dZ3,axis=1,keepdims=True)
dA2=np.dot(W3.T,dZ3)
dZ2=np.multiply(dA2,np.int64(A2>0))#relu?
dW2=1/m*np.dot(dZ2,A1.T)+lambd/m*W2
db2=1/m*np.sum(dZ2,axis=1,keepdims=True)
dA1=np.dot(W2.T,dZ2)
dZ1=np.multiply(dA1,np.int64(A1>0))
dW1=1/m*np.dot(dZ1,X.T)+lambd/m*W1
db1=1/m*np.sum(dZ1,axis=1,keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
X_assess, Y_assess, cache = backward_propagation_with_regularization_test_case()
grads = backward_propagation_with_regularization(X_assess, Y_assess, cache, lambd = 0.7)
# print ("dW1 = "+ str(grads["dW1"]))
# print ("dW2 = "+ str(grads["dW2"]))
# print ("dW3 = "+ str(grads["dW3"]))
#现在让我们使用L2正则化lambda=0.7运行的模型,model()函数将调用:
# compute_cost_with_regularization代替compute_cost
# backward_propagation_with_regularization代替backward_propagation
parameters=model(train_X,train_Y,lambd=0.7)
print ("On the train set:")
predictions_train = predict(train_X, train_Y, parameters)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model with L2-regularization")
axes=plt.gca()#进行坐标轴的移动
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
# lambda的值是你可以调整开发集的超参数。
# L2正则化使决策边界更平滑。如果lambda太大,则也可能“过度平滑”,从而使模型偏差较高。
# L2正则化的影响:
# 损失计算:
# - 正则化条件会添加到损失函数中
# 反向传播函数:
# - 有关权重矩阵的渐变中还有其他术语
# 权重最终变小(“权重衰减”):
# - 权重被推到较小的值。
#3.Dropout
# - 朋友:“为什么你需要所有神经元来训练你的网络以分类图像?”。
# # - 你:“因为每个神经元都有权重,并且可以学习图像的特定特征/细节/形状。我拥有的神经元越多,模型学习的特征就越丰富!”
# # - 朋友:“我知道了,但是你确定你的神经元学习的是不同的特征而不是全部相同的特征吗?”
# # - 你:“这是个好问题……同一层中的神经元实际上并不关联。应该绝对有可能让他们学习相同的图像特征/形状/形式/细节...这是多余的。为此应该有一个解决方案。”
#带有Dropout的正向传播,关闭第一层和第二层中的某些神经元
def forward_propagation_with_dropout(X, parameters, keep_prob):
# Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
W1=parameters["W1"]
b1=parameters["b1"]
W2=parameters["W2"]
b2=parameters["b2"]
W3=parameters["W3"]
b3=parameters["b3"]
# Step 1: initialize matrix D1 = np.random.rand(..., ...)
# Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
# Step 3: shut down some neurons of A1
# Step 4: scale the value of neurons that haven't been shut down
Z1=np.dot(W1,X)+b1
A1=relu(Z1)
D1=np.random.rand(A1.shape[0],A1.shape[1])
D1=D1<keep_prob
A1=np.multiply(A1,D1)#A1*D1
A1/=keep_prob
Z2=np.dot(W2,A1)+b2
A2=relu(Z2)
D2=np.random.rand(A2.shape[0],A2.shape[1])
D2=D2<keep_prob
A2=np.multiply(A2,D2)
A2/=keep_prob
Z3=np.dot(W3,A2)+b3
A3=sigmoid(Z3)
cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)
return A3,cache
X_assess, parameters = forward_propagation_with_dropout_test_case()
A3, cache = forward_propagation_with_dropout(X_assess, parameters, keep_prob = 0.7)
print ("A3 = " + str(A3))
#带有dropout的反向传播
def backward_propagation_with_dropout(X, Y, cache, keep_prob):
m=X.shape[1]
Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3=cache
dZ3 = A3 - Y # sigmoid
dW3 = 1 / m * np.dot(dZ3, A2.T)
db3 = 1 / m * np.sum(dZ3, axis=1, keepdims=True)
dA2 = np.dot(W3.T, dZ3)
dA2=dA2*D2
dA2/=keep_prob
dZ2 = np.multiply(dA2, np.int64(A2 > 0)) # relu?
dW2 = 1 / m * np.dot(dZ2, A1.T)
db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
dA1=dA1*D1
dA1/=keep_prob
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = 1 / m * np.dot(dZ1, X.T)
db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
X_assess, Y_assess, cache = backward_propagation_with_dropout_test_case()
gradients = backward_propagation_with_dropout(X_assess, Y_assess, cache, keep_prob = 0.8)
print ("dA1 = " + str(gradients["dA1"]))
print ("dA2 = " + str(gradients["dA2"]))
parameters=model(train_X,train_Y,keep_prob = 0.86)
print ("On the train set:")
predictions_train = predict(train_X, train_Y, parameters)
print ("On the test set:")
predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model with dropout")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
#使用dropout时的常见错误是在训练和测试中都使用。你只能在训练中使用dropout(随机删除节点)。
#深度学习框架,例如tensorflow, PaddlePaddle, keras或者 caffe 附带dropout层的实现。不需强调-相信你很快就会学习到其中的一些框架。
# 1.dropout是一种正则化技术。
# 2.仅在训练期间使用dropout,在测试期间不要使用。
# 3.在正向和反向传播期间均应用dropout。
# 4.在训练期间,将每个dropout层除以keep_prob,以保持激活的期望值相同。例如,如果keep_prob为0.5,
# 则平均而言,我们将关闭一半的节点,因此输出将按0.5缩放,因为只有剩余的一半对解决方案有所贡献。
# 除以0.5等于乘以2,因此输出现在具有相同的期望值。你可以检查此方法是否有效,即使keep_prob的值不是0.5。
吴恩达神经网络学习-L2W1作业2
最新推荐文章于 2024-07-18 16:36:43 发布