使用Dropout防止神经网络过拟合（纯numpy实现）

1、原理简述

dropout简单来讲，就是在迭代的过程中，随机的丢弃掉某些神经元，使得其训练只包含部分神经元的网络，因为任何神经元都有可能消失，所以模型会变得对神经元不那么敏感，表现就是对参数W的压缩，起到与L2正则化类似的作用。

1->随机生成一个数值在（0-1）之间的，与A[l]维数相同的矩阵,drop_prob是保留概率

D[l] = np.random.rand(A[l].shape[0],A[l].shape[1])<drop_prob


2->丢弃部分神经元（前向传播）

A[l] = A[l]*D[l]
A[l] = A[l]/drop_prob  #保持期望值一样


3->丢弃部分神经元（后向传播）

dA[l] = dA[l]*D[l]
dA[l] = dA[l]/drop_prob  #保持期望值一样


2、实验结果

"
训练正确率：1.0



训练正确率：0.919431279620853



3、python实现

import numpy as np
import matplotlib.pyplot as plt
import nn_utils
nTrain = train_x.shape[1]
nTest = test_x.shape[1]
#初始化超参数----------------------------------------
layers = [train_x.shape[0], 50, 35, 20, 10, 1]  #网络结构
alpha = 0.02  #学习率
drop_prob = 0.6   #神经元保留概率
Iterations = 40000  #迭代次数
nLayers = len(layers)-1 #网络层数
#初始化W,b-------------------------------------------
W = [[] for i in range(nLayers+1)]
b = [[] for i in range(nLayers+1)]
for l in range(1,nLayers+1):
W[l] = np.random.randn(layers[l],layers[l-1])/(np.sqrt(layers[l-1]/2))
b[l] = np.zeros((layers[l],1))
dW = W.copy()
db = b.copy()
#初始化Cache-----------------------------------------
A = [[] for i in range(nLayers+1)]
Z = [[] for i in range(nLayers+1)]
for l in range(1,nLayers+1):
A[l] = np.zeros((layers[l],nTrain))
Z[l] = np.zeros((layers[l],nTrain))
print(A[l].shape)
print(Z[l].shape)
dA = A.copy()
dZ = Z.copy()
D = A.copy()  #drop矩阵
A[0] = train_x
cost = []
#迭代训练-------------------------------------------------
for i in range(Iterations):
for l in range(1,nLayers+1):  #前向传播
Z[l] = np.dot(W[l],A[l-1])+b[l]
if l==nLayers:
A[l] = nn_utils.sigmoid(Z[l])
else:
A[l] = nn_utils.relu(Z[l])
D[l] = np.random.rand(A[l].shape[0],A[l].shape[1])<drop_prob
A[l] = A[l]*D[l]   #随机丢弃部分神经元 简化网络
A[l] /= drop_prob
dZ[nLayers] = (A[nLayers]-train_y)/nTrain
for l in np.arange(nLayers,0,-1):
dW[l] = np.dot(dZ[l], A[l-1].T)
db[l] = np.sum(dZ[l], axis=1, keepdims=True)
if l>1:
dA[l-1] = np.dot(W[l].T, dZ[l])*D[l-1]
dA[l-1] /= drop_prob
dZ[l-1] = dA[l-1].copy()
dZ[l-1][Z[l-1]<0] = 0
for l in range(1,nLayers+1):
W[l] -=alpha*dW[l]
b[l] -= alpha*db[l]
if i%2000==0:
cost_cur = -np.sum( train_y*np.log(A[nLayers]+0.0001)+(1-train_y)*np.log(1-A[nLayers]+0.0001) )/nTrain
cost.append(cost_cur)
print("迭代次数："+str(i)+"---cost："+str(cost_cur))

#计算精度-------------------------------------------------
train_err = np.sum(A[nLayers][train_y==1]<=0.5)+np.sum(A[nLayers][train_y==0]>=0.5)
print("训练正确率："+str(1-train_err/nTrain))
predict_A = test_x
for l in range(1,nLayers+1):
predict_Z = np.dot(W[l], predict_A)+b[l]
if l==nLayers:
predict_A = nn_utils.sigmoid(predict_Z)
else:
predict_A = nn_utils.relu(predict_Z)
test_err = np.sum(predict_A[test_y==1]<=0.5)+np.sum(predict_A[test_y==0]>0.5)
print("测试错误率："+str(1-test_err/nTest))
nn_utils.plot_decision_boundary(W, b, train_x, train_y)


nn_utils.py

import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio

def relu(x):
return np.maximum(x,0)
def sigmoid(x):
if (x>0).all():
return 1.0/(1+np.exp(-x))
else:
return np.exp(x)/(1+np.exp(x))
def plot_decision_boundary(W, b, X, Y):
x_min, x_max = X[0,:].min(), X[0,:].max()  #取得绘图数值范围
y_min, y_max = X[1,:].min(), X[1,:].max()
step = 0.01   #网格精度
xx,yy = np.meshgrid( np.arange(x_min,x_max,step), np.arange(y_min,y_max,step) )  #生成一张网格
plot_samples = np.array( [xx.ravel(),yy.ravel()] )
A = plot_samples.copy()
for l in range(1, len(W)):
Z = np.dot(W[l], A)+b[l]
if l==len(W)-1:
A = sigmoid(Z)
else:
A = relu(Z)
A[A>0.5] = 1
A[A<=0.5] = 0
A =A.reshape(xx.shape)
plt.contourf(xx, yy, A, cmap=plt.cm.Spectral)
plt.xlabel('x1')
plt.ylabel('y2')
plt.scatter(X[0,:], X[1,:], c=Y[0,:])
plt.show()

train_X = data['X'].T
train_Y = data['y'].T
test_X = data['Xval'].T
test_Y = data['yval'].T
if is_plot:
plt.scatter(train_X[0, :], train_X[1, :], c=train_Y[0,:], s=40, cmap=plt.cm.Spectral);
return train_X, train_Y, test_X, test_Y


©️2019 CSDN 皮肤主题: 1024 设计师: 上身试试