1、将下面的代码段保存成base_utils.py 文档,用于初始化和L2 惩罚以及dropout方法的文件
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy.io
import sklearn.datasets
import sklearn.linear_model
#%matplotlib inline
def load_data_cat():
x_train_set = h5py.File('datasets/train_catvnoncat.h5','r')
x_train = np.array(x_train_set['train_set_x'][:])
y_train = np.array(x_train_set['train_set_y'][:])
x_test_set = h5py.File('datasets/test_catvnoncat.h5','r')
x_test = np.array(x_test_set['test_set_x'][:])
y_test = np.array(x_test_set['test_set_y'][:])
num_px = x_train.shape[1]
x_train = x_train.reshape(-1,num_px*num_px*3).T
x_test = x_test.reshape(-1,num_px*num_px*3).T
y_train = y_train.reshape(1,-1)
y_test = y_test.reshape(1,-1)
label_names = np.array(x_test_set['list_classes'][:])
x_train = x_train / 255.0
x_test = x_test / 255.0
return x_train,x_test,y_train,y_test,label_names
def load_2D_dataset():
data = scipy.io.loadmat('datasets/data.mat')
x_train = data['X'].T
y_train = data['y'].T
x_test = data['Xval'].T
y_test = data['yval'].T
plt.figure(1)
plt.scatter(x_train[0, :], x_train[1, :], c=y_train, s=40, cmap=plt.cm.Spectral);
plt.show()
return x_train,x_test,y_train,y_test
def load_data():
from sklearn.datasets import make_circles
np.random.seed(1)
x_train,y_train = make_circles(n_samples=300, shuffle=True,noise = 0.05,factor=0.8)
np.random.seed(2)
x_test,y_test = make_circles(n_samples=100,shuffle=True,noise=0.05,factor=0.8)
#visualize data
plt.scatter(x_train[:,0],x_train[:,1],c=y_train,s=40,cmap = plt.cm.Spectral)
x_train = x_train.T
y_train = y_train.reshape(1,-1)
x_test = x_test.T
y_test = y_test.reshape(1,-1)
return x_train,x_test,y_train,y_test
def plot_decision_boundary(model,X,Y):
'''
Plot the decision boundary
'''
x_min,x_max = X[0,:].min()-1.0,X[0,:].max()+1.0
y_min,y_max = X[1,:].min()-1.0,X[1,:].max()+1.0
h = 0.01
xx,yy = np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
z = model(np.c_[xx.ravel(),yy.ravel()].T)
z = z.reshape(xx.shape)
plt.contourf(xx,yy,z,cmap=plt.cm.Spectral)
plt.xlabel('feature 1')
plt.ylabel('feature 2')
plt.xlim([-2.0,2.0])
plt.ylim([-2.0,2.0])
plt.scatter(X[0,:],X[1,:],c=Y,cmap=plt.cm.Spectral)
plt.show()
def model_pred(X,parameters,activation_list):
'''
This function used to predict the used parameters
'''
n_layers = len(activation_list)
m_samples = X.shape[1]
caches = []
for i in range(n_layers):
cache_temp = {}
if (i == 0):
cache_temp['Z'] = X
cache_temp['A'] = X
caches.append(cache_temp)
else:
cache_temp['Z'] = np.random.randn(parameters[i]['b'].shape[0],m_samples)
cache_temp['A'] = np.random.randn(parameters[i]['b'].shape[0],m_samples)
caches.append(cache_temp)
caches = n_layers_forward(parameters,caches, activation_list)
predictions = caches[n_layers-1]['A'].reshape(-1,1)
return predictions
#image datas
#x_train, x_test,y_train,y_test,label_names = load_data_cat()
# data used in make_circles
x_train, x_test,y_train,y_test = load_data()
plt.show()
units_list = [x_train.shape[0],3,2,1]
activation_list = ['None','relu','relu','sigmoid']
learning_rate = 0.01
def sigmoid_forward(Z):
'''
sigmoid function do calc of z =1.0/(1+exp(-x))
return z
'''
A = 1.0/(1.0 + np.exp(-Z))
assert(Z.shape == A.shape)
return A
def sigmoid_backward(dA,Z):
'''
Inputs:
dA: the backprop derivations of A
Z: in forwardprop A = g(Z)
return:
dZ: the gradient of Z
'''
temp_A = sigmoid_forward(Z)
# dZ = dA*A(1-A)
dZ = np.multiply(dA,np.multiply(temp_A,(1.0-temp_A)))
assert(dA.shape == dZ.shape)
return dZ
def relu_forward(Z):
'''
relu calc
'''
A = np.maximum(0,Z)
assert(A.shape == Z.shape)
return A
def relu_backward(dA,Z):
'''
relu backprop calc
'''
dZ = np.copy(dA)
dZ[Z<0] = 0.0
assert(dA.shape == dZ.shape)
return dZ
def init(X,units_list,initialization='random'):
'''
function used to init variables about to use
Inputs:
X: inputs values used to train model
units_list: list structure ,length is layer number, values represents units names(inputs as the 0 layers as first layers)
Outputs:
parameters : W,b in every layers
caches: Z,A in every layers
gradients : dZ,dA,dW,db in every layers
'''
np.random.seed(1)
n_layers = len(units_list)
m_samples = X.shape[1]
parameters = []
caches = []
gradients = []
init_scaler = []
for i in range(n_layers):
if i == 0:
init_scaler.append(0.01)
else:
if initialization == 'zeros':
init_scaler.append(0.0)
elif initialization == 'he':
init_scaler.append(np.sqrt(2.0/np.float(units_list[i-1])))
else :
init_scaler.append(1.0)
# init of matrix
for i in range(n_layers):
param_temp = {}
cache_temp = {}
grad_temp = {}
if (i==0):
print('test of init_scaler',init_scaler[i],init_scaler)
param_temp['W'] = np.random.randn(units_list[i],units_list[i])*init_scaler[i] # will not used
param_temp['b'] = np.random.randn(units_list[i],1)*init_scaler[i] # will not used
cache_temp['Z'] = X # will not be used
cache_temp['A'] = X #!!!!!! trainning values important
grad_temp['dW'] = np.random.randn(units_list[i],units_list[i]) # will not used
grad_temp['db'] = np.random.randn(units_list[i]) # will not used
grad_temp['dA'] = np.random.randn(X.shape[0],X.shape[1]) # will not used
grad_temp['dZ'] = np.random.randn(X.shape[0],X.shape[1]) # will not used
parameters.append(param_temp)
caches.append(cache_temp)
gradients.append(grad_temp)
else:
param_temp['W'] = np.random.randn(units_list[i],units_list[i-1])*init_scaler[i]
param_temp['b'] = np.random.randn(units_list[i],1)*init_scaler[i]
cache_temp['Z'] = np.random.randn(units_list[i],m_samples)
cache_temp['A'] = np.random.randn(units_list[i],m_samples)
grad_temp['dW'] = np.random.randn(units_list[i],units_list[i-1])
grad_temp['db'] = np.random.randn(units_list[i],1)
grad_temp['dA'] = np.random.randn(units_list[i],m_samples)
grad_temp['dZ'] = np.random.randn(units_list[i],m_samples)
parameters.append(param_temp)
caches.append(cache_temp)
gradients.append(grad_temp)
return parameters, caches, gradients
def init_drop(X,units_list,drop_list,initialization='random'):
'''
function used to init variables about to use
Inputs:
X: inputs values used to train model
units_list: list structure ,length is layer number, values represents units names(inputs as the 0 layers as first layers)
Outputs:
parameters : W,b in every layers
caches: Z,A in every layers
gradients : dZ,dA,dW,db in every layers
'''
drop_index = [drop_list[i][0] for i in range(len(drop_list))]
drop_prop = [drop_list[i][1] for i in range(len(drop_list))]
np.random.seed(1)
n_layers = len(units_list)
m_samples = X.shape[1]
parameters = []
caches = []
gradients = []
init_scaler = []
for i in range(n_layers):
if i == 0:
init_scaler.append(0.01)
else:
if initialization == 'zeros':
init_scaler.append(0.0)
elif initialization == 'he':
init_scaler.append(np.sqrt(2.0/np.float(units_list[i-1])))
else :
init_scaler.append(1.0)
init_scaler = np.array(init_scaler)
# init of matrix
for i in range(n_layers):
param_temp = {}
cache_temp = {}
grad_temp = {}
if (i==0):
print('test of init_scaler',init_scaler[i],init_scaler)
param_temp['W'] = np.random.randn(units_list[i],units_list[i])*init_scaler[i] # will not used
param_temp['b'] = np.random.randn(units_list[i],1)*init_scaler[i] # will not used
cache_temp['Z'] = X # will not be used
cache_temp['A'] = X #!!!!!! trainning values important
cache_temp['D'] = np.random.rand(X.shape[0],X.shape[1])
grad_temp['dW'] = np.random.randn(units_list[i],units_list[i]) # will not used
grad_temp['db'] = np.random.randn(units_list[i]) # will not used
grad_temp['dA'] = np.random.randn(X.shape[0],X.shape[1]) # will not used
grad_temp['dZ'] = np.random.randn(X.shape[0],X.shape[1]) # will not used
parameters.append(param_temp)
caches.append(cache_temp)
gradients.append(grad_temp)
else:
param_temp['W'] = np.random.randn(units_list[i],units_list[i-1])*init_scaler[i]
param_temp['b'] = np.random.randn(units_list[i],1)*init_scaler[i]
cache_temp['Z'] = np.random.randn(units_list[i],m_samples)
cache_temp['A'] = np.random.randn(units_list[i],m_samples)
cache_temp['D'] = np.random.rand(units_list[i],m_samples)
if i in drop_index:
index = drop_index.index(i)
prop = drop_prop[index]
cache_temp['D'][cache_temp['D']<=prop] = 1.0
cache_temp['D'][cache_temp['D']!=1.0] = 0.0
grad_temp['dW'] = np.random.randn(units_list[i],units_list[i-1])
grad_temp['db'] = np.random.randn(units_list[i],1)
grad_temp['dA'] = np.random.randn(units_list[i],m_samples)
grad_temp['dZ'] = np.random.randn(units_list[i],m_samples)
parameters.append(param_temp)
caches.append(cache_temp)
gradients.append(grad_temp)
return parameters, caches, gradients
def linear_forward(X,W,b):
'''
calc the preocess w*x + b
'''
Z = np.dot(W,X) + b
assert(Z.shape[0] == W.shape[0])
assert(Z.shape[1] == X.shape[1])
return Z
def linear_activation_forward(A_prev,W,b,activation='None'):
'''
function is a single layer calc
return cache parameters
outputs cache values of Z,A
'''
Z = linear_forward(A_prev,W,b)
if(activation == 'relu'):
A = relu_forward(Z)
elif(activation == 'sigmoid'):
A = sigmoid_forward(Z)
else:
A = Z
print('wrong in activation function!!!')
assert(Z.shape == A.shape)
return Z,A
def n_layers_forward(parameters,caches,activation_list):
'''
this function calc the caches use w,b and Aprev
'''
n_layers = len(activation_list)
for i in range(1,n_layers):
A_prev = caches[i-1]['A']
W = parameters[i]['W']
b = parameters[i]['b']
activation = activation_list[i]
caches[i]['Z'], caches[i]['A'] = linear_activation_forward(A_prev,W,b,activation)
return caches
def n_layers_forward_drop(parameters,caches,activation_list,drop_list):
'''
this function calc the caches use w,b and Aprev
'''
drop_index = [drop_list[i][0] for i in range(len(drop_list))]
drop_prop = [drop_list[i][1] for i in range(len(drop_list))]
n_layers = len(activation_list)
for i in range(1,n_layers):
A_prev = caches[i-1]['A']
W = parameters[i]['W']
b = parameters[i]['b']
activation = activation_list[i]
caches[i]['Z'], caches[i]['A'] = linear_activation_forward(A_prev,W,b,activation)
if i in drop_index:
index = drop_index.index(i)
prop = drop_prop[index]
# D = np.random.rand(caches[i]['A'].shape[0],caches[i]['A'].shape[1])
# D[D<=prop] = 1.0
# D[D !=1.0] = 0.0
D = caches[i]['D']
assert(D.shape == caches[i]['Z'].shape)
caches[i]['A'] = np.multiply(caches[i]['A'],D)/prop
#caches[i]['D'] = D
return caches
def linear_backward(dZ,Aprev):
'''
single layers in linear calc backprop calc
Inputs:
dZ: gradients of loss to ith layers' Z
Aprev: cache values in (i-1) layers' matrix A
Outputs:
dW: gradients of loss to ith layers' W
db: gradients of loss to ith layers' b
'''
m_samples = dZ.shape[1]
dW = np.dot(dZ, Aprev.T)/np.float(m_samples)
db = np.sum(dZ,axis=1,keepdims=True)/np.float(m_samples)
return dW, db
def linear_activation_backward(Z,Aprev,Wplus,dZplus,activation):
'''
used to calc single layer's dZ,dA,dW,db
Inputs:
Z : matrix of i th layers
Aprev: matrix of previous layers
Wplus: parameters of W of i+1 th layers
dZplus: dz gradients of (i+1)th layers
activation: activation function
Outputs:
dA: dA gradients of i th layers
dZ: dZ gradients of i th layers
dW: dW gradients of i th layers
db: db gradients of i th layers
'''
dA = np.dot(Wplus.T,dZplus)
if (activation == 'sigmoid'):
dZ = sigmoid_backward(dA,Z)
elif(activation == 'relu'):
dZ = relu_backward(dA,Z)
else:
dZ = dA
print('Wrong in calc dz,da,dw,db')
dW,db = linear_backward(dZ,Aprev)
return dZ,dA,dW,db
def linear_activation_backward_drop(Z,Aprev,Wplus,dZplus,D,prop,activation):
'''
used to calc single layer's dZ,dA,dW,db
Inputs:
Z : matrix of i th layers
Aprev: matrix of previous layers
Wplus: parameters of W of i+1 th layers
dZplus: dz gradients of (i+1)th layers
activation: activation function
Outputs:
dA: dA gradients of i th layers
dZ: dZ gradients of i th layers
dW: dW gradients of i th layers
db: db gradients of i th layers
'''
dA = np.dot(Wplus.T,dZplus)
assert(dA.shape == D.shape)
dA = np.multiply(dA,D)/prop
if (activation == 'sigmoid'):
dZ = sigmoid_backward(dA,Z)
elif(activation == 'relu'):
dZ = relu_backward(dA,Z)
else:
dZ = dA
print('Wrong in calc dz,da,dw,db')
dW,db = linear_backward(dZ,Aprev)
return dZ,dA,dW,db
def n_layers_backward(Y,parameters,caches,gradients, activation_list):
'''
used to calc the n_layers gradients
Inputs:
parameters: w,b every layer model to learn
caches: Z,A every layers
gradients: used as inputs
activation_list: every layers activation_function
Outputs:
gradients: cost function gradients to every in dA,dZ,dW,db
'''
n_layers = len(activation_list)
for i in range(n_layers-1,0,-1):
activation = activation_list[i]
Z = caches[i]['Z']
A = caches[i]['A']
Aprev = caches[i-1]['A']
if (i == n_layers -1):
gradients[i]['dA'] = -np.divide(Y,A) + np.divide((1.0-Y),(1.0-A))
dA = gradients[i]['dA']
gradients[i]['dZ'] = sigmoid_backward(dA,Z)
dZ = gradients[i]['dZ']
gradients[i]['dW'],gradients[i]['db'] = linear_backward(dZ,Aprev)
else:
Wplus = parameters[i+1]['W']
dZplus = gradients[i+1]['dZ']
gradients[i]['dZ'],gradients[i]['dA'],gradients[i]['dW'],gradients[i]['db'] = \
linear_activation_backward(Z,Aprev,Wplus,dZplus,activation)
return gradients
def n_layers_backward_drop_l2(Y,parameters,caches,gradients,activation_list,drop_list,lambd=0.01):
'''
used to calc the n_layers gradients
Inputs:
parameters: w,b every layer model to learn
caches: Z,A every layers
gradients: used as inputs
activation_list: every layers activation_function
Outputs:
gradients: cost function gradients to every in dA,dZ,dW,db
'''
drop_index = [drop_list[i][0] for i in range(len(drop_list))]
drop_prop = [drop_list[i][1] for i in range(len(drop_list))]
m_samples = Y.shape[1]
n_layers = len(activation_list)
for i in range(n_layers-1,0,-1):
activation = activation_list[i]
Z = caches[i]['Z']
A = caches[i]['A']
Aprev = caches[i-1]['A']
if (i == n_layers -1):
gradients[i]['dA'] = -np.divide(Y,A) + np.divide((1.0-Y),(1.0-A))
dA = gradients[i]['dA']
gradients[i]['dZ'] = sigmoid_backward(dA,Z)
dZ = gradients[i]['dZ']
gradients[i]['dW'],gradients[i]['db'] = linear_backward(dZ,Aprev)
gradients[i]['dW'] += lambd/2.0/np.float(m_samples)*parameters[i]['W']
gradients[i]['db'] += lambd/2.0/np.float(m_samples)*parameters[i]['b']
else:
Wplus = parameters[i+1]['W']
dZplus = gradients[i+1]['dZ']
if (i in drop_index):
index = drop_index.index(i)
D = caches[i]['D']
prop = drop_prop[index]
gradients[i]['dZ'],gradients[i]['dA'],gradients[i]['dW'],gradients[i]['db'] = \
linear_activation_backward_drop(Z,Aprev,Wplus,dZplus,D,prop,activation)
gradients[i]['dW'] += lambd/2.0/np.float(m_samples)*parameters[i]['W']
gradients[i]['db'] += lambd/2.0/np.float(m_samples)*parameters[i]['b']
return gradients
def n_layers_backward_l2(Y,parameters,caches,gradients,activation_list,lambd=0.01):
'''
used to calc the n_layers gradients
Inputs:
parameters: w,b every layer model to learn
caches: Z,A every layers
gradients: used as inputs
activation_list: every layers activation_function
Outputs:
gradients: cost function gradients to every in dA,dZ,dW,db
'''
m_samples = Y.shape[1]
n_layers = len(activation_list)
for i in range(n_layers-1,0,-1):
activation = activation_list[i]
Z = caches[i]['Z']
A = caches[i]['A']
Aprev = caches[i-1]['A']
if (i == n_layers -1):
gradients[i]['dA'] = -np.divide(Y,A) + np.divide((1.0-Y),(1.0-A))
dA = gradients[i]['dA']
gradients[i]['dZ'] = sigmoid_backward(dA,Z)
dZ = gradients[i]['dZ']
gradients[i]['dW'],gradients[i]['db'] = linear_backward(dZ,Aprev)
gradients[i]['dW'] += lambd/2.0/np.float(m_samples)*parameters[i]['W']
gradients[i]['db'] += lambd/2.0/np.float(m_samples)*parameters[i]['b']
else:
Wplus = parameters[i+1]['W']
dZplus = gradients[i+1]['dZ']
gradients[i]['dZ'],gradients[i]['dA'],gradients[i]['dW'],gradients[i]['db'] = \
linear_activation_backward(Z,Aprev,Wplus,dZplus,activation)
gradients[i]['dW'] += lambd/2.0/np.float(m_samples)*parameters[i]['W']
gradients[i]['db'] += lambd/2.0/np.float(m_samples)*parameters[i]['b']
return gradients
def update_parameters(parameters,gradients,learning_rate):
'''
function used to update parameters w,b
Inputs:
parameters,gradients,learning_rate
Outputs:
parameters: updated parameters
'''
n_layers = len(parameters)
#print('shape of learning_rate',learning_rate)
for i in range(1,n_layers):
assert(parameters[i]['W'].shape == gradients[i]['dW'].shape)
assert(parameters[i]['b'].shape == gradients[i]['db'].shape)
parameters[i]['W'] += -learning_rate*gradients[i]['dW']
parameters[i]['b'] += -learning_rate*gradients[i]['db']
return parameters
def cost_function(AL,Y):
'''
function to calc cost values
Inputs: AL last layers' cache matrix A
Y: labeled samples targets
Outputs:
loss: total cost function values
'''
m_samples = Y.shape[1]
AL.reshape(-1,1)
Y.reshape(-1,1)
loss = np.dot(Y.T,np.log(AL)) + np.dot((1.0-Y).T,np.log(1.0-AL))
loss = -loss / np.float(m_samples)
loss = loss.reshape(-1,1)
loss = loss[0]
return loss
def cost_function_l2(AL,Y,parameters,lambd):
'''
function to calc cost values
Inputs: AL last layers' cache matrix A
Y: labeled samples targets
parameters: W in each layers with be used
lambd: coefficience of regulazition
Outputs:
loss: total cost function values
'''
m_samples = Y.shape[1]
n_layers = len(parameters)
AL.reshape(-1,1)
Y.reshape(-1,1)
loss_norm = np.dot(Y.T,np.log(AL)) + np.dot((1.0-Y).T,np.log(1.0-AL))
loss_norm = -loss_norm / np.float(m_samples)
loss_norm = loss_norm.reshape(-1,1)
loss_norm = loss_norm[0]
loss_l2 = 0.0
for i in range(1,n_layers):
loss_l2 += np.sum(np.multiply(parameters[i]['W'],parameters[i]['W']))
loss_l2 = loss_l2 * lambd/2.0/np.float(m_samples)
loss = loss_norm + loss_l2
return loss
def predict(AL,Y):
'''
function use learned parameters to predict
Inputs:
AL: last layer cache matrix
Y: labeled datas
Outputs:
accuracy: the predict accuracy real number
'''
AL = AL.reshape(-1,1)
Y = Y.reshape(-1,1)
m_samples = Y.shape[0]
counts = 0.0
for i in range(m_samples):
if AL[i] >=0.5:
AL[i] = 1.0
else:
AL[i] = 0.0
accuracy = np.sum(AL == Y)/np.float(m_samples)
return accuracy
# def learning_process(X,Y,units_list,activation_list,learning_rate = 0.0075):
# '''
# function used to learn model
# Inputs:
# X: inputs data including features
# Y: labeled data
# learning_rate: learning_rate
# units_list : layers length and layers units number
# activation_list: activations in each layer
# Outputs:
# parameters: learned W b in all layers
# loss: total cost function in convergence
# '''
# n_layers = len(units_list)
# num_epoch = 30000
# loss_list = []
# accuracy_list = []
# accuracy_test = []
# steps = []
# #plt.ion()
# plt.figure(1)
# plt.figure(2)
# loss_temp = 0.0
# parameters, caches, gradients = init(X,units_list,initialization='he')
# for i in range(num_epoch):
# caches = n_layers_forward(parameters,caches,activation_list)
# loss = cost_function(caches[n_layers-1]['A'],Y)
# dloss = np.abs(loss-loss_temp)/(np.abs(loss)+1.0e-15)
# loss_temp = loss
# gradients = n_layers_backward(Y,parameters,caches,gradients,activation_list)
# parameters = update_parameters(parameters,gradients,learning_rate)
# if(i%200 == 0):
# steps.append(i)
# loss_list.append(loss)
# accuracy_list.append(predict(caches[n_layers-1]['A'],Y))
# test_predictions = model_pred(x_test,parameters,activation_list)
# accuracy_test.append(predict(test_predictions,y_test))
# print('The trainning steps is {0} total loss is: {1} residual is:{2}'.format(i,loss,dloss))
# print('The trainning accuracy is {0},test is:{1}'.format(accuracy_list[-1],accuracy_test[-1]))
# plt.figure(1)
# line1,=plt.plot(steps,loss_list,'r',linewidth=1.5)
# plt.xlabel('Trainning steps')
# plt.ylabel('Total loss values')
# plt.legend([line1],['total loss'],loc = 'best')
# plt.figure(2)
# line2, = plt.plot(steps,accuracy_list,'g',linewidth=1.5)
# line3, = plt.plot(steps,accuracy_test,'r',linewidth=1.5)
# plt.xlabel('Trainning steps')
# plt.ylabel('Trainning Accuracy')
# plt.legend([line2,line3],['Trainning Accuracy','Test Accuracy'],loc='best')
# #plt.pause(0.01)
# return parameters, loss
def grad_checking(Y,parameters,caches,gradients,activation_list):
'''
'''
caches = n_layers_forward(parameters,caches,activation_list)
gradients = n_layers_backward(Y,parameters,caches,gradients,activation_list)
# start gradient_appro calc
n_layers = len(activation_list)
print('thi is out of n_layersin checking',n_layers)
m_samples = Y.shape[1]
epsol = 1.0e-7
grad_appro = []
grad_diff = []
# this is 0 layer's grad_appro
grad_temp = {}
grad_temp['W'] = np.zeros([2,2])
grad_temp['b'] = np.zeros([2,1])
grad_appro.append(grad_temp)
grad_diff.append(grad_temp)
# from 1st layer calc
for k in range(1,n_layers):
nx,ny = parameters[k]['W'].shape[0],parameters[k]['W'].shape[1]
grad_temp = {}
diff_temp = {}
grad_temp['dW'] = np.zeros([nx,ny])
diff_temp['dW'] = np.zeros([nx,ny])
grad_temp['db'] = np.zeros([nx,1])
diff_temp['db'] = np.zeros([nx,1])
w_temp = parameters[k]['W']
b_temp = parameters[k]['b']
for i in range(nx):
parameters[k]['b'][i][0] += -1.0*epsol
caches_temp = n_layers_forward(parameters,caches,activation_list)
j_minus = cost_function(caches_temp[n_layers-1]['A'],Y)
parameters[k]['b'][i][0] += 2.0*epsol
caches_temp = n_layers_forward(parameters,caches,activation_list)
j_plus = cost_function(caches_temp[n_layers-1]['A'],Y)
grad_temp['db'][i][0] = (j_plus-j_minus)/2.0/epsol
parameters[k]['b'] = b_temp
for j in range(ny):
parameters[k]['W'][i][j] += -1.0*epsol
caches_temp = n_layers_forward(parameters,caches,activation_list)
j_minus = cost_function(caches_temp[n_layers-1]['A'],Y)
parameters[k]['W'] += 2.0*epsol
caches_temp = n_layers_forward(parameters,caches,activation_list)
j_plus = cost_function(caches_temp[n_layers-1]['A'],Y)
grad_temp['dW'][i][j] = (j_plus - j_minus)/2.0/epsol
parameters[k]['W'] = w_temp
grad_appro.append(grad_temp)
assert(grad_temp['dW'].shape == gradients[k]['dW'].shape)
assert(grad_temp['db'].shape == gradients[k]['db'].shape)
diff_temp['dW'] = np.abs(grad_appro[k]['dW']-gradients[k]['dW'])/(np.abs(grad_appro[k]['dW'])+np.abs(gradients[k]['dW']))
diff_temp['db'] = np.abs(grad_appro[k]['db']-gradients[k]['db'])/(np.abs(grad_appro[k]['db'])+np.abs(gradients[k]['db']))
grad_diff.append(diff_temp)
for k in range(1,n_layers):
print(len(grad_diff))
print('this is {0}th layer outputs!!!!!!!'.format(k))
print("this is out of {0} th layer's dW :".format(k))
print(grad_diff[k]['dW'])
print(grad_appro[k]['dW'])
print(gradients[k]['dW'])
print("this is out of {0} th layers's db:".format(k))
print(grad_diff[k]['db'])
print(grad_appro[k]['db'])
print(gradients[k]['db'])
return grad_diff
2、将如下代码段保存成initialize_l2_drop.py则可进行相应的验证:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import h5py
import pdb
#matplotlib inline
from base_utils import sigmoid_forward,sigmoid_backward,relu_forward,relu_backward
from base_utils import linear_forward,linear_backward,linear_activation_forward,linear_activation_backward
from base_utils import n_layers_forward,n_layers_backward,cost_function,predict,model_pred, init
from base_utils import update_parameters,n_layers_backward_l2,cost_function_l2
from base_utils import load_2D_dataset,plot_decision_boundary,load_data_cat
from base_utils import init_drop,n_layers_forward_drop,n_layers_backward_drop_l2
from base_utils import grad_checking
learning_rate = 0.01
#x_train,x_test,y_train, y_test = load_2D_dataset()
x_train,x_test,y_train, y_test,label_names = load_data_cat()
units_list = [x_train.shape[0],15,5,1]
activation_list = ['None','relu','relu','sigmoid']
drop_list = [(1,0.86),(2,0.86)]
l2 = True
lambd = 0.0
initialization = 'he'
#print('first y_test',y_test)
def model(X,Y,units_list,activation_list,learning_rate=0.0075,l2 = False,lambd=0.0,initialization='random'):
'''
function used to learn model
Inputs:
X: inputs data including features
Y: labeled data
learning_rate: learning_rate
units_list : layers length and layers units number
activation_list: activations in each layer
Outputs:
parameters: learned W b in all layers
loss: total cost function in convergence
'''
epoch_num = 10000
accuracy = []
accuracy_test = []
loss_list = []
steps = []
#plt.ion()
plt.figure(2)
plt.figure(3)
loss_temp = 0.0
n_layers = len(units_list)
parameters,caches,gradients = init(X,units_list,initialization=initialization)
for i in range(epoch_num):
# #if (i == 3000):
# grad_diff = grad_checking(Y,parameters,caches,gradients,activation_list)
# pdb.set_trace()
caches = n_layers_forward(parameters,caches,activation_list)
if (not l2) :
loss = cost_function(caches[n_layers-1]['A'],Y)
gradients = n_layers_backward(Y,parameters,caches,gradients,activation_list)
else:
loss = cost_function_l2(caches[n_layers-1]['A'],Y,parameters,lambd)
gradients = n_layers_backward_l2(Y,parameters,caches,gradients,activation_list,lambd)
dloss = np.abs(loss - loss_temp)/(np.abs(loss)+1.0e-15)
loss_temp = loss
parameters = update_parameters(parameters,gradients,learning_rate)
if(i%200 == 0):
print('Steps is{0}, total loss value is:{1},resudal is:{2}'.format(i,loss,dloss))
steps.append(i)
loss_list.append(loss)
accuracy.append(predict(caches[n_layers-1]['A'],Y))
predictions = model_pred(x_test,parameters,activation_list)
accuracy_test.append(predict(predictions,y_test))
print('Steps is{0},trainning accuracy is:{1},test_accuracy is:{2}'.format(i,accuracy[-1],accuracy_test[-1]))
plt.figure(2)
line1, = plt.plot(steps,loss_list, 'g',linewidth=1.5)
plt.xlabel('Trainning steps')
plt.ylabel('Total Loss')
plt.title('Trainning loss vs steps learning_rate is:{0} l2 is{1}'.format(learning_rate,l2))
plt.legend([line1],['Trainning loss'],loc='best')
plt.figure(3)
line2, = plt.plot(steps,accuracy,'r',linewidth=1.5)
line3, = plt.plot(steps,accuracy_test,'b',linewidth=1.5)
plt.xlabel('Trainning steps')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Steps! learning_rate:{0},l2:{1}'.format(learning_rate, l2))
plt.legend([line2,line3],['Trainning accuracy','Test accuracy'],loc='best')
#plt.pause(0.01)
return parameters, loss
def model_drop(X,Y,units_list,activation_list,drop_list,learning_rate=0.0075,l2 = False,lambd=0.0,initialization='random'):
'''
function used to learn model
Inputs:
X: inputs data including features
Y: labeled data
learning_rate: learning_rate
units_list : layers length and layers units number
activation_list: activations in each layer
Outputs:
parameters: learned W b in all layers
loss: total cost function in convergence
'''
epoch_num = 15000
accuracy = []
accuracy_test = []
loss_list = []
steps = []
#plt.ion()
plt.figure(2)
plt.figure(3)
loss_temp = 0.0
n_layers = len(units_list)
parameters,caches,gradients = init_drop(X,units_list,drop_list,initialization=initialization)
for i in range(epoch_num):
caches = n_layers_forward_drop(parameters,caches,activation_list,drop_list)
if (not l2) :
loss = cost_function(caches[n_layers-1]['A'],Y)
gradients = n_layers_backward(Y,parameters,caches,gradients,activation_list)
else:
loss = cost_function_l2(caches[n_layers-1]['A'],Y,parameters,lambd)
n_layers_backward_l2(Y,parameters,caches,gradients,activation_list,lambd)
#gradients = n_layers_backward_drop_l2(Y,parameters,caches,gradients,activation_list,drop_list,lambd)
dloss = np.abs(loss - loss_temp)/(np.abs(loss)+1.0e-15)
loss_temp = loss
parameters = update_parameters(parameters,gradients,learning_rate)
if(i%200 == 0):
print('Steps is{0}, total loss value is:{1},resudal is:{2}'.format(i,loss,dloss))
steps.append(i)
loss_list.append(loss)
accuracy.append(predict(caches[n_layers-1]['A'],Y))
predictions = model_pred(x_test,parameters,activation_list)
accuracy_test.append(predict(predictions,y_test))
print('Steps is{0},trainning accuracy is:{1},test_accuracy is:{2}'.format(i,accuracy[-1],accuracy_test[-1]))
plt.figure(2)
line1, = plt.plot(steps,loss_list, 'g',linewidth=1.5)
plt.xlabel('Trainning steps')
plt.ylabel('Total Loss')
plt.title('Trainning loss vs steps learning_rate is:{0} l2 is{1}'.format(learning_rate,l2))
plt.legend([line1],['Trainning loss'],loc='best')
plt.figure(3)
line2, = plt.plot(steps,accuracy,'r',linewidth=1.5)
line3, = plt.plot(steps,accuracy_test,'b',linewidth=1.5)
plt.xlabel('Trainning steps')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Steps! learning_rate:{0},l2:{1}'.format(learning_rate, l2))
plt.legend([line2,line3],['Trainning accuracy','Test accuracy'],loc='best')
#plt.pause(0.01)
return parameters, loss
parameters, loss = model(x_train,y_train,units_list,activation_list,\
learning_rate=learning_rate,l2=l2,lambd=lambd,initialization=initialization)
plt.show()
plt.figure(4)
plt.xlabel('X Feature 1')
plt.ylabel('X Feature 2')
plt.title(' trainning data and model boundary')
plot_decision_boundary(lambda x:model_pred(x,parameters,activation_list),x_train,y_train)
print('final loss is ',loss)