构建一个任意层数的深度神经网络
- 使用ReLU等非线性单位来改善模型
- 建立更深的神经网络(具有1个以上的隐藏层)
- 实现一个易于使用的神经网络类
import numpy as np import matplotlib.pyplot as plt import h5py from testCases_v2 import * #提供了一些测试用例来评估函数的正确性 from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward #为此笔记本提供了一些必要的函数 plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' np.random.seed(1) #使所有随机函数调用保持一致 #初始化 写两个辅助函数用来初始化模型的参数。 第一个函数将用于初始化两层模型的参数。 第二个将把初始化过程推广到L层模型上 #2层 # 模型的结构为:LINEAR -> RELU -> LINEAR -> SIGMOID。 # 随机初始化权重矩阵。 确保准确的维度,使用np.random.randn(shape)* 0.01。 # 将偏差初始化为0。 使用np.zeros(shape) def initialize_parameters(n_x,n_y,n_h): # n_x -- size of the input layer # n_h -- size of the hidden layer # n_y -- size of the output layer np.random.seed(1) W1=np.random.randn(n_h,n_x)*0.01 b1=np.zeros((n_h,1)) W2=np.random.randn(n_y,n_h)*0.01 b2=np.zeros((n_y,1)) assert (W1.shape == (n_h, n_x)) assert (b1.shape == (n_h, 1)) assert (W2.shape == (n_y, n_h)) assert (b2.shape == (n_y, 1)) parameters={ "W1":W1, "b1":b1, "W2":W2, "b2":b2 } return parameters parameters = initialize_parameters(2,1,2) # print("W1 = " + str(parameters["W1"])) # print("b1 = " + str(parameters["b1"])) # print("W2 = " + str(parameters["W2"])) # print("b2 = " + str(parameters["b2"])) # 我们将在不同的layer_dims变量中存储n_l,即不同层中的神经元数。例如,上周“二维数据分类模型”的layer_dims为[2,4,1]: # 即有两个输入,一个隐藏层包含4个隐藏单元,一个输出层包含1个输出单元。(这里相当于是layer0、layer1、layer2),虽然layer_dims的长度为3,但层数L=2 # 因此,W1的维度为(4,2),b1的维度为(4,1),W2的维度为(1,4),而b2的维度为(1,1)。现在你将把它应用到L层! def initialize_parameters_deep(layer_dims): np.random.seed(3) parameters = {} for i in range(1,len(layer_dims)): parameters["W"+str(i)]=np.random.randn(layer_dims[i],layer_dims[i-1])*0.01 #注,这里使用[i]和[i-1],使用[i+1]和[i]会超限 parameters["b"+str(i)]=np.zeros((layer_dims[i],1)) assert (parameters['W' + str(i)].shape == (layer_dims[i], layer_dims[i-1])) assert (parameters['b' + str(i)].shape == (layer_dims[i], 1)) return parameters parameters = initialize_parameters_deep([5,4,3]) # print("W1 = " + str(parameters["W1"])) # print("b1 = " + str(parameters["b1"])) # print("W2 = " + str(parameters["W2"])) # print("b2 = " + str(parameters["b2"])) #正向传播模块 #1.线性正向 def linear_forward(A,W,b): Z=np.dot(W,A)+b assert (Z.shape == (W.shape[0], A.shape[1])) cache=(A,W,b) return Z,cache A,W,b=linear_forward_test_case() Z,linear_cache=linear_forward(A,W,b) # print("Z = " + str(Z)) #2.正向线性激活 #我们将实现一个函数用以执行LINEAR正向步骤和ACTIVATION正向步骤。 #把两个函数(线性和激活)组合为一个函数(LINEAR-> ACTIVATION) def linear_activation_forward(A_prev,W,b,activation): if activation=="sigmoid": Z,linear_cache=linear_forward(A_prev,W,b) A,activation_cache=sigmoid(Z) elif activation=="relu": Z, linear_cache = linear_forward(A_prev, W, b) A,activation_cache=relu(Z) assert (A.shape == (W.shape[0], A_prev.shape[1])) cache=(linear_cache,activation_cache) return A,cache A_prev, W, b = linear_activation_forward_test_case() A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "sigmoid") # print("With sigmoid: A = " + str(A)) # print(linear_activation_cache[0]) # print(linear_activation_cache[1]) # A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "relu") # print("With ReLU: A = " + str(A)) #3.L层模型 #需要一个函数来复制前一个函数(使用RELU的linear_activation_forward)L-1次,以及复制带有SIGMOID的linear_activation_forward。 # 使用你先前编写的函数 # 使用for循环复制[LINEAR-> RELU](L-1)次 # 不要忘记在“cache”列表中更新缓存。 def L_model_forward(X,parameters): caches=[] L=len(parameters)//2 A=X for l in range(1,L): A_prev=A W=parameters["W"+str(l)] b=parameters["b"+str(l)] A,linear_activation_cache=linear_activation_forward(A_prev,W,b,"relu") caches.append(linear_activation_cache) WL=parameters["W"+str(L)] bL=parameters["b"+str(L)] Y_hat,cache=linear_activation_forward(A,WL,bL,"sigmoid") caches.append(cache) assert (Y_hat.shape == (1, X.shape[1])) return Y_hat,caches X,parameters=L_model_forward_test_case() Y_hat,caches=L_model_forward(X,parameters) # print("Y_hat = " + str(Y_hat)) # print("Length of caches list = " + str(len(caches)))#它接受输入X并输出包含预测的行向量AL。 它还将所有中间值记录在"caches"中以计算预测的损失值 #损失函数 def compute_cost(Y_hat,Y): m=Y.shape[1] cost=-1/m*np.sum((Y*np.log(Y_hat)+(1-Y)*np.log(1-Y_hat)),axis=1,keepdims=True) cost=np.squeeze(cost) return cost Y, Y_hat= compute_cost_test_case() # print("cost = " + str(compute_cost(Y_hat, Y))) #反向传播模块 #1.线性反向 def linear_backward(dZ,cache): # Arguments: # dZ -- Gradient of the cost with respect to the linear output (of current layer l) # cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer m=dZ.shape[1] A_prev=cache[0] W=cache[1] dW=1/m*np.dot(dZ,A_prev.T) db=1/m*np.sum(dZ,axis=1,keepdims=True) dA_prev=np.dot(W.T,dZ) # assert (dA_prev.shape == A_prev.shape) # assert (dW.shape == W.shape) # assert (db.shape == b.shape) return dA_prev,dW,db dZ, linear_cache = linear_backward_test_case() dA_prev, dW, db = linear_backward(dZ, linear_cache) # print ("dA_prev = "+ str(dA_prev)) # print ("dW = " + str(dW)) # print ("db = " + str(db)) #2.反向线性激活 #创建一个合并两个辅助函数的函数:linear_backward 和反向步骤的激活 linear_activation_backward def linear_activation_backward(dA,cache,activation): # Arguments: # dA -- post-activation gradient for current layer l # cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently # activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" # dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev linear_cache,activation_cache=cache if activation=="sigmoid": dZ=sigmoid_backward(dA,activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) elif activation=="relu": dZ=relu_backward(dA,activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) return dA_prev,dW,db AL, linear_activation_cache = linear_activation_backward_test_case() dA_prev, dW, db = linear_activation_backward(AL, linear_activation_cache, activation = "sigmoid") # print ("sigmoid:") # print ("dA_prev = "+ str(dA_prev)) # print ("dW = " + str(dW)) # print ("db = " + str(db) + "\n") dA_prev, dW, db = linear_activation_backward(AL, linear_activation_cache, activation = "relu") # print ("relu:") # print ("dA_prev = "+ str(dA_prev)) # print ("dW = " + str(dW)) # print ("db = " + str(db)) #3.反向L层模型 def L_model_backward(Y_hat,Y,caches): grads={} L=len(caches) m=Y_hat.shape[1] Y=Y.reshape(Y_hat.shape) # after this line, Y is the same shape as AL dAL = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat)) # derivative of cost with respect to AL current_cache=caches[L-1] grads["dA"+str(L)],grads["dW"+str(L)],grads["db"+str(L)]=linear_activation_backward(dAL,current_cache,"sigmoid") for l in reversed(range(L-1)): # Inputs: "grads["dA" + str(l + 2)], caches". Outputs: "grads["dA" + str(l + 1)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] current_cache=caches[l]#l=L-2-->0 dA_prev_temp,dW_temp,db_temp=linear_activation_backward(grads["dA"+str(l+2)],current_cache,"relu") #"dA"+str(l+2)]="dA"+str(L)!=dAL grads["dA"+str(l+1)]=dA_prev_temp grads["dW"+str(l+1)]=dW_temp grads["db"+str(l+1)]=db_temp return grads AL, Y_assess, caches = L_model_backward_test_case() grads = L_model_backward(AL, Y_assess, caches) # print ("dW1 = "+ str(grads["dW1"])) # print ("db1 = "+ str(grads["db1"])) # print ("dA1 = "+ str(grads["dA1"])) #更新参数 def update_parameters(parameters,grads,learning_rate): L=len(parameters)//2 for l in range(L): parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)] parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)] return parameters parameters, grads = update_parameters_test_case() parameters = update_parameters(parameters, grads, 0.1) print ("W1 = "+ str(parameters["W1"])) print ("b1 = "+ str(parameters["b1"])) print ("W2 = "+ str(parameters["W2"])) print ("b2 = "+ str(parameters["b2"])) # def two_layer_model(X,Y,layers_dims,learning_rate=0.0075,num_iterations=3000,print_cost=False): # costs=[] # grads={} # n_x, n_h, n_y=layers_dims # parameters=initialize_parameters(n_x, n_y, n_h) # W1=parameters["W1"] # b1=parameters["b1"] # W2=parameters["W2"] # b2=parameters["b2"] # for i in range(num_iterations): # A,cache1=linear_activation_forward(X,W1,b1,"relu") # Y_hat,cache2=linear_activation_forward(A,W2,b2,"sigmoid") # cost=compute_cost(Y_hat,Y) # # Backward propagation. Inputs: "dA2, cache2, cache1". Outputs: "dA1, dW2, db2; also dA0 (not used), dW1, db1". # dA2 = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat)) # dA1,dW2,db2=linear_activation_backward(dA2,cache2,"sigmoid") # dA0,dW1,db1=linear_activation_backward(dA1,cache1,"relu") # # grads['dW1'] = dW1 # grads['db1'] = db1 # grads['dW2'] = dW2 # grads['db2'] = db2 # # parameters=update_parameters(parameters,grads,learning_rate) # # W1 = parameters["W1"] # b1 = parameters["b1"] # W2 = parameters["W2"] # b2 = parameters["b2"] # # if print_cost and i%100==0: # print("Cost after iteration {}: {}".format(i, np.squeeze(cost))) # if print_cost and i % 100 == 0: # costs.append(cost) # # plt.plot(np.squeeze(costs)) # plt.ylabel('cost') # plt.xlabel('iterations (per tens)') # plt.title("Learning rate =" + str(learning_rate)) # plt.show() # # return parameters #