文章目录
构建一个用于图像分类的深度神经网络
一些笔记
写作业
导入常用的包,构建简单函数
import numpy as np
import h5py
import matplotlib.pyplot as plt
from dnn_app_utils_v2 import *
%matplotlib inline
%load_ext autoreload
%autoreload 2
np.random.seed(1)
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
def sigmoid(Z):
A = 1/(1+np.exp(-Z))
cache = Z
return A, cache
def relu(Z):
A = np.maximum(0,Z)
cache = Z
return A, cache
def relu_backward(dA, cache):
Z = cache
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
return dZ
def sigmoid_backward(dA, cache):
Z = cache
s = 1/(1+np.exp(-Z))
dZ = dA * s * (1-s)
return dZ
导入数据
与之前的作业吴恩达深度学习L1W2——实现简单逻辑回归数据集一致。导入数据,以及数据预处理也基本一致
# 训练集、测试集
train_data_set = h5py.File('./train_catvnoncat.h5', "r")
test_data_set = h5py.File('./test_catvnoncat.h5', "r")
#h5格式文件类似于字典
for key in train_data_set.keys():
print(key)
# list_classes 类别
# train_set_x 猫片
# train_set_y 标签
#从字典中取出对应图片以及标签
train_data_org=train_data_set["train_set_x"][:]
train_label_org=train_data_set["train_set_y"][:]
test_data_org=test_data_set["test_set_x"][:]
test_label_org=test_data_set["test_set_y"][:]
classes = np.array(test_data_set["list_classes"][:])
#查看类别标签
for i in classes:
print(i.decode("utf-8"))
#non-cat
#cat
#随便查看一个图片
plt.imshow(test_data_org[48])
#处理上述数据的维度
m_train=train_data_org.shape[0]
num_px=train_data_org.shape[1]
m_test=test_data_org.shape[0]
train_data_flatten=train_data_org.reshape(m_train,-1).T
test_data_flatten=test_data_org.reshape(m_test,-1).T
print(train_data_flatten.shape,test_data_flatten.shape,num_px)
#(12288, 209) (12288, 50) 64
test_label=test_label_org[np.newaxis,:]
train_label=train_label_org[np.newaxis,:]
# 添加一行,将原来的(50,)变成(1,50)
#通常从二维数组里面抽取一列,取出来之后维度却变成了一维,如果我们需要将其还原为二维,就可以使用上述方法
# 也可以使用上面的reshape方法
print(test_label.shape,train_label.shape)
#(1, 50) (1, 209)
#标准化数据
#图片的三个RGB通道,位于0-255,标签的量级在0、1,需要将其归一化,进行÷255即可
train_x=train_data_flatten/255
test_x=test_data_flatten/255
构造创建模型时需要的函数
在上次作业中说到,创建模型需要三步:
1、定义神经网络结构
2、初始化模型参数
3、前向传播、损失函数、反向求导、bp下降
由于上次使用的是2层的神经网络,我们很多可以很容易的写出来,但是如果层数多的话,我们需要进行循环 这里用layer_dims变量存储nl,即不同层中的神经元数,然后进行各步操作
创建并初始化两层神经网络的参数
这里先用一个两层的神经网络例子来看一看,与之前的初始化参数一致,如果是更多层的话就需要用到layer_dims+循环初始化了
该模型可以总结为:INPUT -> LINEAR -> RELU -> LINEAR -> SIGMOID -> OUTPUT
随机初始化权重矩阵。使用np.random.rand(shape)* 0.01
零初始化偏差。使用np.zeros(shape)
这里只是两层的神经网络,如果是更深的L层神经网络的初始化会更加复杂,因为有很多的权重矩阵和偏差向量需要进行初始化,每层的大小见笔记图:
当我们在python中计算WX+b时,使用广播,比如:
def initialize_parameters(n_x,n_h,n_y):
np.random.seed(1)
W1=np.random.randn(n_h,n_x)*0.01
b1=np.zeros((n_h,1))
W2=np.random.randn(n_y,n_h)*0.01
b2=np.zeros((n_y,1))
parameters={
"W1":W1,
"b1":b1,
"W2":W2,
"b2":b2
}
return parameters
实现L层网络的初始化函数
该模型可以总结为:[LINEAR -> RELU] (L-1) -> LINEAR -> SIGMOID
##这是 (一层神经网络)的实现
if L == 1:
parameters["W" + str(L)] = np.random.randn(layer_dims[1], layer_dims[0]) * 0.01
parameters["b" + str(L)] = np.zeros((layer_dims[1], 1))
在layer_dims变量中存储nl,即不同层中的神经元数。
例如上次实验中“二维数据分类模型”的layer_dims为[2,4,1]:即有两个输入,一个隐藏层包含4个隐藏单元,一个输出层包含1个输出单元。因此,W1的维度为(4,2),b1的维度为(4,1),W2的维度为(1,4),而b2的维度为(1,1)
def initialize_parameters_deep(layer_dims):
np.random.seed(1)
parameters = {}
L = len(layer_dims) #有多少层,3层的话,0层为输入,只为1,2初始化即可
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
parameters['b' + str(l)] = np.zeros((layer_dims[l],1))
return parameters
正向传播函数实现
正向传播中的线性函数计算
Z[l]=W[l]A[l-1]+b[l]
其中A[0]=X
def linear_forward(A_pre,W,b):
#h合鲸社区中的代码给的是A ,但是我感觉在这里用A_pre比较容易理解
Z = np.dot(W,A_pre) + b
cache = (A_pre, W, b)
return Z, cache
正向传播中的激活函数计算
A, activation_cache = sigmoid(Z)
A, activation_cache = relu(Z)
封装了上面线性函数计算函数来获得Z,最后返回的结果为(当前层为l):Al,((Al-1,Wl,bl),Zl)
def linear_activation_forward(A_prev, W, b, activation):
if activation == "sigmoid":
Z, linear_cache = linear_forward(A_prev,W,b)
A, activation_cache = sigmoid(Z)
elif activation == "relu":
Z, linear_cache = linear_forward(A_prev,W,b)
A, activation_cache = relu(Z)
cache = (linear_cache, activation_cache)
return A, cache
L层模型的前向传播实现
L层模型的前向传播的话,前面实现的初始化函数返回的parameter中w、b作为上面实现的前向传播两个函数的输入,进行计算
def L_model_forward(X, parameters):
caches = []
A = X
L = len(parameters) // 2 #parameters中每层有w和b,÷2代表的是层数
#所有隐藏层
for l in range(1, L):
A_prev = A
A, cache = linear_activation_forward(A_prev,parameters['W' + str(l)],parameters['b' + str(l)],activation = "relu")
caches.append(cache)
#最后一层输出层为sigmoid函数
AL, cache = linear_activation_forward(A,parameters['W' + str(L)],parameters['b' + str(L)],activation = "sigmoid")
caches.append(cache)
#每层中cache存储的为A,((A_pre,W,b),Z),方便后面进行反向传播
return AL, caches
损失函数
计算交叉熵损失函数J,使用上面的L层模型前向传播函数返回的AL和输入的Y进行计算
def compute_cost(AL, Y):
m = Y.shape[1]
cost = -1 / m * np.sum(Y * np.log(AL) + (1-Y) * np.log(1-AL),axis=1,keepdims=True)
cost = np.squeeze(cost) #去掉长度为1 的维度
return cost
反向传播
线性函数的导数
假设已经有最后一层的dJ/(dZl),我们要计算dWl、dbl、dAl-1 所需要的公式如图所示:
#通过激活函数的导数返回的dZ,进行计算dW、db、dA_pre,dA_pre又可以作为上一层的激活函数导数计算中参数dA
def linear_backward(dZ, cache):
A_prev, W, b = cache
m = A_prev.shape[1]
dW = 1 / m * np.dot(dZ ,A_prev.T)
db = 1 / m * np.sum(dZ,axis = 1 ,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
激活函数导数计算
dZl=dAl*g`(Zl)
def linear_activation_backward(dA, cache, activation):
# linear_cache为(A_pre,w,b),activation_cache为Z
linear_cache, activation_cache = cache
if activation == "relu":
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == "sigmoid":
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
return dA_prev, dW, db
L层模型的反向传播
由于我们进行反向传播初步需要使用输出层的dA
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
然后就可以将dAL输入到linear_activation_backward中进行计算
def L_model_backward(AL, Y, caches):
grads = {}
L = len(caches) #层数-1,5层模型只有4个cache,1、2、3、4
m = AL.shape[1]#(n_L,m)中的m
Y = Y.reshape(AL.shape)
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))#最后一层的dA,作为激活函数导数的输入参数
#由于最后一层为sigmoid激活函数,单独运行
current_cache = caches[L-1]
grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
for l in reversed(range(L - 1)):
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+2)], current_cache, activation = "relu")
grads["dA" + str(l + 1)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
return grads
梯度下降更新参数
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2 #……
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l + 1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l + 1)]
return parameters
其他函数
def predict(X, y, parameters):
m = X.shape[1]
n = len(parameters) // 2
p = np.zeros((1,m))
# Forward propagation
probas, caches = L_model_forward(X, parameters)
# convert probas to 0/1 predictions
for i in range(0, probas.shape[1]):
if probas[0,i] > 0.5:
p[0,i] = 1
else:
p[0,i] = 0
print("Accuracy: " + str(np.sum((p == y)/m)))
return p
def print_mislabeled_images(classes, X, y, p):
a = p + y
mislabeled_indices = np.asarray(np.where(a == 1))
plt.rcParams['figure.figsize'] = (40.0, 40.0) # set default size of plots
num_images = len(mislabeled_indices[0])
for i in range(num_images):
index = mislabeled_indices[1][i]
plt.subplot(2, num_images, i + 1)
plt.imshow(X[:,index].reshape(64,64,3), interpolation='nearest')
plt.axis('off')
plt.title("Prediction: " + classes[int(p[0,index])].decode("utf-8") + " \n Class: " + classes[y[0,index]].decode("utf-8"))
实现两层的神经网络
#大致model()需要一下框架
def initialize_parameters(n_x, n_h, n_y):
...
return parameters
def linear_activation_forward(A_prev, W, b, activation):
...
return A, cache
def compute_cost(AL, Y):
...
return cost
def linear_activation_backward(dA, cache, activation):
...
return dA_prev, dW, db
def update_parameters(parameters, grads, learning_rate):
...
return parameters
n_x = 12288 # num_px * num_px * 3
n_h = 7
n_y = 1
layers_dims = (n_x, n_h, n_y)
def two_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
np.random.seed(1)
grads = {}
costs = [] #记录代价
m = X.shape[1] # 实例数量
(n_x, n_h, n_y) = layers_dims
parameters = initialize_parameters(n_x, n_h, n_y)
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
for i in range(0, num_iterations):
A1, cache1 =linear_activation_forward(X, W1, b1, activation = "relu")
A2, cache2 = linear_activation_forward(A1, W2, b2, activation = "sigmoid")
cost = compute_cost(A2, Y)
#初步输入的dAL
dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
#求导获得梯度
dA1, dW2, db2 = linear_activation_backward(dA2, cache2, activation = "sigmoid")
dA0, dW1, db1 = linear_activation_backward(dA1, cache1, activation = "relu")
grads['dW1'] = dW1
grads['db1'] = db1
grads['dW2'] = dW2
grads['db2'] = db2
#梯度下降
parameters = update_parameters(parameters, grads, learning_rate)
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
if print_cost and i % 100 == 0:
costs.append(cost)
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters
模型训练
parameters = two_layer_model(train_x, train_label, layers_dims = (n_x, n_h, n_y), num_iterations = 2500, print_cost=True)
训练集和测试集的准确度
predictions_train = predict(train_x, train_label, parameters)
#Accuracy: 0.9999999999999998
predictions_test = predict(test_x, test_label, parameters)
#Accuracy: 0.72
实现L层神经网络
[ LINEAR - > RELU ] (L-1) -> LINEAR -> SIGMOID。
def initialize_parameters_deep(layer_dims):
...
return parameters
def L_model_forward(X, parameters):
...
return AL, caches
def compute_cost(AL, Y):
...
return cost
def L_model_backward(AL, Y, caches):
...
return grads
def update_parameters(parameters, grads, learning_rate):
...
return parameters
layers_dims = [12288, 20, 7, 5, 1]
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
np.random.seed(1)
costs = []
parameters = initialize_parameters_deep(layers_dims)
for i in range(0, num_iterations):
AL, caches = L_model_forward(X, parameters)
cost = compute_cost(AL, Y)
grads = L_model_backward(AL, Y, caches)
parameters = update_parameters(parameters, grads, learning_rate)
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
if print_cost and i % 100 == 0:
costs.append(cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters
模型的训练
parameters = L_layer_model(train_x, train_label, layers_dims, num_iterations = 2500, print_cost = True)
训练集和测试集上的准确率
pred_train = predict(train_x, train_label, parameters)
#Accuracy: 0.9856459330143539
pred_test = predict(test_x, test_label, parameters)
#Accuracy: 0.8
查看错误的图片
print_mislabeled_images(classes, test_x, test_label, pred_test)
这次作业大致就这样,
这次写作业是把网上的两个作业和到了一起,然后也有几个小函数是自己按思路写的,有一个bug找了好久,发现教程中第二个作业使用的函数中初始化W并不是第一个作业中一样,然后训练之后在测试集上准确率只有40%,难过了好久。www