import numpy as np
import h5py
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
%load_ext autoreload
%autoreload 2
np.random.seed(1)
np.pad: 前面填几层,后面前几层,前面层的值为多少,后面的值为多少,一个元组对应一个维度
def zero_pad(X, pad):
X_pad = np.pad(X,((0,0),(pad,pad),(pad,pad),(0,0)), 'constant',constant_values=(0,0))
return X_pad
np.random.seed(1)
x = np.random.randn(4, 3, 3, 2)
x_pad = zero_pad(x, 2)
print ("x.shape =", x.shape)
print ("x_pad.shape =", x_pad.shape)
print ("x[1,1] =", x[1,1])
print ("x_pad[1,1] =", x_pad[1,1])
fig, axarr = plt.subplots(1, 2)
axarr[0].set_title('x')
axarr[0].imshow(x[0,:,:,0])
axarr[1].set_title('x_pad')
axarr[1].imshow(x_pad[0,:,:,0])
结果:
x.shape = (4, 3, 3, 2)
x_pad.shape = (4, 7, 7, 2)
x[1,1] = [[ 0.90085595 -0.68372786]
[-0.12289023 -0.93576943]
[-0.26788808 0.53035547]]
x_pad[1,1] = [[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]]
计算卷积后的值
def conv_single_step(a_slice_prev, W, b):
s = a_slice_prev*W+b
Z = np.sum(s)
return Z
np.random.seed(1)
a_slice_prev = np.random.randn(4, 4, 3)
W = np.random.randn(4, 4, 3)
b = np.random.randn(1, 1, 1)
Z = conv_single_step(a_slice_prev, W, b)
print("Z =", Z)
结果:
Z = -23.16021220252078
卷积前向传播:定义stride,padding数,filter数,根据公式计算卷积线性输出结果大小Z,遍历每一个样本,针对每一个样本,定义corner坐标,选出slice,再遍历每一个filter卷积计算得到Z一个位置的值,遍历结束,Z值即为所求值
def conv_forward(A_prev, W, b, hparameters):
#输入数据(m * n_H_prev * n_W_prev * n_C_prev)
m,n_H_prev,n_W_prev,n_C_prev = A_prev.shape
#卷积核尺寸(f*f*n_C_prev),n_C代表偶几个卷积核
f,f,n_C_prev,n_C = W.shape
#定义步长,填充数
stride = hparameters['stride']
pad = hparameters['pad']
#计算输出层数据维度(m*n_H*n_W*n_C),一组卷积核计算输出一个二维数据,叠加成n_C维
n_H = int(np.floor((n_H_prev+2*pad-f)/stride + 1))
n_W = int(np.floor((n_W_prev+2*pad-f)/stride + 1))
# 初始化卷积后的线性输出值
Z = np.zeros((m,n_H,n_W,n_C))
#对输入层进行填充
A_prev_pad = zero_pad(A_prev, pad)
for i in range(m):
#选取样本
a_prev_pad = A_prev_pad[i,:,:,:]
#对填充后的数据计算corner坐标(W能够和填充后的数据对应相乘)
for j in range(n_H_prev+3):
for k in range(n_W_prev+3):
#一个卷积核对应Z中的一层(所有的过滤器都乘上第一个位置,得到Z左上角的一个值(加上通道))
for l in range(n_C):
vert_start = j
vert_end = j + f
horiz_start = k
horiz_end = k + f
a_slice_prev = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
#卷积后计算的值对应Z的第(j,k)个位置,i(数据),(l,第l个卷积核计算的数据)
Z[i,j,k,l] = conv_single_step(a_slice_prev, W[:,:,:,l], b[:,:,:,l])
assert(Z.shape == (m, n_H, n_W, n_C))
cache = (A_prev, W, b, hparameters)
return Z, cache
np.random.seed(1)
A_prev = np.random.randn(10,4,4,3)
W = np.random.randn(2,2,3,8)
b = np.random.randn(1,1,1,8)
hparameters = {"pad" : 2,
"stride": 1}
Z, cache_conv = conv_forward(A_prev, W, b, hparameters)
print("Z's mean =", np.mean(Z))
print("cache_conv[0][1][2][3] =", cache_conv[0][1][2][3])
结果:
Z's mean = 0.15585932488906465
cache_conv[0][1][2][3] = [-0.20075807 0.18656139 0.41005165]
卷积前向传播
def pool_forward(A_prev, hparameters, mode = "max"):
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
f = hparameters["f"]
stride = hparameters["stride"]
n_H = int(1 + (n_H_prev - f) / stride)
n_W = int(1 + (n_W_prev - f) / stride)
n_C = n_C_prev
A = np.zeros((m, n_H, n_W, n_C))
for i in range(m):
for j in range(n_H):
for k in range(n_W):
for l in range(n_C):
vert_start = j
vert_end = j + f
horiz_start = k
horiz_end = k + f
a_slice = A_prev[i, vert_start:vert_end,horiz_start:horiz_end,l]
if mode == "max":
A[i, j, k,l] = np.max(a_slice)
else:
A[i, j, k,l] = np.mean(a_slice)
cache = (A_prev, hparameters)
assert(A.shape == (m, n_H, n_W, n_C))
return A, cache
np.random.seed(1)
A_prev = np.random.randn(2, 4, 4, 3)
hparameters = {"stride" : 1, "f": 4}
A, cache = pool_forward(A_prev, hparameters)
print("mode = max")
print("A =", A)
print()
A, cache = pool_forward(A_prev, hparameters, mode = "average")
print("mode = average")
print("A =", A)
结果:
mode = max
A = [[[[1.74481176 1.6924546 2.10025514]]]
[[[1.19891788 1.51981682 2.18557541]]]]
mode = average
A = [[[[-0.09498456 0.11180064 -0.14263511]]]
[[[-0.09525108 0.28325018 0.33035185]]]]
卷积反向传播
cache保存着前向传播中的参数:包括上一层的输出值A_perv,当前卷积层的W,b,stride,pad参数,在前向传播中会得到Z,反向传播过程中,初始化参数参数均为0,其中,dA_prev和A_prev,W和dW,b和db的形状相同;
在求导公式中需要注意的是,遍历所有样本,
da_prev(需要知道W,dZ)需要分成卷积大小的矩阵(三维矩阵),其每一块的值是由每个卷积层相应的dZ累加(一个向量,几个卷积层,dZ深度就有几维可以这么理解,第一个卷积对应着dZ深度上第一层的值,如此对应)
dW(需要A_prev和dZ),将 a_prev_pad分成卷积核大小的矩阵(a_prev_paddZ(深度上1,2,3…相加)
db(需要知道dZ,相当于所有的卷积核对应的Z相加),卷积过程对A_prev用到了pad,求导数则也需dA_prev进行pad,再转化为dA_prev
def conv_backward(dZ, cache):
A_prev, W, b, hparameters = cache
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
f,f,n_C_prev,n_C = W.shape
stride = hparameters['stride']
pad = hparameters['pad']
m, n_H, n_W, n_C = dZ.shape
dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))
dW = np.zeros((f,f,n_C_prev,n_C))
db = np.zeros((1,1,1,n_C))
A_prev_pad = zero_pad(A_prev, pad)
dA_prev_pad = zero_pad(dA_prev, pad)
for i in range(m):
a_prev_pad = A_prev_pad[i,:,:,:]
da_pre_pad = dA_prev_pad[i,:,:,:]
for j in range(n_H):
for k in range(n_W):
for l in range(n_C):
vert_start = j
vert_end = j + f
horiz_start = k
horiz_end = k + f
a_slice = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
da_pre_pad[vert_start:vert_end, horiz_start:horiz_end,:] += W[:,:,:,l] * dZ[i,j,k,l]
dW[:,:,:,l] += a_slice * dZ[i,j,k,l]
db[:,:,:,l] += dZ[i,j,k,l]
dA_prev[i,:,:,:] = da_pre_pad[pad:-pad,pad:-pad,:]
assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))
return dA_prev, dW, db
np.random.seed(1)
dA, dW, db = conv_backward(Z, cache_conv)
print("dA_mean =", np.mean(dA))
print("dW_mean =", np.mean(dW))
print("db_mean =", np.mean(db))
结果:
dA_mean = 9.608990675868995
dW_mean = 10.581741275547566
db_mean = 76.37106919563735
创建掩码矩阵
def create_mask_from_window(x):
mask = (x==np.max(x))
return mask
np.random.seed(1)
x = np.random.randn(2,3)
mask = create_mask_from_window(x)
print('x = ', x)
print("mask = ", mask)
结果:
x = [[ 1.62434536 -0.61175641 -0.52817175]
[-1.07296862 0.86540763 -2.3015387 ]]
mask = [[ True False False]
[False False False]]
def distribute_value(dz, shape):
n_H, n_W = shape
average = dz/(n_H*n_W)
a = np.ones(shape)*average
return a
结果:
distributed value = [[0.5 0.5]
[0.5 0.5]]
池化层反向传播
找到dA一个值对应的dA_prev区域,dA_prev区域取最大值或者最小值(矩阵),用dA*该区域,就是此区域的dA值,累加即可
def pool_backward(dA, cache, mode = "max"):
A_prev, hparameters = cache
f = hparameters["f"]
stride = hparameters["stride"]
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(m, n_H, n_W, n_C) = dA.shape
dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))
for i in range(m):
a_prev = A_prev[i,:,:,:]
for j in range(n_H):
for k in range(n_W):
for l in range(n_C):
vert_start = j
vert_end = j + f
horiz_start = k
horiz_end = k + f
if mode == 'max':
c = a_prev[vert_start:vert_end,horiz_start:horiz_end,l]
mask = create_mask_from_window(c)
dA_prev[i,vert_start:vert_end,horiz_start:horiz_end,l] = dA_prev[i,vert_start:vert_end,horiz_start:horiz_end,l]+ mask * dA[i,j,k,l]
else:
dz = dA[i, j,k,l]
shape = (f,f)
dA_prev[i,vert_start:vert_end,horiz_start:horiz_end,l] = dA_prev[i,vert_start:vert_end,horiz_start:horiz_end,l]+ distribute_value(dz, shape)
assert(dA_prev.shape == A_prev.shape)
return dA_prev
np.random.seed(1)
A_prev = np.random.randn(5, 5, 3, 2)
hparameters = {"stride" : 1, "f": 2}
A, cache = pool_forward(A_prev, hparameters)
dA = np.random.randn(5, 4, 2, 2)
dA_prev = pool_backward(dA, cache, mode = "max")
print("mode = max")
print('mean of dA = ', np.mean(dA))
print('dA_prev[1,1] = ', dA_prev[1,1])
print()
dA_prev = pool_backward(dA, cache, mode = "average")
print("mode = average")
print('mean of dA = ', np.mean(dA))
print('dA_prev[1,1] = ', dA_prev[1,1])
结果:
mode = max
mean of dA = 0.14571390272918056
dA_prev[1,1] = [[ 0. 0. ]
[ 5.05844394 -1.68282702]
[ 0. 0. ]]
mode = average
mean of dA = 0.14571390272918056
dA_prev[1,1] = [[ 0.08485462 0.2787552 ]
[ 1.26461098 -0.25749373]
[ 1.17975636 -0.53624893]]