导入相应的模块
import numpy as np
import h5py
import matplotlib.pyplot as plt
%matplotlib inline
#配置matplotlib的参数
plt.rcParams['figure.figsize'] = (5.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
%load_ext autoreload
%autoreload 2
np.random.seed(1)
卷积层函数
0填充(zero padding)
参考np.pad的用法
def zero_pad(X, pad):
"""
使用零填充数据集X的所有图像。填充应用于图像的高度和宽度
参数X:
X -- numpy数组(m, n_H, n_W, n_C)
pad -- 垂直和水平尺寸上每个图像周围的填充量个数
返回值:
X_pad -- 填充后的维度(m, n_H + 2*pad, n_W + 2*pad, n_C)
"""
X_pad = np.pad(X, ((0,0), (pad,pad), (pad,pad), (0,0)), 'constant', constant_values = (0,0))
return X_pad
测试:
np.random.seed(1)
x = np.random.randn(4, 3, 3, 2)
x_pad = zero_pad(x, 2)
print ("x.shape =", x.shape)
print ("x_pad.shape =", x_pad.shape)
print ("x[1,1] =", x[1,1])
print ("x_pad[1,1] =", x_pad[1,1])
fig, axarr = plt.subplots(1, 2)
axarr[0].set_title('x')
axarr[0].imshow(x[0,:,:,0])
axarr[1].set_title('x_pad')
axarr[1].imshow(x_pad[0,:,:,0])
out:
x.shape: | (4, 3, 3, 2) |
---|---|
x_pad.shape: | (4, 7, 7, 2) |
x[1,1]: | [[ 0.90085595 -0.68372786][-0.12289023 -0.93576943][-0.26788808 0.53035547]] |
x_pad[1,1]: | [[ 0. 0.][ 0. 0.][ 0. 0.][ 0. 0.][ 0. 0.][ 0. 0.][ 0. 0.]] |
实现一次卷积操作
上图:卷积核维度为2X2,stride步幅=1
def conv_single_step(a_slice_prev, W, b):
"""
对前一层的输出应用以下公式(类似于神经网络的线性函数):
Z = WA + b
参数:
a_slice_prev -- 上一层输出的切片,如上图所示,维度为(f, f, n_C_prev),n_C_prev表示上一个卷积层或者池化层的通道数
W -- 权重(就是一个卷积核) 维度为(f, f, n_C_prev)
b -- 偏置,维度为(1, 1, 1)
返回值:
Z -- 是一个标量值,是一次卷积后所得到的输出
"""
s = W*a_slice_prev #卷积核每一个元素与上一层输出层的滑动切片的每一个元素相乘
Z = np.sum(s)
Z = np.squeeze(Z+b)
return Z
测试:
#假设卷积核是一个4X4维的3通道张量,进行一次卷积操作
np.random.seed(1)
a_slice_prev = np.random.randn(4, 4, 3)
W = np.random.randn(4, 4, 3)
b = np.random.randn(1, 1, 1)
Z = conv_single_step(a_slice_prev, W, b)
print("Z =", Z)
#out: Z = -6.999089450680221
卷积层的前向传播
用n_C
个卷积核对每一个样本都进行卷积
其中:
n
H
=
⌊
n
H
p
r
e
v
−
f
+
2
×
p
a
d
s
t
r
i
d
e
⌋
+
1
n_H = \lfloor \frac{n_{H_{prev}} - f + 2 \times pad}{stride} \rfloor +1
nH=⌊stridenHprev−f+2×pad⌋+1
n
W
=
⌊
n
W
p
r
e
v
−
f
+
2
×
p
a
d
s
t
r
i
d
e
⌋
+
1
n_W = \lfloor \frac{n_{W_{prev}} - f + 2 \times pad}{stride} \rfloor +1
nW=⌊stridenWprev−f+2×pad⌋+1
n
C
=
在本次卷积中所采用的卷积核的数量
n_C = \text{在本次卷积中所采用的卷积核的数量}
nC=在本次卷积中所采用的卷积核的数量
def conv_forward(A_prev, W, b, hparameters):
"""
参数:
A_prev -- 上一层的激活函数的输出,维度为(m, n_H_prev, n_W_prev, n_C_prev)
W -- 卷积核,维度为(f, f, n_C_prev, n_C)
b -- 偏置(对n_C个卷积核有n_C个不同的偏置),维度为(1, 1, 1, n_C)
hparameters -- 超参数,即填充pad和步幅stride
返回值:
Z -- 卷积层的输出(应用了WA+b)(m, n_H, n_W, n_C)
cache -- 为反向传播而缓存的数据,(A_prev, W, b, hparameters)
"""
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(f, f, n_C_prev, n_C) = W.shape
stride = hparameters["stride"]
pad = hparameters["pad"]
n_H = (n_H_prev+2*pad-f)//stride + 1
n_W = (n_W_prev+2*pad-f)//stride + 1
#初始化卷积层的输出
Z = np.zeros((m, n_H, n_W, n_C))
# 为上一层卷积层输出添加填充
A_prev_pad = zero_pad(A_prev,pad)
for i in range(m): # 对每一个样本进行操作
a_prev_pad = A_prev_pad[i,:,:,:]
for h in range(n_H): #在单个样本上滑动卷积核
for w in range(n_W):
for c in range(n_C): #每个样本用n_C个不同的卷积核进行卷积
#卷积核在样本上滑动的位置,vert_start,horiz_start的概念如上图
vert_start = h*stride
vert_end = vert_start+f
horiz_start = w*stride
horiz_end = horiz_start+f
a_slice_prev = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
Z[i, h, w, c] = conv_single_step(a_slice_prev, W[:,:,:,c], b[:,:,:,c])
assert(Z.shape == (m, n_H, n_W, n_C))
cache = (A_prev, W, b, hparameters)
return Z, cache
测试:
np.random.seed(1)
A_prev = np.random.randn(10,4,4,3) #10个样本,每一个样本的维度为(4,4,3)
W = np.random.randn(2,2,3,8) # 8个卷积核,每一个卷积核的维度为(2,2,3)
b = np.random.randn(1,1,1,8)
hparameters = {"pad" : 2, "stride": 2}
Z, cache_conv = conv_forward(A_prev, W, b, hparameters)
print(cache_conv[3])
print("Z's mean =", np.mean(Z))
print("Z[3,2,1] =", Z[3,2,1])
print("cache_conv[0][1][2][3] =", cache_conv[0][1][2][3])
#out:
"""
Z's mean = 0.0489952035289
Z[3,2,1] = [-0.61490741 -6.7439236 -2.55153897 1.75698377 3.56208902 0.530364375.18531798 8.75898442]
cache_conv[0][1][2][3] = [-0.20075807 0.18656139 0.41005165]
"""
卷积层的反向传播
dA的计算:
d A + = ∑ h = 0 n H ∑ w = 0 n W W c × d Z h w dA += \sum _{h=0} ^{n_H} \sum_{w=0} ^{n_W} W_c \times dZ_{hw} dA+=h=0∑nHw=0∑nWWc×dZhw
dWc的计算
W
c
W_c
Wc表示其中一个卷积核
d
W
c
+
=
∑
h
=
0
n
H
∑
w
=
0
n
W
a
s
l
i
c
e
×
d
Z
h
w
dW_c += \sum _{h=0} ^{n_H} \sum_{w=0} ^ {n_W} a_{slice} \times dZ_{hw}
dWc+=h=0∑nHw=0∑nWaslice×dZhw
db的计算
d b = ∑ h ∑ w d Z h w db = \sum_h \sum_w dZ_{hw} db=h∑w∑dZhw
def conv_backward(dZ, cache):
"""
参数:
dZ -- 本层卷积层输出的代价的梯度,维度(与卷积层前向输出维度一致)为(m, n_H, n_W, n_C)
cache -- 在卷积层前向传播函数中的缓存
返回值:
dA_prev -- 本层卷积层输入代价的梯度,维度为(m, n_H_prev, n_W_prev, n_C_prev)
dW -- 卷积核的梯度(f, f, n_C_prev, n_C)
db -- 偏置的梯度(1, 1, 1, n_C)
"""
# 从cache中抽取出对反向传播有用的数据,卷积层前一层的输出,卷积核,偏置,超参数
(A_prev, W, b, hparameters) = cache[0],cache[1],cache[2],cache[3]
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(f, f, n_C_prev, n_C) = W.shape
stride = hparameters["stride"]
pad = hparameters["pad"]
(m, n_H, n_W, n_C) = dZ.shape
#初始化要计算的梯度
dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))
dW = np.zeros((f, f, n_C_prev, n_C))
db = np.zeros((1, 1, 1, n_C))
#填充上一卷积层输出与填充其梯度
A_prev_pad = zero_pad(A_prev,pad)
dA_prev_pad = zero_pad(dA_prev,pad)
for i in range(m): #遍历每一个样本
a_prev_pad = A_prev_pad[i]
da_prev_pad = dA_prev_pad[i]
for h in range(n_H): #在该样本上滑动卷积核
for w in range(n_W):
for c in range(n_C):
#找到该样本的切片
vert_start = h*stride
vert_end = vert_start+f
horiz_start = w*stride
horiz_end = horiz_start+f
a_slice = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
# 计算梯度
da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
db[:,:,:,c] += dZ[i, h, w, c]
# 将填充去除
dA_prev[i, :, :, :] = da_prev_pad[pad:-pad,pad:-pad,:]
assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))
return dA_prev, dW, db
测试:
#采用的Z为前向传播测试中的输出
np.random.seed(1)
dA, dW, db = conv_backward(Z, cache_conv)
print("dA_mean =", np.mean(dA))
print("dW_mean =", np.mean(dW))
print("db_mean =", np.mean(db))
#out:
#dA_mean = 1.45243777754
#dW_mean = 1.72699145831
#db_mean = 7.83923256462
池化层函数
池化层前向传播
最大池化处理示意图:
平均池化处理示意图:
def pool_forward(A_prev, hparameters, mode = "max"):
"""
参数:
A_prev -- 前一层的输出,维度为(m, n_H_prev, n_W_prev, n_C_prev)
hparameters -- 超参数
mode -- 选择池化处理的方式("max" or "average")
返回值:
A -- 池化处理后的输出, 维度为(m, n_H, n_W, n_C)
cache -- 缓存用于池化层反向传播的参数,
"""
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
f = hparameters["f"]
stride = hparameters["stride"]
n_H = int(1 + (n_H_prev - f) / stride)
n_W = int(1 + (n_W_prev - f) / stride)
n_C = n_C_prev #池化处理后的通道数不变
# 初始化输出
A = np.zeros((m, n_H, n_W, n_C))
for i in range(m): #对每一个样本都进行池化
for h in range(n_H):
for w in range(n_W):
for c in range (n_C):
vert_start = h*stride
vert_end = vert_start+f
horiz_start = w*stride
horiz_end = horiz_start+f
a_prev_slice = A_prev[i,vert_start:vert_end,horiz_start:horiz_end,c]
if mode == "max":
A[i, h, w, c] = np.max(a_prev_slice)
elif mode == "average":
A[i, h, w, c] = np.mean(a_prev_slice)
cache = (A_prev, hparameters)
assert(A.shape == (m, n_H, n_W, n_C))
return A, cache
测试:
np.random.seed(1)
A_prev = np.random.randn(2, 4, 4, 3) #假设待池化的是通道数为3的4X4的4个样本
hparameters = {"stride" : 2, "f": 3}
A, cache = pool_forward(A_prev, hparameters)
print("mode = max")
print("A =", A)
print()
A, cache = pool_forward(A_prev, hparameters, mode = "average")
print("mode = average")
print("A =", A)
"""
out:
mode = max
A = [[[[ 1.74481176 0.86540763 1.13376944]]]
[[[ 1.13162939 1.51981682 2.18557541]]]]
mode = average
A = [[[[ 0.02105773 -0.20328806 -0.40389855]]]
[[[-0.22154621 0.51716526 0.48155844]]]]
"""
池化层向后传播
尽管池化层不需要参数,但是池化也会对最终的输出造成影响,因此仍需对池化层进行反向传播
对于max池化
X
=
[
1
3
4
2
]
→
M
=
[
0
0
1
0
]
X = \begin{bmatrix} 1 && 3 \\ 4 && 2 \end{bmatrix} \quad \rightarrow \quad M =\begin{bmatrix} 0 && 0 \\ 1 && 0 \end{bmatrix}
X=[1432]→M=[0100]
由上可知,创建一个mask矩阵,用来指示最大值的位置,1/True表示该位置为最大值,0/False为其他元素
def create_mask_from_window(x):
"""
参数:
x -- 维度为(f, f)
返回值:
mask -- 标记x数组的最大值位置的数组
"""
mask = x>=np.max(x)
return mask
对于Average池化
在最大池中,对于每个输入,输出上的所有“影响”来自单个输入值,即最大值。 在平均池中,输入的每个元素对输出具有相同的影响。 因此,要实现backprop,将创建一个能反映该影响的数组
d
Z
=
1
→
d
Z
=
[
1
/
4
1
/
4
1
/
4
1
/
4
]
dZ = 1 \quad \rightarrow \quad dZ =\begin{bmatrix} 1/4 && 1/4 \\ 1/4 && 1/4 \end{bmatrix}
dZ=1→dZ=[1/41/41/41/4]
def distribute_value(dz, shape):
"""
参数:
dz -- 标量,是本层输出的其中一个元素
shape -- (n_H, n_W) ,也等于单通道卷积核的维度
返回值:
a -- 维度为(n_H, n_W)
"""
(n_H, n_W) = shape[0],shape[1]
average = n_H*n_W
a = np.full((n_H,n_W),dz/average)
return a
池化层反向传播
def pool_backward(dA, cache, mode = "max"):
"""
参数:
dA -- 本层输出的代价的梯度
cache -- 本层池化层前向传播时的缓存数据: (A_prev, hparameters)
mode -- 池化处理的模式("max" or "average")
返回值:
dA_prev -- 本层池化层输入的梯度
Note:池化处理不改变通道数
"""
(A_prev, hparameters) = cache[0],cache[1]
stride = hparameters["stride"]
f = hparameters["f"]
m, n_H_prev, n_W_prev, n_C_prev = A_prev.shape
m, n_H, n_W, n_C = dA.shape
#初始化带计算的梯度
dA_prev = np.zeros((A_prev.shape))
for i in range(m): #遍历每一个样本
a_prev = A_prev[i] #选中当前的样本
for h in range(n_H):
for w in range(n_W):
for c in range(n_C):
vert_start = h*stride
vert_end = vert_start+f
horiz_start = w*stride
horiz_end = horiz_start+f
if mode == "max":
#卷积核在当前层的输入上滑动的切片
a_prev_slice = a_prev[vert_start:vert_end,horiz_start:horiz_end,c]
#创建mask矩阵
mask = create_mask_from_window(a_prev_slice)
dA_prev[i, vert_start: vert_end, horiz_start: horiz_end, c] += mask*dA[i,h,w,c]
elif mode == "average":
#当前层的输出的其中一个通道的值
da = dA[i,
shape = (f,f)
dA_prev[i, vert_start: vert_end, horiz_start: horiz_end, c] += distribute_value(da, shape)
assert(dA_prev.shape == A_prev.shape)
return dA_prev