也是参考了各路大佬的方法,感觉在fc-net.py中,将affine层、bn层、relu层先在layers.py中集成起来的做法比较方便,先留个记录,之后再编辑。
初始化:
for i in range(len(hidden_dims)):
D = hidden_dims[i]
self.params['W' + str(i + 1)] = weight_scale * np.random.randn(input_dim, D)
self.params['b' + str(i + 1)] = np.zeros(D)
if self.normalization:
self.params['gamma' + str(i + 1)] = np.ones(D)
self.params['beta' + str(i + 1)] = np.zeros(D)
input_dim = D
self.params['W' + str(self.num_layers)] = weight_scale * np.random.randn(hidden_dims[-1], num_classes)
self.params['b' + str(self.num_layers)] = np.zeros(num_classes)
前向传播:
cache = {}
cache_dropout = {}
input_X = X
for i in range(self.num_layers - 1):
W, b = self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]
if self.normalization == 'batchnorm':
gamma, beta = self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)]
bn_param = self.bn_params[i]
input_X, cache[i] = affine_bn_relu_forward(input_X, W, b, gamma, beta, bn_param)
elif self.normalization == 'layernorm':
gamma, beta = self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)]
bn_param = self.bn_params[i]
input_X, cache[i] = affine_ln_relu_forward(input_X, W, b, gamma, beta, bn_param)
else:
input_X, cache[i] = affine_relu_forward(input_X, W, b)
if self.use_dropout:
input_X, cache_dropout[i] = dropout_forward(input_X, self.dropout_param)
# 输出层单独写
W, b = self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]
scores, cache[self.num_layers - 1] = affine_forward(input_X, W, b)
反向传播:
loss, dscores = softmax_loss(scores, y)
W = self.params['W' + str(self.num_layers)]
loss += self.reg * 0.5 * np.sum(W ** 2)
dout_relu, dW, db = affine_backward(dscores, cache[self.num_layers - 1])
grads['W' + str(self.num_layers)] = dW + self.reg * W
grads['b' + str(self.num_layers)] = db
for i in range(self.num_layers - 1, 0, -1):
W = self.params['W' + str(i)]
loss += self.reg * 0.5 * np.sum(W ** 2)
if self.use_dropout:
dout_relu = dropout_backward(dout_relu, cache_dropout[i - 1])
# 使用BN时不需要对gamma和beta进行正则化
if self.normalization == 'batchnorm':
dout_relu, dW, db, dgamma, dbeta = affine_bn_relu_backward(dout_relu, cache[i - 1])
grads['gamma' + str(i)] = dgamma
grads['beta' + str(i)] = dbeta
elif self.normalization == 'layernorm':
dout_relu, dW, db, dgamma, dbeta = affine_ln_relu_backward(dout_relu, cache[i - 1])
grads['gamma' + str(i)] = dgamma
grads['beta' + str(i)] = dbeta
else:
dout_relu, dW, db = affine_relu_backward(dout_relu, cache[i - 1])
grads['W' + str(i)] = dW + self.reg * W
grads['b' + str(i)] = db