视频里 Andrej Karpathy上课的时候说,这次的作业meaty but educational,确实很meaty,作业一般是由.ipynb文件和.py文件组成,这次因为每个.ipynb文件涉及到的.py文件较多,且互相之间有交叉,所以每篇博客只贴出一个.ipynb或者一个.py文件.(因为之前的作业由于是一个.ipynb文件对应一个.py文件,所以就整合到一篇博客里)
还是那句话,有错误希望帮我指出来,多多指教,谢谢
第二部分编写任意层数的全连接层的类的时候,我的前向计算和反向计算都太繁琐,这是在写任意层数的conv_net的时候发现的,所以懒得改了
fc_net.py内容:
import numpy as np
from cs231n.layers import *
from cs231n.layer_utils import *
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network with ReLU nonlinearity and
softmax loss that uses a modular layer design. We assume an input dimension
of D, a hidden dimension of H, and perform classification over C classes.
The architecure should be affine - relu - affine - softmax.
Note that this class does not implement gradient descent; instead, it
will interact with a separate Solver object that is responsible for running
optimization.
The learnable parameters of the model are stored in the dictionary
self.params that maps parameter names to numpy arrays.
"""
def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
weight_scale=1e-3, reg=0.0):
"""
Initialize a new network.
Inputs:
- input_dim: An integer giving the size of the input
- hidden_dim: An integer giving the size of the hidden layer
- num_classes: An integer giving the number of classes to classify
- dropout: Scalar between 0 and 1 giving dropout strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- reg: Scalar giving L2 regularization strength.
"""
self.params = {}
self.reg = reg
############################################################################
# TODO: Initialize the weights and biases of the two-layer net. Weights #
# should be initialized from a Gaussian with standard deviation equal to #
# weight_scale, and biases should be initialized to zero. All weights and #
# biases should be stored in the dictionary self.params, with first layer #
# weights and biases using the keys 'W1' and 'b1' and second layer weights #
# and biases using the keys 'W2' and 'b2'. #
############################################################################
self.params['W1'] = weight_scale * np.random.randn(input_dim, hidden_dim)
self.params['b1'] = np.zeros(hidden_dim)
self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes)
self.params['b2'] = np.zeros(num_classes)
############################################################################
# END OF YOUR CODE #
############################################################################
def loss(self, X, y=None):
"""
Compute loss and gradient for a minibatch of data.
Inputs:
- X: Array of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,). y[i] gives the label for X[i].
Returns:
If y is None, then run a test-time forward pass of the model and return:
- scores: Array of shape (N, C) giving classification scores, where
scores[i, c] is the classification score for X[i] and class c.
If y is not None, then run a training-time forward and backward pass and
return a tuple of:
- loss: Scalar value giving the loss
- grads: Dictionary with the same keys as self.params, mapping parameter
names to gradients of the loss with respect to those parameters.
"""
scores = None
############################################################################
# TODO: Implement the forward pass for the two-layer net, computing the #
# class scores for X and storing them in the scores variable. #
############################################################################
a2, cache1 = affine_relu_forward(X, self.params['W1'], self.params['b1'])
scores, cache2 = affine_forward(a2, self.params['W2'], self.params['b2'])
############################################################################
# END OF YOUR CODE #
############################################################################
# If y is None then we are in test mode so just return scores
if y is None:
return scores
loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the two-layer net. Store the loss #
# in the loss variable and gradients in the grads dictionary. Compute data #
# loss using softmax, and make sure that grads[k] holds the gradients for #
# self.params[k]. Don't forget to add L2 regularization! #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
loss_without_reg, dscores = softmax_loss(scores, y)
loss = loss_without_reg + 0.5 * self.reg * (np.sum(self.params['W1']**2) + \
np.sum(self.params['W2']**2))
da2, grads['W2'], grads['b2'] = affine_backward(dscores, cache2)
grads['W2'] += self.reg*cache2[1]
dx, grads['W1'], grads['b1'] = affine_relu_backward(da2, cache1)
grads['W1'] += self.reg*cache1[0][1]
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads
class FullyConnectedNet(object):
"""
A fully-connected neural network with an arbitrary number of hidden layers,
ReLU nonlinearities, and a softmax loss function. This will also implement
dropout and batch normalization as options. For a network with L layers,
the architecture will be
{affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch normalization and dropout are optional, and the {...} block is
repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the
self.params dictionary and will be learned using the Solver class.
"""
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
"""
Initialize a new FullyConnectedNet.
Inputs:
- hidden_dims: A list of integers giving the size of each hidden layer.
- input_dim: An integer giving the size of the input.
- num_classes: An integer giving the number of classes to classify.
- dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
the network should not use dropout at all.
- use_batchnorm: Whether or not the network should use batch normalization.
- reg: Scalar giving L2 regularization strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- dtype: A numpy datatype object; all computations will be performed using
this datatype. float32 is faster but less accurate, so you should use
float64 for numeric gradient checking.
- seed: If not None, then pass this random seed to the dropout layers. This
will make the dropout layers deteriminstic so we can gradient check the
model.
"""
self.use_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
############################################################################
# TODO: Initialize the parameters of the network, storing all values in #
# the self.params dictionary. Store weights and biases for the first layer #
# in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
# initialized from a normal distribution with standard deviation equal to #
# weight_scale and biases should be initialized to zero. #
# #
# When using batch normalization, store scale and shift parameters for the #
# first layer in gamma1 and beta1; for the second layer use gamma2 and #
# beta2, etc. Scale parameters should be initialized to one and shift #
# parameters should be initialized to zero. #
############################################################################
for i in xrange(self.num_layers):
if i == 0:
#initialize first affine layers
self.params['W'+str(i+1)] = \
weight_scale * np.random.randn(input_dim, hidden_dims[i])
self.params['b'+str(i+1)] = np.zeros(hidden_dims[i])
#initialize first batch normalize layers
if self.use_batchnorm:
self.params['gamma'+str(i+1)] = np.ones(hidden_dims[i])
self.params['beta'+str(i+1)] = np.zeros(hidden_dims[i])
elif i == self.num_layers-1:
#initialize last affine layers
self.params['W'+str(i+1)] = \
weight_scale * np.random.randn(hidden_dims[i-1], num_classes)
self.params['b'+str(i+1)] = np.zeros(num_classes)
else:
#initialize affine layers
self.params['W'+str(i+1)] = \
weight_scale * np.random.randn(hidden_dims[i-1], hidden_dims[i])
self.params['b'+str(i+1)] = np.zeros(hidden_dims[i])
#initialize batch normalize layers
if self.use_batchnorm:
self.params['gamma'+str(i+1)] = np.ones(hidden_dims[i])
self.params['beta'+str(i+1)] = np.zeros(hidden_dims[i])
############################################################################
# END OF YOUR CODE #
############################################################################
# When using dropout we need to pass a dropout_param dictionary to each
# dropout layer so that the layer knows the dropout probability and the mode
# (train / test). You can pass the same dropout_param to each dropout layer.
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
# With batch normalization we need to keep track of running means and
# variances, so we need to pass a special bn_param object to each batch
# normalization layer. You should pass self.bn_params[0] to the forward pass
# of the first batch normalization layer, self.bn_params[1] to the forward
# pass of the second batch normalization layer, etc.
self.bn_params = []
if self.use_batchnorm:
self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.iteritems():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above.
"""
X = X.astype(self.dtype)
mode = 'test' if y is None else 'train'
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.dropout_param is not None:
self.dropout_param['mode'] = mode
if self.use_batchnorm:
for bn_param in self.bn_params:
bn_param['mode'] = mode
# bn_param[mode] = mode
## 我觉得这一块原文可能错了索引的mode应该是带引号的,应该是bn_param['mode']
scores = None
############################################################################
# TODO: Implement the forward pass for the fully-connected net, computing #
# the class scores for X and storing them in the scores variable. #
# #
# When using dropout, you'll need to pass self.dropout_param to each #
# dropout forward pass. #
# #
# When using batch normalization, you'll need to pass self.bn_params[0] to #
# the forward pass for the first batch normalization layer, pass #
# self.bn_params[1] to the forward pass for the second batch normalization #
# layer, etc. #
############################################################################
#写的太繁琐,效率且低
if self.use_batchnorm and self.use_dropout:
a = []
a.append(X)
cache = []
for i in xrange(self.num_layers):
if i==self.num_layers-1:
scores, cache_last = affine_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)])
cache.append(cache_last)
else:
a_out_i, cache_i = affine_bn_relu_dp_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)],\
self.params['gamma'+str(i+1)],\
self.params['beta'+str(i+1)],\
self.bn_params[i],\
self.dropout_param)
a.append(a_out_i)
cache.append(cache_i)
elif self.use_batchnorm and not self.use_dropout:
a = []
a.append(X)
cache = []
for i in xrange(self.num_layers):
if i==self.num_layers-1:
scores, cache_last = affine_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)])
cache.append(cache_last)
else:
a_out_i, cache_i = affine_bn_relu_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)],\
self.params['gamma'+str(i+1)],\
self.params['beta'+str(i+1)],\
self.bn_params[i])
a.append(a_out_i)
cache.append(cache_i)
elif self.use_dropout and not self.use_batchnorm:
a = []
a.append(X)
cache = []
for i in xrange(self.num_layers):
if i==self.num_layers-1:
scores, cache_last = affine_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)])
cache.append(cache_last)
else:
a_out_i, cache_i = affine_relu_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)])
# add dropout layer
d_out_i, dp_cache= dropout_forward(a_out_i, self.dropout_param)
a.append(d_out_i)
ad_dp_cache = (cache_i, dp_cache)
cache.append(ad_dp_cache)
else:
a = []
a.append(X)
cache = []
for i in xrange(self.num_layers):
if i==self.num_layers-1:
scores, cache_last = affine_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)])
cache.append(cache_last)
else:
a_out_i, cache_i = affine_relu_forward(a[i], \
self.params['W'+str(i+1)], \
self.params['b'+str(i+1)])
a.append(a_out_i)
cache.append(cache_i)
############################################################################
# END OF YOUR CODE #
############################################################################
# If test mode return early
if mode == 'test':
return scores
loss, grads = 0.0, {}
############################################################################
# TODO: Implement the backward pass for the fully-connected net. Store the #
# loss in the loss variable and gradients in the grads dictionary. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# When using batch normalization, you don't need to regularize the scale #
# and shift parameters. #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
#写的太繁琐,效率且低
if self.use_batchnorm and self.use_dropout:
loss, dscores = softmax_loss(scores, y)
for i in xrange(self.num_layers):
loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)
dout = []
dout.append(dscores)
for i in reversed(xrange(self.num_layers)):
#print len(cache[i])
if i==self.num_layers-1:
dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
grads['b'+str(i+1)] = db
else:
dout_i, dw, db, dgamma, dbeta = \
affine_bn_relu_dp_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][0][1]
grads['b'+str(i+1)] = db
grads['gamma'+str(i+1)] = dgamma
grads['beta'+str(i+1)] = dbeta
elif self.use_batchnorm and not self.use_dropout:
loss, dscores = softmax_loss(scores, y)
for i in xrange(self.num_layers):
loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)
dout = []
dout.append(dscores)
for i in reversed(xrange(self.num_layers)):
#print len(cache[i])
if i==self.num_layers-1:
dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
grads['b'+str(i+1)] = db
else:
dout_i, dw, db, dgamma, dbeta = \
affine_bn_relu_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][0][1]
grads['b'+str(i+1)] = db
grads['gamma'+str(i+1)] = dgamma
grads['beta'+str(i+1)] = dbeta
elif self.use_dropout and not self.use_batchnorm:
loss, dscores = softmax_loss(scores, y)
for i in xrange(self.num_layers):
loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)
dout = []
dout.append(dscores)
for i in reversed(xrange(self.num_layers)):
#print len(cache[i])
if i==self.num_layers-1:
dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
grads['b'+str(i+1)] = db
else:
cache_i, dp_cache = cache[i]
dout_ar = dropout_backward(dout[self.num_layers-i-1], dp_cache)
dout_i, dw, db = affine_relu_backward(dout_ar, cache_i)
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache_i[0][1]
grads['b'+str(i+1)] = db
else:
loss, dscores = softmax_loss(scores, y)
for i in xrange(self.num_layers):
loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)
dout = []
dout.append(dscores)
for i in reversed(xrange(self.num_layers)):
#print len(cache[i])
if i==self.num_layers-1:
dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
grads['b'+str(i+1)] = db
else:
dout_i, dw, db = affine_relu_backward(dout[self.num_layers-i-1], cache[i])
dout.append(dout_i)
grads['W'+str(i+1)] = dw + self.reg * cache[i][0][1]
grads['b'+str(i+1)] = db
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads