cs231n assignment1 Two-Layer Neural Network

此次的作业的目的:

  • 了解NN的结构和搭建过程:

         本次实验的两层NN的结构:

                                   

                                    相当于使用了 ReLu

        最后一层用 softmax 得出 loss

 

  • 理解 backpropagation:

         其目的是为了求任意函数的导数

         至于求导过程,在代码中有所体现,首要的是理解 

         推导过程:

 

Coding:

如有错误,麻烦指出

  • 首先,搭建一个两层的的NN,实现forward和backward,并用简单的数据测试正确性

Two-Layer Neural Network 的结构:

 

对于要求  

我的理解是这样的:

代码:

correct_scores 当作是一个测试数据集

import numpy as np
import matplotlib.pyplot as plt

from cs231n.classifiers.neural_net import TwoLayerNet

input_size = 4
hidden_size = 10
num_classes = 3
num_inputs = 5

def rel_error(x, y):
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def init_toy_model():
    np.random.seed(0)
    return TwoLayerNet(input_size, hidden_size, num_classes, std=1e-1)

def init_toy_data():
    np.random.seed(1)
    X = 10 * np.random.randn(num_inputs, input_size)
    y = np.array([0, 1, 2, 2, 1])
    return X, y

net = init_toy_model()
X, y = init_toy_data()

# Forward pass: compute scores
scores = net.loss(X)
print('Your scores:')
print(scores)
print()
print('correct scores:')
correct_scores = np.asarray([
  [-0.81233741, -1.27654624, -0.70335995],
  [-0.17129677, -1.18803311, -0.47310444],
  [-0.51590475, -1.01354314, -0.8504215 ],
  [-0.15419291, -0.48629638, -0.52901952],
  [-0.00618733, -0.12435261, -0.15226949]])
print(correct_scores)
print()

# The difference should be very small. We get < 1e-7
print('Difference between your scores and correct scores:')
print(np.sum(np.abs(scores - correct_scores)))

loss, _ = net.loss(X, y, reg=0.05)
correct_loss = 1.30378789133

print('Difference between your loss and correct loss:')
print(np.sum(np.abs(loss - correct_loss)))

1. 首先,正向求loss:

在nerual_net.py 的 loss 中:

# Unpack variables from the params dictionary
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']
    N, D = X.shape

    # Compute the forward pass
    scores = None
    #############################################################################
    # TODO: Perform the forward pass, computing the class scores for the input. #
    # Store the result in the scores variable, which should be an array of      #
    # shape (N, C).                                                             #
    #############################################################################
    # 计算分值
    z1 = X.dot(W1) + b1
    a1 = np.maximum(0, z1)
    scores = a1.dot(W2) + b2

    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################
    
    # If the targets are not given then jump out, we're done
    if y is None:
      return scores

    # Compute the loss
    loss = None
    #############################################################################
    # TODO: Finish the forward pass, and compute the loss. This should include  #
    # both the data loss and L2 regularization for W1 and W2. Store the result  #
    # in the variable loss, which should be a scalar. Use the Softmax           #
    # classifier loss.                                                          #
    #############################################################################
    # 最后一层softmax
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    loss = np.sum(-np.log(probs[np.arange(N), y]))
    loss /= N
    loss += 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

 2. 接下来,反向传播求出梯度,并用数值梯度验证

# Backward pass
from cs231n.gradient_check import eval_numerical_gradient

loss, grads = net.loss(X, y, reg=0.05)

# these should all be less than 1e-8 or so
for param_name in grads:
    f = lambda W: net.loss(X, y, reg=0.05)[0]
    param_grad_num = eval_numerical_gradient(f, net.params[param_name], verbose=False)
    print('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))

在nerual_net.py 的 loss 中: 求导过程在注释中

# Backward pass: compute gradients
    grads = {}
    #############################################################################
    # TODO: Compute the backward pass, computing the derivatives of the weights #
    # and biases. Store the results in the grads dictionary. For example,       #
    # grads['W1'] should store the gradient on W1, and be a matrix of same size #
    #############################################################################
    # 计算梯度
    # softmax
    dscores = probs
    dscores[range(N), y] -= 1
    dscores /= N    #

    # W2 和 b2
    grads['W2'] = np.dot(a1.T, dscores)       # 见开头推倒过程或PPT
    grads['b2'] = np.sum(dscores, axis=0)     # 使其与b2的size相同
    # backward 所传播的df/dx
    dhidden = np.dot(dscores, W2.T)
    # relu求导
    dhidden[a1 <= 0] = 0

    # W1 和 b1
    grads['W1'] = np.dot(X.T, dhidden)
    grads['b1'] = np.sum(dhidden, axis=0)

    # 加上正则化梯度的部分
    grads['W2'] += reg * W2
    grads['W1'] += reg * W1
    #############################################################################
    #                              END OF YOUR CODE                             #
    #############################################################################

 这里想要得到的梯度是各个参数的,即 df/dw df/db,backward传递的是 df/dx

 

3. 训练数据并画出loss曲线

# Train the network
net = init_toy_model()
stats = net.train(X, y, X, y,
            learning_rate=1e-1, reg=5e-6,
            num_iters=100, verbose=False)

print('Final training loss: ', stats['loss_history'][-1])

# plot the loss history
plt.plot(stats['loss_history'])
plt.xlabel('iteration')
plt.ylabel('training loss')
plt.title('Training Loss history')
plt.show()

predict

  def predict(self, X):

    y_pred = None

    ###########################################################################
    # TODO: Implement this function; it should be VERY simple!                #
    ###########################################################################
    scores = self.loss(X)
    y_pred = np.argmax(scores, axis=1)
    ###########################################################################
    #                              END OF YOUR CODE                           #
    ###########################################################################

    return y_pred

 

这里依然是用 mini-batch

  def train(self, X, y, X_val, y_val,
            learning_rate=1e-3, learning_rate_decay=0.95,
            reg=5e-6, num_iters=100,
            batch_size=200, verbose=False):

    num_train = X.shape[0]
    iterations_per_epoch = max(num_train / batch_size, 1)

    # Use SGD to optimize the parameters in self.model
    loss_history = []
    train_acc_history = []
    val_acc_history = []
  # 个人感觉一个epoch中应该batch所有的数据,这里直接用epoch应该结果也差不多
    for it in range(num_iters):
      X_batch = None
      y_batch = None

      #########################################################################
      # TODO: Create a random minibatch of training data and labels, storing  #
      # them in X_batch and y_batch respectively.                             #
      #########################################################################
      sample_indices = np.random.choice(np.arange(num_train), batch_size)
      X_batch = X[sample_indices]
      y_batch = y[sample_indices]
      #########################################################################
      #                             END OF YOUR CODE                          #
      #########################################################################

      # Compute loss and gradients using the current minibatch
      loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
      loss_history.append(loss)

      #########################################################################
      # TODO: Use the gradients in the grads dictionary to update the         #
      # parameters of the network (stored in the dictionary self.params)      #
      # using stochastic gradient descent. You'll need to use the gradients   #
      # stored in the grads dictionary defined above.                         #
      #########################################################################
      # gradient desent
      self.params['W1'] += -learning_rate * grads['W1']
      self.params['b1'] += -learning_rate * grads['b1']
      self.params['W2'] += -learning_rate * grads['W2']
      self.params['b2'] += -learning_rate * grads['b2']


      #########################################################################
      #                             END OF YOUR CODE                          #
      #########################################################################

      if verbose and it % 100 == 0:
        print('iteration %d / %d: loss %f' % (it, num_iters, loss))

      # Every epoch, check train and val accuracy and decay learning rate.
      if it % iterations_per_epoch == 0:
        # Check accuracy
        train_acc = (self.predict(X_batch) == y_batch).mean()
        val_acc = (self.predict(X_val) == y_val).mean()
        train_acc_history.append(train_acc)
        val_acc_history.append(val_acc)

        # Decay learning rate
        learning_rate *= learning_rate_decay

    return {
      'loss_history': loss_history,
      'train_acc_history': train_acc_history,
      'val_acc_history': val_acc_history,
    }

 

可证明model 和 计算过程基本正确

  • 导入 cifar-10 数据, 训练,测试, 调优

1. 导入数据

# Load the data
from cs231n.data_utils import load_CIFAR10
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
    # Load the raw CIFAR-10 data
    cifar10_dir = 'F:\pycharmFile\KNN\cifar-10-batches-py'

    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # Subsample the data
    mask = list(range(num_training, num_training + num_validation))
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = list(range(num_training))
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image


    # Reshape data to rows
    X_train = X_train.reshape(num_training, -1)
    X_val = X_val.reshape(num_validation, -1)
    X_test = X_test.reshape(num_test, -1)

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

 2. 训练,测试, 调优

# Train a network
input_size = 32 * 32 * 3
hidden_size = 50
num_classes = 10
#net = TwoLayerNet(input_size, hidden_size, num_classes)

# Train the network
# stats = net.train(X_train, y_train, X_val, y_val,
#             num_iters=1000, batch_size=200,
#             learning_rate=1e-4, learning_rate_decay=0.95,
#             reg=0.25, verbose=True)
#
# # Predict on the validation set
# val_acc = (net.predict(X_val) == y_val).mean()
# print('Validation accuracy: ', val_acc)


# Plot the loss function and train / validation accuracies
def plot_loss_acc(stats):
    plt.subplot(2, 1, 1)
    plt.plot(stats['loss_history'])
    plt.title('Loss history')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')

    plt.subplot(2, 1, 2)
    plt.plot(stats['train_acc_history'], label='train')
    plt.plot(stats['val_acc_history'], label='val')
    plt.title('Classification accuracy history')
    plt.xlabel('Epoch')
    plt.ylabel('Clasification accuracy')
    plt.legend()
    plt.show()


from cs231n.vis_utils import visualize_grid

# Visualize the weights of the network
def show_net_weights(net):
    W1 = net.params['W1']
    W1 = W1.reshape(32, 32, 3, -1).transpose(3, 0, 1, 2)
    plt.imshow(visualize_grid(W1, padding=3).astype('uint8'))
    plt.gca().axis('off')
    plt.show()
#show_net_weights(net)

# find the best nn
best_val = -1
best_nn = None
best_stats = None
results = {}
####----------- 这里reg如果设置的太小,可能会导致一些数据溢出 ------------####
learning_rates = [1e-3]
regularization_strengths = [0.6, 0.8]
for lr in learning_rates:
    for rg in regularization_strengths:
        net = TwoLayerNet(input_size, hidden_size, num_classes)
        stats = net.train(X_train, y_train, X_val, y_val,
                    num_iters=2500, batch_size=200,
                    learning_rate=lr, learning_rate_decay=0.95,
                    reg=rg)
        y_train_pred = net.predict(X_train)
        acc_train = np.mean(y_train_pred == y_train)
        y_val_pred = net.predict(X_val)
        acc_val = np.mean(y_val_pred == y_val)
        results[(lr, rg)] = (acc_train, acc_val)

        if best_val < acc_val:
            best_nn = net
            best_val = acc_val
            best_stats = stats
for lr, reg in results:
    train_acc, val_acc = results[(lr, reg)]
    print('lr: ', lr, ' reg: ', reg, ' train acc: ', train_acc, ' val_acc: ', val_acc)

print('The best validation accuracy: ', best_val)

# Run on the test set
test_acc = (best_nn.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)

plot_loss_acc(best_stats)
show_net_weights(best_nn)

除了调超参(lr, reg)之外,hidden 层的节点数,epoch的大小, batch_size的大小,learning rate的变化率,对训练结果都有一定影响

我得出的结果:

 

 

 

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值