CS231n——Assignmen1之Softmax

Softmax 分类器的实现


1.计算损失函数和梯度的实现

import numpy as np
from random import shuffle

def softmax_loss_naive(W, X, y, reg):#带循环
  """
  Softmax loss function, naive implementation (with loops)

  Inputs have dimension D, there are C classes, and we operate on minibatches
  of N examples.

  Inputs:
  - W: A numpy array of shape (D, C) containing weights.
  - X: A numpy array of shape (N, D) containing a minibatch of data.
  - y: A numpy array of shape (N,) containing training labels; y[i] = c means
    that X[i] has label c, where 0 <= c < C.
  - reg: (float) regularization strength

  Returns a tuple of:
  - loss as single float
  - gradient with respect to weights W; an array of same shape as W
  """
  # Initialize the loss and gradient to zero.
  dW = np.zeros_like(W)
  dW_each=np.zeros_like(W)
  #############################################################################
  # TODO: Compute the softmax loss and its gradient using explicit loops.     #
  # Store the loss in loss and the gradient in dW. If you are not careful     #
  # here, it is easy to run into numeric instability. Don't forget the        #
  # regularization!                                                           #
  #############################################################################
  num_train=X.shape[0]
  num_class=W.shape[1]
  f=np.dot(X,W)  #(N,C),评分函数
  f_max=np.reshape(np.max(f,axis=1),(num_train,1)) #找到每一行的最大值,然后reshape 之后减去
  #这样可以防止后面的操作会出现数值上的一些偏差
  #regularization
  f-=f_max
  p = np.exp(f) / np.sum(np.exp(f),axis=1,keepdims=True) #N by C #这里要注意,除的是每个样本的和,不能全求和
  #求交叉熵!!!
  loss=0.0
  y_true=np.zeros_like(p)
  y_true[np.arange(num_train),y]=1.0# 生成hot-vector
  for i in range(num_train):
    for j in range(num_class):
      loss+=-(y_true[i,j]*np.log(p[i,j])) #损失函数公式:L = -(1/N)∑i∑j1(k=yi)log(exp(fk)/∑j exp(fj)) + λR(W)
      dW_each[:,j]=-(y_true[i,j]-p[i,j])*X[i,:] # ∇Wk L = -(1/N)∑i xiT(pi,m-Pm) + 2λWk, where Pk = exp(fk)/∑j exp(fj
    dW+=dW_each
  loss/=num_train
  loss+=0.5*reg*np.sum(W*W) #加上正则项
  dW/=num_train
  dW+=reg*W


  #############################################################################
  #                          END OF YOUR CODE                                 #
  #############################################################################

  return loss, dW


def softmax_loss_vectorized(W, X, y, reg):#向量化操作
  """
  Softmax loss function, vectorized version.

  Inputs and outputs are the same as softmax_loss_naive.
  """
  # Initialize the loss and gradient to zero.
  loss = 0.0
  dW = np.zeros_like(W)#D by C

  #############################################################################
  # TODO: Compute the softmax loss and its gradient using no explicit loops.  #
  # Store the loss in loss and the gradient in dW. If you are not careful     #
  # here, it is easy to run into numeric instability. Don't forget the        #
  # regularization!                                                           #
  #############################################################################
  num_train = X.shape[0]
  num_class = W.shape[1]
  f = np.dot(X, W)  # (N,C),评分函数
  f_max = np.reshape(np.max(f, axis=1), (num_train, 1))  # 找到每一行的最大值,然后reshape 之后减去
  # 这样可以防止后面的操作会出现数值上的一些偏差
  # regularization
  f -= f_max
  p = np.exp(f) / np.sum(np.exp(f), axis=1, keepdims=True)  # N by C #这里要注意,除的是每个样本的和,不能全求和
  # 求交叉熵!!
  y_true=np.zeros_like(p)
  y_true[np.arange(num_train),y]=1.0# 生成hot-vector
  loss+=-np.sum(np.log(p[np.arange(num_train),y])) / num_train + 0.5* reg*np.sum(W*W)
  dW+=-np.dot(X.T,y_true-p) /num_train + reg* W ##求梯度的vectorized 形式

  #############################################################################
  #                          END OF YOUR CODE                                 #
  #############################################################################

  return loss, dW

2.导入数据 (def get_CIFAR10_Data)

cifar10_dir='cs231n//datasets'
X_train,y_train,X_test,y_test=load_CIFAR10(cifar10_dir)


2.1 子采样数据

mask=range(num_train,num_train+num_validation)
X_val=X_train[mask]
y_val=y_train[mask]
mask=range(num_train)
X_train=X_train[mask]
y_train=y_train[mask]
mask=range(num_test)
X_test=X_test[mask]
y_test=y_test[mask]
mask=np.random.choice(num_train,num_dev,replace=False)#从0-num_train中选num_dev个样本
X_dev=X_train[mask]
y_dev=y_train[mask]


2.2 改变数据维度,使每幅图片排成一行

X_train=np.reshape(X_train,(X_train.shape[0],-1))
X_val=np.reshape(X_val,(X_val.shape[0],-1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))


2.3 normalize data 减去平均像素值

注意:此处是在训练集求得一副均值图片,然后用所有的数据集都减去这个均值图片

#normalize the data: substract the mean image
mean_image=np.mean(X_train,axis=0)#按列取平均,得到9000幅图片的均值图片
X_train-=mean_image
X_val-=mean_image
X_test-=mean_image
X_dev-=mean_image


2.4 加上bias

np.hstack(tup)

Stack arrays in sequence horizontally (column wise).

Take a sequence of arrays and stack them horizontally to make a single array

tup : sequence of ndarrays

All arrays must have the same shape along all but the second axis.所有维度除了第二维,都必须一样

X_train=np.hstack([X_train,np.ones((X_train.shape[0],1))])
X_val=np.hstack([X_val,np.ones((X_val.shape[0],1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])



3.两种version对比

3.1 调用naive one(with loop)

随机给W赋值

W=np.random.randn(3073,10)*0.0001

计算loss(合理性检查!!)

loss,grad=softmax_loss_naive(W,X_dev,y_dev,0.0)

注意:此处loss应该检查一下是否接近-log(0.1)

合理性检查的原因:在使用小参数进行初始化时,确保得到的损失值与期望一致,最好先单独检查数据损失(让正则化强度为0)。例如一个跑CIFAR-10的softmax分类器一般它的初始损失值为2.302.因为初始时预计每个类别的概率是0.1,然后Softmax损失值正确分类的负对数概率为-ln(0.1)=2.302

对于Weston Watkins SVM,假设所有边界都被越过,所以损失值为9,如果没看到这些损失值,那么初始化中就可能有问题。


3.2梯度检查

from cs231n.gradient_check import grad_check_sparse
f=lambda w: softmax_loss_naive(w,X_dev,y_dev,0.0)[0]
grad_numerical=grad_check_sparse(f,W,grad,10)


加上正则化项,再做一遍梯度检查

loss,grad=softmax_loss_naive(W,X_dev,y_dev,1e2)
f=lambda w: softmax_loss_naive(w,X_dev,y_dev,1e2)[0]
grad_numerical=grad_check_sparse(f,W,grad,10)


3.3对比vectorized version的时间

tic=time.time()
loss_naive,grad_naive=softmax_loss_naive(W,X_dev,y_dev,0.00001)
toc=time.time()
print('naive loss: %e computed in %fs'%(loss_naive,toc - tic))

tic=time.time()
loss_vec,grad_vec=softmax_loss_vectorized(W,X_dev,y_dev,0.00001)
toc=time.time()
print('vectorized loss: %e computed in %fs'%(loss_vec,toc-tic))

#compare the two versions of the gradient
grad_difference=np.linalg.norm(grad_naive-grad_vec,ord='fro')
print('Loss differencce: %f'%np.abs(loss_naive-loss_vec))
print('Gradient difference:%f'% grad_difference)


4.用validation set 调整超参数
这里是将softmax集中到Linear_classifier这个类里,该类包括了train(),predict()和loss()而,loss()根据选用的loss function 不同而不同

所以再建一个Softmax类,继承Linear_Classifier类,然后重写loss()方法

 

以下是Linear_Classifier


class LinearClassifier(object):

  def __init__(self):
    self.W = None

  def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
            batch_size=200, verbose=False):
    """
    Train this linear classifier using stochastic gradient descent.

    Inputs:
    - X: A numpy array of shape (N, D) containing training data; there are N
      training samples each of dimension D.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c
      means that X[i] has label 0 <= c < C for C classes.
    - learning_rate: (float) learning rate for optimization.
    - reg: (float) regularization strength.
    - num_iters: (integer) number of steps to take when optimizing
    - batch_size: (integer) number of training examples to use at each step.
    - verbose: (boolean) If true, print progress during optimization.

    Outputs:
    A list containing the value of the loss function at each training iteration.
    """
    num_train, dim = X.shape
    num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
    if self.W is None:
      # lazily initialize W
      self.W = 0.001 * np.random.randn(dim, num_classes)

    # Run stochastic gradient descent to optimize W
    loss_history = []
    for it in range(num_iters):
      X_batch = None
      y_batch = None

      #########################################################################
      # TODO:                                                                 #
      # Sample batch_size elements from the training data and their           #
      # corresponding labels to use in this round of gradient descent.        #
      # Store the data in X_batch and their corresponding labels in           #
      # y_batch; after sampling X_batch should have shape (dim, batch_size)   #
      # and y_batch should have shape (batch_size,)                           #
      #                                                                       #
      # Hint: Use np.random.choice to generate indices. Sampling with         #
      # replacement is faster than sampling without replacement.              #
      #########################################################################
      #子采样
      sample_index=np.random.choice(num_train,batch_size,replace=False)#replace=False意思是说没有重复的意思
      X_batch=X[sample_index]
      y_batch=y[sample_index]
      #########################################################################
      #                       END OF YOUR CODE                                #
      #########################################################################

      # evaluate loss and gradient
      loss, grad = self.loss(X_batch, y_batch, reg)
      loss_history.append(loss)

      # perform parameter update
      #########################################################################
      # TODO:                                                                 #
      # Update the weights using the gradient and the learning rate.          #
      #########################################################################
      #梯度更新
      self.W+=-learning_rate*grad
      #########################################################################
      #                       END OF YOUR CODE                                #
      #########################################################################

      if verbose and it % 100 == 0:
        print ('iteration %d / %d: loss %f' % (it, num_iters, loss))

    return loss_history

  def predict(self, X):
    """
    Use the trained weights of this linear classifier to predict labels for
    data points.

    Inputs:
    - X: N x D array of training data. Each column is a D-dimensional point.

    Returns:
    - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
      array of length N, and each element is an integer giving the predicted
      class.
    """
    y_pred = np.zeros(X.shape[1])
    ###########################################################################
    # TODO:                                                                   #
    # Implement this method. Store the predicted labels in y_pred.            #
    ###########################################################################
    y_pred=np.dot(X,self.W)# N by C
    y_pred=np.argmax(y_pred,axis=1)# N,
    ###########################################################################
    #                           END OF YOUR CODE                              #
    ###########################################################################
    return y_pred
  
  def loss(self, X_batch, y_batch, reg):
    """
    Compute the loss function and its derivative. 
    Subclasses will override this.

    Inputs:
    - X_batch: A numpy array of shape (N, D) containing a minibatch of N
      data points; each point has dimension D.
    - y_batch: A numpy array of shape (N,) containing labels for the minibatch.
    - reg: (float) regularization strength.

    Returns: A tuple containing:
    - loss as a single float
    - gradient with respect to self.W; an array of the same shape as W
    """
    pass
 
以下是Softmax类,调用了之前的softmax函数里的loss_vectorized的方法
class Softmax(LinearClassifier):
  """ A subclass that uses the Softmax + Cross-entropy loss function """

  def loss(self, X_batch, y_batch, reg):
    return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)
 
【调 参】
from cs231n.classifiers import Softmax
result={}
best_val=-1
best_softmax=None
learning_rate=[5e-6,1e-7,5e-7]
reg=[1e4,5e4,1e8]
#################################################
for each_rate in learning_rate:
    for each_reg in reg:
        softmax=Softmax()
        loss_hist=softmax.train(X_train,y_train,learning_rate=each_rate,reg=each_reg,num_iters=700,verbose=True)
        y_train_pred=softmax.predict(X_train)
        accuracy_train=np.mean(y_train==y_train_pred)

        y_val_pred=softmax.predict(X_val)
        accuracy_val=np.mean(y_val==y_val_pred)
        result[each_rate,each_reg]=(accuracy_train,accuracy_val)
        if(best_val<accuracy_val):
            best_val=accuracy_val
            best_softmax=softmax
####################################################
for lr,reg in sorted(result):
    train_accuracy, val_accuracy=result[(lr,reg)]
    print('lr %e reg %e train accuracy: %f val accuracy: %f'%(lr,reg,train_accuracy,val_accuracy))

print("best validation accuracy achieved druring cross-validation: %f" % best_val)
 
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值