cs231n作业 两层网络的实现及实现过程中遇到的问题

之前两三次的作业是顺风顺水,一路风平浪静,并没有出现难以搞定的问题,但是这次的作业给爷心态整炸了。话不多说,先贴上代码
这是大杂烩版的,把全连接,relu,softmax全整一起了

import numpy as np
import pickle
import os
import joblib


# 准备数据集
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
# 初始化
dataset = []
labelset = []
file_location = 'cifar-10-batches-py'
file_name_list = os.listdir(file_location)
# 依次读取每一个batch中的data和label并把他们合并在一个array中
for file_name in file_name_list:
   if file_name[0:10] == 'data_batch':
        data_batch = unpickle('cifar-10-batches-py/'+file_name)[b'data']
        label_batch = unpickle('cifar-10-batches-py/'+file_name)[b'labels']
        dataset.append(data_batch)
        labelset.append(label_batch)
# 去除dataset和labelset中多余的括号,比如labelset原本是这样的形式[[1],[2],[3],[4],[5]]reshape成(5,1)
dataset = np.array(dataset)
xtrain = np.reshape(dataset,(50000,3072))
ytrain = np.array(labelset).reshape(-1,1).squeeze()


class twolayernet():
    def __init__(self,input_size,hidden_size,output_size,std=1e-4):
        self.w1 = np.random.randn(input_size,hidden_size)*std
        self.w2 = np.random.randn(hidden_size,output_size)*std
        self.b1 = np.zeros(hidden_size)
        self.b2 = np.zeros(output_size)

    def loss_function(self,X,y,reg):
        num,dim = X.shape
        loss = None
        w1 = self.w1
        w2 = self.w2
        b1 = self.b1
        b2 = self.b2
        Z = np.dot(X,w1) + b1
        A = np.maximum(0,Z)
        F = np.dot(A,w2) + b2
        softmax = F-np.max(F,axis=1).reshape(-1,1)
        F_softmax = np.exp(softmax)/np.sum(np.exp(softmax),axis=1).reshape(-1,1)
        loss = -np.sum(np.log(F_softmax[range(num),list(y)]))
        loss =  loss/num
        loss += 0.5*reg*(np.sum(w1*w1)+np.sum(w2*w2))

        #核心部分,计算梯度dw1 dw2
        copy = F_softmax.copy()
        copy[range(num),list(y)] += -1
        copy /= num
        dw2 = np.dot(A.T,copy) + reg*w2
        db2 = np.sum(copy,axis=0)
        dA = np.dot(copy,w2.T)
        dZ = dA*(dA>0)
        dw1 = np.dot(X.T,dZ) + reg*w1
        db1 = np.sum(dZ,axis=0)
        return loss,dw1,dw2,db1,db2

    def train(self, X, y, learning_rate=1e-3, num_iters=1000,
              batch_size=500, print_flag=False, reg=5e-6,learning_rate_decay=0.95):
        loss_history = []
        num_train = X.shape[0]
        for t in range(num_iters):
            idx_batch = np.random.choice(num_train, batch_size, replace=False)
            X_batch = X[idx_batch]
            y_batch = y[idx_batch]
            loss,dw1,dw2,db1,db2,F = self.loss_function(X_batch, y_batch,reg)
            loss_history.append(loss)
            self.w1 += -learning_rate * dw1
            self.w2 += -learning_rate * dw2
            self.b1 += -learning_rate * db1
            self.b2 += -learning_rate * db2

            if print_flag and t%100 == 0:
                print('iteration %d / %d: loss %f' % (t, num_iters, loss))
                w1 = self.w1
                w2 = self.w2
                b1 = self.b1
                b2 = self.b2
                Z1 = np.dot(X_batch, w1) + b1
                A1 = np.maximum(0, Z1)  # ReLU function
                scores = np.dot(A1, w2) + b2
                y_pred = np.argmax(scores, axis=1)
                N = 0
                new_num =X_batch.shape[0]
                for i in range(new_num):
                    if y_batch[i] == y_pred[i]:
                        N += 1
                print(N / new_num)
        return loss_history

    def predict(self,X):
        w1 = self.w1
        w2 = self.w2
        b1 = self.b1
        b2 = self.b2
        Z1 = np.dot(X, w1) + b1
        A1 = np.maximum(0, Z1)  # ReLU function
        scores = np.dot(A1, w2) + b2
        y_pred = np.argmax(scores, axis=1)
        return y_pred

if __name__ =='__main__':
    xtrain_data = xtrain[:5000]
    ytrain_data = ytrain[:5000]
    xtest_data = xtrain[30000:30200]
    ytest_data = ytrain[30000:30200]
    xtrain_data = (xtrain_data- xtrain_data.mean())/np.sqrt(xtrain_data.var()+1e-6)
    xtest_data = (xtest_data- xtest_data.mean())/np.sqrt(ytest_data.var()+1e-6)
    model = twolayernet(input_size=3072,hidden_size=100,output_size=10,std=0.04)
    model.train(X=xtrain_data,y=ytrain_data,print_flag=True,num_iters=1600,reg=1e-4,learning_rate=1e-3,batch_size=200)
    y_predict = model.predict(xtest_data)
    num = 0
    for i in range(200):
        if y_predict[i] == ytest_data[i]:
            num +=1
    print(num/200)

之前两次作业都有用到cifar10这个数据集,所以加载数据我都是复制粘贴的,而且在问题出来后,我理所应当的认为数据加载这方面不会有任何问题,不然之前就跑不了了。
之前的ytrain是以[array[ ],array[ ],…,]的形式存在的,而我却认为它是一个列向量类似[ 1,2,3]的转置,这俩货是有本质区别的,我当时也担心了一下,没怎么在意
但是在这段代码copy[range(num),list(y)] += -1中,如果是上面所写的第一种形式,那就是copy中所有的数全部 -1,但是如果是第二种,那么就会将copy中对应range(num)行,对应list(y)列的数-1。
为了这个问题,我每天都看好几遍这个代码,迟迟发现不了错误,loss运行下来一直是460,而且不会变,问了个ai专业的同学(江湖人称丰神),也没给到什么建设性的意见。不过还好有某c巨,短短2小时,就帮我debug了出来,这么细微的错误。
(有一说一确实难找啊啊啊啊)
通过这次错误,我发现bug的出现不能只检查算法的思路,这次错误出来,我反复检查了我网络的构建,反向传播的导数是否求对了,这么做是正确的,但是还有一方面,作为green hand,我没想到,就是思路对的同时,实现思路的数据的结构有可能是错的,你写的输出和你想要的输出可能存在偏差,所以建议多写代码,多发现错误,熟练数据的结构。(基础太差)

下面贴上层层递进再层层递出的代码(思路清晰,我是憨憨)

import numpy as np
import pickle
import os

# 准备数据集
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
# 初始化
dataset = []
labelset = []
file_location = 'cifar-10-batches-py'
file_name_list = os.listdir(file_location)
# 依次读取每一个batch中的data和label并把他们合并在一个array中
for file_name in file_name_list:
   if file_name[0:10] == 'data_batch':
        data_batch = unpickle('cifar-10-batches-py/'+file_name)[b'data']
        label_batch = unpickle('cifar-10-batches-py/'+file_name)[b'labels']
        dataset.append(data_batch)
        labelset.append(label_batch)
# 去除dataset和labelset中多余的括号,比如labelset原本是这样的形式[[1],[2],[3],[4],[5]]reshape成(5,1)
dataset = np.array(dataset)
xtrain = np.reshape(dataset,(50000,3072))
ytrain = np.reshape(labelset,(1,-1))[0].T

def affine_forward(x,w,b):
    N = x.shape[0]
    x_reshape = x.reshape(N,-1)
    out = np.dot(x_reshape,w)+b
    cache = (x,w,b)
    return out,cache

def affine_backword(dout,cache):
    x,w,b = cache
    N = x.shape[0]
    x_reshape = x.reshape(N,-1)
    dw = np.dot(x_reshape.T,dout)
    dx = np.dot(dout,w.T).reshape(x.shape)
    db = np.sum(dout,axis=0)
    return dx,dw,db

def relu_forward(x):
    out = None
    out = np.maximum(0,x)
    cache = x
    return out,cache

def relu_backward(dout,cache):
    dx,x = None,cache
    dx = dout
    dx[x<0] = 0
    return dx

def softmax_loss(x, y):
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
    log_probs = shifted_logits - np.log(Z)
    probs = np.exp(log_probs)
    N = x.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx

class TwoLayerNet(object):
    def __init__(self,input_dim = 3*32*32,hidden_dim = 100,num_classes = 10, weight_scale = 1e-3,reg=0.0):
        self.params = {}
        self.reg = reg
        self.params['w1'] = weight_scale*np.random.randn(input_dim,hidden_dim)
        self.params['w2'] = weight_scale*np.random.randn(hidden_dim,num_classes)
        self.params['b1'] = np.zeros(hidden_dim)
        self.params['b2'] = np.zeros(num_classes)

    def loss(self,X,y=None):
        scores = None
        w1 = self.params['w1']
        w2 = self.params['w2']
        b1 = self.params['b1']
        b2 = self.params['b2']
        y1,cache1 = affine_forward(X,w1,b1)
        y2,cache2 = relu_forward(y1)
        y3,cache3 = affine_forward(y2,w2,b2)
        scores = y3
        if y is None:
            return scores

        loss , grads = 0 , {}
        loss , dy3 = softmax_loss(scores,y)
        reg_loss = 0.5*self.reg*(np.sum(w1*w1)+np.sum(w2*w2))
        loss += reg_loss
        dy2 , grads['w2'] , grads['b2'] = affine_backword(dy3 , cache3)
        dy1 = relu_backward(dy2,cache2)
        dx , grads['w1'] , grads['b1'] = affine_backword(dy1,cache1)
        grads['w1'] += self.reg * w1
        grads['w2'] += self.reg * w2

        return loss ,grads

    def train(self, X, y, learning_rate=1e-3, num_iters=1000,
                  batch_size=500, print_flag=False, reg=5e-6, learning_rate_decay=0.95):
            loss_history = []
            num_train = X.shape[0]
            for t in range(num_iters):
                idx_batch = np.random.choice(num_train, batch_size, replace=False)
                X_batch = X[idx_batch]
                y_batch = y[idx_batch]
                loss, grads = self.loss(X_batch, y_batch)
                loss_history.append(loss)
                self.params['w1'] += -learning_rate * grads['w1']
                self.params['w2'] += -learning_rate * grads['w2']
                self.params['b1'] += -learning_rate * grads['b1']
                self.params['b2'] += -learning_rate * grads['b2']

                if print_flag and t % 100 == 0:
                    print('iteration %d / %d: loss %f' % (t, num_iters, loss))
                    w1 = self.params['w1']
                    w2 = self.params['w2']
                    b1 = self.params['b1']
                    b2 = self.params['b2']
                    Z1 = np.dot(X_batch, w1) + b1
                    A1 = np.maximum(0, Z1) # ReLU function
                    scores = np.dot(A1, w2) + b2
                    y_pred = np.argmax(scores, axis=1)
                    N = 0
                    new_num = X_batch.shape[0]
                    for i in range(new_num):
                        if y_batch[i] == y_pred[i]:
                            N += 1
                    print(N / new_num)
            return loss_history

    def predict(self, X):
        w1 = self.params['w1']
        w2 = self.params['w2']
        b1 = self.params['b1']
        b2 = self.params['b2']
        Z1 = np.dot(X, w1) + b1
        A1 = np.maximum(0, Z1)# ReLU function
        scores = np.dot(A1, w2) + b2
        y_pred = np.argmax(scores, axis=1)
        return y_pred

if __name__ == '__main__':
        xtrain_data = xtrain[:30000]
        ytrain_data = ytrain[:30000]
        xtest_data = xtrain[30000:30200]
        ytest_data = ytrain[30000:30200]
        model = TwoLayerNet(input_dim=3072, hidden_dim=100, num_classes=10)
        model.train(X=xtrain_data, y=ytrain_data, print_flag=True, num_iters=1600, reg=1e-4, learning_rate=1e-4,
                    batch_size=200)
        y_predict = model.predict(xtest_data)
        num = 0
        for i in range(200):
            if y_predict[i] == ytest_data[i]:
                num += 1
        print(num / 200)

最后再次感谢c巨,希望他以后做我的debug机器

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

vegtsunami

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值