cs231n作业两层网络的实现及实现过程中遇到的问题

最新推荐文章于 2022-03-26 20:34:58 发布

vegtsunami

最新推荐文章于 2022-03-26 20:34:58 发布

阅读量247

点赞数

文章标签：机器学习

本文链接：https://blog.csdn.net/caixiaonb/article/details/104957468

版权

之前两三次的作业是顺风顺水，一路风平浪静，并没有出现难以搞定的问题，但是这次的作业给爷心态整炸了。话不多说，先贴上代码
这是大杂烩版的，把全连接，relu，softmax全整一起了

import numpy as np
import pickle
import os
import joblib


# 准备数据集
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
# 初始化
dataset = []
labelset = []
file_location = 'cifar-10-batches-py'
file_name_list = os.listdir(file_location)
# 依次读取每一个batch中的data和label并把他们合并在一个array中
for file_name in file_name_list:
   if file_name[0:10] == 'data_batch':
        data_batch = unpickle('cifar-10-batches-py/'+file_name)[b'data']
        label_batch = unpickle('cifar-10-batches-py/'+file_name)[b'labels']
        dataset.append(data_batch)
        labelset.append(label_batch)
# 去除dataset和labelset中多余的括号，比如labelset原本是这样的形式[[1],[2],[3],[4],[5]]reshape成(5,1)
dataset = np.array(dataset)
xtrain = np.reshape(dataset,(50000,3072))
ytrain = np.array(labelset).reshape(-1,1).squeeze()


class twolayernet():
    def __init__(self,input_size,hidden_size,output_size,std=1e-4):
        self.w1 = np.random.randn(input_size,hidden_size)*std
        self.w2 = np.random.randn(hidden_size,output_size)*std
        self.b1 = np.zeros(hidden_size)
        self.b2 = np.zeros(output_size)

    def loss_function(self,X,y,reg):
        num,dim = X.shape
        loss = None
        w1 = self.w1
        w2 = self.w2
        b1 = self.b1
        b2 = self.b2
        Z = np.dot(X,w1) + b1
        A = np.maximum(0,Z)
        F = np.dot(A,w2) + b2
        softmax = F-np.max(F,axis=1).reshape(-1,1)
        F_softmax = np.exp(softmax)/np.sum(np.exp(softmax),axis=1).reshape(-1,1)
        loss = -np.sum(np.log(F_softmax[range(num),list(y)]))
        loss =  loss/num
        loss += 0.5*reg*(np.sum(w1*w1)+np.sum(w2*w2))

        #核心部分，计算梯度dw1 dw2
        copy = F_softmax.copy()
        copy[range(num),list(y)] += -1
        copy /= num
        dw2 = np.dot(A.T,copy) + reg*w2
        db2 = np.sum(copy,axis=0)
        dA = np.dot(copy,w2.T)
        dZ = dA*(dA>0)
        dw1 = np.dot(X.T,dZ) + reg*w1
        db1 = np.sum(dZ,axis=0)
        return loss,dw1,dw2,db1,db2

    def train(self, X, y, learning_rate=1e-3, num_iters=1000,
              batch_size=500, print_flag=False, reg=5e-6,learning_rate_decay=0.95):
        loss_history = []
        num_train = X.shape[0]
        for t in range(num_iters):
            idx_batch = np.random.choice(num_train, batch_size, replace=False)
            X_batch = X[idx_batch]
            y_batch = y[idx_batch]
            loss,dw1,dw2,db1,db2,F = self.loss_function(X_batch, y_batch,reg)
            loss_history.append(loss)
            self.w1 += -learning_rate * dw1
            self.w2 += -learning_rate * dw2
            self.b1 += -learning_rate * db1
            self.b2 += -learning_rate * db2

            if print_flag and t%100 == 0:
                print('iteration %d / %d: loss %f' % (t, num_iters, loss))
                w1 = self.w1
                w2 = self.w2
                b1 = self.b1
                b2 = self.b2
                Z1 = np.dot(X_batch, w1) + b1
                A1 = np.maximum(0, Z1)  # ReLU function
                scores = np.dot(A1, w2) + b2
                y_pred = np.argmax(scores, axis=1)
                N = 0
                new_num =X_batch.shape[0]
                for i in range(new_num):
                    if y_batch[i] == y_pred[i]:
                        N += 1
                print(N / new_num)
        return loss_history

    def predict(self,X):
        w1 = self.w1
        w2 = self.w2
        b1 = self.b1
        b2 = self.b2
        Z1 = np.dot(X, w1) + b1
        A1 = np.maximum(0, Z1)  # ReLU function
        scores = np.dot(A1, w2) + b2
        y_pred = np.argmax(scores, axis=1)
        return y_pred

if __name__ =='__main__':
    xtrain_data = xtrain[:5000]
    ytrain_data = ytrain[:5000]
    xtest_data = xtrain[30000:30200]
    ytest_data = ytrain[30000:30200]
    xtrain_data = (xtrain_data- xtrain_data.mean())/np.sqrt(xtrain_data.var()+1e-6)
    xtest_data = (xtest_data- xtest_data.mean())/np.sqrt(ytest_data.var()+1e-6)
    model = twolayernet(input_size=3072,hidden_size=100,output_size=10,std=0.04)
    model.train(X=xtrain_data,y=ytrain_data,print_flag=True,num_iters=1600,reg=1e-4,learning_rate=1e-3,batch_size=200)
    y_predict = model.predict(xtest_data)
    num = 0
    for i in range(200):
        if y_predict[i] == ytest_data[i]:
            num +=1
    print(num/200)

之前两次作业都有用到cifar10这个数据集，所以加载数据我都是复制粘贴的，而且在问题出来后，我理所应当的认为数据加载这方面不会有任何问题，不然之前就跑不了了。
之前的ytrain是以[array[ ],array[ ],…,]的形式存在的，而我却认为它是一个列向量类似[ 1,2,3]的转置，这俩货是有本质区别的，我当时也担心了一下，没怎么在意
但是在这段代码copy[range(num),list(y)] += -1中，如果是上面所写的第一种形式，那就是copy中所有的数全部 -1，但是如果是第二种，那么就会将copy中对应range(num)行，对应list(y)列的数-1。
为了这个问题，我每天都看好几遍这个代码，迟迟发现不了错误，loss运行下来一直是460，而且不会变，问了个ai专业的同学（江湖人称丰神），也没给到什么建设性的意见。不过还好有某c巨，短短2小时，就帮我debug了出来，这么细微的错误。
（有一说一确实难找啊啊啊啊）
通过这次错误，我发现bug的出现不能只检查算法的思路，这次错误出来，我反复检查了我网络的构建，反向传播的导数是否求对了，这么做是正确的，但是还有一方面，作为green hand，我没想到，就是思路对的同时，实现思路的数据的结构有可能是错的，你写的输出和你想要的输出可能存在偏差，所以建议多写代码，多发现错误，熟练数据的结构。（基础太差）

下面贴上层层递进再层层递出的代码（思路清晰，我是憨憨）

import numpy as np
import pickle
import os

# 准备数据集
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
# 初始化
dataset = []
labelset = []
file_location = 'cifar-10-batches-py'
file_name_list = os.listdir(file_location)
# 依次读取每一个batch中的data和label并把他们合并在一个array中
for file_name in file_name_list:
   if file_name[0:10] == 'data_batch':
        data_batch = unpickle('cifar-10-batches-py/'+file_name)[b'data']
        label_batch = unpickle('cifar-10-batches-py/'+file_name)[b'labels']
        dataset.append(data_batch)
        labelset.append(label_batch)
# 去除dataset和labelset中多余的括号，比如labelset原本是这样的形式[[1],[2],[3],[4],[5]]reshape成(5,1)
dataset = np.array(dataset)
xtrain = np.reshape(dataset,(50000,3072))
ytrain = np.reshape(labelset,(1,-1))[0].T

def affine_forward(x,w,b):
    N = x.shape[0]
    x_reshape = x.reshape(N,-1)
    out = np.dot(x_reshape,w)+b
    cache = (x,w,b)
    return out,cache

def affine_backword(dout,cache):
    x,w,b = cache
    N = x.shape[0]
    x_reshape = x.reshape(N,-1)
    dw = np.dot(x_reshape.T,dout)
    dx = np.dot(dout,w.T).reshape(x.shape)
    db = np.sum(dout,axis=0)
    return dx,dw,db

def relu_forward(x):
    out = None
    out = np.maximum(0,x)
    cache = x
    return out,cache

def relu_backward(dout,cache):
    dx,x = None,cache
    dx = dout
    dx[x<0] = 0
    return dx

def softmax_loss(x, y):
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
    log_probs = shifted_logits - np.log(Z)
    probs = np.exp(log_probs)
    N = x.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx

class TwoLayerNet(object):
    def __init__(self,input_dim = 3*32*32,hidden_dim = 100,num_classes = 10, weight_scale = 1e-3,reg=0.0):
        self.params = {}
        self.reg = reg
        self.params['w1'] = weight_scale*np.random.randn(input_dim,hidden_dim)
        self.params['w2'] = weight_scale*np.random.randn(hidden_dim,num_classes)
        self.params['b1'] = np.zeros(hidden_dim)
        self.params['b2'] = np.zeros(num_classes)

    def loss(self,X,y=None):
        scores = None
        w1 = self.params['w1']
        w2 = self.params['w2']
        b1 = self.params['b1']
        b2 = self.params['b2']
        y1,cache1 = affine_forward(X,w1,b1)
        y2,cache2 = relu_forward(y1)
        y3,cache3 = affine_forward(y2,w2,b2)
        scores = y3
        if y is None:
            return scores

        loss , grads = 0 , {}
        loss , dy3 = softmax_loss(scores,y)
        reg_loss = 0.5*self.reg*(np.sum(w1*w1)+np.sum(w2*w2))
        loss += reg_loss
        dy2 , grads['w2'] , grads['b2'] = affine_backword(dy3 , cache3)
        dy1 = relu_backward(dy2,cache2)
        dx , grads['w1'] , grads['b1'] = affine_backword(dy1,cache1)
        grads['w1'] += self.reg * w1
        grads['w2'] += self.reg * w2

        return loss ,grads

    def train(self, X, y, learning_rate=1e-3, num_iters=1000,
                  batch_size=500, print_flag=False, reg=5e-6, learning_rate_decay=0.95):
            loss_history = []
            num_train = X.shape[0]
            for t in range(num_iters):
                idx_batch = np.random.choice(num_train, batch_size, replace=False)
                X_batch = X[idx_batch]
                y_batch = y[idx_batch]
                loss, grads = self.loss(X_batch, y_batch)
                loss_history.append(loss)
                self.params['w1'] += -learning_rate * grads['w1']
                self.params['w2'] += -learning_rate * grads['w2']
                self.params['b1'] += -learning_rate * grads['b1']
                self.params['b2'] += -learning_rate * grads['b2']

                if print_flag and t % 100 == 0:
                    print('iteration %d / %d: loss %f' % (t, num_iters, loss))
                    w1 = self.params['w1']
                    w2 = self.params['w2']
                    b1 = self.params['b1']
                    b2 = self.params['b2']
                    Z1 = np.dot(X_batch, w1) + b1
                    A1 = np.maximum(0, Z1) # ReLU function
                    scores = np.dot(A1, w2) + b2
                    y_pred = np.argmax(scores, axis=1)
                    N = 0
                    new_num = X_batch.shape[0]
                    for i in range(new_num):
                        if y_batch[i] == y_pred[i]:
                            N += 1
                    print(N / new_num)
            return loss_history

    def predict(self, X):
        w1 = self.params['w1']
        w2 = self.params['w2']
        b1 = self.params['b1']
        b2 = self.params['b2']
        Z1 = np.dot(X, w1) + b1
        A1 = np.maximum(0, Z1)# ReLU function
        scores = np.dot(A1, w2) + b2
        y_pred = np.argmax(scores, axis=1)
        return y_pred

if __name__ == '__main__':
        xtrain_data = xtrain[:30000]
        ytrain_data = ytrain[:30000]
        xtest_data = xtrain[30000:30200]
        ytest_data = ytrain[30000:30200]
        model = TwoLayerNet(input_dim=3072, hidden_dim=100, num_classes=10)
        model.train(X=xtrain_data, y=ytrain_data, print_flag=True, num_iters=1600, reg=1e-4, learning_rate=1e-4,
                    batch_size=200)
        y_predict = model.predict(xtest_data)
        num = 0
        for i in range(200):
            if y_predict[i] == ytest_data[i]:
                num += 1
        print(num / 200)

最后再次感谢c巨，希望他以后做我的debug机器

vegtsunami

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
cs231n作业两层网络的实现及实现过程中遇到的问题

之前两三次的作业是顺风顺水，一路风平浪静，并没有出现难以搞定的问题，但是这次的作业给爷心态整炸了。话不多说，先贴上代码这是大杂烩版的，把全连接，relu，softmax全整一起了import numpy as npimport pickleimport osimport joblib# 准备数据集def unpickle(file): with open(file, 'r...
复制链接

扫一扫