SGD_momentum动量的初次使用

最新推荐文章于 2024-03-20 09:38:55 发布

vegtsunami

最新推荐文章于 2024-03-20 09:38:55 发布

阅读量1.6k

点赞数 1

文章标签：机器学习

本文链接：https://blog.csdn.net/caixiaonb/article/details/104979508

版权

在优化网络训练的过程中，在SGD中加入动量可以有效的解决鞍点附近梯度小导致的w更新太慢的问题以及loss卡在极值点而不是最小值点的问题。
大致思路如下
$v=v*p - learning \_rate*dw \\ new\_w = w + v \\ 其中p是超参数，自行调整$

下面是实现的代码，是在之前已经完成的两层网络作业的基础上做出来的，仅仅只改变了train（）函数w的更新过程（118行开始），加入动量后不仅训练速度突飞猛进，而且最终的正确率也提高了点。

import numpy as np
import pickle
import os

# 准备数据集
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
# 初始化
dataset = []
labelset = []
file_location = 'cifar-10-batches-py'
file_name_list = os.listdir(file_location)
# 依次读取每一个batch中的data和label并把他们合并在一个array中
for file_name in file_name_list:
   if file_name[0:10] == 'data_batch':
        data_batch = unpickle('cifar-10-batches-py/'+file_name)[b'data']
        label_batch = unpickle('cifar-10-batches-py/'+file_name)[b'labels']
        dataset.append(data_batch)
        labelset.append(label_batch)
# 去除dataset和labelset中多余的括号，比如labelset原本是这样的形式[[1],[2],[3],[4],[5]]reshape成(5,1)
dataset = np.array(dataset)
xtrain = np.reshape(dataset,(50000,3072))
ytrain = np.reshape(labelset,(1,-1))[0].T

def affine_forward(x,w,b):
    N = x.shape[0]
    x_reshape = x.reshape(N,-1)
    out = np.dot(x_reshape,w)+b
    cache = (x,w,b)
    return out,cache

def affine_backword(dout,cache):
    x,w,b = cache
    N = x.shape[0]
    x_reshape = x.reshape(N,-1)
    dw = np.dot(x_reshape.T,dout)
    dx = np.dot(dout,w.T).reshape(x.shape)
    db = np.sum(dout,axis=0)
    return dx,dw,db

def relu_forward(x):
    out = None
    out = np.maximum(0,x)
    cache = x
    return out,cache

def relu_backward(dout,cache):
    dx,x = None,cache
    dx = dout
    dx[x<0] = 0
    return dx

def softmax_loss(x, y):
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
    log_probs = shifted_logits - np.log(Z)
    probs = np.exp(log_probs)
    N = x.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx

def sgd_momentum(w, dw, config=None):
    if config is None: config = {}
    config.setdefault('learning_rate', 2e-4)
    config.setdefault('momentum', 0.09)
    v = config.get('velocity', np.zeros_like(w))

    next_w = None
    v = config['momentum'] * v - config['learning_rate'] * dw
    next_w = w + v
    config['velocity'] = v

    return next_w, config

class TwoLayerNet(object):
    def __init__(self,input_dim = 3*32*32,hidden_dim = 100,num_classes = 10, weight_scale = 1e-3,reg=0.0):
        self.params = {}
        self.reg = reg
        self.params['w1'] = weight_scale*np.random.randn(input_dim,hidden_dim)
        self.params['w2'] = weight_scale*np.random.randn(hidden_dim,num_classes)
        self.params['b1'] = np.zeros(hidden_dim)
        self.params['b2'] = np.zeros(num_classes)

    def loss(self,X,y=None):
        scores = None
        w1 = self.params['w1']
        w2 = self.params['w2']
        b1 = self.params['b1']
        b2 = self.params['b2']
        y1,cache1 = affine_forward(X,w1,b1)
        y2,cache2 = relu_forward(y1)
        y3,cache3 = affine_forward(y2,w2,b2)
        scores = y3
        if y is None:
            return scores

        loss , grads = 0 , {}
        loss , dy3 = softmax_loss(scores,y)
        reg_loss = 0.5*self.reg*(np.sum(w1*w1)+np.sum(w2*w2))
        loss += reg_loss
        dy2 , grads['w2'] , grads['b2'] = affine_backword(dy3 , cache3)
        dy1 = relu_backward(dy2,cache2)
        dx , grads['w1'] , grads['b1'] = affine_backword(dy1,cache1)
        grads['w1'] += self.reg * w1
        grads['w2'] += self.reg * w2

        return loss ,grads

    def train(self, X, y, learning_rate=1e-3, num_iters=1000,
                  batch_size=500, print_flag=False, reg=5e-6, learning_rate_decay=0.95):
            loss_history = []
            num_train = X.shape[0]
            config1 = {}
            config1['learning_rate'] = learning_rate
            config2 = {}
            config2['learning_rate'] = learning_rate
            for t in range(num_iters):
                idx_batch = np.random.choice(num_train, batch_size, replace=False)
                X_batch = X[idx_batch]
                y_batch = y[idx_batch]
                loss, grads = self.loss(X_batch, y_batch)
                loss_history.append(loss)
                self.params['w1'] ,config1 = sgd_momentum(self.params['w1'],grads['w1'],config1)
                self.params['w2'] ,config2 = sgd_momentum(self.params['w2'],grads['w2'],config2)
                self.params['b1'] += -learning_rate * grads['b1']
                self.params['b2'] += -learning_rate * grads['b2']

                if print_flag and t % 100 == 0:
                    print('iteration %d / %d: loss %f' % (t, num_iters, loss))
                    print(self.params['w1'][0, 0])
                    w1 = self.params['w1']
                    w2 = self.params['w2']
                    b1 = self.params['b1']
                    b2 = self.params['b2']
                    Z1 = np.dot(X_batch, w1) + b1
                    A1 = np.maximum(0, Z1) # ReLU function
                    scores = np.dot(A1, w2) + b2
                    y_pred = np.argmax(scores, axis=1)
                    N = 0
                    new_num = X_batch.shape[0]
                    for i in range(new_num):
                        if y_batch[i] == y_pred[i]:
                            N += 1
                    print(N / new_num)
                    learning_rate *= pow(learning_rate_decay,t%100)
            return loss_history

    def predict(self, X):
        w1 = self.params['w1']
        w2 = self.params['w2']
        b1 = self.params['b1']
        b2 = self.params['b2']
        Z1 = np.dot(X, w1) + b1
        A1 = np.maximum(0, Z1)# ReLU function
        scores = np.dot(A1, w2) + b2
        y_pred = np.argmax(scores, axis=1)
        return y_pred

if __name__ == '__main__':
        xtrain_data = xtrain[:30000]
        ytrain_data = ytrain[:30000]
        xtest_data = xtrain[30000:30200]
        ytest_data = ytrain[30000:30200]
        model = TwoLayerNet(input_dim=3072, hidden_dim=100, num_classes=10)
        model.train(X=xtrain_data, y=ytrain_data, print_flag=True, num_iters=4000, reg=1e-1, learning_rate=2e-4,
                    batch_size=200)
        y_predict = model.predict(xtest_data)
        num = 0
        for i in range(200):
            if y_predict[i] == ytest_data[i]:
                num += 1
        print(num / 200)

vegtsunami

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
0
评论
SGD_momentum动量的初次使用

在优化网络训练的过程中，在SGD中加入动量可以有效的解决鞍点附近梯度小导致的w更新太慢的问题以及loss卡在极值点而不是最小值点的问题。下面是实现的代码，是在之前已经完成的两层网络作业的基础上做出来的，仅仅只改变了train（）函数w的更新过程（118行开始），加入动量后不仅训练速度突飞猛进，而且最终的正确率也提高了点。import numpy as npimport pickleimpo...
复制链接

扫一扫