深度学习笔记(2):参数范数惩罚L1和L2

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

自定义正则L1,L2类

# -*- coding: utf-8 -*- 
# author: 
# time : 2021/4/28 13:04
# task:  自定义的正则类。
""" 导入包"""
from abc import ABC, abstractmethod
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import math
import sys
import os
import time
import re
import progressbar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



class RegularizerBase(ABC):
    """ 基础类,init, loss, grad 两种方法,
    分别在反向传播的时候使用grad 进行使用,
    在计算loss的进行统计正则贡献。"""
    def __init__(self):
        super().__init__()

    @abstractmethod
    def loss(self, **kwargs):
        raise NotImplementedError
    @abstractmethod
    def grad(self, **kwargs):
        raise NotImplementedError

""" 正则l1,l2 类,都是集成基础类"""
class L1_Regularizer(RegularizerBase):
    """
    L1 方法进行正则化,公式: j = j + lambd * |w|
    所以参数是lambd
    """
    def __init__(self, lambd = 0.001):
        super(L1_Regularizer).__init__()
        self.lambd = lambd

    # 实现loss
    def loss(self, params):
        loss = 0
        pattern = re.compile(r"^W\d+") # 形状如W1,W2,W3 ...这种,不对b进行正则。
        for key, val in params.items():
            if pattern.match(key):
                loss +=  np.sum(np.abs(val)) * self.lambd
        return loss

    # 实现grad
    def grad(self, W): # formular : lambd * sign(w)
        grad = self.lambd * np.sign(W)
        return grad

class L2_Regularizer(RegularizerBase):
    """
    L2 正则化,formular: j = j + 0.5* (w).t * lambd * (w)
    """
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd

    def loss(self, params):
        loss = 0
        pattern = re.compile(r"W\d+")
        for key, val in params.items():
            if pattern.match(key):
                loss += 0.5 * np.sum(np.square(val)) * self.lambd
                # np.sqrt()  求开方, np.square 求平方。
                # loss += 0.5 * np.sum(val**2) * self.lambd
        return loss

    def grad(self, W):
        grad = self.lambd * W
        return grad

class RegularizerInitializer(object):
    """ """

    def __init__(self, regular_name = "l2"):
        self.regular_name = regular_name

    def __call__(self):
        r = r"([a-zA-Z]*)=([^,)]*)"
        regular_str = self.regular_name.lower()  # l2=0.001
        # 提取参数。
        kwargs = dict([(i,eval(j)) for (i,j) in re.findall(r, regular_str)])


        if "l1" in regular_str:
            regular = L1_Regularizer(**kwargs)
        elif "l2" in regular_str:
            regular = L2_Regularizer(**kwargs)
        else:
            raise ValueError(f"Unrecognized regular: {regular_str}")
        return regular #返回一个类对象。



构建正则全连接

from abc import ABC, abstractmethod
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import math
import sys
import os
import time
import re
import progressbar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 从之前的文件中导入需要的类:
from utils import *
from activation import *


# 导入regular类
from Regular_Class import *

class FullyConnected_Regular(LayerBase):
    """ 需要一些参数,比如in,out....forward, backward, update, flash_gradient, _init_params"""
    def __init__(self, n_out,acti_fn,init_w,optimizer=None):
        super(FullyConnected_Regular,self).__init__(optimizer)  # 将optimizer 传到lyaerbase 中进行初始化。
        self.n_out = n_out
        self.acti_fn = ActivationInitializer(acti_fn)()
        self.init_w = init_w
        self.init_weights = WeightInitializer(mode=init_w)
        self.n_in = None
        self.is_initialized = False

    def _init_params(self,**kwargs):
        b = np.zeros((1, self.n_out))
        W = self.init_weights((self.n_in, self.n_out))
        self.params = {"W" : W, "b": b}
        self.gradients = {"W" : np.zeros_like(W), "b": np.zeros_like(b)}
        self.is_initialized = True

    def forward(self,X, retain_derived=True):
        """ 全连接网络的前向传播"""
        if not self.is_initialized:
            self.n_in = X.shape[1]
            self._init_params()
        W = self.params["W"]
        b = self.params["b"]

        z = X@W + b
        a = self.acti_fn.forward(z)
        # 保存输入,方便进行反向传播计算。
        if retain_derived:
            self.X.append(X)
        return a

    def backward(self, dLda, retain=True, regular = None):
        """ 有正则项的反向传播"""
        if not isinstance(dLda, list):
            dLda = [dLda]

        dX = []
        X = self.X  # X shape: nbatchsize,inchannel.      dlda shape: batchsize, outchannel.
        for da, x  in zip(dLda,X):
            dx,dw,db = self._bwd(da,x,regular)
            dX.append(dx)
            if retain:
                self.gradients["W"] += dw
                self.gradients["b"] += db
            return dX[0] if len(X)==1 else dX

    def _bwd(self,dLda,X,regular):
        W = self.params["W"]
        b = self.params["b"]
        z = X@W +b

        dz = dLda * self.acti_fn.grad(z)   # a = acti_fn(z)
        dX = dz @ W.T
        dW = X.T @ dz
        db = dz.sum(axis=0, keepdims=True)

        # 如果有正则项
        if regular is not None:
            # n = X.shape[0]
            dW_norm = regular.grad(W)
            dW += dW_norm
        return dX,dW,db

    def hyperparams(self):
        return {
            "layer":"Fully_connected_Regularizer",
            "init_w":self.init_w,
            "n_in": self.n_in,
            "n_out": self.n_out,
            "acti_fn":str(self.acti_fn),
            "optimizer":{

                "hyperparams": self.optimizer.hyperparams,
            },
            "componets":{

                k:v for k,v in self.params.items()
            }



        }

双层全连接正则模型

def minibatch(x, batchsize=256, shuffle=True):
    N = x.shape[0]
    idx = np.arange(N)
    n_batches = int(np.ceil(N/batchsize))

    if shuffle:
        np.random.shuffle(idx)
    def mb_generator():
        for i in range(n_batches):
            yield idx[i*batchsize:(i+1)*batchsize]
    return mb_generator(), n_batches  # 返回迭代器(索引), 以及batch数量。

""" 双层全连接层的模型"""
class DFN(object):
    def __init__(self,
                 hidden_dims_1 = None,
                 hidden_dims_2 = None,
                 optimizer = "sgd(lr=0.1)",
                 init_w = "std_normal",
                 regular_act=None,
                 loss = CrossEntropy()):
        self.optimizer = optimizer
        self.hidden_dims_1 = hidden_dims_1
        self.hidden_dims_2 = hidden_dims_2
        self.loss =loss
        self.regular = None
        self.regular_act = regular_act
        self.is_initialized = False
        self.init_w = init_w


    def _set_params(self):
        """ 模型初始化: FC1-> sigmoid -> FC2 -> softmax"""
        self.layers = OrderedDict()
        self.layers["FC1"] = FullyConnected_Regular(n_out=self.hidden_dims_1,
                                                    acti_fn="sigmoid",
                                                    init_w=self.init_w,
                                                    optimizer=self.optimizer)
        self.layers["FC2"] = FullyConnected_Regular(n_out=self.hidden_dims_2,
                                                    acti_fn="affine(slope=1.,intercept=0)", #slope=(.*),intercept=(.*)
                                                    init_w=self.init_w,
                                                    optimizer=self.optimizer)
        self.layers["Softmax"] = Softmax(optimizer=self.optimizer)   # 需要改变参数数量加上一个regular



        if self.regular_act is not None:
            self.regular = RegularizerInitializer(self.regular_act)()  # 类对象。
        self.is_initialized = True

    def forward(self,X):
        Xs = {}
        out = X
        for k,v in self.layers.items():
            Xs[k] = out
            out = v.forward(out)
        return out, Xs

    def backward(self,grad):
        dXs = {}
        out = grad
        for k,v in reversed(self.layers.items()):
            dXs[k] = out
            out = v.backward(out, regular=self.regular)
        return out, dXs

    def update(self):
        """ 参数更新"""
        for k,v in reversed(list(self.layers.items())):
            v.update()
        self.flush_gradients()

    def flush_gradients(self, curr_loss=None):
        for k,v in self.layers.items():
            v.flush_gradients()

    def fit(self, X_train,y_train,n_epochs=20, batch_size=64, verbose=False):
        """
        :param X_train:
        :param y_train:
        :param n_epochs:
        :param batch_size:
        :param verbose:
        :return:
        """

        self.verbose = verbose
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        if not self.is_initialized:
            self.n_features = X_train.shape[1]
            self._set_params()

        prev_loss = np.inf

        # softmax = Softmax()

        for i in range(n_epochs):
            loss, epoch_start = 0.0, time.time()
            batch_generator, n_batch = minibatch(X_train, self.batch_size, shuffle=True)
            # batch_generator is index of xtrian.
            for j, batch_idx in enumerate(batch_generator):
                batch_len, batch_start = len(batch_idx), time.time()
                X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
                out,_  = self.forward(X_batch) # 前向传播。
                # y_pred_batch = softmax(out)
                batch_loss = self.loss(y_batch, out)

                if self.regular is not None:
                    for _,layerparams in self.hyperparams["components"].items():
                        assert type(layerparams) is dict
                        batch_loss += self.regular.loss(layerparams)

                grad = self.loss.grad(y_batch, out)
                _,_ = self.backward(grad) # 反向传播计算梯度。
                self.update()
                loss += batch_loss
                if self.verbose:
                    fstr = f"\t [Batch {j+1}/{n_batch} Train loss :{batch_loss:.3f} ({(time.time() - batch_start):.1f}s/batch) ]"
                    print(fstr)

            loss /= n_batch
            fstr2 = f"[Epoch {i+1}/{n_epochs} avg.loss :{loss:.3f}, Delta:{(prev_loss-loss):.3f} ({(time.time() - epoch_start):.1f}s/epoch)]"
            print(fstr2)
            prev_loss = loss

    def evaluate(self, X_test, y_test, batch_size=128):
        acc = 0.0
        batch_generator, n_batch = minibatch(X_test, batchsize=batch_size, shuffle=True)
        for j, batch_idx in enumerate(batch_generator):
            batch_len, batch_start = len(batch_idx), time.time()
            X_batch, y_batch = X_test[batch_idx], y_test[batch_idx]
            out,_ = self.forward(X_batch)
            y_pred = np.argmax(out, axis=1) #取索引
            y_batch = np.argmax(y_batch,axis=1)  # onehot ---> max index.
            acc += np.sum(y_pred==y_batch)
        return acc / X_test.shape[0]



    @property
    def hyperparams(self):
        return {
            "init_w": self.init_w,
            "loss": str(self.loss),
            "optimizer": self.optimizer,
            "regular": str(self.regular_act),
            "hidden_dims_1": self.hidden_dims_1,
            "hidden_dims_2": self.hidden_dims_2,
            "components": {k: v.params for k, v in self.layers.items()}
        }
训练实验:
""" 测试训练"""
def load_data(path = "..\data/mnist/mnist.npz"):
    f = np.load(path)
    X_train,y_train = f["x_train"], f["y_train"]
    X_test, y_test = f["x_test"], f["y_test"]
    f.close()
    return (X_train,y_train),(X_test,y_test)

(X_train, y_train), (X_test, y_test) = load_data()
y_train = np.eye(10)[y_train.astype(int)]
y_test = np.eye(10)[y_test.astype(int)]
X_train = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2]).astype('float32')
X_test = X_test.reshape(-1, X_test.shape[1]*X_test.shape[2]).astype('float32')

print(X_train.shape, y_train.shape)
N = 20000 # 取 20000 条数据用以训练
indices = np.random.permutation(range(X_train.shape[0]))[:N]
X_train, y_train = X_train[indices], y_train[indices]
print(X_train.shape, y_train.shape)
X_train /= 255
X_train = (X_train - 0.5) * 2
X_test /= 255
X_test = (X_test - 0.5) * 2

# 不引入正则化
model = DFN(hidden_dims_1=200, hidden_dims_2=10)
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("without regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))
(60000, 784) (60000, 10)
(20000, 784) (20000, 10)
[Epoch 1/20 avg.loss :2.284, Delta:inf (1.9s/epoch)]
[Epoch 2/20 avg.loss :2.181, Delta:0.103 (1.9s/epoch)]
[Epoch 3/20 avg.loss :1.827, Delta:0.354 (1.9s/epoch)]
[Epoch 4/20 avg.loss :1.338, Delta:0.489 (1.8s/epoch)]
[Epoch 5/20 avg.loss :0.935, Delta:0.403 (1.9s/epoch)]
[Epoch 6/20 avg.loss :0.704, Delta:0.231 (1.9s/epoch)]
[Epoch 7/20 avg.loss :0.578, Delta:0.126 (1.9s/epoch)]
[Epoch 8/20 avg.loss :0.509, Delta:0.070 (1.9s/epoch)]
[Epoch 9/20 avg.loss :0.464, Delta:0.045 (1.9s/epoch)]
[Epoch 10/20 avg.loss :0.434, Delta:0.030 (1.9s/epoch)]
[Epoch 11/20 avg.loss :0.411, Delta:0.023 (1.9s/epoch)]
[Epoch 12/20 avg.loss :0.393, Delta:0.018 (1.9s/epoch)]
[Epoch 13/20 avg.loss :0.380, Delta:0.014 (1.9s/epoch)]
[Epoch 14/20 avg.loss :0.368, Delta:0.011 (1.9s/epoch)]
[Epoch 15/20 avg.loss :0.357, Delta:0.011 (1.9s/epoch)]
[Epoch 16/20 avg.loss :0.348, Delta:0.009 (1.9s/epoch)]
[Epoch 17/20 avg.loss :0.341, Delta:0.008 (1.9s/epoch)]
[Epoch 18/20 avg.loss :0.335, Delta:0.006 (1.9s/epoch)]
[Epoch 19/20 avg.loss :0.328, Delta:0.007 (1.9s/epoch)]
[Epoch 20/20 avg.loss :0.322, Delta:0.006 (1.9s/epoch)]
without regularization -- accuracy:0.9188

引入正则:

model = DFN(hidden_dims_1=200, hidden_dims_2=10, regular_act="l2(lambd=0.01)")
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("with L2 regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))



(60000, 784) (60000, 10)
(20000, 784) (20000, 10)
[Epoch 1/20 avg.loss :2.290, Delta:inf (2.0s/epoch)]
[Epoch 2/20 avg.loss :2.259, Delta:0.031 (2.0s/epoch)]
[Epoch 3/20 avg.loss :2.173, Delta:0.086 (2.0s/epoch)]
[Epoch 4/20 avg.loss :1.971, Delta:0.201 (2.1s/epoch)]
[Epoch 5/20 avg.loss :1.767, Delta:0.205 (2.0s/epoch)]
[Epoch 6/20 avg.loss :1.570, Delta:0.197 (2.0s/epoch)]
[Epoch 7/20 avg.loss :1.398, Delta:0.172 (2.0s/epoch)]
[Epoch 8/20 avg.loss :1.280, Delta:0.118 (2.4s/epoch)]
[Epoch 9/20 avg.loss :1.191, Delta:0.089 (2.1s/epoch)]
[Epoch 10/20 avg.loss :1.115, Delta:0.076 (2.1s/epoch)]
[Epoch 11/20 avg.loss :1.058, Delta:0.057 (2.2s/epoch)]
[Epoch 12/20 avg.loss :1.016, Delta:0.042 (2.3s/epoch)]
[Epoch 13/20 avg.loss :0.974, Delta:0.042 (2.2s/epoch)]
[Epoch 14/20 avg.loss :0.936, Delta:0.038 (2.1s/epoch)]
[Epoch 15/20 avg.loss :0.901, Delta:0.034 (2.1s/epoch)]
[Epoch 16/20 avg.loss :0.876, Delta:0.025 (2.0s/epoch)]
[Epoch 17/20 avg.loss :0.857, Delta:0.019 (2.1s/epoch)]
[Epoch 18/20 avg.loss :0.844, Delta:0.013 (2.1s/epoch)]
[Epoch 19/20 avg.loss :0.834, Delta:0.010 (2.1s/epoch)]
[Epoch 20/20 avg.loss :0.826, Delta:0.009 (2.0s/epoch)]
with L2 regularization -- accuracy:0.8514

Process finished with exit code 0

L1正则:

model = DFN(hidden_dims_1=200, hidden_dims_2=10, regular_act="l1(lambd=0.0001)")
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("with L1 regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))

[Epoch 1/20 avg.loss :2.288, Delta:inf (2.4s/epoch)]
[Epoch 2/20 avg.loss :2.261, Delta:0.027 (2.3s/epoch)]
[Epoch 3/20 avg.loss :2.197, Delta:0.063 (2.3s/epoch)]
[Epoch 4/20 avg.loss :2.002, Delta:0.195 (2.3s/epoch)]
[Epoch 5/20 avg.loss :1.750, Delta:0.251 (2.3s/epoch)]
[Epoch 6/20 avg.loss :1.494, Delta:0.256 (2.3s/epoch)]
[Epoch 7/20 avg.loss :1.222, Delta:0.272 (2.3s/epoch)]
[Epoch 8/20 avg.loss :1.002, Delta:0.219 (2.3s/epoch)]
[Epoch 9/20 avg.loss :0.848, Delta:0.155 (2.3s/epoch)]
[Epoch 10/20 avg.loss :0.729, Delta:0.118 (2.3s/epoch)]
[Epoch 11/20 avg.loss :0.641, Delta:0.088 (2.3s/epoch)]
[Epoch 12/20 avg.loss :0.582, Delta:0.059 (2.3s/epoch)]
[Epoch 13/20 avg.loss :0.541, Delta:0.041 (2.3s/epoch)]
[Epoch 14/20 avg.loss :0.512, Delta:0.029 (2.3s/epoch)]
[Epoch 15/20 avg.loss :0.491, Delta:0.021 (2.3s/epoch)]
[Epoch 16/20 avg.loss :0.475, Delta:0.016 (2.3s/epoch)]
[Epoch 17/20 avg.loss :0.462, Delta:0.013 (2.3s/epoch)]
[Epoch 18/20 avg.loss :0.452, Delta:0.010 (2.3s/epoch)]
[Epoch 19/20 avg.loss :0.442, Delta:0.010 (2.3s/epoch)]
[Epoch 20/20 avg.loss :0.435, Delta:0.007 (2.3s/epoch)]
with L2 regularization -- accuracy:0.8947

可以看出,L1的lambd选择要比L2更加谨慎,因为会稀疏化权重矩阵,如果搞成0了,那么参数就不会更新了。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值