自定义正则L1,L2类
# -*- coding: utf-8 -*-
# author:
# time : 2021/4/28 13:04
# task: 自定义的正则类。
""" 导入包"""
from abc import ABC, abstractmethod
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import math
import sys
import os
import time
import re
import progressbar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
class RegularizerBase(ABC):
""" 基础类,init, loss, grad 两种方法,
分别在反向传播的时候使用grad 进行使用,
在计算loss的进行统计正则贡献。"""
def __init__(self):
super().__init__()
@abstractmethod
def loss(self, **kwargs):
raise NotImplementedError
@abstractmethod
def grad(self, **kwargs):
raise NotImplementedError
""" 正则l1,l2 类,都是集成基础类"""
class L1_Regularizer(RegularizerBase):
"""
L1 方法进行正则化,公式: j = j + lambd * |w|
所以参数是lambd
"""
def __init__(self, lambd = 0.001):
super(L1_Regularizer).__init__()
self.lambd = lambd
# 实现loss
def loss(self, params):
loss = 0
pattern = re.compile(r"^W\d+") # 形状如W1,W2,W3 ...这种,不对b进行正则。
for key, val in params.items():
if pattern.match(key):
loss += np.sum(np.abs(val)) * self.lambd
return loss
# 实现grad
def grad(self, W): # formular : lambd * sign(w)
grad = self.lambd * np.sign(W)
return grad
class L2_Regularizer(RegularizerBase):
"""
L2 正则化,formular: j = j + 0.5* (w).t * lambd * (w)
"""
def __init__(self, lambd):
super().__init__()
self.lambd = lambd
def loss(self, params):
loss = 0
pattern = re.compile(r"W\d+")
for key, val in params.items():
if pattern.match(key):
loss += 0.5 * np.sum(np.square(val)) * self.lambd
# np.sqrt() 求开方, np.square 求平方。
# loss += 0.5 * np.sum(val**2) * self.lambd
return loss
def grad(self, W):
grad = self.lambd * W
return grad
class RegularizerInitializer(object):
""" """
def __init__(self, regular_name = "l2"):
self.regular_name = regular_name
def __call__(self):
r = r"([a-zA-Z]*)=([^,)]*)"
regular_str = self.regular_name.lower() # l2=0.001
# 提取参数。
kwargs = dict([(i,eval(j)) for (i,j) in re.findall(r, regular_str)])
if "l1" in regular_str:
regular = L1_Regularizer(**kwargs)
elif "l2" in regular_str:
regular = L2_Regularizer(**kwargs)
else:
raise ValueError(f"Unrecognized regular: {regular_str}")
return regular #返回一个类对象。
构建正则全连接
from abc import ABC, abstractmethod
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import math
import sys
import os
import time
import re
import progressbar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 从之前的文件中导入需要的类:
from utils import *
from activation import *
# 导入regular类
from Regular_Class import *
class FullyConnected_Regular(LayerBase):
""" 需要一些参数,比如in,out....forward, backward, update, flash_gradient, _init_params"""
def __init__(self, n_out,acti_fn,init_w,optimizer=None):
super(FullyConnected_Regular,self).__init__(optimizer) # 将optimizer 传到lyaerbase 中进行初始化。
self.n_out = n_out
self.acti_fn = ActivationInitializer(acti_fn)()
self.init_w = init_w
self.init_weights = WeightInitializer(mode=init_w)
self.n_in = None
self.is_initialized = False
def _init_params(self,**kwargs):
b = np.zeros((1, self.n_out))
W = self.init_weights((self.n_in, self.n_out))
self.params = {"W" : W, "b": b}
self.gradients = {"W" : np.zeros_like(W), "b": np.zeros_like(b)}
self.is_initialized = True
def forward(self,X, retain_derived=True):
""" 全连接网络的前向传播"""
if not self.is_initialized:
self.n_in = X.shape[1]
self._init_params()
W = self.params["W"]
b = self.params["b"]
z = X@W + b
a = self.acti_fn.forward(z)
# 保存输入,方便进行反向传播计算。
if retain_derived:
self.X.append(X)
return a
def backward(self, dLda, retain=True, regular = None):
""" 有正则项的反向传播"""
if not isinstance(dLda, list):
dLda = [dLda]
dX = []
X = self.X # X shape: nbatchsize,inchannel. dlda shape: batchsize, outchannel.
for da, x in zip(dLda,X):
dx,dw,db = self._bwd(da,x,regular)
dX.append(dx)
if retain:
self.gradients["W"] += dw
self.gradients["b"] += db
return dX[0] if len(X)==1 else dX
def _bwd(self,dLda,X,regular):
W = self.params["W"]
b = self.params["b"]
z = X@W +b
dz = dLda * self.acti_fn.grad(z) # a = acti_fn(z)
dX = dz @ W.T
dW = X.T @ dz
db = dz.sum(axis=0, keepdims=True)
# 如果有正则项
if regular is not None:
# n = X.shape[0]
dW_norm = regular.grad(W)
dW += dW_norm
return dX,dW,db
def hyperparams(self):
return {
"layer":"Fully_connected_Regularizer",
"init_w":self.init_w,
"n_in": self.n_in,
"n_out": self.n_out,
"acti_fn":str(self.acti_fn),
"optimizer":{
"hyperparams": self.optimizer.hyperparams,
},
"componets":{
k:v for k,v in self.params.items()
}
}
双层全连接正则模型
def minibatch(x, batchsize=256, shuffle=True):
N = x.shape[0]
idx = np.arange(N)
n_batches = int(np.ceil(N/batchsize))
if shuffle:
np.random.shuffle(idx)
def mb_generator():
for i in range(n_batches):
yield idx[i*batchsize:(i+1)*batchsize]
return mb_generator(), n_batches # 返回迭代器(索引), 以及batch数量。
""" 双层全连接层的模型"""
class DFN(object):
def __init__(self,
hidden_dims_1 = None,
hidden_dims_2 = None,
optimizer = "sgd(lr=0.1)",
init_w = "std_normal",
regular_act=None,
loss = CrossEntropy()):
self.optimizer = optimizer
self.hidden_dims_1 = hidden_dims_1
self.hidden_dims_2 = hidden_dims_2
self.loss =loss
self.regular = None
self.regular_act = regular_act
self.is_initialized = False
self.init_w = init_w
def _set_params(self):
""" 模型初始化: FC1-> sigmoid -> FC2 -> softmax"""
self.layers = OrderedDict()
self.layers["FC1"] = FullyConnected_Regular(n_out=self.hidden_dims_1,
acti_fn="sigmoid",
init_w=self.init_w,
optimizer=self.optimizer)
self.layers["FC2"] = FullyConnected_Regular(n_out=self.hidden_dims_2,
acti_fn="affine(slope=1.,intercept=0)", #slope=(.*),intercept=(.*)
init_w=self.init_w,
optimizer=self.optimizer)
self.layers["Softmax"] = Softmax(optimizer=self.optimizer) # 需要改变参数数量加上一个regular
if self.regular_act is not None:
self.regular = RegularizerInitializer(self.regular_act)() # 类对象。
self.is_initialized = True
def forward(self,X):
Xs = {}
out = X
for k,v in self.layers.items():
Xs[k] = out
out = v.forward(out)
return out, Xs
def backward(self,grad):
dXs = {}
out = grad
for k,v in reversed(self.layers.items()):
dXs[k] = out
out = v.backward(out, regular=self.regular)
return out, dXs
def update(self):
""" 参数更新"""
for k,v in reversed(list(self.layers.items())):
v.update()
self.flush_gradients()
def flush_gradients(self, curr_loss=None):
for k,v in self.layers.items():
v.flush_gradients()
def fit(self, X_train,y_train,n_epochs=20, batch_size=64, verbose=False):
"""
:param X_train:
:param y_train:
:param n_epochs:
:param batch_size:
:param verbose:
:return:
"""
self.verbose = verbose
self.n_epochs = n_epochs
self.batch_size = batch_size
if not self.is_initialized:
self.n_features = X_train.shape[1]
self._set_params()
prev_loss = np.inf
# softmax = Softmax()
for i in range(n_epochs):
loss, epoch_start = 0.0, time.time()
batch_generator, n_batch = minibatch(X_train, self.batch_size, shuffle=True)
# batch_generator is index of xtrian.
for j, batch_idx in enumerate(batch_generator):
batch_len, batch_start = len(batch_idx), time.time()
X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
out,_ = self.forward(X_batch) # 前向传播。
# y_pred_batch = softmax(out)
batch_loss = self.loss(y_batch, out)
if self.regular is not None:
for _,layerparams in self.hyperparams["components"].items():
assert type(layerparams) is dict
batch_loss += self.regular.loss(layerparams)
grad = self.loss.grad(y_batch, out)
_,_ = self.backward(grad) # 反向传播计算梯度。
self.update()
loss += batch_loss
if self.verbose:
fstr = f"\t [Batch {j+1}/{n_batch} Train loss :{batch_loss:.3f} ({(time.time() - batch_start):.1f}s/batch) ]"
print(fstr)
loss /= n_batch
fstr2 = f"[Epoch {i+1}/{n_epochs} avg.loss :{loss:.3f}, Delta:{(prev_loss-loss):.3f} ({(time.time() - epoch_start):.1f}s/epoch)]"
print(fstr2)
prev_loss = loss
def evaluate(self, X_test, y_test, batch_size=128):
acc = 0.0
batch_generator, n_batch = minibatch(X_test, batchsize=batch_size, shuffle=True)
for j, batch_idx in enumerate(batch_generator):
batch_len, batch_start = len(batch_idx), time.time()
X_batch, y_batch = X_test[batch_idx], y_test[batch_idx]
out,_ = self.forward(X_batch)
y_pred = np.argmax(out, axis=1) #取索引
y_batch = np.argmax(y_batch,axis=1) # onehot ---> max index.
acc += np.sum(y_pred==y_batch)
return acc / X_test.shape[0]
@property
def hyperparams(self):
return {
"init_w": self.init_w,
"loss": str(self.loss),
"optimizer": self.optimizer,
"regular": str(self.regular_act),
"hidden_dims_1": self.hidden_dims_1,
"hidden_dims_2": self.hidden_dims_2,
"components": {k: v.params for k, v in self.layers.items()}
}
训练实验:
""" 测试训练"""
def load_data(path = "..\data/mnist/mnist.npz"):
f = np.load(path)
X_train,y_train = f["x_train"], f["y_train"]
X_test, y_test = f["x_test"], f["y_test"]
f.close()
return (X_train,y_train),(X_test,y_test)
(X_train, y_train), (X_test, y_test) = load_data()
y_train = np.eye(10)[y_train.astype(int)]
y_test = np.eye(10)[y_test.astype(int)]
X_train = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2]).astype('float32')
X_test = X_test.reshape(-1, X_test.shape[1]*X_test.shape[2]).astype('float32')
print(X_train.shape, y_train.shape)
N = 20000 # 取 20000 条数据用以训练
indices = np.random.permutation(range(X_train.shape[0]))[:N]
X_train, y_train = X_train[indices], y_train[indices]
print(X_train.shape, y_train.shape)
X_train /= 255
X_train = (X_train - 0.5) * 2
X_test /= 255
X_test = (X_test - 0.5) * 2
# 不引入正则化
model = DFN(hidden_dims_1=200, hidden_dims_2=10)
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("without regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))
(60000, 784) (60000, 10)
(20000, 784) (20000, 10)
[Epoch 1/20 avg.loss :2.284, Delta:inf (1.9s/epoch)]
[Epoch 2/20 avg.loss :2.181, Delta:0.103 (1.9s/epoch)]
[Epoch 3/20 avg.loss :1.827, Delta:0.354 (1.9s/epoch)]
[Epoch 4/20 avg.loss :1.338, Delta:0.489 (1.8s/epoch)]
[Epoch 5/20 avg.loss :0.935, Delta:0.403 (1.9s/epoch)]
[Epoch 6/20 avg.loss :0.704, Delta:0.231 (1.9s/epoch)]
[Epoch 7/20 avg.loss :0.578, Delta:0.126 (1.9s/epoch)]
[Epoch 8/20 avg.loss :0.509, Delta:0.070 (1.9s/epoch)]
[Epoch 9/20 avg.loss :0.464, Delta:0.045 (1.9s/epoch)]
[Epoch 10/20 avg.loss :0.434, Delta:0.030 (1.9s/epoch)]
[Epoch 11/20 avg.loss :0.411, Delta:0.023 (1.9s/epoch)]
[Epoch 12/20 avg.loss :0.393, Delta:0.018 (1.9s/epoch)]
[Epoch 13/20 avg.loss :0.380, Delta:0.014 (1.9s/epoch)]
[Epoch 14/20 avg.loss :0.368, Delta:0.011 (1.9s/epoch)]
[Epoch 15/20 avg.loss :0.357, Delta:0.011 (1.9s/epoch)]
[Epoch 16/20 avg.loss :0.348, Delta:0.009 (1.9s/epoch)]
[Epoch 17/20 avg.loss :0.341, Delta:0.008 (1.9s/epoch)]
[Epoch 18/20 avg.loss :0.335, Delta:0.006 (1.9s/epoch)]
[Epoch 19/20 avg.loss :0.328, Delta:0.007 (1.9s/epoch)]
[Epoch 20/20 avg.loss :0.322, Delta:0.006 (1.9s/epoch)]
without regularization -- accuracy:0.9188
引入正则:
model = DFN(hidden_dims_1=200, hidden_dims_2=10, regular_act="l2(lambd=0.01)")
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("with L2 regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))
(60000, 784) (60000, 10)
(20000, 784) (20000, 10)
[Epoch 1/20 avg.loss :2.290, Delta:inf (2.0s/epoch)]
[Epoch 2/20 avg.loss :2.259, Delta:0.031 (2.0s/epoch)]
[Epoch 3/20 avg.loss :2.173, Delta:0.086 (2.0s/epoch)]
[Epoch 4/20 avg.loss :1.971, Delta:0.201 (2.1s/epoch)]
[Epoch 5/20 avg.loss :1.767, Delta:0.205 (2.0s/epoch)]
[Epoch 6/20 avg.loss :1.570, Delta:0.197 (2.0s/epoch)]
[Epoch 7/20 avg.loss :1.398, Delta:0.172 (2.0s/epoch)]
[Epoch 8/20 avg.loss :1.280, Delta:0.118 (2.4s/epoch)]
[Epoch 9/20 avg.loss :1.191, Delta:0.089 (2.1s/epoch)]
[Epoch 10/20 avg.loss :1.115, Delta:0.076 (2.1s/epoch)]
[Epoch 11/20 avg.loss :1.058, Delta:0.057 (2.2s/epoch)]
[Epoch 12/20 avg.loss :1.016, Delta:0.042 (2.3s/epoch)]
[Epoch 13/20 avg.loss :0.974, Delta:0.042 (2.2s/epoch)]
[Epoch 14/20 avg.loss :0.936, Delta:0.038 (2.1s/epoch)]
[Epoch 15/20 avg.loss :0.901, Delta:0.034 (2.1s/epoch)]
[Epoch 16/20 avg.loss :0.876, Delta:0.025 (2.0s/epoch)]
[Epoch 17/20 avg.loss :0.857, Delta:0.019 (2.1s/epoch)]
[Epoch 18/20 avg.loss :0.844, Delta:0.013 (2.1s/epoch)]
[Epoch 19/20 avg.loss :0.834, Delta:0.010 (2.1s/epoch)]
[Epoch 20/20 avg.loss :0.826, Delta:0.009 (2.0s/epoch)]
with L2 regularization -- accuracy:0.8514
Process finished with exit code 0
L1正则:
model = DFN(hidden_dims_1=200, hidden_dims_2=10, regular_act="l1(lambd=0.0001)")
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("with L1 regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))
[Epoch 1/20 avg.loss :2.288, Delta:inf (2.4s/epoch)]
[Epoch 2/20 avg.loss :2.261, Delta:0.027 (2.3s/epoch)]
[Epoch 3/20 avg.loss :2.197, Delta:0.063 (2.3s/epoch)]
[Epoch 4/20 avg.loss :2.002, Delta:0.195 (2.3s/epoch)]
[Epoch 5/20 avg.loss :1.750, Delta:0.251 (2.3s/epoch)]
[Epoch 6/20 avg.loss :1.494, Delta:0.256 (2.3s/epoch)]
[Epoch 7/20 avg.loss :1.222, Delta:0.272 (2.3s/epoch)]
[Epoch 8/20 avg.loss :1.002, Delta:0.219 (2.3s/epoch)]
[Epoch 9/20 avg.loss :0.848, Delta:0.155 (2.3s/epoch)]
[Epoch 10/20 avg.loss :0.729, Delta:0.118 (2.3s/epoch)]
[Epoch 11/20 avg.loss :0.641, Delta:0.088 (2.3s/epoch)]
[Epoch 12/20 avg.loss :0.582, Delta:0.059 (2.3s/epoch)]
[Epoch 13/20 avg.loss :0.541, Delta:0.041 (2.3s/epoch)]
[Epoch 14/20 avg.loss :0.512, Delta:0.029 (2.3s/epoch)]
[Epoch 15/20 avg.loss :0.491, Delta:0.021 (2.3s/epoch)]
[Epoch 16/20 avg.loss :0.475, Delta:0.016 (2.3s/epoch)]
[Epoch 17/20 avg.loss :0.462, Delta:0.013 (2.3s/epoch)]
[Epoch 18/20 avg.loss :0.452, Delta:0.010 (2.3s/epoch)]
[Epoch 19/20 avg.loss :0.442, Delta:0.010 (2.3s/epoch)]
[Epoch 20/20 avg.loss :0.435, Delta:0.007 (2.3s/epoch)]
with L2 regularization -- accuracy:0.8947
可以看出,L1的lambd选择要比L2更加谨慎,因为会稀疏化权重矩阵,如果搞成0了,那么参数就不会更新了。