代码实现:
from utils import *
import numpy
class BatchNorm1D(LayerBase):
""" 集成基础的层,需要计算三个梯度,对x的梯度,以及线性变换的两个梯度"""
def __init__(self, momentum = 0.9, epsilon = 1e-6, optimizer=None):
super().__init__(optimizer) # 必须有optimizer这个基类的变量。
self.momentum = momentum
self.epsilon = epsilon
self.n_in = None
self.n_out = None # 输入和输出的维度,也就是x的维度。x shape : batchsize, n_in. n_in == n_out
self.params = {
"scaler":None,
"intercept": None, # 线性变换的两个参数
"running_mean": None,
"running_var": None # 需要注意的是,前两个参数是需要保存反向传播梯度来进行学习的,后两个参数则不用。
}
self.is_initialized = False
def _init_params(self,**kwargs):
""" 进行参数的初始化"""
scaler = np.random.rand(self.n_in)
intercept = np.zeros(self.n_in)
running_mean = np.zeros(self.n_in)
running_var = np.ones(self.n_in)
self.params = {
"scaler": scaler,
"intercept": intercept, # 线性变换的两个参数
"running_mean": running_mean,
"running_var": running_var
}
self.gradients = {
"scaler": np.zeros_like(scaler),
"intercept": np.zeros_like(intercept)
}
self.is_initialized = True
def reset_running_stats(self): # 在一个epoch 运行完成后,归零。
self.params["running_mean"] = np.zeros(self.n_in)
self.params["running_var"] = np.ones(self.n_in)
@property
def hyperparams(self):
return {
"layer": "BatchNorm1D",
"acti_fn": None,
"n_in": self.n_in,
"n_out": self.n_out,
"epsilon": self.epsilon,
"momentum": self.momentum,
"optimizer": {
"cache": self.optimizer.cache,
"hyperparams": self.optimizer.hyperparams,
},
}
def forward(self, X, is_train = True, retain_dedrived = True):
"""
正向传播:
:param X:
:param is_train:
:param retain_dedrived:
:return:
[train]: Y = scaler * norm(X) + intercept,其中 norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
[test]: Y = scaler * running_norm(X) + intercept,
running_norm(X) = (X - running_mean) / sqrt(running_var + epsilon)
"""
if not self.is_initialized : #没有进行初始化。
self.n_in = X.shape[1] # 输入的维度。
self._init_params()
# 提取运算超参数:
momentum, epsilon = self.hyperparams["momentum"], self.hyperparams["epsilon"]
rm, rv = self.params["running_mean"], self.params["running_var"]
scaler, intercept = self.params["scaler"], self.params["intercept"]
X_mean, X_var = self.params["running_mean"], self.params["running_var"] # 测试的话,直接用来计算。
if is_train:
X_mean, X_var = np.mean(X, axis=0), np.var(X, axis=0) # 计算当前batchsize的均值和方差。
# 将移动平均和移动方差的值进行更新。
self.params["running_mean"] = momentum * rm + (1-momentum)*X_mean
self.params["running_mean"] = momentum * rv + (1-momentum)*X_mean
if retain_dedrived:
self.X.append(X)
# 进行前向传播:
# Y = scaler * norm(X) + intercept,其中
# norm(X) = (X - mean(X)) / sqrt(var(X) + epsilon)
X_hat = (X- X_mean)/np.sqrt(X_var + epsilon)
y = scaler*X_hat + intercept
return y
def backward(self, dLda, retain_grads = True):
"""
反向传播,计算scaler, intrecept 以及x的梯度并进行保存。
"""
if not isinstance(dLda, list):
dLda = [dLda] # dlda shape : batchsize, n_classes
dX = []
X =self.X
# 对每个batch进行梯度计算。
for da, x in zip(dLda,X):
dx, dScaler, dIntercept = self._bwd(da,x)
dX.append(dx)
if retain_grads:
self.gradients["scaler"] = dScaler
self.gradients["intercept"] = dScaler
return dX[0] if len(dX) == 1 else dX # 这里是为了与上边的DLDa对应,两个都是lsit类型。
def _bwd(self, dLda, X):
""" 按照公式来计算三个梯度即可"""
scaler = self.params["scaler"]
epsi = self.hyperparams["epsilon"]
n_ex, n_in = X.shape
X_mean, X_var = X.mean(axis=0), X.var(axis=0)
X_hat = (X - X_mean) / np.sqrt(X_var + epsi)
dIntercept = dLda.sum(axis=0)
dScaler = np.sum(dLda * X_hat, axis=0)
dX_hat = dLda * scaler
dX = (n_ex * dX_hat - dX_hat.sum(axis=0) - X_hat * (dX_hat * X_hat).sum(axis=0)) / (
n_ex * np.sqrt(X_var + epsi)
)
return dX, dScaler, dIntercept
数据:
(X_train, y_train), (X_test, y_test) = load_data()
y_train = np.eye(10)[y_train.astype(int)]
y_test = np.eye(10)[y_test.astype(int)]
X_train = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2]).astype('float32')
X_test = X_test.reshape(-1, X_test.shape[1]*X_test.shape[2]).astype('float32')
print(X_train.shape, y_train.shape)
N = 20000 # 取 20000 条数据用以训练
indices = np.random.permutation(range(X_train.shape[0]))[:N]
X_train, y_train = X_train[indices], y_train[indices]
print(X_train.shape, y_train.shape)
X_train /= 255
X_train = (X_train - 0.5) * 2
X_test /= 255
X_test = (X_test - 0.5) * 2
定义具有BatchNorm层的全连接2层网络:
class DFN2(object):
def __init__(
self,
hidden_dims_1=None,
hidden_dims_2=None,
optimizer="sgd(lr=0.01)",
init_w="std_normal",
loss=CrossEntropy()
):
self.optimizer = optimizer
self.init_w = init_w
self.loss = loss
self.hidden_dims_1 = hidden_dims_1
self.hidden_dims_2 = hidden_dims_2
self.is_initialized = False
def _set_params(self):
"""
函数作用:模型初始化
FC1 -> Sigmoid -> BN -> FC2 -> Softmax
"""
self.layers = OrderedDict()
self.layers["FC1"] = FullyConnected(
n_out=self.hidden_dims_1,
acti_fn="sigmoid",
init_w=self.init_w,
optimizer=self.optimizer
)
self.layers["BN"] = BatchNorm1D(optimizer=self.optimizer)
self.layers["FC2"] = FullyConnected(
n_out=self.hidden_dims_2,
acti_fn="affine(slope=1,intercept=0)",
init_w=self.init_w,
optimizer=self.optimizer
)
self.layers["Softmax"] = Softmax(dim=-1, optimizer=self.optimizer)
self.softmax = Softmax(dim=-1, optimizer=self.optimizer)
self.is_initialized = True
def forward(self, X_train):
Xs = {}
out = X_train
for k, v in self.layers.items():
Xs[k] = out # 每一层的输出。
out = v.forward(out)
return out, Xs
def backward(self, grad):
dXs = {}
out = grad
for k, v in reversed(list(self.layers.items())):
dXs[k] = out
out = v.backward(out)
return out, dXs
def update(self):
"""
函数作用:梯度更新
"""
for k, v in reversed(list(self.layers.items())):
v.update()
self.flush_gradients() # 反向更新梯度。
def flush_gradients(self, curr_loss=None):
"""
函数作用:更新后重置梯度
"""
for k, v in self.layers.items():
v.flush_gradients()
def fit(self, X_train, y_train, n_epochs=20, batch_size=64, verbose=False):
"""
参数说明:
X_train:训练数据
y_train:训练数据标签
n_epochs:epoch 次数
batch_size:每次 epoch 的 batch size
verbose:是否每个 batch 输出损失
"""
self.verbose = verbose
self.n_epochs = n_epochs
self.batch_size = batch_size
if not self.is_initialized:
self.n_features = X_train.shape[1]
self._set_params()
prev_loss = np.inf
# softmax = self.softmax.forward #
for i in range(n_epochs):
loss, epoch_start = 0.0, time.time()
batch_generator, n_batch = minibatch(X_train, self.batch_size, shuffle=True)
for j, batch_idx in enumerate(batch_generator):
batch_len, batch_start = len(batch_idx), time.time()
X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
out, _ = self.forward(X_batch)
# y_pred_batch = softmax(out) #
y_pred_batch = out #
batch_loss = self.loss(y_batch, y_pred_batch)
grad = self.loss.grad(y_batch, y_pred_batch) # 计算cross_entroy 交叉熵的梯度。
_, _ = self.backward(grad) # 方向传播,计算梯度。
self.update() # 用梯度来更新参数。更新后将梯度清零。
loss += batch_loss # 计算loss.
if self.verbose:
fstr = "\t[Batch {}/{}] Train loss: {:.3f} ({:.1f}s/batch)"
print(fstr.format(j + 1, n_batch, batch_loss, time.time() - batch_start))
loss /= n_batch
fstr = "[Epoch {}] Avg. loss: {:.3f} Delta: {:.3f} ({:.2f}m/epoch)"
print(fstr.format(i + 1, loss, prev_loss - loss, (time.time() - epoch_start) / 60.0))
prev_loss = loss
def evaluate(self, X_test, y_test, batch_size=128):
acc = 0.0
batch_generator, n_batch = minibatch(X_test, batch_size, shuffle=True)
for j, batch_idx in enumerate(batch_generator):
batch_len, batch_start = len(batch_idx), time.time()
X_batch, y_batch = X_test[batch_idx], y_test[batch_idx]
y_pred_batch, _ = self.forward(X_batch)
y_pred_batch = np.argmax(y_pred_batch, axis=1)
y_batch = np.argmax(y_batch, axis=1)
acc += np.sum(y_pred_batch == y_batch)
return acc / X_test.shape[0]
@property
def hyperparams(self):
return {
"init_w": self.init_w,
"loss": str(self.loss),
"optimizer": self.optimizer,
"hidden_dims_1": self.hidden_dims_1,
"hidden_dims_2": self.hidden_dims_2,
"components": {k: v.params for k, v in self.layers.items()}
}
训练测试:
""" 训练"""
# model = DFN(hidden_dims_1=200, hidden_dims_2=10)
# model.fit(X_train, y_train, n_epochs=20)
# print(model.evaluate(X_test,y_test))
#
model = DFN2(hidden_dims_1=200, hidden_dims_2=10)
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("accuracy:{}".format(model.evaluate(X_test, y_test)))
结果:
[Epoch 1] Avg. loss: 1.863 Delta: inf (0.03m/epoch)
[Epoch 2] Avg. loss: 1.164 Delta: 0.699 (0.03m/epoch)
[Epoch 3] Avg. loss: 0.830 Delta: 0.334 (0.03m/epoch)
[Epoch 4] Avg. loss: 0.654 Delta: 0.176 (0.03m/epoch)
[Epoch 5] Avg. loss: 0.558 Delta: 0.095 (0.03m/epoch)
[Epoch 6] Avg. loss: 0.504 Delta: 0.055 (0.03m/epoch)
[Epoch 7] Avg. loss: 0.466 Delta: 0.038 (0.03m/epoch)
[Epoch 8] Avg. loss: 0.442 Delta: 0.024 (0.03m/epoch)
[Epoch 9] Avg. loss: 0.422 Delta: 0.021 (0.03m/epoch)
[Epoch 10] Avg. loss: 0.407 Delta: 0.014 (0.03m/epoch)
[Epoch 11] Avg. loss: 0.397 Delta: 0.010 (0.03m/epoch)
[Epoch 12] Avg. loss: 0.384 Delta: 0.013 (0.03m/epoch)
[Epoch 13] Avg. loss: 0.376 Delta: 0.009 (0.03m/epoch)
[Epoch 14] Avg. loss: 0.370 Delta: 0.005 (0.03m/epoch)
[Epoch 15] Avg. loss: 0.363 Delta: 0.008 (0.03m/epoch)
[Epoch 16] Avg. loss: 0.356 Delta: 0.007 (0.03m/epoch)
[Epoch 17] Avg. loss: 0.350 Delta: 0.006 (0.03m/epoch)
[Epoch 18] Avg. loss: 0.345 Delta: 0.005 (0.03m/epoch)
[Epoch 19] Avg. loss: 0.339 Delta: 0.006 (0.03m/epoch)
[Epoch 20] Avg. loss: 0.339 Delta: 0.000 (0.03m/epoch)
accuracy:0.9131