import copy
import numpy as np
from sklearn.datasets import load_digits
from Code_Session import deep_learning_A as dl


Review of Deep Neural Network


nums_8x8 = load_digits()

data = nums_8x8.images[13:16] / np.max(nums_8x8.images[13:16])
data = data.reshape(len(data), -1)
labels = nums_8x8.target[13:16]


x_l1 = dl.layer(data, 64, 32)
x_b1 = dl.batch_norm(x_l1)
x_a1 = dl.relu(x_b1)

x_l2 = dl.layer(x_a1, 32, 16)
x_b2 = dl.batch_norm(x_l2)
x_a2 = dl.relu(x_b2)

x_l3 = dl.layer(x_a2, 16, 10)
x_b3 = dl.batch_norm(x_l3)
y_pred = dl.softmax(x_b3)

为了评估神经网络预测的质量,所有ground truth标签都应该转换成 one-hot格式,这样它们就可以被放入一个损失函数。

def one_hot(labels, class_num=10):
    return np.eye(class_num, dtype=float)[labels]
y = one_hot(nums_8x8.target[13:16], class_num=10)
loss = dl.cross_entropy(y, y_pred)
print(np.round(loss, decimals=2))
[[0.2  0.24 0.02 1.59 0.03 0.03 0.11 0.22 0.02 0.01]
 [0.02 0.03 0.26 0.04 1.35 0.03 0.02 0.05 0.29 0.08]
 [0.07 0.04 0.07 0.03 0.04 1.19 0.16 0.03 0.05 0.26]]


1. Derivative of Loss Function

 Loss  y g r y pred  = − y g t ⋅ log ⁡ ( y pred  ) − ( 1 − y g t ) ⋅ log ⁡ ( 1 − y pred  ) → ∇ Loss ⁡ y gh  y pred  = − y g t y pred  + 1 − y g t 1 − y pred  \begin{aligned} \underset{y_{g r} y_{\text {pred }}}{\text { Loss }}=&-y_{g t} \cdot \log \left(y_{\text {pred }}\right)-\left(1-y_{g t}\right) \cdot \log \left(1-y_{\text {pred }}\right) \\ & \rightarrow \quad \nabla \underset{y_{\text {gh }} y_{\text {pred }}}{\operatorname{Loss}}=-\frac{y_{g t}}{y_{\text {pred }}}+\frac{1-y_{g t}}{1-y_{\text {pred }}} \end{aligned} ygrypred  Loss =ygtlog(ypred )(1ygt)log(1ypred )ygh ypred Loss=ypred ygt+1ypred 1ygt

class CrossEntropyLoss(object):
    def __call__(self, y, y_pred):
        # Avoid division by zero
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return - y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred)

    def gradient(self, y, y_pred):
        # Avoid division by zero
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        return - (y / y_pred) + (1 - y) / (1 - y_pred)

这是逆推法的开始。在计算梯度时无论是ground truth one-hot 还是预测的概率都应该被考虑。

ce = CrossEntropyLoss()
loss_grad = ce.gradient(y, y_pred)
print(np.round(loss_grad, decimals=2))
[[ 1.22  1.27  1.02 -4.92  1.03  1.03  1.12  1.24  1.02  1.01]
 [ 1.02  1.03  1.29  1.04 -3.87  1.03  1.02  1.05  1.34  1.08]
 [ 1.07  1.04  1.07  1.03  1.04 -3.29  1.18  1.03  1.05  1.3 ]]

2. Derivative of Softmax Function

S = e y − c ∑ i = 1 n e y i …  where  c = max ⁡ ( y 1 , y 2 , … , y k ) S=\frac{e^{y-c}}{\sum_{i=1}^{n} e^{y_{i}}} \quad \ldots \text { where } c=\max \left(y_{1}, y_{2}, \ldots, y_{k}\right) S=i=1neyieyc where c=max(y1,y2,,yk)
我们想在这里添加 𝑐 的原因是因为当输入向量包含大数字时,它可以防止overfloat。为了求导softmax函数,我们将使用除法法则:
 let  f ( x ) = g ( x ) h ( x ) → f ′ ( x ) = g ′ ( x ) h ( x ) − h ′ ( x ) g ( x ) h 2 ( x ) \text { let } f(x)=\frac{g(x)}{h(x)} \quad \rightarrow \quad f^{\prime}(x)=\frac{g^{\prime}(x) h(x)-h^{\prime}(x) g(x)}{h^{2}(x)}  let f(x)=h(x)g(x)f(x)=h2(x)g(x)h(x)h(x)g(x)
f ′ ( x ) = e y ∑ i = 1 n e y i − e y e y j ( ∑ i = 1 n e y i ) 2 = e y ∑ i = 1 n e y i ⋅ ∑ i = 1 n e y i − e y j ∑ i = 1 n e y i = S ( 1 − S ) f^{\prime}(x)=\frac{e^{\mathbf{y}} \sum_{i=1}^{n} e^{\mathbf{y}_{i}}-e^{\mathbf{y}} e^{\mathbf{y}_{j}}}{\left(\sum_{i=1}^{n} e^{y_{i}}\right)^{2}}=\frac{e^{\mathbf{y}}}{\sum_{i=1}^{n} e^{y_{i}}} \cdot \frac{\sum_{i=1}^{n} e^{y_{i}}-e^{\mathbf{y}_{j}}}{\sum_{i=1}^{n} e^{y_{i}}}=S(1-S) f(x)=(i=1neyi)2eyi=1neyieyeyj=i=1neyieyi=1neyii=1neyieyj=S(1S)


class Softmax():
    def __call__(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def gradient(self, x):
        p = self.__call__(x)
        return p * (1 - p)


softmax = Softmax()
softmax_grad = softmax.gradient(x_b3) * loss_grad
print(np.round(softmax_grad, decimals=2))
[[ 0.18  0.21  0.02 -0.8   0.02  0.03  0.11  0.2   0.02  0.01]
 [ 0.02  0.03  0.23  0.04 -0.74  0.03  0.02  0.05  0.25  0.08]
 [ 0.07  0.04  0.07  0.03  0.04 -0.7   0.15  0.03  0.05  0.23]

Addition - Parameter Optimizer

至今为止我们做的反向传播是去确定每一步没有更新参数的梯度。当我们进行到第三步时, 𝛾 和 𝐵 将需要被更新。在进入批标准化部分的向后传递之前,我们需要了解如何使用一些优化方法去去更新这些带有传播梯度的目标参数。下面列出了一些流行的优化方法。

Stochastic Gradient Descent
Nesterov Accelerated Gradient

一般来说,函数的参数可以通过简单的梯度来更新。然而,在优化过程中,只使用小批处理就很容易地会在优化过程中出现zigzag trace,从而导致更长的和一些无效的更新。此外,如果更新到达鞍点,更新将被停止。以上5种方法可以有效避免这些问题。

A-1. Stochastic Gradient Descent + Momentum

zigzag trace 是由不同规模的梯度沿不同维度造成的。
v t =  momentum  ⋅ v t − 1 + η ⋅ ∇ f ( w t ) → w t + 1 = w t − v t \begin{array}{c} v_{t}=\text { momentum } \cdot v_{t-1}+\eta \cdot \nabla f\left(w_{t}\right) \\ \rightarrow \quad w_{t+1}=w_{t}-v_{t} \end{array} vt= momentum vt1+ηf(wt)wt+1=wtvt

class StochasticGradientDescent(object):
    def __init__(self, learning_rate=0.01, momentum=0.8):
        self.lr = learning_rate
        self.momentum = momentum
        self.velocity = None
    def update(self, w, w_grad):
        # If not initialized
        if self.velocity is None:
            self.velocity = np.zeros(np.shape(w))
        # Use momentum if set
        self.velocity = self.momentum * self.velocity + self.lr * w_grad
        # Move against the gradient to minimize loss
        return w - self.velocity

A-2. Nesterov Accelerated Gradient

v t + 1 =  momentum  ⋅ v t + η ⋅ ∇ f ( w t −  momentum  ⋅ v t ) → w t + 1 = w t − v t + 1 \begin{array}{c} v_{t+1}=\text { momentum } \cdot v_{t}+\eta \cdot \nabla f\left(w_{t}-\text { momentum } \cdot v_{t}\right) \\ \rightarrow \quad w_{t+1}=w_{t}-v_{t+1} \end{array} vt+1= momentum vt+ηf(wt momentum vt)wt+1=wtvt+1

class NesterovAcceleratedGradient(object):
   def __init__(self, learning_rate=0.001, momentum=0.4):
       self.lr = learning_rate
       self.momentum = momentum
       self.velocity = np.array([])
   def update(self, w, grad_func):
       # Calculate the gradient of the loss a bit further down the slope from w
       w_grad_ = grad_func(w - self.momentum * self.velocity)
       w_grad_ = np.clip(w_grad_, -1, 1)# Initialize on first update
       if not self.velocity.any():
           self.velocity = np.zeros(np.shape(w))
       self.velocity = self.momentum * self.velocity + self.lr * w_grad_
       # Move against the gradient to minimize loss
       return w - self.velocity

A-3. AdaGrad

G t = ∑ i = 1 t ( ∇ f ( w i ) ⋅ ∇ f ( w i ) ) → w t + 1 = w t − η ⋅ ∇ f ( w t ) G t + ϵ \begin{aligned} G_{t} &=\sum_{i=1}^{t}\left(\nabla f\left(w_{i}\right) \cdot \nabla f\left(w_{i}\right)\right) \\ \rightarrow & w_{t+1}=w_{t}-\eta \cdot \frac{\nabla f\left(w_{t}\right)}{\sqrt{G_{t}+\epsilon}} \end{aligned} Gt=i=1t(f(wi)f(wi))wt+1=wtηGt+ϵ f(wt)

class Adagrad():
   def __init__(self, learning_rate=0.01):
       self.lr = learning_rate
       self.G = None  # Sum of squares of the gradients
       self.eps = 1e-8
   def update(self, w, w_grad):
       # If not initialized
       if self.G is None:
           self.G = np.zeros(np.shape(w))
       # Add the square of the gradient of the loss function at w
       self.G += np.power(w_grad, 2)
       # Adaptive gradient with higher learning rate for sparse data
       return w - self.lr * w_grad / np.sqrt(self.G + self.eps)


A-4. RMSprop


E ( G t ) = ρ E ( G t − 1 ) + ( 1 − ρ ) G t → w t + 1 = w t − η ⋅ ∇ f ( w t ) E ( G t ) + ϵ \begin{array}{r} \mathbb{E}\left(G_{t}\right)=\rho \mathbb{E}\left(G_{t-1}\right)+(1-\rho) G_{t} \\ \rightarrow \quad w_{t+1}=w_{t}-\eta \cdot \frac{\nabla f\left(w_{t}\right)}{\sqrt{\mathbb{E}\left(G_{t}\right)+\epsilon}} \end{array} E(Gt)=ρE(Gt1)+(1ρ)Gtwt+1=wtηE(Gt)+ϵ f(wt)

class RMSprop(object):
   def __init__(self, learning_rate=0.01, rho=0.9):
       self.lr = learning_rate
       self.Eg = None  
       self.eps = 1e-8
       self.rho = rho
   def update(self, w, w_grad):
       # If not initialized
       if self.Eg is None:
           self.Eg = np.zeros(np.shape(w_grad))
       # Running average of the square gradients at w
       self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(w_grad, 2)# Divide the learning rate for a weight by a running average of
       # the magnitudes of recent gradients for that weight
       return w - self.lr * w_grad / np.sqrt(self.Eg + self.eps)

A-5. Adam - Adaptive Moment Estimation


m t = β 1 ⋅ m t − 1 + ( 1 − β 1 ) ⋅ ∇ f ( w t ) → m ^ t = m t 1 − β 1 v t = β 2 ⋅ v t − 1 + ( 1 − β 2 ) ⋅ G t → v ^ t = v t 1 − β 2 → w t + 1 = w t − η ⋅ m ^ t v ^ t + ϵ \begin{array}{cc} m_{t}=\beta_{1} \cdot m_{t-1}+\left(1-\beta_{1}\right) \cdot \nabla f\left(w_{t}\right) \quad \rightarrow \quad \hat{m}_{t}=\frac{m_{t}}{1-\beta_{1}} \\ v_{t}=\beta_{2} \cdot v_{t-1}+\left(1-\beta_{2}\right) \cdot G_{t} \quad \rightarrow \quad \hat{v}_{t}=\frac{v_{t}}{1-\beta_{2}} \\ \rightarrow \quad w_{t+1}=w_{t}-\eta \cdot \frac{\hat{m}_{t}}{\sqrt{\hat{v}_{t}}+\epsilon} \end{array} mt=β1mt1+(1β1)f(wt)m^t=1β1mtvt=β2vt1+(1β2)Gtv^t=1β2vtwt+1=wtηv^t +ϵm^t
𝑚̂ 和 𝑣̂ 这两项都除以一个系数,这样的训练过程不会轻易受到0初始化𝑚𝑡和𝑣𝑡的影响。

class Adam(object):
   def __init__(self, learning_rate=0.001, b1=0.9, b2=0.999):
       self.lr = learning_rate
       self.eps = 1e-8
       self.m, self.v = None, None
       self.b1, self.b2 = b1, b2    # Decay rates.
   def update(self, w, w_grad):
       # If not initialized
       if self.m is None:
           self.m = np.zeros(w_grad.shape)
           self.v = np.zeros(w_grad.shape)
       self.m = self.b1 * self.m + (1 - self.b1) * w_grad
       self.v = self.b2 * self.v + (1 - self.b2) * np.power(w_grad, 2)
       m_hat = self.m / (1 - self.b1)
       v_hat = self.v / (1 - self.b2)
       return w - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)

3. Derivative of Batch Normalization


μ β = 1 n ∑ i = 1 n x i  Batch Mean  … ( 1 ) σ β 2 = 1 n ∑ i = 1 n ( x i − μ β ) 2  Batch Variance  … ( 2 ) x ^ i = x i − μ β σ β 2 + ϵ  Normalization  … ( 3 ) x ^ i ∗ = γ x ^ i + B = B N ( x i ) γ , B  Scale and Shift  … ( 4 ) \begin{aligned} \mu_{\beta}&=\frac{1}{n} \sum_{i=1}^{n} x_{i} \quad &\text { Batch Mean } & \ldots(1)\\ \sigma_{\beta}^{2}&=\frac{1}{n} \sum_{i=1}^{n}\left(x_{i}-\mu_{\beta}\right)^{2}&\text { Batch Variance } & \ldots(2)\\ \hat{x}_{i} &=\frac{x_{i}-\mu_{\beta}}{\sqrt{\sigma_{\beta}^{2}+\epsilon}} & \text { Normalization } & \ldots(3) \\ \hat{x}_{i}^{*} &=\gamma \hat{x}_{i}+B=\underset{\gamma, B}{B N\left(x_{i}\right)} & \text { Scale and Shift } & \ldots(4) \end{aligned} μβσβ2x^ix^i=n1i=1nxi=n1i=1n(xiμβ)2=σβ2+ϵ xiμβ=γx^i+B=γ,BBN(xi) Batch Mean  Batch Variance  Normalization  Scale and Shift (1)(2)(3)(4)


class BatchNorm(object):
   def __init__(self, insize, momentum=0.99, eps=0.01, trainable=True):
       self.insize = insize
       self.mm = momentum
       self.eps = eps
       self.trainable = trainable
       self._mean = None
       self._var = None
   def initialize(self, optimizer):
       # Initialize the parameters
       self.gamma = np.ones(self.insize)
       self.beta = np.zeros(self.insize)
       # parameter optimizers
       self.gamma_opt = copy.copy(optimizer)
       self.beta_opt = copy.copy(optimizer)
   def parameters(self):
       return np.prod(self.gamma.shape) + np.prod(self.beta.shape)
   def forward_pass(self, X, training=True):
       # Initialize running mean and variance if first run
       if self._mean is None:
           self._mean = np.mean(X, axis=0)
           self._var = np.var(X, axis=0)# Similar to the Adam optimizer.
       if training and self.trainable:
           mean = np.mean(X, axis=0)
           var = np.var(X, axis=0)
           self._mean = self.mm * self._mean + (1 - self.mm) * mean
           self._var = self.mm * self._var + (1 - self.mm) * var
           mean = self._mean
           var = self._var
       # Statistics saved for backward pass
       self.X_centered = X - mean
       self.stddev_inv = 1 / np.sqrt(var + self.eps)
       X_norm = self.X_centered * self.stddev_inv
       output = self.gamma * X_norm + self.beta
       return output

在批标准化的情况下,多变量的情况应该被考虑。假设我们有一个函数 𝑧=𝑓(𝑥,𝑦) where 𝑥=𝑔(𝑡) and 𝑦=ℎ(𝑡) ,链式法则可以应用如下::

d z d t = ∂ z ∂ x ⋅ d x d t + ∂ z ∂ y ⋅ d y d t \frac{d z}{d t}=\frac{\partial z}{\partial x} \cdot \frac{d x}{d t}+\frac{\partial z}{\partial y} \cdot \frac{d y}{d t} dtdz=xzdtdx+yzdtdy
类似地,如果 𝑥=𝑔(𝑢,𝑣) and 𝑦=ℎ(𝑢,𝑣)有两个变量,函数也链式法则将是这样的:

d z d u = ∂ z ∂ x ⋅ ∂ x d u + ∂ z ∂ y ⋅ ∂ y ∂ u & d z d v = ∂ z ∂ x ⋅ ∂ x d v + ∂ z ∂ y ⋅ ∂ y ∂ v \frac{d z}{d u}=\frac{\partial z}{\partial x} \cdot \frac{\partial x}{d u}+\frac{\partial z}{\partial y} \cdot \frac{\partial y}{\partial u} \quad \& \quad \frac{d z}{d v}=\frac{\partial z}{\partial x} \cdot \frac{\partial x}{d v}+\frac{\partial z}{\partial y} \cdot \frac{\partial y}{\partial v} dudz=xzdux+yzuy&dvdz=xzdvx+yzvy

3-1. Scale and Shift


∂ f ∂ γ = ∂ f ∂ x ^ i ∗ ⋅ ∂ x ^ i ∗ ∂ γ  batch  ∑ i = 1 n ∂ f ∂ x ^ i ∗ ⋅ x ^ i … [ A ] ∂ f ∂ B = ∂ f ∂ x ^ i ∗ ⋅ ∂ x ^ i ∗ ∂ B  batch  ∑ i = 1 n ∂ f ∂ x ^ i ∗ … [ B ] ∂ f ∂ x ^ i = ∂ f ∂ x ^ i ∗ ⋅ ∂ x ^ i ∗ ∂ x ^ i  batch  ∑ i = 1 n ∂ f ∂ x ^ i ∗ ⋅ γ … ( 5 ) \begin{array}{llll} \frac{\partial f}{\partial \gamma} & =\frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \frac{\partial \hat{x}_{i}^{*}}{\partial \gamma} & \text { batch } & \sum_{i=1}^{n} \frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \hat{x}_{i} & \ldots[A] \\ \frac{\partial f}{\partial B} & =\frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \frac{\partial \hat{x}_{i}^{*}}{\partial B} & \text { batch } & \sum_{i=1}^{n} \frac{\partial f}{\partial \hat{x}_{i}^{*}} & \ldots[B] \\ \frac{\partial f}{\partial \hat{x}_{i}} & =\frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \frac{\partial \hat{x}_{i}^{*}}{\partial \hat{x}_{i}} & \text { batch } & \sum_{i=1}^{n} \frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \gamma & \ldots(5) \end{array} γfBfx^if=x^ifγx^i=x^ifBx^i=x^ifx^ix^i batch  batch  batch i=1nx^ifx^ii=1nx^ifi=1nx^ifγ[A][B](5)

3-2. Derivative of Input 𝑥𝑖

∂ f ∂ x i = ∂ f ∂ x ^ i ∗ ⋅ ∂ x ^ i ∗ ∂ x ^ i ⋅ ∂ x ^ i x i = ( ∂ f ∂ x ^ i ∗ ⋅ γ ) ⋅ ∂ x ^ i x i \frac{\partial f}{\partial x_{i}}=\frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \frac{\partial \hat{x}_{i}^{*}}{\partial \hat{x}_{i}} \cdot \frac{\partial \hat{x}_{i}}{x_{i}}=\left(\frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot \gamma\right) \cdot \frac{\partial \hat{x}_{i}}{x_{i}} xif=x^ifx^ix^ixix^i=(x^ifγ)xix^i

x ^ i = x i − μ β σ β 2 + ϵ = f ( z , μ β , σ β 2 )  chain  d f d x i = ∂ f ∂ z ⋅ ∂ z ∂ x i + ∂ f ∂ μ β ⋅ ∂ μ β ∂ x i + ∂ f ∂ σ β 2 ⋅ ∂ σ β 2 ∂ x i \hat{x}_{i}=\frac{x_{i}-\mu_{\beta}}{\sqrt{\sigma_{\beta}^{2}+\epsilon}}=f\left(z, \mu_{\beta}, \sigma_{\beta}^{2}\right) \quad \text { chain } \quad \frac{d f}{d x_{i}}=\frac{\partial f}{\partial z} \cdot \frac{\partial z}{\partial x_{i}}+\frac{\partial f}{\partial \mu_{\beta}} \cdot \frac{\partial \mu_{\beta}}{\partial x_{i}}+\frac{\partial f}{\partial \sigma_{\beta}^{2}} \cdot \frac{\partial \sigma_{\beta}^{2}}{\partial x_{i}} x^i=σβ2+ϵ xiμβ=f(z,μβ,σβ2) chain dxidf=zfxiz+μβfxiμβ+σβ2fxiσβ2
∂ f ∂ z ⋅ ∂ z ∂ x i = 1 σ β 2 + ϵ ⋅ 1 ∂ f ∂ μ β ⋅ ∂ μ β ∂ x i = − 1 σ β 2 + ϵ ⋅ ∑ j = 1 n ∂ f ∂ x ^ j ∗ ∂ f ∂ σ β 2 ⋅ ∂ σ β 2 ∂ x i = − 1 2 ( x i − μ β ) ( σ β 2 + ϵ ) − 3 2 × 2 n ∑ j = 1 n ∂ f ∂ x ^ j ∗ ⋅ ( x j − μ β ) \begin{aligned} \frac{\partial f}{\partial z} \cdot \frac{\partial z}{\partial x_{i}} &=\frac{1}{\sqrt{\sigma_{\beta}^{2}+\epsilon}} \cdot 1 \\ \frac{\partial f}{\partial \mu_{\beta}} \cdot \frac{\partial \mu_{\beta}}{\partial x_{i}} &=\frac{-1}{\sqrt{\sigma_{\beta}^{2}+\epsilon}} \cdot \sum_{j=1}^{n} \frac{\partial f}{\partial \hat{x}_{j}^{*}} \\ \frac{\partial f}{\partial \sigma_{\beta}^{2}} \cdot \frac{\partial \sigma_{\beta}^{2}}{\partial x_{i}} &=\frac{-1}{2}\left(x_{i}-\mu_{\beta}\right)\left(\sigma_{\beta}^{2}+\epsilon\right)^{\frac{-3}{2}} \times \frac{2}{n} \sum_{j=1}^{n} \frac{\partial f}{\partial \hat{x}_{j}^{*}} \cdot\left(x_{j}-\mu_{\beta}\right) \end{aligned} zfxizμβfxiμβσβ2fxiσβ2=σβ2+ϵ 11=σβ2+ϵ 1j=1nx^jf=21(xiμβ)(σβ2+ϵ)23×n2j=1nx^jf(xjμβ)

∂ f ∂ x i = γ ⋅ 1 N σ β 2 + ϵ ⋅ ( ∂ f ∂ x ^ i ∗ ⋅ N − ∑ i = 1 n ∂ f ∂ x ^ i ∗ − x i − μ β σ β 2 + ϵ ⋅ ∑ i = 1 n ∂ f ∂ x ^ i ∗ ( x i − μ β ) ) … [ C ] \frac{\partial f}{\partial x_{i}}=\gamma \cdot \frac{1}{N \sqrt{\sigma_{\beta}^{2}+\epsilon}} \cdot\left(\frac{\partial f}{\partial \hat{x}_{i}^{*}} \cdot N-\sum_{i=1}^{n} \frac{\partial f}{\partial \hat{x}_{i}^{*}}-\frac{x_{i}-\mu_{\beta}}{\sigma_{\beta}^{2}+\epsilon} \cdot \sum_{i=1}^{n} \frac{\partial f}{\partial \hat{x}_{i}^{*}}\left(x_{i}-\mu_{\beta}\right)\right) \quad \ldots[C] xif=γNσβ2+ϵ 1(x^ifNi=1nx^ifσβ2+ϵxiμβi=1nx^if(xiμβ))[C]

class BatchNormalization(BatchNorm):
   def __init__(self, insize, momentum=0.99, eps=0.01, trainable=True):
       super(BatchNormalization, self).__init__(
           insize, momentum, eps, trainable)
   def backward_pass(self, accum_grad):
       # Save parameters used during the forward pass
       gamma = self.gamma
       # If the layer is trainable the parameters are updated
       if self.trainable:
           X_norm = self.X_centered * self.stddev_inv
           grad_gamma = np.sum(accum_grad * X_norm, axis=0)  # ... [A]
           grad_beta = np.sum(accum_grad, axis=0)            # ... [B]
           self.gamma = self.gamma_opt.update(self.gamma, grad_gamma)
           self.beta = self.beta_opt.update(self.beta, grad_beta)
       batch_size = accum_grad.shape[0]# The gradient of the loss with respect to the layer inputs
       # (use weights and statistics from forward pass).
       accum_grad = (1 / batch_size) * gamma * self.stddev_inv * (
           batch_size * accum_grad -
           np.sum(accum_grad, axis=0) -
           self.X_centered * self.stddev_inv**2 *
           np.sum(accum_grad * self.X_centered, axis=0))     # ... [C]return accum_grad

4. Derivative of Bias & Weight Matrices

 neuron  ( x ) = x ⋅ w + b  batch  ‾ f ( w , b ) = ∑ j x i j ⋅ w j + b i = y i \text { neuron }(x)=x \cdot w+b \quad \underline{\text { batch }} \quad f(w, b)=\sum_{j} x_{i j} \cdot w_{j}+b_{i}=y_{i}  neuron (x)=xw+b batch f(w,b)=jxijwj+bi=yi



class dense(object):
   def __init__(self, insize, outsize):
       self.insize = insize
       self.outsize = outsize

       self.w, self.b = None, None
       self.layer_input = None

   def initialize(self, optimizer):
       # Initialize the weights
       limit = 1 / np.sqrt(self.insize)
       self.b = np.zeros((1, self.outsize))
       self.w = np.random.uniform(-limit, limit,
                                  (self.insize, self.outsize))
       # Weight optimizers
       self.w_opt = copy.copy(optimizer)
       self.b_opt = copy.copy(optimizer)

   def parameters(self):
       return np.prod(self.w.shape) + np.prod(self.b.shape)

   def forward_pass(self, X, training=True):
       self.layer_input = X
       return X.dot(self.w) + self.b


∂ f ∂ w = ∂ f ∂ y i ⋅ ∂ y i ∂ w = ∂ f ∂ y i ⋅ ∂ ∂ w ( x w + b ) = ∂ f ∂ y i ⋅ x  batch  x T ⋅ ∂ f ∂ y … [ D ] ∂ f ∂ b = ∂ f ∂ y i ⋅ ∂ y i ∂ b = ∂ f ∂ y i ⋅ ∂ ∂ b ( x w + b ) = ∂ f ∂ y i ⋅ 1  batch  ∑ i = 1 n ∂ f ∂ y i … [ E ] \begin{aligned} \frac{\partial f}{\partial w} &=\frac{\partial f}{\partial y_{i}} \cdot \frac{\partial y_{i}}{\partial w}=\frac{\partial f}{\partial y_{i}} \cdot \frac{\partial}{\partial w}(x w+b)=\frac{\partial f}{\partial y_{i}} \cdot x & \text { batch } & \mathbf{x}^{T} \cdot \frac{\partial f}{\partial \mathbf{y}} & \ldots[D] \\ \frac{\partial f}{\partial b} &=\frac{\partial f}{\partial y_{i}} \cdot \frac{\partial y_{i}}{\partial b}=\frac{\partial f}{\partial y_{i}} \cdot \frac{\partial}{\partial b}(x w+b)=\frac{\partial f}{\partial y_{i}} \cdot 1 & \text { batch } & \sum_{i=1}^{n} \frac{\partial f}{\partial y_{i}} & \ldots[E] \end{aligned} wfbf=yifwyi=yifw(xw+b)=yifx=yifbyi=yifb(xw+b)=yif1 batch  batch xTyfi=1nyif[D][E]


∂ f ∂ x = ∂ f ∂ y i ⋅ ∂ y i ∂ x = ∂ f ∂ y i ⋅ ∂ ∂ x ( x w + b ) = ∂ f ∂ y i ⋅ w  batch  ∂ f ∂ y ⋅ w T \frac{\partial f}{\partial x}=\frac{\partial f}{\partial y_{i}} \cdot \frac{\partial y_{i}}{\partial x}=\frac{\partial f}{\partial y_{i}} \cdot \frac{\partial}{\partial x}(x w+b)=\frac{\partial f}{\partial y_{i}} \cdot w \quad \text { batch } \quad \frac{\partial f}{\partial \mathbf{y}} \cdot \mathbf{w}^{T} xf=yifxyi=yifx(xw+b)=yifw batch yfwT


class Dense(dense):
   def __init__(self, insize, outsize, trainable=True):
       super(Dense, self).__init__(insize, outsize)
       self.trainable = trainable
   def backward_pass(self, accum_grad):
       # Save weights used during forwards pass
       w = self.w

       if self.trainable:
           # Calculate gradient w.r.t layer weights
           grad_w = self.layer_input.T.dot(accum_grad)         # ... [D]
           grad_b = np.sum(accum_grad, axis=0, keepdims=True)  # ... [E]

           # Update the layer weights
           self.w = self.w_opt.update(self.w, grad_w)
           self.b = self.b_opt.update(self.b, grad_b)

       # Conduct the accumulated gradient to former layer using the
       # same weight as the forward pass.
       accum_grad = accum_grad.dot(w.T)                        # ... (6)
       return accum_grad

5. Derivative of Activation Functions


5-1. Sigmoid

d d z ( 1 1 + e − z ) = d d z ( 1 + e − z ) − 1 = ( 1 + e − z ) − 2 ⋅ e − z = σ ( z ) ( 1 − σ ( z ) ) \frac{d}{d z}\left(\frac{1}{1+e^{-z}}\right)=\frac{d}{d z}\left(1+e^{-z}\right)^{-1}=\left(1+e^{-z}\right)^{-2} \cdot e^{-z}=\sigma(z)(1-\sigma(z)) dzd(1+ez1)=dzd(1+ez)1=(1+ez)2ez=σ(z)(1σ(z))

class Sigmoid(object):
    def __call__(self, x):
        return 1 / (1 + np.exp(-x))
    def gradient(self, x):
        return self.__call__(x) * (1 - self.__call__(x))

5-2. Tanh


d d x tanh ⁡ x = 1 − tanh ⁡ 2 x = sech ⁡ 2 x = 1 cosh ⁡ 2 x \frac{d}{d x} \tanh x=1-\tanh ^{2} x=\operatorname{sech}^{2} x=\frac{1}{\cosh ^{2} x} dxdtanhx=1tanh2x=sech2x=cosh2x1

class Tanh(object):
    def __call__(self, x):
        return 2 / (1 + np.exp(-2 * x)) - 1
    def gradient(self, x):
        return 1 - np.power(self.__call__(x), 2)

5-3. ReLU


class ReLU(object):
    def __call__(self, x):
        return np.where(x >= 0, x, 0)
    def gradient(self, x):
        return np.where(x >= 0, 1, 0)

5-4. Leaky ReLU


class LeakyReLU(object):
    def __init__(self, alpha=0.2):
        self.alpha = alpha
    def __call__(self, x):
        return np.where(x >= 0, x, self.alpha * x)
    def gradient(self, x):
        return np.where(x >= 0, 1, self.alpha)

5-5. ELU


d d x α ( e x − 1 ) = α ⋅ e x = E L U − α \frac{d}{d x} \alpha\left(e^{x}-1\right)=\alpha \cdot e^{x}=E L U-\alpha dxdα(ex1)=αex=ELUα

class ELU(object):
    def __init__(self, alpha=0.1):
        self.alpha = alpha
    def __call__(self, x):
        return np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1))
    def gradient(self, x):
        return np.where(x >= 0.0, 1, self.__call__(x) + self.alpha)

 activation  ( x ) = ⋯ =  out  → ∂  activation  ( x ) ∂ x = ∂  activation  ( x ) ∂  out  ⋅ ∂  out  ∂ x \text { activation }(x)=\cdots=\text { out } \quad \rightarrow \quad \frac{\partial \text { activation }(x)}{\partial x}=\frac{\partial \text { activation }(x)}{\partial \text { out }} \cdot \frac{\partial \text { out }}{\partial x}  activation (x)== out x activation (x)= out  activation (x)x out 

class Activation(object):
    def __init__(self, func):
        self.function = func
        self.trainable = True
    def layer_name(self):
        return "Activation (%s)" % (self.function.__class__.__name__)
    def forward_pass(self, X, training=True):
        self.layer_input = X
        return self.function(X)
    def backward_pass(self, accum_grad):
        return accum_grad * self.function.gradient(self.layer_input)

Network Construction


linear1 = Dense(64, 32)
linear1.initialize(Adam(learning_rate=0.001, b1=0.9, b2=0.999))
bn1 = BatchNormalization(32, momentum=0.99, eps=0.01)
bn1.initialize(Adam(learning_rate=0.001, b1=0.9, b2=0.999))
a1 = Activation(Sigmoid())
linear2 = Dense(32, 16)
linear2.initialize(Adam(learning_rate=0.001, b1=0.9, b2=0.999))
bn2 = BatchNormalization(16, momentum=0.99, eps=0.01)
bn2.initialize(Adam(learning_rate=0.001, b1=0.9, b2=0.999))
a2 = Activation(ReLU())
linear3 = Dense(16, 10)
linear3.initialize(Adam(learning_rate=0.001, b1=0.9, b2=0.999))
bn3 = BatchNormalization(10, momentum=0.99, eps=0.01)
bn3.initialize(Adam(learning_rate=0.001, b1=0.9, b2=0.999))
a3 = Activation(Softmax()


1. Forward & Backward


def forward(data):
    x = linear1.forward_pass(data)
    x = bn1.forward_pass(x)
    x = a1.forward_pass(x)
    x = linear2.forward_pass(x)
    x = bn2.forward_pass(x)
    x = a2.forward_pass(x)
    x = linear3.forward_pass(x)
    x = bn3.forward_pass(x)
    x = a3.forward_pass(x)
    return x
(3, 10)


ce = CrossEntropyLoss()
y = one_hot(labels, class_num=10)
loss_grad = ce.gradient(y, y_pred)
print('loss sum: {}'.format(np.sum(ce(y, y_pred))))
loss sum: 6.5744415695544305


def backward(loss_grad):
    grad = a3.backward_pass(loss_grad)
    grad = bn3.backward_pass(grad)
    grad = linear3.backward_pass(grad)
    grad = a2.backward_pass(grad)
    grad = bn2.backward_pass(grad)
    grad = linear2.backward_pass(grad)
    grad = a1.backward_pass(grad)
    grad = bn1.backward_pass(grad)
    grad = linear1.backward_pass(grad)
    return grad
(3, 64)

2. Train & Predict


data = nums_8x8.images / np.max(nums_8x8.images)
data = data.reshape(len(data), -1)
labels = nums_8x8.target


data_train, label_train = data[:1500], labels[:1500]
data_val, label_val = data[1500:], labels[1500:]
print('Accuracy: {}%'.format(100 * np.sum(np.argmax(
    forward(data_val), axis=1) == label_val) / len(label_val)))
Accuracy: 11.784511784511784%


for i in range(5000):
    # Forward pass through each layer.
    y_pred = forward(data_train)
    # Label format transformation.
    y = one_hot(label_train, class_num=10)
    # Get the gradient of the loss function.
    loss_grad = ce.gradient(y, y_pred)
    # Backward pass to update parameters.
    backward(loss_grad)if i % 500 == 0:
        print('loss sum: {}'.format(np.sum(ce(y, y_pred))))
loss sum: 5320.774219742432
loss sum: 561.0516403743936
loss sum: 333.3261353080994
loss sum: 217.5184718949811
loss sum: 151.71408069091297
loss sum: 110.50518695024523
loss sum: 83.06328142549991
loss sum: 63.77920909272024
loss sum: 49.7062883092378
loss sum: 39.07595031622607

3. Evaluation


print('Accuracy: {}%'.format(100 * np.sum(np.argmax(
    forward(data_val), axis=1) == label_val) / len(label_val)))
Accuracy: 92.5925925925926%


4. Parameter Statistic


print('linear 1     : {}'.format(linear1.parameters()))
print('Batch Norm 1 : {}'.format(bn1.parameters()))
print('linear 2     : {}'.format(linear2.parameters()))
print('Batch Norm 2 : {}'.format(bn2.parameters()))
print('totally    --> {}'.format(linear1.parameters() + bn1.parameters() + 
                                 linear2.parameters() + bn2.parameters()))
linear 1     : 2080
Batch Norm 1 : 64
linear 2     : 528
Batch Norm 2 : 32
totally    --> 2704


Batch Norm: What does the gradient flowing through batch normalization looks like ?
Batch Norm: Deriving the Gradient for the Backward Pass of Batch Normalization

