反向传播算法(过程及公式推导)_卷积神经网络各层反向传播公式推导

668309a86776fce466a1b9843b05b18d.png
  1. 前向传播

输入层:

ce34bc77af81f5d7cbb4b2627495793e.png

隐藏层:
第二层

700467ff89694c026a7c53922328acf7.png

第三层:

ebeb18d14b193c71b12492e32c29b79a.png

输出层:

f5a46cef4cc5946998d6a215a93f83af.png

2.反向传播

bbb582ea9104475f967dffe101bdaf66.png
273020f08516ffee5704ff401541b23d.png

反向传播算法

01ceedebf4e3e535086410fe530a60b6.png

参考http://neuralnetworksanddeeplearning.com/chap2.html

dbb8c8940c2266d410332327391fb8fa.png
c7bcae7cddc039a28e20c46dd9126a35.png
b69b3f1e12eab1b16c21aed3978739de.png
def conv_forward(X, W, b, stride=1, padding=1):    cache = W, b, stride, padding    n_filters, d_filter, h_filter, w_filter = W.shape    n_x, d_x, h_x, w_x = X.shape    h_out = (h_x - h_filter + 2 * padding) / stride + 1    w_out = (w_x - w_filter + 2 * padding) / stride + 1    if not h_out.is_integer() or not w_out.is_integer():        raise Exception('Invalid output dimension!')    h_out, w_out = int(h_out), int(w_out)    X_col = im2col_indices(X, h_filter, w_filter, padding=padding, stride=stride)    W_col = W.reshape(n_filters, -1)    out = W_col @ X_col + b    out = out.reshape(n_filters, h_out, w_out, n_x)    out = out.transpose(3, 0, 1, 2)    cache = (X, W, b, stride, padding, X_col)    return out, cachedef conv_backward(dout, cache):    X, W, b, stride, padding, X_col = cache    n_filter, d_filter, h_filter, w_filter = W.shape    db = np.sum(dout, axis=(0, 2, 3))    db = db.reshape(n_filter, -1)    dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(n_filter, -1)    dW = dout_reshaped @ X_col.T    dW = dW.reshape(W.shape)    W_reshape = W.reshape(n_filter, -1)    dX_col = W_reshape.T @ dout_reshaped    dX = col2im_indices(dX_col, X.shape, h_filter, w_filter, padding=padding, stride=stride)    return dX, dW, db
c3f60aa4046de16515e5b2bbd19e20d3.png
3110134f61950dbd4bf04168bee0248b.png
```pythondef sigmoid_forward(X):    out = util.sigmoid(X)    cache = out    return out, cachedef sigmoid_backward(dout, cache):    return cache * (1. - cache) * doutdef relu_forward(X):    out = np.maximum(X, 0)    cache = X    return out, cachedef relu_backward(dout, cache):    dX = dout.copy()    dX[cache <= 0] = 0    return dXdef lrelu_forward(X, a=1e-3):    out = np.maximum(a * X, X)    cache = (X, a)    return out, cachedef lrelu_backward(dout, cache):    X, a = cache    dX = dout.copy()    dX[X < 0] *= a    return dXdef tanh_forward(X):    out = np.tanh(X)    cache = out    return out, cachedef tanh_backward(dout, cache):    dX = (1 - cache**2) * dout    return dX```
f61d84aa1d34cf523cdc4826bd94c440.png
def dropout_forward(X, p_dropout):    u = np.random.binomial(1, p_dropout, size=X.shape) / p_dropout    out = X * u    cache = u    return out, cachedef dropout_backward(dout, cache):    dX = dout * cache    return dX
acfb8682029bcdee90c4bfbd0ae0edb3.png
c0494a56b194ecfa628f3f9b79dd7ac2.png
def bn_forward(X, gamma, beta, cache, momentum=.9, train=True):    running_mean, running_var = cache    if train:        mu = np.mean(X, axis=0)        var = np.var(X, axis=0)        X_norm = (X - mu) / np.sqrt(var + c.eps)        out = gamma * X_norm + beta        cache = (X, X_norm, mu, var, gamma, beta)        running_mean = util.exp_running_avg(running_mean, mu, momentum)        running_var = util.exp_running_avg(running_var, var, momentum)    else:        X_norm = (X - running_mean) / np.sqrt(running_var + c.eps)        out = gamma * X_norm + beta        cache = None    return out, cache, running_mean, running_vardef bn_backward(dout, cache):    X, X_norm, mu, var, gamma, beta = cache    N, D = X.shape    X_mu = X - mu    std_inv = 1. / np.sqrt(var + c.eps)    dX_norm = dout * gamma    dvar = np.sum(dX_norm * X_mu, axis=0) * -.5 * std_inv**3    dmu = np.sum(dX_norm * -std_inv, axis=0) + dvar * np.mean(-2. * X_mu, axis=0)    dX = (dX_norm * std_inv) + (dvar * 2 * X_mu / N) + (dmu / N)    dgamma = np.sum(dout * X_norm, axis=0)    dbeta = np.sum(dout, axis=0)    return dX, dgamma, dbeta
2f05f6a2b58bbf31da12b05088017769.png
def fc_forward(X, W, b):    out = X @ W + b    cache = (W, X)    return out, cachedef fc_backward(dout, cache):    W, h = cache    dW = h.T @ dout    db = np.sum(dout, axis=0)    dX = dout @ W.T    return dX, dW, db
836f192542917ba57892eacc154122d7.png
def maxpool_forward(X, size=2, stride=2):    def maxpool(X_col):        max_idx = np.argmax(X_col, axis=0)        out = X_col[max_idx, range(max_idx.size)]        return out, max_idx    return _pool_forward(X, maxpool, size, stride)def maxpool_backward(dout, cache):    def dmaxpool(dX_col, dout_col, pool_cache):        dX_col[pool_cache, range(dout_col.size)] = dout_col        return dX_col    return _pool_backward(dout, dmaxpool, cache)def avgpool_forward(X, size=2, stride=2):    def avgpool(X_col):        out = np.mean(X_col, axis=0)        cache = None        return out, cache    return _pool_forward(X, avgpool, size, stride)def avgpool_backward(dout, cache):    def davgpool(dX_col, dout_col, pool_cache):        dX_col[:, range(dout_col.size)] = 1. / dX_col.shape[0] * dout_col        return dX_col    return _pool_backward(dout, davgpool, cache)def _pool_forward(X, pool_fun, size=2, stride=2):    n, d, h, w = X.shape    h_out = (h - size) / stride + 1    w_out = (w - size) / stride + 1    if not w_out.is_integer() or not h_out.is_integer():        raise Exception('Invalid output dimension!')    h_out, w_out = int(h_out), int(w_out)    X_reshaped = X.reshape(n * d, 1, h, w)    X_col = im2col_indices(X_reshaped, size, size, padding=0, stride=stride)    out, pool_cache = pool_fun(X_col)    out = out.reshape(h_out, w_out, n, d)    out = out.transpose(2, 3, 0, 1)    cache = (X, size, stride, X_col, pool_cache)    return out, cachedef _pool_backward(dout, dpool_fun, cache):    X, size, stride, X_col, pool_cache = cache    n, d, w, h = X.shape    dX_col = np.zeros_like(X_col)    dout_col = dout.transpose(2, 3, 0, 1).ravel()    dX = dpool_fun(dX_col, dout_col, pool_cache)    dX = col2im_indices(dX_col, (n * d, 1, h, w), size, size, padding=0, stride=stride)    dX = dX.reshape(X.shape)    return dX
反向传播算法是一种用于训练神经网络的算法,其核心思想是利用链式法则计算每个参数对于损失函数的梯度,从而进行梯度下降优化。其过程可以分为以下几步: 1. 前向传播:将输入数据通过网络传播,计算出网络的输出。 2. 计算误差:将网络的输出与真实标签比较,计算出误差。 3. 反向传播:利用误差,从输出层开始,利用链式法则计算每个参数对于误差的梯度。 4. 更新参数:根据每个参数对应的梯度,使用梯度下降算法对参数进行更新。 具体来说,反向传播公式推导如下: 在一个神经网络中,我们假设有L层,第l层的权重为W(l),偏置为b(l),第l层的激活函数为f(l),输入为x,输出为y。 前向传播的过程可以表示为: $$z^{(l)}=W^{(l)}y^{(l-1)}+b^{(l)}$$ $$y^{(l)}=f^{(l)}(z^{(l)})$$ 其中,$y^{(0)}=x$。 计算误差: 假设我们的目标是最小化输出层的误差,我们可以定义一个损失函数$J(y^{(L)},y^{*})$,其中$y^{*}$表示真实标签。我们的目标是找到一组参数$W^{(1)},...,W^{(L)},b^{(1)},...,b^{(L)}$,使得损失函数最小化,即: $$\min_{W^{(1)},...,W^{(L)},b^{(1)},...,b^{(L)}} J(y^{(L)},y^{*})$$ 利用链式法则,我们可以计算出每个参数对于损失函数的梯度: $$\frac{\partial J}{\partial W^{(l)}}=\frac{\partial J}{\partial z^{(L)}}\frac{\partial z^{(L)}}{\partial y^{(L-1)}}\frac{\partial y^{(L-1)}}{\partial z^{(L-1)}}\cdots\frac{\partial z^{(l)}}{\partial W^{(l)}}$$ $$\frac{\partial J}{\partial b^{(l)}}=\frac{\partial J}{\partial z^{(L)}}\frac{\partial z^{(L)}}{\partial y^{(L-1)}}\frac{\partial y^{(L-1)}}{\partial z^{(L-1)}}\cdots\frac{\partial z^{(l)}}{\partial b^{(l)}}$$ 其中,$\frac{\partial J}{\partial z^{(L)}}$表示输出层的误差,可以根据损失函数的定义求出;$\frac{\partial z^{(L)}}{\partial y^{(L-1)}}$表示输出层的权重,可以直接从前向传播的过程中得到;$\frac{\partial y^{(L-1)}}{\partial z^{(L-1)}}$表示第L-1层的激活函数的导数,可以根据激活函数的选择求出;$\frac{\partial z^{(l)}}{\partial W^{(l)}}$和$\frac{\partial z^{(l)}}{\partial b^{(l)}}$表示第l层的权重和偏置,可以直接从前向传播的过程中得到。 利用上述公式,我们可以计算出每个参数对于损失函数的梯度,从而进行梯度下降优化。这就是反向传播算法的核心思想。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值