基于数值微分和误差反向传播的比较

对于数值微分来说,它的计算非常耗费时间,但是实现起来非常简单,不容易出错。而误差反向传播法的实现就非常复杂,且容易出错,所以经常会比较数值微分和误差反向传播的结果,以确认我们书写的方向传播逻辑是正确的。这样的操作就称为梯度确认(gradientcheck)

实现代码如下:

from collections import OrderedDict
import numpy as np
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch

# 导入数据
train_datasets = dsets.MNIST('data',
                             train=True,
                             transform=transforms.ToTensor,
                             download=False)
test_datasets = dsets.MNIST('data',
                            train=False,
                            transform=transforms.ToTensor,
                            download=False)


class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx


class Relu():
    def __init__(self):
        self.x = None

    def forward(self, x):
        self.x = np.maximum(0, x)
        out = self.x

        return out

    # dout 为上一层传过来的导数
    def backward(self, dout):
        dx = dout
        dx[self.x <= 0] = 0
        return dx


class _sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * self.out * (1 - self.out)
        return dx


# 激活函数softmax的定义
def _softmax(x):
    if x.ndim == 2:
        # 因为x为二维函数,所以shape为(ndim,row,column)
        # axis=1:求各column的最大值
        # axis=2:求各row的最大值
        D = np.max(x, axis=1)
        # 由于是求各列的最大值,所以需要对x进行转置
        x = x.T - D  # 溢出对策
        # np.sum(a, axis=0) 表示的是将二维数组中的各个元素对应相加
        # axis =1 时, 表示的是二维数组中的各自维的列相加
        # axis =2 时, 表示的是二维数组中的各自维的行相加
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T
    D = np.max(x)
    exp_x = np.exp(x - D)
    return exp_x / np.sum(exp_x)


def cross_entropy_error(p, y):
    delta = 1e-7
    batch_size = p.shape[0]
    return np.sum(-y * np.log(p + delta)) / batch_size


def numberical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        # 取索引
        idx = it.multi_index
        # 取值
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = float(tmp_val) - h
        fxh2 = f(x)
        # 求导数 中心差值公式
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val  # 还原值
        it.iternext()

    return grad


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.p = None
        self.y = None

    def forward(self, x, y):
        self.y = y
        self.p = _softmax(x)
        self.loss = cross_entropy_error(self.p, self.y)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.y.shape[0]
        dx = (self.p - self.y) / batch_size
        return dx


class TowLayerNet:
    # weight_init_std=0.01防止权重过大
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化权重
        self.params = {}
        # 输入层和隐层之间的权重
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        # 隐层和输出层之间的权重
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

        # 生成层
        self.layer = OrderedDict()
        self.layer['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layer['Relu1'] = Relu()
        self.layer['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.layer['Relu2'] = Relu()
        self.lastLayer = SoftmaxWithLoss()

    def predict(self, x):
        for layer in self.layer.values():
            # 第一个就是Affine1层,调用此层的forward函数
            x = layer.forward(x)
        return x

    # x:输入数据,y:监督数据
    def loss(self, x, y):
        # 得到预测值
        p = self.predict(x)
        #  SoftmaxWithLoss层
        return self.lastLayer.forward(p, y)

    def accuracy(self, x, y):
        p = self.predict(x)
        # 在一维数组中argmax有一个参数axis,默认是0,表示每一列的最大值的索引 axis=1表示每一行的最大值的索引
        # 而在二维数组中默认是求矩阵中的最大值的索引,axis=1表示求每列的最大值的索引,axis=2,表示求每行的最大值索引
        p = np.argmax(y, axis=1)
        if y.ndim != 1: y = np.argmax(y, axis=1)

        accuracy = np.sum(p == y) / float(x.shape[0]) * 100
        return accuracy

    # x:输入数据,y:监督数据
    def numberical_gradient(self, x, y):
        loss_W = lambda W: self.loss(x, y)
        # 相当于 def loss_W(W):
        #         return self.loss(x, y)

        grads = {}
        grads['W1'] = numberical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numberical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numberical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numberical_gradient(loss_W, self.params['b2'])

        return grads

    def gradient(self, x, y):
        # forward
        self.loss(x, y)
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)

        layers = list(self.layer.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 设定
        grads = {}
        grads['W1'], grads['b1'] = self.layer['Affine1'].dW, self.layer['Affine1'].db
        grads['W2'], grads['b2'] = self.layer['Affine2'].dW, self.layer['Affine2'].db

        return grads


x_train = train_datasets.train_data.numpy().reshape(-1, 28 * 28)
# 转换为一列
y_train_tmp = train_datasets.train_labels.reshape(train_datasets.train_labels.shape[0], 1)
# 转换为one-hot 编码
y_train = torch.zeros(y_train_tmp.shape[0], 10).scatter_(1, y_train_tmp, 1).numpy()
x_test = test_datasets.test_data.numpy().reshape(-1, 28 * 28)
y_test_tmp = test_datasets.test_labels.reshape(test_datasets.test_labels.shape[0], 1)
# 转换为one-hot 编码
y_test = torch.zeros(y_test_tmp.shape[0], 10).scatter_(1, y_test_tmp, 1).numpy()

network = TowLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:100]
y_batch = y_train[:100]
grad_numberical = network.numberical_gradient(x_batch,y_batch)
grad_backprop = network.gradient(x_batch,y_batch)

for key in grad_numberical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numberical[key]))
    print(key + ":" + str(diff))

运行结果:

代码中的OrdereDict这个类是有序字典,”有序“是指它可以”记住“我们向这个类里添加元素的顺序,因此神经网络的前向传播只需要按照添加元素的顺序调用各层的Forward方法即可完成处理,而相对的误差反向传播只需要按照前向传播的相反的顺序调用各层的backward方法即可 

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值