深度学习与图像识别:原理与实践笔记Day_11_np.sum(y * np.log(p + delta))-CSDN博客

本文链接：https://blog.csdn.net/qq786558544/article/details/120836995

基于数值微分的反向传播

在本节中，我们尝试使用基于数值微分的方式实现手写数字的识别，并且是使用mini_batch来提升计算性能，使用的优化方法是随机梯度下降法，随机是指：“随机选择数据源中的小批次”的意思，随机梯度下降的英文名SGD。

第一步、激活函数的定义。主要是使用softmax和Relu，代码如下：

# 激活函数Relu的定义
def Relu(in_data):
    y = np.maximum(0, in_data)
    return y

# 激活函数softmax的定义
def _softmax(x):
    if x.ndim == 2:
        # 因为x为二维函数，所以shape为(ndim,row,column)
        # axis=1:求各column的最大值
        # axis=2:求各row的最大值
        D = np.max(x, axis=1)
        # 由于是求各列的最大值，所以需要对x进行转置
        x = x.T - D  # 溢出对策
        # np.sum(a, axis=0) 表示的是将二维数组中的各个元素对应相加
        # axis =1 时， 表示的是二维数组中的各自维的列相加
        # axis =2 时， 表示的是二维数组中的各自维的行相加
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T
    D = np.max(x)
    exp_x = np.exp(x - D)
    return exp_x / np.sum(exp_x)

第二步、损失函数以及数值微分的计算逻辑，代码如下

def cross_entropy_error(p, y):
    delta = 1e-7
    batch_size = p.shape[0]
    return np.sum(-y * np.log(p + delta)) / batch_size


def numberical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        # 取索引
        idx = it.multi_index
        # 取值
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = float(tmp_val) - h
        fxh2 = f(x)
        # 求导数
        grad[idx] = (fxh1 - fxh2) / (2 - h)

        x[idx] = tmp_val  # 还原值
        it.iternext()

    return grad

第三步、定义神经网络，代码如下

class TowLayerNet():
    # weight_init_std=0.01防止权重过大
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化权重
        self.params = {}
        # 输入层和隐层之间的权重
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        # 隐层和输出层之间的权重
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = x.dot(W1) + b1
        z1 = Relu(a1)
        a2 = a1.dot(W2) + b2

        p = _softmax(a2)

        return p

    def loss(self, x, y):
        # 得到预测值
        p = self.predict(x)

        return cross_entropy_error(p, y)

    def numberical_gradient(self, x, y):
        loss_W = lambda W: self.loss(x, y)
        # 相当于 def loss_W(W):
        #         return self.loss(x, y)

        grads = {}
        grads['W1'] = numberical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numberical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numberical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numberical_gradient(loss_W, self.params['b2'])

        return grads

    def accuracy(self, x, y):

        p =self.predict(x)
        # 在一维数组中argmax有一个参数axis,默认是0,表示每一列的最大值的索引 axis=1表示每一行的最大值的索引
        # 而在二维数组中默认是求矩阵中的最大值的索引，axis=1表示求每列的最大值的索引，axis=2,表示求每行的最大值索引
        p = np.argmax(p, axis=1)
        y = np.argmax(y, axis=1)

        accuracy = np.sum(p == y) / float(x.shape[0])

        return accuracy

第四步、查看损失值

# 4、查看损失值
x_train = train_datasets.train_data.numpy().reshape(-1, 28 * 28)
# 转换为一列
y_train_tmp = train_datasets.train_labels.reshape(train_datasets.train_labels.shape[0], 1)
# 转换为one-hot 编码
y_train = torch.zeros(y_train_tmp.shape[0], 10).scatter_(1, y_train_tmp, 1).numpy()
x_test = test_datasets.test_data.numpy().reshape(-1, 28 * 28)
y_test_tmp = test_datasets.test_labels.reshape(test_datasets.test_labels.shape[0], 1)
# 转换为one-hot 编码
y_test = torch.zeros(y_test_tmp.shape[0], 10).scatter_(1, y_test_tmp, 1).numpy()

# 超参数
iters_num = 100  # 循环次数
train_size = x_train.shape[0]
batch_size = 10
lr = 0.001

Network = TowLayerNet(input_size=784, hidden_size=50, output_size=10)
for i in range(iters_num):
    # 在 train_size 范围内随机取 batch_size 个数
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    y_batch = y_train[batch_mask]

    grad = Network.numberical_gradient(x_batch, y_batch)

    # 梯度下降
    for key in ('W1', 'b1', 'W2', 'b2'):
        Network.params[key] -= lr * grad[key]

    # 记录学习过程
    loss = Network.loss(x_batch, y_batch)
    if i % 9 == 0:
        print(loss)
    print('accuracy: %f %%' % Network.accuracy(x_test, y_test))

通过观察，准确率大概是75%左右。