随机梯度下降分析以及画图实现

最新推荐文章于 2024-05-08 17:43:51 发布

看星河的兔子

最新推荐文章于 2024-05-08 17:43:51 发布

阅读量459

点赞数

文章标签： python 机器学习开发语言

本文链接：https://blog.csdn.net/fcxgfdjy/article/details/126369252

版权

偏导实现梯度下降，梯度指的方向是各点函数值下降最多的方向，可用梯度寻找loss函数最小值
在这里插入图片描述

import numpy as np
import matplotlib.pylab as plt

def numerical_gradient(f,x):#求函数对变量的偏导，f代表函数，x代表变量
    h=1e-4
    grad=np.zeros_like(x)#生成和x形状相同的数组

    for idx in range(x.size):
        tmp_val=x[idx]
        x[idx]=tmp_val+h
        fxh1=f(x)
        x[idx]=tmp_val-h
        fxh2=f(x)
        grad[idx]=(fxh1-fxh2)/(2*h)
        x[idx]=tmp_val#还原值
    return grad


def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x
    x_history = []

    for i in range(step_num):
        x_history.append( x.copy() )#复制历史值

        grad = numerical_gradient(f, x)#计算
        x -= lr * grad#参见梯度法表示公式

    return x, np.array(x_history)

#实现过程
def function_2(x):
    return x[0]**2 + x[1]**2

init_x = np.array([-3.0, 4.0])#初始值

lr = 0.1#学习率
step_num = 20#步长，可更改
x, x_history = gradient_descent(function_2, init_x, lr=lr, step_num=step_num)
#画虚线
plt.plot( [-5, 5], [0,0], '--b')
plt.plot( [0,0], [-5, 5], '--b')
plt.plot(x_history[:,0], x_history[:,1], 'o')
#限制
plt.xlim(-3.5, 3.5)
plt.ylim(-4.5, 4.5)
plt.xlabel("X0")
plt.ylabel("X1")
plt.show()

在这里插入图片描述
从图中可看出通过梯度下降，最终值无限接近于0。我们接下来看学习率过大和过小的结果。

lr=1e-10
[[-3.          4.        ]
 [-3.          4.        ]
 [-3.          4.        ]
 [-3.          4.        ]
 [-3.          4.        ]
 [-3.          4.        ]
 [-3.          4.        ]
 [-3.          3.99999999]
 [-3.          3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999999]
 [-2.99999999  3.99999998]]

lr=10
[[-3.00000000e+00  4.00000000e+00]
 [ 5.70000000e+01 -7.60000000e+01]
 [-1.08300000e+03  1.44400000e+03]
 [ 2.05770000e+04 -2.74360000e+04]
 [-3.90963008e+05  5.21284002e+05]
 [ 7.42829664e+06 -9.90439654e+06]
 [-1.41137328e+08  1.88183103e+08]
 [ 2.68126267e+09 -3.57501690e+09]
 [-5.09763373e+10  6.79001831e+10]
 [ 9.45170863e+11 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]
 [-2.58983747e+13 -1.29524862e+12]]

上图为所示的20个值可以发现学习率过大会发散，学习率过小会基本没更新就结束了。
在这里插入图片描述
发散的非常夸张，中间那个点是没有梯度下降的初始点

mini-batch实现

需要自行下载dataset哦，关于如何下载，在我另一篇文章中有
dataset
先说明以下只是方法展示，不建议大家运行，这个是最初版方法，运行速度慢。只是引入学习，主要看mini-batch是如何实现的。


import sys, os

sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.functions import *
from common.gradient import numerical_gradient


class TwoLayerNet:  # 两层神经网络

    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化权重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        return y  # 生成预测值

    # x:输入数据, t:监督数据
    def loss(self, x, t):
        y = self.predict(x)

        return cross_entropy_error(y, t)  # 返回预测值和真实值直接的交叉熵

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy  # 计算精度

    # x:输入数据, t:监督数据
    def numerical_gradient(self, x, t):  # 计算梯度低速版
        loss_W = lambda W: self.loss(x, t)  # 计算损失函数

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])

        return grads


# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000  # 适当设定循环的次数
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)  # 随机选择
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    # 计算梯度
    grad = network.numerical_gradient(x_batch, t_batch)
    print(grad)
    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        print(network.params)
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

print(x_batch)
print("\n")
print(t_batch)
print("\n")
print(grad)
print("\n")
print(train_loss_list)

下面代码运行速度快，大家可以自行对比两者区别。
其中引入了epoch
epoch是指学习中所有训练数据均被使用过一次的更新次数。比如10000笔数据，batch_size=100，也就是说重复梯度下降100次，所有的数据都被“看过”了，此时100次就是一个epoch。
在以下例子中，每经过一个epoch就对所有数据计算识别精度，并记录结果，这样可以加快速度。


import sys, os
sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000  # 适当设定循环的次数
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 计算梯度
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 更新参数
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

# 绘制图形
markers = {'train': 'o', 'test': 's'}
x = np.arange(len(train_acc_list))
plt.plot(x, train_acc_list, label='train acc')
plt.plot(x, test_acc_list, label='test acc', linestyle='--')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.show()

看星河的兔子

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
随机梯度下降分析以及画图实现

偏导实现梯度下降，梯度指的方向是各点函数值下降最多的方向，可用梯度寻找loss函数最小值从图中可看出通过梯度下降，最终值无限接近于0。我们接下来看学习率过大和过小的结果。上图为所示的20个值可以发现学习率过大会发散，学习率过小会基本没更新就结束了。发散的非常夸张，中间那个点是没有梯度下降的初始点。...
复制链接

扫一扫