【算法解析】随机梯度下降 SGD以及它的扩展

最新推荐文章于 2023-03-14 14:46:49 发布

Loong_DQX

最新推荐文章于 2023-03-14 14:46:49 发布

阅读量454

点赞数

文章标签：算法

本文链接：https://blog.csdn.net/weixin_43872912/article/details/123684205

版权

跟SGD随机梯度下降有关的还有两种算法，一种是mini-batch小批量梯度下降法MBGD,另一种是BGD批量梯度下降法，这三种之间的区别在于BGD是所有样本都要进行梯度求解，而SGD是随机采取一个样本进行求解，而MBGD是采取多个样本但不是全部样本。
假设线性回归函数为：
在这里插入图片描述
那么损失函数为

那优化的时候肯定是为了损失函数变的更小，也就是hθ减去y的值更近，这也就是平方的好处，消除了正负值的影响。那为了使两者最近，θ应该怎么变，就可以进行求导求解往结果变小的方向变。
求解一下导数：
在这里插入图片描述
这里的h’（θ）就是x
最后θ权重的变化公式:

这就是这三个算法的核心了。
代码例子：

# -*- coding: utf-8 -*-
import random


# This is a sample to simulate a function y = theta1*x1 + theta2*x2
input_x = [[1, 4], [2, 5], [5, 1], [4, 2]]        #2维输入，一维输出
y = [19, 26, 19, 20]
theta = [1, 1]        #θ权重
loss = 10
step_size = 0.001        #学习率
eps = 0.0001        #判断退出条件
max_iters = 10000        #最大迭代次数
error = 0
iter_count = 0
while (loss > eps and iter_count < max_iters):
    loss = 0
    # 这里更新权重的时候所有的样本点都用上了
    for i in range(3):
        pred_y = theta[0] * input_x[i][0] + theta[1] * input_x[i][1]    #预测y，根据hθ求出的结果
        theta[0] = theta[0] - step_size * (pred_y - y[i]) * input_x[i][0]    #更新权重
        theta[1] = theta[1] - step_size * (pred_y - y[i]) * input_x[i][1]
    for i in range(3):    #计算误差总和
        pred_y = theta[0] * input_x[i][0] + theta[1] * input_x[i][1]
        error = 0.5 * (pred_y - y[i]) ** 2
        loss = loss + error
    iter_count += 1
    print('iters_count', iter_count)


print('theta: ', theta)
print('final loss: ', loss)
print('iters: ', iter_count)

下面有一个更简单的代码。

#coding=utf-8
import numpy as np
import random


#下面实现的是批量梯度下降法
def batchGradientDescent(x, y, theta, alpha, m, maxIterations):
    xTrains = x.transpose()                             #得到它的转置
    for i in range(0, maxIterations):
        hypothesis = np.dot(x, theta)
        loss = hypothesis - y
        # print loss
        gradient = np.dot(xTrains, loss) / m             #对所有的样本进行求和，然后除以样本数
        theta = theta - alpha * gradient
    return theta


#下面实现的是随机梯度下降法
def StochasticGradientDescent(x, y, theta, alpha, m, maxIterations):
    data = []
    for i in range(10):
        data.append(i)
    xTrains = x.transpose()     #变成3*10，没一列代表一个训练样本
    # 这里随机挑选一个进行更新点进行即可（不用像上面一样全部考虑）
    for i in range(0,maxIterations):
        hypothesis = np.dot(x, theta)
        loss = hypothesis - y                   #注意这里有10个样本的，我下面随机抽取一个进行更新即可
        index = random.sample(data,1)           #任意选取一个样本点，得到它的下标,便于下面找到xTrains的对应列
        index1 = index[0]                       #因为回来的时候是list，我要取出变成int，更好解释
        gradient = loss[index1]*x[index1]       #只取这一个点进行更新计算
        theta = theta - alpha * gradient.T
    return theta


def predict(x, theta):
    m, n = np.shape(x)
    xTest = np.ones((m, n+1))                     #在这个例子中，是第三列放1
    xTest[:, :-1] = x                             #前俩列与x相同
    res = np.dot(xTest, theta)                    #预测这个结果
    return res


trainData = np.array([[1.1,1.5,1],[1.3,1.9,1],[1.5,2.3,1],[1.7,2.7,1],[1.9,3.1,1],[2.1,3.5,1],[2.3,3.9,1],[2.5,4.3,1],[2.7,4.7,1],[2.9,5.1,1]])
trainLabel = np.array([2.5,3.2,3.9,4.6,5.3,6,6.7,7.4,8.1,8.8])
m, n = np.shape(trainData)
theta = np.ones(n)        #3个权重
alpha = 0.1
maxIteration = 5000
#下面返回的theta就是学到的theta
theta = batchGradientDescent(trainData, trainLabel, theta, alpha, m, maxIteration)
print("theta = ",theta)
x = np.array([[3.1, 5.5], [3.3, 5.9], [3.5, 6.3], [3.7, 6.7], [3.9, 7.1]])
print(predict(x, theta))
theta = StochasticGradientDescent(trainData, trainLabel, theta, alpha, m, maxIteration)
print("theta = ",theta)
x = np.array([[3.1, 5.5], [3.3, 5.9], [3.5, 6.3], [3.7, 6.7], [3.9, 7.1]])
print (predict(x, theta))