机器学习--基础算法--梯度下降法

1 什么是梯度下降法

说明:
不是一个机器学习算法
是一种基于搜索的最优化方法
作用:最小化一个损失函数
梯度上升法:最大化一个效用函数

在这里插入图片描述
●η称为学习率(learning rate)
●η的取值影响获得最优解的速度
●n取值不合适,甚至得不到最优解
●n是梯度下降法的一个超参数

并不是所有函数都有唯一的极值点
解决方案:
●多次运行,随机化初始点
●梯度下降法的初始点也是一个超参数
在这里插入图片描述

2 模拟实现梯度下降法

import numpy as np
import matplotlib.pyplot as plt
plot_x = np.linspace(-1, 6, 141)
plot_y = (plot_x - 2.5) ** 2 - 1
# 可视化
# 横轴是theta,纵轴是损失函数
plt.plot(plot_x, plot_y)
plt.show()

输出:
在这里插入图片描述

# 计算损失函数对于theta的导数
def dJ(theta):
    return 2 * (theta - 2.5)

# 计算损失函数
def J(theta):
    return (theta - 2.5) ** 2 - 1

模拟梯度下降法的过程

eta = 0.1 # 初始化学习率
epsilon = 1e-8 # 初始化精度
theta = 0.0 # 给定起始点
while True:
    gradient = dJ(theta) # 计算梯度
    last_theta = theta
    theta = theta - eta * gradient # 更新theta值
    
    if abs(J(theta) - J(last_theta)) < epsilon:
        break
        
print(theta)
print(J(theta))
>>>2.499891109642585
>>>-0.99999998814289

在损失函数图像上画出梯度下降的过程

eta = 0.1 # 初始化学习率
epsilon = 1e-8 # 初始化精度
theta = 0.0 # 给定起始点
theta_history = [theta]
while True:
    gradient = dJ(theta) # 计算梯度
    last_theta = theta
    theta = theta - eta * gradient # 更新theta值
    theta_history.append(theta)
    
    if abs(J(theta) - J(last_theta)) < epsilon:
        break
        
plt.plot(plot_x, J(plot_x))
plt.plot(theta_history, J(np.array(theta_history)), color = 'r', marker = '+')
plt.show()

输出:
在这里插入图片描述

# 查看theta_history的长度
len(theta_history)
>>>46

将计算梯度和绘制梯度下降过程封装成函数:

def gradient_descent(initial_theta, eta, epsilon = 1e-8):
    theta = initial_theta # 给定起始点
    theta_history.append(theta)
    while True:
        gradient = dJ(theta) # 计算梯度
        last_theta = theta
        theta = theta - eta * gradient # 更新theta值
        theta_history.append(theta)
    
        if abs(J(theta) - J(last_theta)) < epsilon:
            break
            
def plot_theta_history():
    plt.plot(plot_x, J(plot_x))
    plt.plot(theta_history, J(np.array(theta_history)), color = 'r', marker = '+')
    plt.show()

降低学习率eta,重新模拟梯度下降过程:

eta = 0.01
theta_history = []
initial_theta = 0.0
gradient_descent(initial_theta, eta)
plot_theta_history()

输出:
在这里插入图片描述
为了防止迭代次数过多陷入死循环,可设置迭代次数:

def gradient_descent(initial_theta, eta, n_iter = 1e3, epsilon = 1e-8):
    theta = initial_theta # 给定起始点
    theta_history.append(theta)
    i_iter = 0
    while i_iter < n_iter:
        gradient = dJ(theta) # 计算梯度
        last_theta = theta
        theta = theta - eta * gradient # 更新theta值
        theta_history.append(theta)
    
        if abs(J(theta) - J(last_theta)) < epsilon:
            break
            
        i_iter += 1

设置一个较大的学习率试试:

eta = 1.1
theta_history = []
initial_theta = 0.0
gradient_descent(initial_theta, eta)
len(theta_history)
>>>1001
# 查看theta_history的最后一个值
theta_history[-1]
>>>-3.79477522293131e+79
eta = 1.1
theta_history = []
initial_theta = 0.0
gradient_descent(initial_theta, eta)
plot_theta_history()

输出:

在这里插入图片描述

3 线性回归中的梯度下降法

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

4 实现线性回归中的梯度下降法

在线性回归模型中使用梯度下降法:

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = 2 * np.random.random(size = 100)
y = x * 3 + 4 + np.random.normal(size = 100)
x = x.reshape(-1, 1)
print(x.shape)
print(y.shape)
>>>(100, 1)
>>>(100,)
># 可视化
plt.scatter(x, y)
plt.show()

输出:
在这里插入图片描述
使用梯度下降法训练:
在这里插入图片描述
封装计算损失函数与梯度的函数:

def J(theta, x_b, y):
    try:
        return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
    except:
        return float('inf')

def dJ(theta, x_b, y):
    res = np.empty(len(theta))
    res[0] = np.sum(x_b.dot(theta) - y)
    for i in range(1, len(theta)):
        res[i] = (x_b.dot(theta) - y).dot(x_b[:,i])
    return res * 2 / len(x_b)

接下来进行梯度下降的过程:

def gradient_descent(x_b, y, initial_theta, eta, epsilon = 1e-8):
    theta = initial_theta # 给定起始点
    while True:
        gradient = dJ(theta, x_b, y) # 计算梯度
        last_theta = theta
        theta = theta - eta * gradient # 更新theta值
    
        if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
            break
            
    return theta
x_b = np.hstack([np.ones((len(x), 1)), x])
initial_theta = np.zeros(x_b.shape[1])
eta = 0.01
theta = gradient_descent(x_b, y, initial_theta, eta)
theta    
>>>array([4.02145786, 3.00706277])

在自己编写的LinearRegression.py文件中添加梯度下降法:

    def fit_gd(self, x_train, y_train, eta = 0.01, n_iter = 1e3):
        """根据x_train和y_train,使用梯度下降法训练LinearRegression模型"""
        assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'

        def J(theta, x_b, y):
            try:
                return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
            except:
                return float('inf')

        def dJ(theta, x_b, y):
            res = np.empty(len(theta))
            res[0] = np.sum(x_b.dot(theta) - y)
            for i in range(1, len(theta)):
                res[i] = (x_b.dot(theta) - y).dot(x_b[:, i])
            return res * 2 / len(x_b)

        def gradient_descent(x_b, y, initial_theta, eta, epsilon=1e-8):
            theta = initial_theta  # 给定起始点
            while True:
                gradient = dJ(theta, x_b, y)  # 计算梯度
                last_theta = theta
                theta = theta - eta * gradient  # 更新theta值

                if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
                    break

            return theta

        x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        initial_theta = np.zeros(x_b.shape[1])
        self._theta = gradient_descent(x_b, y_train, initial_theta, eta)
        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self

使用自己编写的LinearRegression中的梯度下降法:

from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_gd(x, y)
>>>LinearRegression()
# 查看计算出的参数
print(lin_reg.coef_)
print(lin_reg.interception_)
>>>[3.00706277]
>>>4.021457858204859

5 梯度下降的向量化和数据标准化

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在编写的LinearRegression.py文件的fit_gd方法中使用向量化的方式求导数:

    def fit_gd(self, x_train, y_train, eta = 0.01, n_iter = 1e3):
        """根据x_train和y_train,使用梯度下降法训练LinearRegression模型"""
        assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'

        def J(theta, x_b, y):
            try:
                return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
            except:
                return float('inf')

        def dJ(theta, x_b, y):
            # 用for循环求导数
            # res = np.empty(len(theta))
            # res[0] = np.sum(x_b.dot(theta) - y)
            # for i in range(1, len(theta)):
            #     res[i] = (x_b.dot(theta) - y).dot(x_b[:, i])
            # return res * 2 / len(x_b)

            # 用向量化求导数
            return x_b.T.dot(x_b.dot(theta) - y) * 2 / len(x_b)

        def gradient_descent(x_b, y, initial_theta, eta, epsilon=1e-8):
            theta = initial_theta  # 给定起始点
            while True:
                gradient = dJ(theta, x_b, y)  # 计算梯度
                last_theta = theta
                theta = theta - eta * gradient  # 更新theta值

                if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
                    break

            return theta

        x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        initial_theta = np.zeros(x_b.shape[1])
        self._theta = gradient_descent(x_b, y_train, initial_theta, eta)
        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self

在这里插入图片描述
加载并预处理数据:

import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
# 去掉异常数据
x = x[y < 50]
y = y[y < 50]
from playML.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, seed = 666)
from playML.LinearRegression import LinearRegression
lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(x_train, y_train)
>>>Wall time: 257 ms
>>>LinearRegression()
lin_reg1.score(x_test, y_test)
>>>0.8129794056212793

使用梯度下降法前进行数据归一化:

from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(x_train)
x_train_standard = standardscaler.transform(x_train)
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(x_train_standard, y_train)
>>>Wall time: 444 ms
>>>LinearRegression()
x_test_standard = standardscaler.transform(x_test)
lin_reg3.score(x_test_standard, y_test)
>>>0.8129873310487505

6 随机梯度下降法

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
使用自己模拟的数据:

import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size = m)
y = 4. * x + 3. + np.random.normal(size = m)
def J(theta, x_b, y):
    try:
        return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
    except:
        return float('inf')
    
def dJ(theta, x_b, y):
    return x_b.T.dot(x_b.dot(theta) - y) * 2 / len(x_b)

# 非默认参数不能放在默认参数之后
def gradient_descent(x_b, y, initial_theta, eta, n_iter = 1e4, epsilon=1e-8):
    theta = initial_theta  # 给定起始点
    cur_iter = 0
    while cur_iter < n_iter:
        gradient = dJ(theta, x_b, y)  # 计算梯度
        last_theta = theta
        theta = theta - eta * gradient  # 更新theta值

        if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
            break
            
        cur_iter += 1

    return theta
%%time
x_b = np.hstack([np.ones((len(x), 1)), x])
initial_theta = np.zeros(x_b.shape[1])
eta = 0.01
theta = gradient_descent(x_b, y, initial_theta, eta) 
>>>Wall time: 1.37 s   
theta
>>>array([3.00392812, 4.00011491])

随机梯度下降法:

def dJ_sgd(theta, x_b_i, y_i):
    return x_b_i.T.dot(x_b_i.dot(theta) - y_i) * 2 

def sgd(x_b, y, initial_theta, n_iter):
    t0 = 5
    t1 = 50
    
    def learning_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iter):
        rand_i = np.random.randint(len(x_b))
        gradient = dJ_sgd(theta, x_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient
    
    return theta
%%time
x_b = np.hstack([np.ones((len(x), 1)), x])
initial_theta = np.zeros(x_b.shape[1])
theta = sgd(x_b, y, initial_theta, n_iter = len(x_b) // 3)   
>>>Wall time: 877 ms
theta
>>>array([3.0107388 , 4.00465842]) 

7 scikit-learn中的随机梯度下降法

在我们编写的LinearRegression.py文件中添加使用随机梯度下降法计算梯度的算法:

    def fit_sgd(self, x_train, y_train, n_iter = 5, t0 = 5, t1 = 50):
        """根据x_train和y_train,使用随机梯度下降法训练LinearRegression模型"""
        assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'
        assert n_iter >= 1

        def dJ_sgd(theta, x_b_i, y_i):
            return x_b_i.T.dot(x_b_i.dot(theta) - y_i) * 2

        def sgd(x_b, y, initial_theta, n_iter):

            def learning_rate(t):
                return t0 / (t + t1)

            theta = initial_theta
            m = len(x_b)

            for cur_iter in range(n_iter * m):
                indexes = np.random.permutation(m) # 将m个样本的索引乱序排列
                x_b_new = x_b[indexes]
                y_new = y[indexes]
                for i in range(m):
                    gradient = dJ_sgd(theta, x_b_new[i], y_new[i])
                    theta = theta - learning_rate(cur_iter * m + i) * gradient

            return theta

        x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        initial_theta = np.zeros(x_b.shape[1])
        self._theta = sgd(x_b, y_train, initial_theta, n_iter)
        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]

在真实数据下使用我们的SGD:

import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
# 去掉异常数据
x = x[y < 50]
y = y[y < 50]
from playML.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, seed = 666)
# 数据归一化
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(x_train)
x_train_standard = standardscaler.transform(x_train)
x_test_standard = standardscaler.transform(x_test)

调用我们自己编写的fit_sgd方法:

from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_sgd(x_train_standard, y_train, n_iter = 2)
%time lin_reg.score(x_test_standard, y_test)
>>>Wall time: 266 ms
>>>0.8129906331888908

sklearn中的SGD

from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor()
%time sgd_reg.fit(x_train_standard, y_train)
sgd_reg.score(x_test_standard, y_test)
>>>Wall time: 450 ms
>>>0.8130539886776001
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值