机器学习----梯度下降方法

最新推荐文章于 2024-06-22 10:35:45 发布

__Miracle__

最新推荐文章于 2024-06-22 10:35:45 发布

阅读量471

点赞数 2

分类专栏：机器学习

原文链接：https://www.jianshu.com/p/4d3bb9efbad6

版权

机器学习专栏收录该内容

15 篇文章 14 订阅

订阅专栏

梯度下降法

不是一个机器学习算法
是一种基于搜索的最优化算法
作用：最小化一个损失函数

这个二维平面与前面提到的所有二维平面不同。注意坐标系是什么。

在直线方程中，导数代表斜率
在曲线方程中，导数代表切线斜率
在梯度下降法中，导数代表theta单位变化时，J相应的变化
导数可以代表方向，对应J增大的方向
在梯度下降法中，theta应该向导数的负方向移动
在多维函数中，要对各个方向的分量分别求导，最终得到的方向就是梯度

以下是定义了一个损失函数以后，参数theta对应的损失函数J的值对应的示例图，我们需要找到使得损失函数值J取得最小值对应的theta（这里是二维平面，也就是我们的参数只有一个）

在这里插入图片描述

η太小，会减慢收敛学习速度

在这里插入图片描述
η太大，甚至导致不收敛

并不是所有函数都有唯一的极值点

解决方案：

多次运行，随机化初始点
梯度下降法的初始点也是一个超参数

在这里插入图片描述

模拟梯度下降法

import numpy as np
import matplotlib.pyplot as plt

plot_x = np.linspace(-1., 6., 141)
plot_x

array([-1.  , -0.95, -0.9 , -0.85, -0.8 , -0.75, -0.7 , -0.65, -0.6 ,
       -0.55, -0.5 , -0.45, -0.4 , -0.35, -0.3 , -0.25, -0.2 , -0.15,
       -0.1 , -0.05,  0.  ,  0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,
        0.35,  0.4 ,  0.45,  0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,
        0.8 ,  0.85,  0.9 ,  0.95,  1.  ,  1.05,  1.1 ,  1.15,  1.2 ,
        1.25,  1.3 ,  1.35,  1.4 ,  1.45,  1.5 ,  1.55,  1.6 ,  1.65,
        1.7 ,  1.75,  1.8 ,  1.85,  1.9 ,  1.95,  2.  ,  2.05,  2.1 ,
        2.15,  2.2 ,  2.25,  2.3 ,  2.35,  2.4 ,  2.45,  2.5 ,  2.55,
        2.6 ,  2.65,  2.7 ,  2.75,  2.8 ,  2.85,  2.9 ,  2.95,  3.  ,
        3.05,  3.1 ,  3.15,  3.2 ,  3.25,  3.3 ,  3.35,  3.4 ,  3.45,
        3.5 ,  3.55,  3.6 ,  3.65,  3.7 ,  3.75,  3.8 ,  3.85,  3.9 ,
        3.95,  4.  ,  4.05,  4.1 ,  4.15,  4.2 ,  4.25,  4.3 ,  4.35,
        4.4 ,  4.45,  4.5 ,  4.55,  4.6 ,  4.65,  4.7 ,  4.75,  4.8 ,
        4.85,  4.9 ,  4.95,  5.  ,  5.05,  5.1 ,  5.15,  5.2 ,  5.25,
        5.3 ,  5.35,  5.4 ,  5.45,  5.5 ,  5.55,  5.6 ,  5.65,  5.7 ,
        5.75,  5.8 ,  5.85,  5.9 ,  5.95,  6.  ])

plot_y = (plot_x-2.5)**2 - 1.

plt.plot(plot_x, plot_y)
plt.show()

epsilon = 1e-8 #10的负8次方
eta = 0.1

def J(theta):
    return (theta-2.5)**2 - 1.

def dJ(theta):
    return 2*(theta-2.5)

theta = 0.0
while True:
    gradient = dJ(theta)
    last_theta = theta
    theta = theta - eta * gradient
    
    if(abs(J(theta) - J(last_theta)) < epsilon):
        break
    
print(theta)
print(J(theta))

2.499891109642585
-0.99999998814289

theta = 0.0
theta_history = [theta]
while True:
    gradient = dJ(theta)
    last_theta = theta
    theta = theta - eta * gradient
    theta_history.append(theta)
    
    if(abs(J(theta) - J(last_theta)) < epsilon):
        break

plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()

len(theta_history)

theta_history = []

def gradient_descent(initial_theta, eta, epsilon=1e-8):
    theta = initial_theta
    theta_history.append(initial_theta)

    while True:
        gradient = dJ(theta)
        last_theta = theta
        theta = theta - eta * gradient
        theta_history.append(theta)
    
        if(abs(J(theta) - J(last_theta)) < epsilon):
            break
            
def plot_theta_history():
    plt.plot(plot_x, J(plot_x))
    plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
    plt.show()

eta = 0.01
theta_history = []
gradient_descent(0, eta)
plot_theta_history()

len(theta_history)

eta = 0.001
theta_history = []
gradient_descent(0, eta)
plot_theta_history()

len(theta_history)

eta = 0.8
theta_history = []
gradient_descent(0, eta)
plot_theta_history()

eta = 1.1
theta_history = []
gradient_descent(0, eta)

---------------------------------------------------------------------------

OverflowError                             Traceback (most recent call last)

<ipython-input-41-81ad09e9d3e0> in <module>()
      1 eta = 1.1
      2 theta_history = []
----> 3 gradient_descent(0, eta)


<ipython-input-35-37822183f4df> in gradient_descent(initial_theta, eta, epsilon)
     11         theta_history.append(theta)
     12 
---> 13         if(abs(J(theta) - J(last_theta)) < epsilon):
     14             break
     15 


<ipython-input-32-e8b6a6f56c24> in J(theta)
      1 def J(theta):
----> 2     return (theta-2.5)**2 - 1.
      3 
      4 def dJ(theta):
      5     return 2*(theta-2.5)


OverflowError: (34, 'Result too large')

def J(theta):
    try:
        return (theta-2.5)**2 - 1.
    except:
        return float('inf')

def gradient_descent(initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
    
    theta = initial_theta
    i_iter = 0
    theta_history.append(initial_theta)

    while i_iter < n_iters:
        gradient = dJ(theta)
        last_theta = theta
        theta = theta - eta * gradient
        theta_history.append(theta)
    
        if(abs(J(theta) - J(last_theta)) < epsilon):
            break
            
        i_iter += 1
        
    return

eta = 1.1
theta_history = []
gradient_descent(0, eta)

len(theta_history)

eta = 1.1
theta_history = []
gradient_descent(0, eta, n_iters=10)
plot_theta_history()

多元线性回归中的梯度下降法

一个三维空间中的梯度下降法（x,y为系数，z为损失函数）

推导过程

上面推导出的式子的大小是和样本数有关的，m越大，结果越大，这是不合理的，我们希望和m无关

在线性回归模型中使用梯度下降法

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(666)
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)

X = x.reshape(-1, 1)

X[:20]

array([[ 1.40087424],
       [ 1.68837329],
       [ 1.35302867],
       [ 1.45571611],
       [ 1.90291591],
       [ 0.02540639],
       [ 0.8271754 ],
       [ 0.09762559],
       [ 0.19985712],
       [ 1.01613261],
       [ 0.40049508],
       [ 1.48830834],
       [ 0.38578401],
       [ 1.4016895 ],
       [ 0.58645621],
       [ 1.54895891],
       [ 0.01021768],
       [ 0.22571531],
       [ 0.22190734],
       [ 0.49533646]])

y[:20]

array([ 8.91412688,  8.89446981,  8.85921604,  9.04490343,  8.75831915,
        4.01914255,  6.84103696,  4.81582242,  3.68561238,  6.46344854,
        4.61756153,  8.45774339,  3.21438541,  7.98486624,  4.18885101,
        8.46060979,  4.29706975,  4.06803046,  3.58490782,  7.0558176 ])

plt.scatter(x, y)
plt.show()

使用梯度下降法训练

由于本次传入的是真实数据，每个特征之间的值差距很大，依然会造成overflow

def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(X_b)
    except:
        return float('inf')

def dJ(theta, X_b, y):
    res = np.empty(len(theta))
    res[0] = np.sum(X_b.dot(theta) - y)  
    for i in range(1, len(theta)):
    	# 这里进行了向量化 sum与否都一样
        res[i] = (X_b.dot(theta) - y).dot(X_b[:,i])
    return res * 2 / len(X_b)

def gradient_descent(X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
    
    theta = initial_theta
    cur_iter = 0

    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
            
        cur_iter += 1

    return theta

X_b = np.hstack([np.ones((len(x), 1)), x.reshape(-1,1)])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

theta = gradient_descent(X_b, y, initial_theta, eta)

theta

array([ 4.02145786,  3.00706277])

封装我们的线性回归算法

import numpy as np
from .metrics import r2_score

class LinearRegression:

    def __init__(self):
        """初始化Linear Regression模型"""
        self.coef_ = None
        self.intercept_ = None
        self._theta = None

    def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
        """根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"

        def J(theta, X_b, y):
            try:
                return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
            except:
                return float('inf')

        def dJ(theta, X_b, y):
            res = np.empty(len(theta))
            res[0] = np.sum(X_b.dot(theta) - y)
            for i in range(1, len(theta)):
                res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
            return res * 2 / len(X_b)

        def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):

            theta = initial_theta
            cur_iter = 0

            while cur_iter < n_iters:
                gradient = dJ(theta, X_b, y)
                last_theta = theta
                theta = theta - eta * gradient
                if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
                    break

                cur_iter += 1

            return theta

        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        initial_theta = np.zeros(X_b.shape[1])
        self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)

        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self

    def predict(self, X_predict):
        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
        assert self.intercept_ is not None and self.coef_ is not None, \
            "must fit before predict!"
        assert X_predict.shape[1] == len(self.coef_), \
            "the feature number of X_predict must be equal to X_train"

        X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
        return X_b.dot(self._theta)

    def score(self, X_test, y_test):
        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""

        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)

    def __repr__(self):
        return "LinearRegression()"

测试我们的线性回归算法

from playML.LinearRegression import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit_gd(X, y)

LinearRegression()

lin_reg.coef_

array([ 3.00706277])

lin_reg.intercept_

4.021457858204859

4.3 向量化

上面展开的行向量是根据式子一共有m个逐一展开的，然后点乘的方块式子就是X_b, 展开后的点乘就是原来左边的式子，这是一个 1xm的向量dot一个mxn+1的矩阵，得到的是一个1xn+1的行向量，虽然numpy不区分行向量还是列向量，在这里最后的式子是将向量进行了整体的转置，然后进行展开得到的结果
AB的转置等于B的转置乘以A的转置

修改之前的求导函数

def dJ(theta, X_b, y):
            ## res = np.empty(len(theta))
            ## res[0] = np.sum(X_b.dot(theta) - y)
            ##
            ## for i in range(1, len(theta)):
            ##     res[i] = np.sum((X_b.dot(theta) - y).dot(X_b[:, i]))
            ##
            ## return res * 2 / len(X_b)
            return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)

梯度下降法的向量化

import numpy as np
from sklearn import datasets

boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)

from playML.LinearRegression import LinearRegression

lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)

CPU times: user 45.6 ms, sys: 5.74 ms, total: 51.3 ms
Wall time: 56.4 ms





0.81298026026584658

使用梯度下降法

lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train, y_train)

/Users/yuanzhang/Dropbox/code/My MOOC/Play-with-Machine-Learning-Algorithms/06-Gradient-Descent/05-Vectorize-Gradient-Descent/playML/LinearRegression.py:32: RuntimeWarning: overflow encountered in square
  return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
/Users/yuanzhang/Dropbox/code/My MOOC/Play-with-Machine-Learning-Algorithms/06-Gradient-Descent/05-Vectorize-Gradient-Descent/playML/LinearRegression.py:48: RuntimeWarning: invalid value encountered in double_scalars
  if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):





LinearRegression()

lin_reg2.coef_

array([ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan])

lin_reg2.fit_gd(X_train, y_train, eta=0.000001)

LinearRegression()

lin_reg2.score(X_test, y_test)

0.27556634853389195

%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001, n_iters=1e6)

CPU times: user 48.4 s, sys: 265 ms, total: 48.6 s
Wall time: 49.9 s





LinearRegression()

lin_reg2.score(X_test, y_test)

0.75418523539807636

使用真实的数据，调整eta和iters，要么由于eta太小导致无法得出真实的结果，要么由于eta太大导致训练时间加长，这是由于数据的规模在不同的特征上不同，所以我们需要对数据进行归一化

数据归一化

使用梯度下降法前进行数据归一化

from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)

lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)

CPU times: user 237 ms, sys: 4.37 ms, total: 242 ms
Wall time: 258 ms





LinearRegression()

X_test_standard = standardScaler.transform(X_test)
lin_reg3.score(X_test_standard, y_test)

0.81298806201222351

梯度下降法的优势

m = 1000
n = 5000

big_X = np.random.normal(size=(m, n))

true_theta = np.random.uniform(0.0, 100.0, size=n+1)

big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0., 10., size=m)

big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X, big_y)

CPU times: user 18.8 s, sys: 899 ms, total: 19.7 s
Wall time: 10.9 s





LinearRegression()

big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X, big_y)

CPU times: user 9.51 s, sys: 121 ms, total: 9.63 s
Wall time: 5.76 s





LinearRegression()

如果样本数非常多，那么即使使用梯度下降法也会导致速度比较慢，因为在梯度下降法中，每一个样本都要参与运算。这时候需要采用随机梯度下降法。

随机梯度下降法

随机梯度下降法介绍

批量梯度下降法带来的一个问题是η的值需要设置的比较小，在样本数比较多的时候导致不是速度特别慢，这时候观察随机梯度下降法损失函数的求导公式，可以发现，我们对每一个Xb都做了求和操作，又在最外面除以了m，那么可以考虑将求和和除以m的两个运算约掉，采用每次使用一个随机的Xb

由于我们使用的事随机梯度下降法，所以导致我们的最终结果不会像批量梯度下降法一样准确的朝着一个方向运算，而是曲线行下降，这时候我们就希望，越到下面，η值相应减小，事运算次数变多，从而精确计算结果

这里使用了模拟退火的思想

随机梯度下降法：它的具体思路是在更新每一参数时都使用一个样本来进行更新，也就是方程中的m等于1。每一次跟新参数都用一个样本，更新很多次。如果样本量很大的情况（例如几十万），那么可能只用其中几万条或者几千条的样本，就已经将theta迭代到最优解了，对比上面的批量梯度下降，迭代一次需要用到十几万训练样本，一次迭代不可能最优，如果迭代10次的话就需要遍历训练样本10次，这种跟新方式计算复杂度太高。
但是，SGD伴随的一个问题是噪音较BGD要多，使得SGD并不是每次迭代都向着整体最优化方向。
优点：训练速度快；
缺点：准确度下降，并不是全局最优；不易于并行实现。
从迭代的次数上来看，SGD迭代的次数较多，在解空间的搜索过程看起来很盲目。

批量梯度下降法（Batch Gradient Descent，简称BGD）是梯度下降法最原始的形式，它的具体思路是在更新每一参数时都使用所有的样本来进行更新，也就是方程（1）中的m表示样本的所有个数。
优点：全局最优解；易于并行实现；
缺点：当样本数目很多时，训练过程会很慢。

小批量梯度下降法（Mini-batch Gradient Descent，简称MBGD）：它的具体思路是在更新每一参数时都使用一部分样本来进行更新，也就是方程（1）中的m的值大于1小于所有样本的数量。为了克服上面两种方法的缺点，又同时兼顾两种方法的有点。

从公式中可以看出，批量梯度下降算法每迭代一步，要用到训练集的所有样本，最后得到的是一个全局最优解。
随机梯度的权值更新公式里调整项没有累加符号，说明随机梯度下降中只用到了训练集的一个样本，最后得到的可能是全局最优解，也可能是局部最优解。

随机梯度下降法

import numpy as np
import matplotlib.pyplot as plt

m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)

plt.scatter(x, y)
plt.show()

普通梯度下降法

def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
    except:
        return float('inf')
    
def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)

def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):

    theta = initial_theta
    cur_iter = 0

    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break

        cur_iter += 1

    return theta

%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)

CPU times: user 2.81 s, sys: 134 ms, total: 2.94 s
Wall time: 1.88 s

theta

array([ 3.00383464,  4.01706856])

随机梯度下降法

def dJ_sgd(theta, X_b_i, y_i):
    return 2 * X_b_i.T.dot(X_b_i.dot(theta) - y_i)

def sgd(X_b, y, initial_theta, n_iters):

    t0, t1 = 5, 50
    def learning_rate(t):
        return t0 / (t + t1)

    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient

    return theta

%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=m//3)

CPU times: user 559 ms, sys: 22.6 ms, total: 582 ms
Wall time: 647 ms

theta

array([ 3.04732375,  4.03214249])

随机梯度下降法的封装和测试

def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
        """
        根据训练数据集X_train, y_train, 使用随机梯度下降法训练Linear Regression模型
        :param X_train:
        :param y_train:
        :param n_iters: 在随机梯度下降法中，n_iters代表所有的样本会被看几圈
        :param t0:
        :param t1:
        :return:
        """
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert n_iters >= 1

        def dJ_sgd(theta, X_b_i, y_i):
            """
            去X_b,y 中的随机一个元素进行导数公式的计算
            :param theta:
            :param X_b_i:
            :param y_i:
            :return:
            """
            return X_b_i * (X_b_i.dot(theta) - y_i) * 2.

        def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):

            def learning_rate(t):
                """
                计算学习率，t1 为了减慢变化速度，t0为了增加随机性
                :param t: 第t次循环
                :return:
                """
                return t0 / (t + t1)

            theta = initial_theta
            m = len(X_b)

            for cur_iter in range(n_iters):
                ## 对X_b进行一个乱序的排序
                indexes = np.random.permutation(m)
                X_b_new = X_b[indexes]
                y_new = y[indexes]

                ## 对整个数据集看一遍
                for i in range(m):
                    gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
                    theta = theta - learning_rate(cur_iter * m + i) * gradient

            return theta

        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        initial_theta = np.random.randn(X_b.shape[1])
        self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)

        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self

封装我们自己的SGD

import numpy as np
import matplotlib.pyplot as plt

m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)

from playML.LinearRegression import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit_bgd(X, y)
print(lin_reg.intercept_, lin_reg.coef_)

3.01644183437 [ 3.99995653]

lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)
print(lin_reg.intercept_, lin_reg.coef_)

2.99558568395 [ 4.02610767]

真实使用我们自己的SGD

from sklearn import datasets

boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)

from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)

from playML.LinearRegression import LinearRegression

lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_reg.score(X_test_standard, y_test)

CPU times: user 11.9 ms, sys: 4.13 ms, total: 16.1 ms
Wall time: 13.5 ms





0.78651716204682975

%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg.score(X_test_standard, y_test)

CPU times: user 155 ms, sys: 8.11 ms, total: 163 ms
Wall time: 158 ms





0.80857287165738345

%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=100)
lin_reg.score(X_test_standard, y_test)

CPU times: user 282 ms, sys: 5.11 ms, total: 287 ms
Wall time: 287 ms





0.81294846132723497

scikit-learn中的SGD

from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

CPU times: user 631 µs, sys: 54 µs, total: 685 µs
Wall time: 690 µs





0.80478459701573024

sgd_reg = SGDRegressor(n_iter=50)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

CPU times: user 4.5 ms, sys: 1.04 ms, total: 5.54 ms
Wall time: 3.94 ms





0.81199073931878407

需要注意的是sklearn中的梯度下降法比我们自己的算法要复杂的多，性能和计算准确度上都比我们的要好，我们的算法只是用来演示过程，具体生产上的使用还是应该使用Sklearn提供的

梯度下降法的调试

梯度下降法调试的原理

可能我们计算出梯度下降法的公式，并使用python编程实现，预测的过程中程序并没有报错，但是可能我们需要求的梯度的结果是错误的，这个时候需要怎么样去调试发现错误呢。

首先以二维坐标平面为例，一个点（O）的导数就是曲线在这个点的切线的斜率，在这个点两侧各取一个点（AB），那么AB两点对应的直线的斜率应该大体等于O的切线的斜率，并且这A和B的距离越近，那么两条直线的斜率就越接近事实上，这也正是导数的定义，当函数y=f(x)的自变量x在一点x0上产生一个增量Δx时，函数输出值的增量Δy与自变量增量Δx的比值在Δx趋于0时的极限a如果存在，a即为在x0处的导数，记作f’(x0)或df(x0)/dx

扩展到多维维度则如下

梯度下降法调试

关于梯度的计算调试

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(666)
X = np.random.random(size=(1000, 10))

true_theta = np.arange(1, 12, dtype=float)
X_b = np.hstack([np.ones((len(X), 1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)

true_theta

array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.])

X.shape

(1000, 10)

y.shape

(1000,)

def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(X_b)
    except:
        return float('inf')

def dJ_math(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)

def dJ_debug(theta, X_b, y, epsilon=0.01):
    res = np.empty(len(theta))
    for i in range(len(theta)):
        theta_1 = theta.copy()
        theta_1[i] += epsilon
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
    return res

def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
    
    theta = initial_theta
    cur_iter = 0

    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
            
        cur_iter += 1

    return theta

X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

%time theta = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)
theta

CPU times: user 13.8 s, sys: 283 ms, total: 14.1 s
Wall time: 7.6 s





array([  1.1251597 ,   2.05312521,   2.91522497,   4.11895968,
         5.05002117,   5.90494046,   6.97383745,   8.00088367,
         8.86213468,   9.98608331,  10.90529198])

%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
theta

CPU times: user 1.57 s, sys: 30.6 ms, total: 1.6 s
Wall time: 856 ms





array([  1.1251597 ,   2.05312521,   2.91522497,   4.11895968,
         5.05002117,   5.90494046,   6.97383745,   8.00088367,
         8.86213468,   9.98608331,  10.90529198])

由此可以看出，我们的dJ_debug和dJ_math的结果是相近的，所以我们的dJ_math的数学推导是没问题的。
我们可以在真正的机器学习之前，先使用dJ_debug这种调试方式来验证一下我们的dJ_math的结果是否正确，然后再进行机器学习。

dJ_debug是通用的，可以放在任何求导的debug过程中，所以可以作为我们机器学习的工具箱来使用.

__Miracle__

关注

2
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
机器学习----梯度下降方法

梯度下降法不是一个机器学习算法是一种基于搜索的最优化算法作用：最小化一个损失函数这个二维平面与前面提到的所有二维平面不同。注意坐标系是什么。在直线方程中，导数代表斜率在曲线方程中，导数代表切线斜率在梯度下降法中，导数代表theta单位变化时，J相应的变化导数可以代表方向，对应J增大的方向在梯度下降法中，theta应该向导数的负方向移动在多维函数中，要对各个方向的分量分别求导，最终得到的方向就是梯度以下是定义了一个损失函数以后，参数theta对应的损失函数J的值对应的示例图，我们需要
复制链接

扫一扫