1.基本概念
- 梯度下降法(Gradient descent)是一个一阶最优化算法。 要使用梯度下降法找到一个函数的局部极小值,必须向函数上当前点对应梯度(或者是近似梯度)的反方向的规定步长距离点进行迭代搜索。如果相反地向梯度正方向迭代进行搜索,则会接近函数的局部极大值点;这个过程则被称为梯度上升法。
2.几种梯度下降方法(针对线性回归算法)
2.1 批量梯度下降法
1.批量梯度下降法的特点及原理

- 运算量大:批量梯度下降法中的每一项计算:
,要计算所有样本(共 m 个); - 批量梯度下降法的梯度是损失函数减小最快的方向,也就是说,对应相同的 theta 变化量,损失函数在梯度方向上的变化量最大;
- 梯度是优化的方向,损失函数的变量 theta 的变化量 = 学习率 X 当前梯度值
2.2 随机梯度下降法(SGD)
1.随机梯度下降法的原理,特点

- 新的搜索方向计算公式(也即是优化的方向):
; - 此处称为搜索方向,而不是梯度的计算公式,因为此公式已经不是梯度公式,而表示优化损失函数的方向;
- 随机梯度下降法的搜索路径:

- 特点:
每一次搜索的方向,不能保证是损失函数减小的方向;
每一次搜索的方向,不能保证是损失函数减小最快的方向;
其优化方向具有不可预知性; - 意义:
实验结论表明,即使随机梯度下降法的优化方向具有不可预知性,通过此方法依然可以差不多来到损失函数最小值的附近,虽然不像批量梯度下降法那样,一定可以来到损失函数最小值位置,但是,如果样本数量很大时,有时可以用一定的模型精度,换取优化模型所用的时间;
2.3 小批量梯度下降法
是批量梯度法和随机梯度法的折中。
3.自己的梯度下降法的实现(针对线性回归算法)
3.1 批量梯度和随机梯度下降法实现
1.向量化的方式实现(封装好)
import numpy as np
from .metrics import r2_score
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def fit_normal(self, X_train, y_train):
"""根据训练数据集X_train, y_train训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_bgd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def fit_sgd(self, X_train, y_train, n_iters=50, t0=5, t1=50):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
def sgd(X_b, y, initial_theta, n_iters=5, t0=5, t1=50):
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for i_iter in range(n_iters):
indexes = np.random.permutation(m)
X_b_new = X_b[indexes,:]
y_new = y[indexes]
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(i_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return r2_score(y_test, y_predict)
def __repr__(self):
return "LinearRegression()"
<注> fit_bgd中使用的就是批量梯度下降方法,eta代表学习率。J代表误差函数值,dJ表示某点的梯度。而fit_sgd使用的是随机梯度下降法。
4.scikit-learn中的随机梯度法
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)
sgd_reg = SGDRegressor(n_iter_no_change=100)
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)
5.使用真实的数据来测试SGD
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y<50.0]
y = y[y<50.0]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2,random_state=666)
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(X_train,y_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter_no_change=100)
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)
上面代码的流程就是使用梯度下降法 的一般流程:包括,提取数据特征,训练,测试集分开,数据的标准化,回归模型的搭建。
6.梯度的调试(其中DJ-debug可以适用所有函数求梯度
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
X = np.random.random(size=(1000, 10))
true_theta = np.arange(1, 12, dtype=float)
X_b = np.hstack([np.ones((len(X), 1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)
true_theta
X.shape
y.shape
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ_math(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def dJ_debug(theta, X_b, y, epsilon=0.01):
res = np.empty(len(theta))
for i in range(len(theta)):
theta_1 = theta.copy()
theta_1[i] += epsilon
theta_2 = theta.copy()
theta_2[i] -= epsilon
res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
return res
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
%time theta = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)
theta
%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
theta
%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
可以通过DJ_debug的方法来验证DJ_math方法推导的是否正确。DJ_debug求梯度是按导数的定义的方式来求的,其适用于任何的损失函数。对应求出的都是线性回归中系数θ值。
844

被折叠的 条评论
为什么被折叠?



