梯度下降线性回归
在普通最小二乘法线性回归中我们计算出线性回归损失函数的梯度为:
∇ J ( w ) = 1 m ∑ i = 1 m ( w T x i − y i ) x i = = 1 m X T ( X w − y ) \begin{aligned}\nabla J(\boldsymbol{w}) &=\frac{1}{m} \sum_{i=1}^{m}\left(\boldsymbol{w}^{\mathrm{T}} \boldsymbol{x}_{i}-\boldsymbol{y}_{i}\right) \boldsymbol{x}_{i} \\&==\frac{1}{m} \boldsymbol{X}^{\mathrm{T}}(\boldsymbol{X} \boldsymbol{w}-\boldsymbol{y})\end{aligned} ∇J(w)=m1i=1∑m(wTxi−yi)xi==m1XT(Xw−y)
批量梯度下降
批量梯度下降算法的每一步都是基于整个训练集计算梯度的。设学习率为
η
\eta
η,梯度下降算法的参数更新公式为:
w
:
=
w
−
η
1
m
X
T
(
X
w
−
y
)
\boldsymbol{w}:=\boldsymbol{w}-\eta \frac{1}{m} \boldsymbol{X}^{\mathrm{T}}(\boldsymbol{X} \boldsymbol{w}-\boldsymbol{y})
w:=w−ηm1XT(Xw−y)
每次使用整批训练样本计算梯度,在训练集非常大时,批量梯度下降算法会运行得极慢。
随机梯度下降
随机梯度下降则每一步只使用一个样本来计算梯度。设学习率为
η
\eta
η,随机梯度下降算法的参数更新公式为:
w
:
=
w
−
η
(
w
T
x
i
−
y
i
)
x
i
\boldsymbol{w}:=\boldsymbol{w}-\eta\left(\boldsymbol{w}^{\mathrm{T}} \boldsymbol{x}_{i}-\boldsymbol{y}_{i}\right) \boldsymbol{x}_{i}
w:=w−η(wTxi−yi)xi
小批量梯度下降
小批量梯度下降顾名思义,使用一小批样本计算梯度。设一小批样本的数量为 N N N,小批量梯度下降算法的梯度计算公式为:
∇
J
(
w
)
=
1
N
∑
i
=
k
k
+
N
(
w
T
x
i
−
y
i
)
x
i
\nabla J(\boldsymbol{w})=\frac{1}{N} \sum_{i=k}^{k+N}\left(\boldsymbol{w}^{\mathrm{T}} \boldsymbol{x}_{i}-\boldsymbol{y}_{i}\right) \boldsymbol{x}_{i}
∇J(w)=N1i=k∑k+N(wTxi−yi)xi
设学习率为
η
\eta
η,小批量梯度下降算法的参数更新公式为:
w
:
=
w
−
η
1
N
∑
i
=
k
k
+
N
(
w
T
x
i
−
y
i
)
x
i
\boldsymbol{w}:=\boldsymbol{w}-\eta \frac{1}{N} \sum_{i=k}^{k+N}\left(\boldsymbol{w}^{\mathrm{T}} \boldsymbol{x}_{i}-\boldsymbol{y}_{i}\right) \boldsymbol{x}_{i}
w:=w−ηN1i=k∑k+N(wTxi−yi)xi
同样可以写成矩阵形式:
w
:
=
w
−
η
1
N
X
T
(
X
w
−
y
)
\boldsymbol{w}:=\boldsymbol{w}-\eta \frac{1}{N} \boldsymbol{X}^{\mathrm{T}}(\boldsymbol{X} \boldsymbol{w}-\boldsymbol{y})
w:=w−ηN1XT(Xw−y)
import numpy as np
class GDLinearRegression:
def __init__(self, n_iteration, learning_rate=1e-4, gd_type='mbgd', batch_size=1):
self.n_iteration = n_iteration
self.learning_rate = learning_rate
self.w = None # 模型参数
self.gd_type = gd_type
self.batch_size = batch_size
def loss(self, y, y_predict):
loss = np.sum((y_predict - y) ** 2) / y.size
return loss
def _predict(self, X):
'''用于类内部方法的预测'''
y_predict = np.matmul(X, self.w)
return y_predict
def _bgd(self, X, y):
'''批量梯度下降'''
y_predict = self._predict(X)
grad = np.matmul(X.T, y_predict - y) / y.size
self.w -= self.learning_rate * grad
def _sgd(self, X, y):
'''随机梯度下降'''
m = X.shape[0]
m_shuffle = np.arange(m)
np.random.shuffle(m_shuffle) # 打乱数据集
X = X[m_shuffle, :]
y = y[m_shuffle]
for i in range(m):
y_predict = self._predict(X[i])
grad = np.matmul(X[i].T, y_predict - y[i])
self.w -= self.learning_rate * grad
def _mbgd(self, X, y):
'''小批量梯度下降'''
m = X.shape[0]
m_shuffle = np.arange(m)
np.random.shuffle(m_shuffle) # 打乱数据集
X = X[m_shuffle, :]
y = y[m_shuffle]
k = m // self.batch_size
for i in range(k):
minibatch_X = X[self.batch_size * i: self.batch_size * (i + 1)]
minibatch_y = y[self.batch_size * i: self.batch_size * (i + 1)]
y_predict = self._predict(minibatch_X)
grad = np.matmul(minibatch_X.T, y_predict - minibatch_y) / minibatch_y.size
self.w -= self.learning_rate * grad
def _gradient_descent(self, X, y):
_,n=np.shape(X)
self.w=np.random.random(n)
for step_i in range(self.n_iteration):
if self.gd_type.lower() == 'bgd':
self._bgd(X, y)
elif self.gd_type.lower() == 'sgd':
self._bgd(X, y)
elif self.gd_type.lower() == 'mbgd':
self._mbgd(X, y)
else:
print('No such type.')
def _preprocess_data(self, X):
'''X中添加x0=1'''
m, n = X.shape
X_processed = np.ones((m, n + 1))
X_processed[:, 1:] = X
return X_processed
def train(self, X, y):
X_processed = self._preprocess_data(X)
_, n = X_processed.shape
self.w = np.random.random(n)
self._gradient_descent(X_processed, y)
def predict(self, X):
'''用于类外部的预测'''
X_processed = self._preprocess_data(X)
y_predict = np.matmul(X_processed, self.w)
return y_predict