第1章 线性回归
线性回归是描述变量之间的线性关系。
1.1 线性回归模型
h w , b ( x ) = ∑ i = 1 n w i x i + b h_{w,b}(x) = \sum_{i=1}^n w_ix_i+b hw,b(x)=i=1∑nwixi+b
w = ( b , w 1 , w 2 , . . . . . . , w n ) T = ( w 0 , w 1 , w 2 , . . . . . . , w n ) T w = (b,w_1,w_2,......,w_n)^T = (w_0,w_1,w_2,......,w_n)^T w=(b,w1,w2,......,wn)T=(w0,w1,w2,......,wn)T
x = ( 1 , x 1 , x 2 , . . . . . . , x n ) T x = (1,x_1,x_2,......,x_n)^T x=(1,x1,x2,......,xn)T
h w ( x ) = ∑ i = 1 n w T x h_{w}(x) = \sum_{i=1}^n w^Tx hw(x)=i=1∑nwTx
1.2 最小二乘法
最小二乘法(OLS Ordinary Least Squares):用MSE作为损失函数求解参数w。MSE是均方误差,即使预测值与实际值差的平方和的均值。
X
=
(
1
x
11
…
x
1
n
1
x
21
…
x
2
n
⋮
⋮
⋱
⋮
1
x
m
1
…
x
m
n
)
=
(
x
1
T
x
2
T
⋮
x
m
T
)
X=\begin{pmatrix} 1 & x_{11} & \dots &x_{1n} \\ 1 & x_{21} & \dots &x_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ 1 & x_{m1} & \dots &x_{mn} \\ \end{pmatrix} \quad = \begin{pmatrix} x_1^T \\x_2^T \\ \vdots \\ x_m^T\end{pmatrix} \quad
X=⎝⎜⎜⎜⎛11⋮1x11x21⋮xm1……⋱…x1nx2n⋮xmn⎠⎟⎟⎟⎞=⎝⎜⎜⎜⎛x1Tx2T⋮xmT⎠⎟⎟⎟⎞
W
=
(
w
0
w
1
⋮
w
n
)
W=\begin{pmatrix} w_0 \\ w_1 \\ \vdots \\w_n\\ \end{pmatrix} \quad
W=⎝⎜⎜⎜⎛w0w1⋮wn⎠⎟⎟⎟⎞
J
(
w
)
=
1
2
m
∑
i
=
1
m
(
h
w
(
x
i
)
−
y
i
)
2
=
1
2
m
∑
i
=
1
m
(
w
T
x
−
y
i
)
2
J(w) = \frac{1}{2m}\sum_{i=1}^{m}(h_w(x_i)-y_i)^2 = \frac{1}{2m}\sum_{i=1}^{m}(w^Tx-y_i)^2
J(w)=2m1i=1∑m(hw(xi)−yi)2=2m1i=1∑m(wTx−yi)2
Δ J ( w ) = 1 2 m ∑ i = 1 m α ( w T x − y i ) 2 α w \Delta J(w) = \frac{1}{2m}\sum_{i=1}^{m}\frac{\alpha (w^Tx-y_i)^2}{\alpha w} ΔJ(w)=2m1i=1∑mαwα(wTx−yi)2
= 1 2 m ∑ i = 1 m 2 ( w T x − y i ) α ( w T x − y i ) α w = \frac{1}{2m}\sum_{i=1}^{m}2(w^Tx-y_i) \frac{\alpha(w^Tx-y_i)}{\alpha w} =2m1i=1∑m2(wTx−yi)αwα(wTx−yi)
= 1 2 m ∑ i = 1 m 2 ( w T x − y i ) x i = \frac{1}{2m}\sum_{i=1}^{m}2(w^Tx-y_i)x_i =2m1i=1∑m2(wTx−yi)xi
= 1 m X T ( X w − y ) = \frac{1}{m}X^T(Xw-y) =m1XT(Xw−y)
w ^ = ( X T X ) − 1 ( X T y ) \hat w= (X^TX)^{-1}(X^Ty) w^=(XTX)−1(XTy)
w : = w − η 1 m X T ( X w − y ) w:= w-\eta \frac{1}{m}X^T(Xw-y) w:=w−ηm1XT(Xw−y)
1.3 梯度下降
梯度下降法,所有的样本都参与训练。
w
:
=
w
−
η
1
m
X
T
(
X
w
−
y
)
w:= w-\eta \frac{1}{m}X^T(Xw-y)
w:=w−ηm1XT(Xw−y)
随机梯度下降法,每次只用一个样本的误差更新梯度。占用内存少,支持海量数据。当损失函数很不规则的时候,它更有可能跳过局部最小值,接近全局最小值,但是梯度数值震荡剧烈。每一次epoch,都要随机打乱数据集。
$$
w:= w-\eta (w^Tx_i-y_i)x_i
小 批 量 梯 度 下 降 法 , 每 次 使 用 一 小 批 样 本 更 新 参 数 。 小批量梯度下降法,每次使用一小批样本更新参数。 小批量梯度下降法,每次使用一小批样本更新参数。
w:= w-\eta \frac{1}{N}\sum_{i=k}{K+N}(wTx_i-y_i)x_i
$$
1.4 算法实现
(1) 用矩阵乘法计算参数
import numpy as np
class OLSLinearRegression:
def _preprocess_data_X(self,X):
# 1 数据处理
m , n = X.shape
X_ = np.ones([m,n+1])
X_[:,1:] = X
return X_
def _ols(self,X,y):
# 2 计算 w = (X.T@X)^{-1}(X.T@X)
return np.linalg.inv(X.T @ X) @ (X.T @ y)
def train(self,X_train,y_train):
# 3 带入数据,根据X计算w
X_train = _preprocess_data_X(self,X_train)
self.w = _ols(self,X_train,y_train)
def predict(self,X):
# 4 预测
X = self._preprocess_data_X(X)
return np.matmul(X,self.w)
if __name__ == '__main__':
# X_train,y_train
import numpy as np
X_train = np.linspace(0,10,100).reshape(-1,1)
y_train = X_train*3+1+ (np.random.randn(100)/100).reshape(-1,1) # [100,2]
ols_reg = LinearRegression()
w = ols_reg.train(X_train,y_train)
y_pred = ols_reg.predict(X_train)
print(y_pred.shape)
'''
(2, 1)
[[1.00133115]
[2.99992476]]
(100, 1)
'''
(2) 用梯度下降法计算参数
''' Linear_regression_gd
1. initial parameters
2. loss ,gradient,gradient_descent,process_data
3. train,predict
4. application
'''
import numpy as np
class linear_regression_gd():
def __init__(self,iters=1000,eta=1e-2,tol=None):
self.iters = iters # 训练次数
self.eta = eta # 梯度下降的学习率
self.tol = tol # 梯度下降的阈值
self.w = None # 参数
def _preprocess_data_X(self, X):
'''1 加上常数项'''
m,n = X.shape
X_ = np.ones([m,n+1])
X_[:, 1:] = X
return X_
def _predict(self, X, w):
return np.matmul(X, w)
def predict(self, X, w):
''' 2 预测 '''
X = self._preprocess_data_X(X)
return self._predict(X, w)
def _loss(self,y_pre,y):
'''3 损失'''
return np.mean((y_pre-y)**2)
def _gradient(self, X, y, y_pre):
return np.matmul(X.T,(y_pre-y))/y.size # [100,2] [100,1] [2,1]
def _gradient_descent(self, w, X, y):
'''4 梯度下降算法'''
if self.tol is not None:
loss_old = np.inf
# 4.1 迭代
for i in range(self.iters):
y_pre = self._predict(X, w)
loss = self._loss(y_pre,y)
#print('第%d loss %f:'%(i,loss))
# 4.2 早停
if self.tol is not None:
if loss_old - loss < self.tol:
break
loss_old = loss
# 4.3 更新参数
grad = self._gradient(X, y, y_pre)
w -= self.eta * grad
if i%100 == 0:
print(i,w)
def train(self,X_train,y_train):
'''5 训练'''
# 5.1 处理数据
X_train = self._preprocess_data_X(X_train)
# 5.2 初始化参数
self.w = (np.random.random(X_train.shape[-1])).reshape(2,1)
# 5.3 梯度下降算法
self._gradient_descent(self.w, X_train,y_train)
if __name__ == '__main__':
X_train = np.linspace(0,10,100).reshape(-1,1)
y_train = X_train*3+1+ np.random.randn(1)/100 # [100,2]
reg_gd = linear_regression_gd(iters=3400,eta=1e-2,tol=1e-10)
reg_gd.train(X_train,y_train)
w = reg_gd.w
y_pre = reg_gd.predict(X_train,w)
print(y_pre.shape)
'''
0 [[0.20966034]
[1.6335141 ]]
100 [[0.55296788]
[3.06662242]]
200 [[0.65046267]
[3.05196308]]
300 [[0.7265051 ]
[3.04052933]]
400 [[0.78581544]
[3.03161141]]
500 [[0.83207537]
[3.02465576]]
600 [[0.86815645]
[3.01923061]]
700 [[0.89629839]
[3.01499918]]
800 [[0.91824808]
[3.01169882]]
900 [[0.93536805]
[3.00912466]]
1000 [[0.94872101]
[3.0071169 ]]
1100 [[0.95913584]
[3.00555093]]
1200 [[0.96725903]
[3.00432952]]
1300 [[0.97359483]
[3.00337687]]
1400 [[0.97853652]
[3.00263384]]
1500 [[0.98239086]
[3.0020543 ]]
1600 [[0.9853971 ]
[3.00160228]]
1700 [[0.98774187]
[3.00124972]]
1800 [[0.9895707 ]
[3.00097474]]
1900 [[0.99099712]
[3.00076026]]
2000 [[0.99210968]
[3.00059297]]
2100 [[0.99297744]
[3.0004625 ]]
2200 [[0.99365425]
[3.00036073]]
2300 [[0.99418215]
[3.00028136]]
2400 [[0.99459388]
[3.00021945]]
2500 [[0.99491503]
[3.00017116]]
2600 [[0.9951655]
[3.0001335]]
2700 [[0.99536087]
[3.00010413]]
2800 [[0.99551324]
[3.00008121]]
2900 [[0.99563209]
[3.00006334]]
3000 [[0.99572479]
[3.00004941]]
(100, 1)
Process finished with exit code 0
'''
(3) 随机梯度下降
import numpy as np
class Linear_Regression_sgd:
def __init__(self, n_iter=1000, eta=0.01):
self.n_iter = n_iter # 训练迭代次数
self.eta = eta # 学习率
self.theta = None # 模型参数theta (训练时根据数据特征数量初始化)
def _preprocess_data_X(self, X):
m,n = X.shape
X_ = np.ones([m,n+1])
X_[:, 1:] = X
return X_
def _gradient(self, xi, yi, w):
return -xi * (yi - np.dot(xi, w)) # 计算当前梯度
def _stochastic_gradient_descent(self, X, y, eta, n_iter):
X_ = X.copy()
m, _ = X_.shape
eta0 = eta
step_i = 0
for j in range(n_iter):
np.random.shuffle(X_) # 随机乱序
for i in range(m):
grad = self._gradient(X_[i], y[i], self.w) # 计算梯度(每次只使用其中一个样本)
step_i += 1
eta = eta0 / np.power(step_i, 0.25)
self.w += eta * -grad # 更新参数w
if j % 10 == 0 :
print(self.w)
def train(self, X, y):
if self.theta is None:
X = self._preprocess_data_X(X)
_, n = X.shape
self.w = np.random.random(n)
self._stochastic_gradient_descent(X, y, self.eta, self.n_iter)
def predict(self, X):
X = self._preprocess_data_X(X)
return np.dot(X, self.w)
if __name__ == '__main__':
X_train = np.linspace(0, 10, 100).reshape(-1, 1) # [100,1]
y_train = X_train*0.8 + 0.5 + np.random.randn(1)/100 # [100,1]
reg_sgd = Linear_Regression_sgd(n_iter=100, eta = 1e-5)
reg_sgd.train(X_train, y_train)
w = reg_sgd.w
y_pre = reg_sgd.predict(X_train)
'''
[0.87034832 0.56149583]
[0.87206667 0.56075662]
[0.87340481 0.55991468]
[0.87459365 0.559757 ]
[0.87569099 0.55915361]
[0.87672773 0.55848424]
[0.87771676 0.55795229]
[0.87866814 0.55752828]
[0.87958761 0.55706313]
[0.88048 0.55706118]
Process finished with exit code 0
'''