linear regression
生成一些数据
import numpy as np
import matplotlib.pyplot as plt
# np.random.rand(100, 1)
# Create an array of the given shape and populate it with
# random samples from a uniform distribution
# over ``[0, 1)``.
X = 2*np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
X[0:5] # 注意数据的大小,这样才是100*1
array([[1.18171039],
[1.62333209],
[0.59230184],
[1.03828925],
[1.3602664 ]])
y[0:5]
array([[ 6.45485124],
[11.0588182 ],
[ 5.36127661],
[ 6.61603683],
[ 7.76445136]])
θ h a t = ( X T X ) − 1 X T y \theta^{hat} = (X^TX)^{-1}X^Ty θhat=(XTX)−1XTy
使用 M S E 进行梯度下降 使用MSE进行梯度下降 使用MSE进行梯度下降
# np.c_ 竖着拼在一起
X_b = np.c_[np.ones((100, 1)), X] # 把x0加到X里,以完成最后截距的predict
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y) # np.linalg.inv() 求拟
X_b[0:5]
array([[1. , 1.18171039],
[1. , 1.62333209],
[1. , 0.59230184],
[1. , 1.03828925],
[1. , 1.3602664 ]])
theta_best # 生成带有噪音的数据,但期待可以得到的是 y = 4 + 3X
array([[3.9255906 ],
[3.03536388]])
用 θ h a t \theta^{hat} θhat做出预测
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new] # 这个是中括号
y_predict = X_new_b.dot(theta_best)
y_predict
array([[3.9255906 ],
[9.99631837]])
plt.figure(figsize = (8, 6))
plt.plot(X, y, 'b.')
plt.plot(X_new, y_predict, 'r-')
# 看起来像分别设定x和y轴? 是这样的 xmin, xmax, ymin, ymax = axis([xmin, xmax, ymin, ymax])
plt.axis([0, 2, 0, 15])
plt.show()
使用sklearn进行线性回归
from sklearn.linear_model import LinearRegression # 使用SVD计算的
lin_reg = LinearRegression()
lin_reg.fit(X, y) # 注意这里用的是X不是X_b,说明这个会自动算截距的,不需要自己加
# 打印一下截距b和权重W
print(lin_reg.intercept_, lin_reg.coef_)
# 预测
ans = lin_reg.predict(X_new)
ans
[3.9255906] [[3.03536388]]
array([[3.9255906 ],
[9.99631837]])
使用梯度下降
# 批量梯度下降
lr = 0.1
n_iterations = 1000
m = 100
theta = np.random.randn(2, 1)
for interation in range(n_iterations):
gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - lr * gradients
theta
array([[3.9255906 ],
[3.03536388]])
# np.random.randint
# randint(low, high=None, size=None, dtype=int)
# Return random integers from `low` (inclusive) to `high` (exclusive).
# Return random integers from the "discrete uniform" distribution of
# the specified dtype in the "half-open" interval [`low`, `high`). If
# `high` is None (the default), then results are from [0, `low`).
print(np.random.randint(6)) # [0,6)
print(np.random.randint(low = 4, high = 11)) # [4, 10)
4
7
n_epochs = 50
t0, t1 = 5, 50
def learning_schedule(t):
return t0/(t + t1)
theta = np.random.randn(2, 1)
# X_b
# array([[1. , 1.82998312],
# [1. , 0.63093605],
# [1. , 0.64826421],
# [1. , 0.19033492],
# [1. , 1.97900686]])
print(len(X_b))
print(X_b[0:1].shape)
print(X_b[0].shape) # 为什么是[random_index:random_index + 1] 而不直接random_index?
print(X_b[0].reshape(1, 2).shape)
print(y[0:1].shape)
100
(1, 2)
(2,)
(1, 2)
(1, 1)
for epoch in range(n_epochs):
for i in range(m): # 上面设置m=100,我不知道是干什么用的 猜想是极端的随机梯度下降然后把batch变成了1
random_index = np.random.randint(m) # 随机选择[0, m) 从100了里随机选100次
xi = X_b[random_index:random_index + 1] # 为什么是[random_index:random_index + 1]而不直接random_index?
yi = y[random_index : random_index + 1] # 答案看上面
gradients = 2 * xi.T.dot(xi.dot(theta) - yi) # 所以是只进行了50次epoch
lr = learning_schedule(epoch * m + i) # 随着批次增加和epoch增加,lr减小
theta = theta - lr * gradients
theta
array([[3.9128593 ],
[3.01191738]])
用sklearn实现使用SGD的线性回归
# np.ndarray.ravel??
# Return a flattened array.
y.ravel().shape # 太奇怪了,为啥要这样
(100,)
from sklearn.linear_model import SGDRegressor
# 最大epoch=1000,早停在两个mse之差0.001,penalty还不会(貌似是不正则化的意思),最开始的lr在0.1
sgd_reg = SGDRegressor(max_iter=1000, tol = 1e-3, penalty=None, eta0 = 0.1)
sgd_reg.fit(X_b, y.ravel()) # 如果没有ravel就会A column-vector y was passed when a 1d array was expected.
SGDRegressor(eta0=0.1, penalty=None)
sgd_reg.intercept_, sgd_reg.coef_
(array([1.92913098]), array([1.92913098, 2.94632646]))
多项式回归
m = 100
X = 6 * np.random.randn(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)
# 发现单独取出一行就变成了没有维度的向量
print(X.shape)
print(X[0].shape, X[0])
print(y[0].shape, y[