回顾梯度下降流程
#1 初始化
θ
\theta
θ
#2 求 gradient
#3
θ
t
+
1
=
θ
t
−
α
•
g
\theta^{t+1}=\theta^{t}-\alpha • g
θt+1=θt−α•g
#4
g
g
g 收敛
随机梯度下降 SGD
随机:Stochastic
梯度:Gradient
下降:Descent
每一步下降用随机抽取的样本
优点:速度快
缺点:精度下降
for i = 1 to m,{
θ
j
:
=
θ
j
−
α
(
h
θ
(
x
(
i
)
)
−
y
(
i
)
)
x
j
(
i
)
\theta_{j}:=\theta_{j}-\alpha(h_{\theta}(x^{(i)})-y^{(i)})x_{j}^{(i)}
θj:=θj−α(hθ(x(i))−y(i))xj(i)
}
#随机索引,抽取出来,进行训练
index = np.random.randint(m)
xi = x_b[index:index+1]
yi = y[index:index+1]
gradients = xi.T.dot(xi.dot(theta)-yi)
import numpy as np
import matplotlib.pyplot as plt
# 随机X纬度x1,rand是随机均匀分布
x = 2 * np.random.rand(100,1)
# 人为设置真实的Y一列 np.random.randn(100,1) 设置误差遵循标准正态分布
y = 4 + 3 * x + np.random.randn(100,1)
# 整合 x0 和 x1 成矩阵
x_b = np.c_[np.ones((100,1)),x]
learning_rate = 0.01 # 学习率一般默认设置为0.01
n_iterations = 10000 # 迭代次数够多即可(不一定需要全局最优解)
m = 100 # 100 行
# #1 初始化theta,w0...wn
theta = np.random.randn(2,1) # x_b 中只有x0,x1,只需要两个theta
# #4 不设置阀值,直接设置超参数,迭代次数,迭代次数到了就认为收敛了
for iteration in range(n_iterations):
# #2 求梯度gradient
index = np.random.randint(m) # 随机索引,抽取出来,进行训练
xi = x_b[index:index+1] # 抽取随机x
yi = y[index:index+1] # 抽取随机x对应的y
gradients = x_b.T.dot(x_b.dot(theta)-y)# 不需要1/m平权
# #3 调整theta值
theta = theta - learning_rate * gradients
print(theta)
x_new = np.array([[0],[2]])
x_new_b = np.c_[(np.ones((2,1))),x_new]
y_predict = x_new_b.dot(theta)
# 绘制图形
plt.plot(x_new,y_predict,'r-')
plt.plot(x,y,'b.')
plt.axis([0,2,0,15])
plt.show()