机器学习笔记10_梯度下降法(gradent descent)
1.梯度下降法:
- 不是一个机器学习算法
- 是一种基于搜索的最优化算法
- 作用:最小化损失函数
- 梯度上升法:最大化一个效用函数
2.梯度下降法原理
寻找合适的参数使损失函数有最小值:
每次计算的值 θ c = θ p − η d J d θ θ_c=θ_p−η\frac{dJ}{dθ} θc=θp−ηdθdJ
- η称为学习率(learning rate)
- η 的取值影响获得最优解的速度
- η 取值不合适,甚至取不到最优秀解
- η 是梯度下降法的一个超参数
- 初始解的位置也是一个超参数!
3.梯度下降法模拟
3.1 梯度下降法的实现
import numpy as np
import matplotlib.pyplot as plt
plot_x = np.linspace(-1,6,141) # 加上头尾共141个点
plot_y = (plot_x-2.5)**2-1
plt.plot(plot_x, plot_y)
plt.show()
def dJ(theta):
return 2*(theta-2.5)
def J(theta):
return (theta-2.5)**2-1
theta = 0.0
eta = 0.1
epsilon = 1e-8
theta_history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta -eta*gradient
theta_history.append(theta)
# 需要防止导数不等于零情况
if ( abs( J(theta)-J(last_theta) )<epsilon ):
break
print(theta)
print(J(theta))
# 画出路线
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)),marker="^")
2.499891109642585
-0.99999998814289
[<matplotlib.lines.Line2D at 0x2142f80f188>]
3.2梯度下降法的封装
def gradient_descent(initial_theta, eta, max_steps = 1e4, epsilon=1e-8):
theta = initial_theta
theta_history.append(initial_theta)
i_ter = 0
while i_ter < max_steps:
gradient = dJ(theta)
last_theta = theta
theta = theta -eta*gradient
theta_history.append(theta)
# 需要防止导数不等于零情况
if ( abs( J(theta)-J(last_theta) )<epsilon ):
break
i_ter += 1
def plot_theta_history():
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)),marker="^")
plt.show()
eta = 0.01 # eta 减小
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
eta = 0.8 # eta 增加
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
eta = 1.4 # eta取1.4报错,安全eta为0.01,eta过大不收敛
theta_history = []
gradient_descent(0.,eta)
plot_theta_history()
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-43-161898807e4f> in <module>
1 eta = 1.4 # eta 增加
2 theta_history = []
----> 3 gradient_descent(0.,eta)
4 plot_theta_history()
<ipython-input-37-550e6e359947> in gradient_descent(initial_theta, eta, max_steps, epsilon)
9 theta_history.append(theta)
10 # 需要防止导数不等于零情况
---> 11 if ( abs( J(theta)-J(last_theta) )<epsilon ):
12 break
13 i_ter += 1
<ipython-input-25-afa8196718fc> in J(theta)
1 def J(theta):
----> 2 return (theta-2.5)**2-1
OverflowError: (34, 'Result too large')
4. 在线性回归模型中使用梯度下降法
import numpy as np
from matplotlib import pyplot as plt
np.random.seed(666)
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
x = x.reshape(-1,1)
x.shape
(100, 1)
y.shape
(100,)
plt.scatter(x,y)
plt.show()
4.1 使用梯度下降法训练
目标: 使 loss function
1
m
∑
i
=
1
m
(
y
(
i
)
−
y
^
(
i
)
)
2
\frac{1}{m}\sum_{i=1}^m(y^{(i)}-\hat{y}^{(i)})^2
m1∑i=1m(y(i)−y^(i))2尽可能小:
即
J
(
θ
)
=
M
S
E
(
y
,
y
^
)
J(\theta)=MSE(y,\hat{y})
J(θ)=MSE(y,y^)
KaTeX parse error: Unknown column alignment: \partial at position 36: … \begin{array} \̲p̲a̲r̲t̲i̲a̲l̲ ̲\partial J/\par…
def J(theta, X_b, y):
try:
return np.sum((y-X_b.dot(theta))**2)/len(X_b)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta)-y)
for i in range(1,len(theta)):
res[i] = (X_b.dot(theta)-y).dot(X_b[:,i])
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, max_steps = 1e5, epsilon=1e-8):
theta = initial_theta
i_ter = 0
while i_ter < max_steps:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta -eta*gradient
# 需要防止导数不等于零情况
if ( abs( J(theta,X_b, y)-J(last_theta, X_b, y) )<epsilon ):
break
i_ter += 1
return theta
X_b = np.hstack([np.ones((len(x),1)), x.reshape(-1,1)])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
theta
array([4.02145786, 3.00706277])
X_b.shape
(100, 2)
$
X_b =
\left( \begin{array}
\
1 &x_1 \
1 &x_2 \
1 &x_3
\end{array}\right)
$
# 向量化表示
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.T.dot(theta)-y)*2./len(y)
4.2 梯度下降法与归一化
使用梯度下降法前,最好进行数据归一化,因为不同维度的梯度不同
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0] # 因为上限为50.0,超过50.0的部分也按50算
y = y[y < 50.0]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
standard = StandardScaler()
standard.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train_standard = standard.transform(X_train)
5.梯度下降的优势
m = 1000 # m个样本
n = 5000 # n个特征
big_X = np.random.normal(size=(m,n))
true_theta = np.random.uniform(0.0, 100.0, size=n+1)
big_y = big_X.dot(true_theta[1:])+true_theta[0]+np.random.normal(0., 10,size=m)