1 什么是梯度下降法
说明:
不是一个机器学习算法
是一种基于搜索的最优化方法
作用:最小化一个损失函数
梯度上升法:最大化一个效用函数
●η称为学习率(learning rate)
●η的取值影响获得最优解的速度
●n取值不合适,甚至得不到最优解
●n是梯度下降法的一个超参数
并不是所有函数都有唯一的极值点
解决方案:
●多次运行,随机化初始点
●梯度下降法的初始点也是一个超参数
2 模拟实现梯度下降法
import numpy as np
import matplotlib.pyplot as plt
plot_x = np.linspace(-1, 6, 141)
plot_y = (plot_x - 2.5) ** 2 - 1
# 可视化
# 横轴是theta,纵轴是损失函数
plt.plot(plot_x, plot_y)
plt.show()
输出:
# 计算损失函数对于theta的导数
def dJ(theta):
return 2 * (theta - 2.5)
# 计算损失函数
def J(theta):
return (theta - 2.5) ** 2 - 1
模拟梯度下降法的过程
eta = 0.1 # 初始化学习率
epsilon = 1e-8 # 初始化精度
theta = 0.0 # 给定起始点
while True:
gradient = dJ(theta) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
if abs(J(theta) - J(last_theta)) < epsilon:
break
print(theta)
print(J(theta))
>>>2.499891109642585
>>>-0.99999998814289
在损失函数图像上画出梯度下降的过程
eta = 0.1 # 初始化学习率
epsilon = 1e-8 # 初始化精度
theta = 0.0 # 给定起始点
theta_history = [theta]
while True:
gradient = dJ(theta) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
theta_history.append(theta)
if abs(J(theta) - J(last_theta)) < epsilon:
break
plt.plot(plot_x, J(plot_x))
plt.plot(theta_history, J(np.array(theta_history)), color = 'r', marker = '+')
plt.show()
输出:
# 查看theta_history的长度
len(theta_history)
>>>46
将计算梯度和绘制梯度下降过程封装成函数:
def gradient_descent(initial_theta, eta, epsilon = 1e-8):
theta = initial_theta # 给定起始点
theta_history.append(theta)
while True:
gradient = dJ(theta) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
theta_history.append(theta)
if abs(J(theta) - J(last_theta)) < epsilon:
break
def plot_theta_history():
plt.plot(plot_x, J(plot_x))
plt.plot(theta_history, J(np.array(theta_history)), color = 'r', marker = '+')
plt.show()
降低学习率eta,重新模拟梯度下降过程:
eta = 0.01
theta_history = []
initial_theta = 0.0
gradient_descent(initial_theta, eta)
plot_theta_history()
输出:
为了防止迭代次数过多陷入死循环,可设置迭代次数:
def gradient_descent(initial_theta, eta, n_iter = 1e3, epsilon = 1e-8):
theta = initial_theta # 给定起始点
theta_history.append(theta)
i_iter = 0
while i_iter < n_iter:
gradient = dJ(theta) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
theta_history.append(theta)
if abs(J(theta) - J(last_theta)) < epsilon:
break
i_iter += 1
设置一个较大的学习率试试:
eta = 1.1
theta_history = []
initial_theta = 0.0
gradient_descent(initial_theta, eta)
len(theta_history)
>>>1001
# 查看theta_history的最后一个值
theta_history[-1]
>>>-3.79477522293131e+79
eta = 1.1
theta_history = []
initial_theta = 0.0
gradient_descent(initial_theta, eta)
plot_theta_history()
输出:
3 线性回归中的梯度下降法
4 实现线性回归中的梯度下降法
在线性回归模型中使用梯度下降法:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = 2 * np.random.random(size = 100)
y = x * 3 + 4 + np.random.normal(size = 100)
x = x.reshape(-1, 1)
print(x.shape)
print(y.shape)
>>>(100, 1)
>>>(100,)
># 可视化
plt.scatter(x, y)
plt.show()
输出:
使用梯度下降法训练:
封装计算损失函数与梯度的函数:
def J(theta, x_b, y):
try:
return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
except:
return float('inf')
def dJ(theta, x_b, y):
res = np.empty(len(theta))
res[0] = np.sum(x_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (x_b.dot(theta) - y).dot(x_b[:,i])
return res * 2 / len(x_b)
接下来进行梯度下降的过程:
def gradient_descent(x_b, y, initial_theta, eta, epsilon = 1e-8):
theta = initial_theta # 给定起始点
while True:
gradient = dJ(theta, x_b, y) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
break
return theta
x_b = np.hstack([np.ones((len(x), 1)), x])
initial_theta = np.zeros(x_b.shape[1])
eta = 0.01
theta = gradient_descent(x_b, y, initial_theta, eta)
theta
>>>array([4.02145786, 3.00706277])
在自己编写的LinearRegression.py文件中添加梯度下降法:
def fit_gd(self, x_train, y_train, eta = 0.01, n_iter = 1e3):
"""根据x_train和y_train,使用梯度下降法训练LinearRegression模型"""
assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'
def J(theta, x_b, y):
try:
return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
except:
return float('inf')
def dJ(theta, x_b, y):
res = np.empty(len(theta))
res[0] = np.sum(x_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (x_b.dot(theta) - y).dot(x_b[:, i])
return res * 2 / len(x_b)
def gradient_descent(x_b, y, initial_theta, eta, epsilon=1e-8):
theta = initial_theta # 给定起始点
while True:
gradient = dJ(theta, x_b, y) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
break
return theta
x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_theta = np.zeros(x_b.shape[1])
self._theta = gradient_descent(x_b, y_train, initial_theta, eta)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
使用自己编写的LinearRegression中的梯度下降法:
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_gd(x, y)
>>>LinearRegression()
# 查看计算出的参数
print(lin_reg.coef_)
print(lin_reg.interception_)
>>>[3.00706277]
>>>4.021457858204859
5 梯度下降的向量化和数据标准化
在编写的LinearRegression.py文件的fit_gd方法中使用向量化的方式求导数:
def fit_gd(self, x_train, y_train, eta = 0.01, n_iter = 1e3):
"""根据x_train和y_train,使用梯度下降法训练LinearRegression模型"""
assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'
def J(theta, x_b, y):
try:
return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
except:
return float('inf')
def dJ(theta, x_b, y):
# 用for循环求导数
# res = np.empty(len(theta))
# res[0] = np.sum(x_b.dot(theta) - y)
# for i in range(1, len(theta)):
# res[i] = (x_b.dot(theta) - y).dot(x_b[:, i])
# return res * 2 / len(x_b)
# 用向量化求导数
return x_b.T.dot(x_b.dot(theta) - y) * 2 / len(x_b)
def gradient_descent(x_b, y, initial_theta, eta, epsilon=1e-8):
theta = initial_theta # 给定起始点
while True:
gradient = dJ(theta, x_b, y) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
break
return theta
x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_theta = np.zeros(x_b.shape[1])
self._theta = gradient_descent(x_b, y_train, initial_theta, eta)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
加载并预处理数据:
import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
# 去掉异常数据
x = x[y < 50]
y = y[y < 50]
from playML.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, seed = 666)
from playML.LinearRegression import LinearRegression
lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(x_train, y_train)
>>>Wall time: 257 ms
>>>LinearRegression()
lin_reg1.score(x_test, y_test)
>>>0.8129794056212793
使用梯度下降法前进行数据归一化:
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(x_train)
x_train_standard = standardscaler.transform(x_train)
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(x_train_standard, y_train)
>>>Wall time: 444 ms
>>>LinearRegression()
x_test_standard = standardscaler.transform(x_test)
lin_reg3.score(x_test_standard, y_test)
>>>0.8129873310487505
6 随机梯度下降法
使用自己模拟的数据:
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size = m)
y = 4. * x + 3. + np.random.normal(size = m)
def J(theta, x_b, y):
try:
return np.sum((y - x_b.dot(theta)) ** 2) / len(x_b)
except:
return float('inf')
def dJ(theta, x_b, y):
return x_b.T.dot(x_b.dot(theta) - y) * 2 / len(x_b)
# 非默认参数不能放在默认参数之后
def gradient_descent(x_b, y, initial_theta, eta, n_iter = 1e4, epsilon=1e-8):
theta = initial_theta # 给定起始点
cur_iter = 0
while cur_iter < n_iter:
gradient = dJ(theta, x_b, y) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
break
cur_iter += 1
return theta
%%time
x_b = np.hstack([np.ones((len(x), 1)), x])
initial_theta = np.zeros(x_b.shape[1])
eta = 0.01
theta = gradient_descent(x_b, y, initial_theta, eta)
>>>Wall time: 1.37 s
theta
>>>array([3.00392812, 4.00011491])
随机梯度下降法:
def dJ_sgd(theta, x_b_i, y_i):
return x_b_i.T.dot(x_b_i.dot(theta) - y_i) * 2
def sgd(x_b, y, initial_theta, n_iter):
t0 = 5
t1 = 50
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
for cur_iter in range(n_iter):
rand_i = np.random.randint(len(x_b))
gradient = dJ_sgd(theta, x_b[rand_i], y[rand_i])
theta = theta - learning_rate(cur_iter) * gradient
return theta
%%time
x_b = np.hstack([np.ones((len(x), 1)), x])
initial_theta = np.zeros(x_b.shape[1])
theta = sgd(x_b, y, initial_theta, n_iter = len(x_b) // 3)
>>>Wall time: 877 ms
theta
>>>array([3.0107388 , 4.00465842])
7 scikit-learn中的随机梯度下降法
在我们编写的LinearRegression.py文件中添加使用随机梯度下降法计算梯度的算法:
def fit_sgd(self, x_train, y_train, n_iter = 5, t0 = 5, t1 = 50):
"""根据x_train和y_train,使用随机梯度下降法训练LinearRegression模型"""
assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'
assert n_iter >= 1
def dJ_sgd(theta, x_b_i, y_i):
return x_b_i.T.dot(x_b_i.dot(theta) - y_i) * 2
def sgd(x_b, y, initial_theta, n_iter):
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
m = len(x_b)
for cur_iter in range(n_iter * m):
indexes = np.random.permutation(m) # 将m个样本的索引乱序排列
x_b_new = x_b[indexes]
y_new = y[indexes]
for i in range(m):
gradient = dJ_sgd(theta, x_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * gradient
return theta
x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_theta = np.zeros(x_b.shape[1])
self._theta = sgd(x_b, y_train, initial_theta, n_iter)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
在真实数据下使用我们的SGD:
import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data
y = boston.target
# 去掉异常数据
x = x[y < 50]
y = y[y < 50]
from playML.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, seed = 666)
# 数据归一化
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(x_train)
x_train_standard = standardscaler.transform(x_train)
x_test_standard = standardscaler.transform(x_test)
调用我们自己编写的fit_sgd方法:
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_sgd(x_train_standard, y_train, n_iter = 2)
%time lin_reg.score(x_test_standard, y_test)
>>>Wall time: 266 ms
>>>0.8129906331888908
sklearn中的SGD
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor()
%time sgd_reg.fit(x_train_standard, y_train)
sgd_reg.score(x_test_standard, y_test)
>>>Wall time: 450 ms
>>>0.8130539886776001