梯度下降法
在使用梯度下降法前,最好进行数据归一化
- 不是一个机器学习算法
- 是一种基于搜索的最优化方法
- 作用:最小化损失函数
- 梯度上升法,最大化一个效用函数
- η称为学习率
- η的值影响获得最优解的速度
- η取值不合适甚至得不到最优解
- η是梯度下降法的一个超参数
优势
特征越多的情况下,梯度下降相比于正规方程所耗时间更短,优势就体现出来了。
问题:存在多个极值点的情况?
- 多次运行,随机化初始点
- 梯度下降法的初始点也是一个超参数
tensorflow梯度下降实现多元线性回归
梯度下降的实现过程(精彩):
x = np.linspace(-1,6,140)
y = np.power((x-2.5),2)-1
plt.plot(x,y)
def dJ(theta):
return 2*(theta-2.5)
def J(theta):
try:
return (theta-2.5)**2-1
except:
return float('inf')
theta = 0.0
eta = 0.1
epsilon = 1e-8
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta)-J(last_theta))<epsilon):
break
print(theta)
print(J(theta))
>>>2.499891109642585
>>>-0.99999998814289
theta = 0.0
eta = 0.1
epsilon = 1e-8
history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
history.append(theta)
if (abs(J(theta)-J(last_theta))<epsilon):
break
plt.plot(x,y)
plt.plot(np.array(history),J(np.array(history)),color='r',marker='+')
plt.show()
print("走的步数:%s"%(len(history)))
theta = 0.0
eta = 0.01
epsilon = 1e-8
history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
history.append(theta)
if (abs(J(theta)-J(last_theta))<epsilon):
break
plt.plot(x,y)
plt.plot(np.array(history),J(np.array(history)),color='r',marker='+')
plt.show()
print("走的步数:%s"%(len(history)))
theta = 0.0
eta = 0.8
epsilon = 1e-8
history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
history.append(theta)
if (abs(J(theta)-J(last_theta))<epsilon):
break
plt.plot(x,y)
plt.plot(np.array(history),J(np.array(history)),color='r',marker='+')
plt.show()
print("走的步数:%s"%(len(history)))
theta = 0.0
eta = 1.1
epsilon = 1e-8
history = [theta]
iters = 0
while True:
iters += 1
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
history.append(theta)
if (abs(J(theta)-J(last_theta))<epsilon or iters>=3):
break
plt.plot(x,y)
plt.plot(np.array(history),J(np.array(history)),color='r',marker='+')
plt.show()
print("走的步数:%s"%(len(history)))
在线性回归的模型中使用梯度下降
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(5)
x = 2*np.random.random(size=100)
y = x*3+4+np.random.normal(size=100)
x = x.reshape((-1,1))
y.shape
>>>(100,)
plt.scatter(x,y)
plt.show()
自己实现梯度下降多元线性回归:
class LinearRegressor:
def __init__(self):
self.coef_ = None # θ向量
self.intercept_ = None # 截距
self._theta = None
self._epsilon = 1e-8
def J(self,theta,x,y):
try:
return np.sum(np.power((y - np.dot(x,theta)),2))/len(x)
except:
return float('inf')
def dJ(self,theta,x,y):
# length = len(theta)
# res = np.zeros(length)
# res[0] = np.sum(x.dot(theta)-y)
# for i in range(1,length):
# res[i] = np.sum((x.dot(theta)-y).dot(x[:,i]))
# return res*2/len(x)
return x.T.dot(x.dot(theta)-y)*2/len(y)
def fit(self,theta,x,y,eta):
x = np.hstack((np.ones((x.shape[0],1)),x))
iters = 0
while True:
iters += 1
gradient = self.dJ(theta,x,y)
last_theta = theta
theta = theta - eta * gradient
if (abs(self.J(theta,x,y)-self.J(last_theta,x,y))<self._epsilon or iters>=10000):
break
self._theta = theta
self.intercept_ = self._theta[0] # 截距
self.coef_ = self._theta[1:] # 权重参数weights
return self
def predict(self,testX):
x = np.hstack((np.ones((testX.shape[0],1)),testX))
# y = np.dot(x,self._theta.reshape(-1,1))
y = np.dot(x,self._theta)
return y
def score(self,testX,testY):
return r2_score(testY,self.predict(testX))
lr = LinearRegressor()
theta = np.zeros([x.shape[1]+1])
lr.fit(theta,x,y,0.01)
print(lr.coef_)
print(lr.intercept_)
>>>[2.94805425]
>>>4.090993917959022
随机梯度下降法:
- 跳出局部最优解
- 更快的运行速度
学习率随着循环次数而递减
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size=m)
X = np.reshape(x,(-1,1))
y = 4*x +3 +np.random.normal(0,3,size=m)
# 梯度下降法
lr = LinearRegressor()
theta = np.zeros([X.shape[1]+1])
%time lr.fit(theta,X,y,0.01)
print(lr.coef_)
print(lr.intercept_)
Wall time: 3.76 s
[3.99569098]
2.999661554158197
In [33]:
class StochasticGradientDescent(LinearRegressor):
def dJ(self,theta,x_i,y_i):
return x_i.T.dot(x_i.dot(theta)-y_i)*2
def learn_rate(self,t):
t0 = 5
t1 = 50
return t0/(t+t1)
def fit(self,theta,x,y,eta,n=5):
x = np.hstack((np.ones((x.shape[0],1)),x))
iters = 0
m = len(x)
for i in range(n):
index = np.random.permutation(m)
x_new = x[index]
y_new = y[index]
for j in range(m):
rand_i = np.random.randint(len(x))
gradient = self.dJ(theta,x_new[rand_i],y_new[rand_i])
theta = theta - self.learn_rate(eta) * gradient
self._theta = theta
self.intercept_ = self._theta[0] # 截距
self.coef_ = self._theta[1:] # 权重参数weights
return self
sgd = StochasticGradientDescent()
theta = np.zeros([X.shape[1]+1])
%time sgd.fit(theta,X,y,0.01)
print(sgd.coef_)
print(sgd.intercept_)
>>>Wall time: 3.86 s
>>>[4.21382386]
>>>2.2958068134016685
## scikit learn 的实现
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(max_iter=5)
sgd.fit(X,y)
print(sgd.coef_)
print(sgd.intercept_)
>>>[3.99482988]
>>>[3.01943775]
梯度调试:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(seed=666)
x = np.random.random(size=(1000,10))
theta_true = np.arange(1,12,dtype=np.float32)
x_b = np.hstack([np.ones((x.shape[0],1)),x])
y = np.dot(x_b,theta_true) + np.random.normal(size=1000)
print(x.shape)
print(y.shape)
print(theta_true)
>>>(1000, 10)
>>>(1000,)
>>>[ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11.]
class Debug_GradientDescent(LinearRegressor):
def dJ(self,theta,x,y):
"""导数的定义"""
epsilon = 0.01
res = np.empty(len(theta))
for i in range(len(theta)):
theta_1 = theta.copy()
theta_1[i] += epsilon
theta_2 = theta.copy()
theta_2[i] -= epsilon
res[i] = (self.J(theta_1,x,y) - self.J(theta_2,x,y))/(2*epsilon)
return res
debug_gd = Debug_GradientDescent()
theta = np.zeros([x.shape[1]+1])
%time debug_gd.fit(theta,x,y,0.01)
debug_gd._theta
>>>Wall time: 14.5 s
>>>array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968, 5.05002117,
5.90494046, 6.97383745, 8.00088367, 8.86213468, 9.98608331,
10.90529198])
lr = LinearRegressor()
theta = np.zeros([x.shape[1]+1])
%time lr.fit(theta,x,y,0.01)
lr._theta
>>>Wall time: 1.58 s
>>>array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968, 5.05002117,
5.90494046, 6.97383745, 8.00088367, 8.86213468, 9.98608331,
10.90529198])