1.了解不同优化器
a.随机梯度下降和二维优化
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
# 梯度下降
# 创建一个较为复杂的函数
J = lambda w: 1.5 * w ** 4 - 15 * w ** 3 + 3 * w ** 2
# 在函数J导数后面加上了一个噪声是为了模拟随机梯度下降,随机梯度下降是每一次在数据里随机选取若干样本得到的梯度
J_prime = lambda w: (6 * w ** 3 - 45 * w ** 2 + 6 * w) * (1 + 10 * np.random.random())
# w = np.linspace(-10, 20, 100)
# print(w)
# plt.plot(w, J(w))
# plt.show()
# 通过将梯度进行累加求平均可以解决噪声带来的影响,但是这样求平均并不是最优的方法
# J = 0
# for i in range(100):
# J += J_prime(1)
# J / 100
# # 栋梁算法,我们希望每次累加的量更加重视当下的梯度,因此要做一个随着时间变化的甲醛平均值
# J = 0
# # 用来记录梯度的值
# JJ = []
#
# for i in range(1000):
# # 我们对当下的梯度赋予一个权重,J_prime是当下梯度
# J = 0.0 * J + 0.1 * J_prime(1)
# JJ.append(J)
# plt.plot(JJ)
# plt.show()
# # 梯度下降法
# w2 = 1
# epoch = 100
# lr = 0.01
# Loss = []
# W = []
#
# for i in range(epoch):
# w2 = w2 - lr * (J_prime(w2)) # 更新梯度
# Loss.append(J(w2))
# W.append(w2)
#
# plt.plot(Loss)
# plt.figure()
# plt.plot(W)
# print(w2)
# # 动量梯度下降法,一定要初始化
# w = 1
# epoch = 100
# lr = 0.001
# beta = 0.5
# y = []
# v = 0
# Loss = []
# W = []
# for i in range(epoch):
# # 指定一个速度V,J_prime是当前梯度,beta是权重
# v - beta * v + (1 - beta) * J_prime(w)
# w = w - lr * v
# Loss.append(J(w))
# W.append(w)
#
# plt.plot(Loss)
# plt.figure()
# plt.plot(W)
# 二维问题
# 一个简单的二维优化,随机梯度下降法进行优化实现
J = lambda w1, w2: w1 ** 2 + 10 * w2 ** 2
J_prime1 = lambda w1: 2 * w1
J_prime2 = lambda w2: 20 * w2
w1 = 1
w2 = -1
epoch = 200
lr = 0.01
y = []
v = 0
s = 0
Loss = []
W1 = []
W2 = []
for i in range(epoch):
w1 = w1 - lr * (J_prime1(w1))
w2 = w2 - lr * (J_prime2(w2))
W1.append(w1)
W2.append(w2)
Loss.append(J(w1, w2))
plt.plot(Loss)
plt.figure()
plt.plot(W1)
plt.plot(W2)
plt.show()
b. Ada自适应梯度调节法
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
# Ada
J = lambda w1, w2: w1 ** 2 + 10 * w2 ** 2
J_prime1 = lambda w1: 2 * w1
J_prime2 = lambda w2: 20 * w2
w1 = 1
w2 = -1
epoch = 200
lr = 0.1
y = []
v = 0
s = 0
Loss = []
W1 = []
W2 = []
s1 = s2 = 0
for i in range(epoch):
s1 += J_prime1(w1) ** 2
w1 = w1 - lr * (J_prime1(w1) / np.sqrt(s1))
s2 += J_prime2(w2) ** 2
w2 = w2 - lr * (J_prime2(w2) / np.sqrt(s2))
W1.append(w1)
W2.append(w2)
Loss.append(J(w1, w2))
plt.plot(Loss)
plt.figure()
plt.plot(W1)
plt.plot(W2)
print(w1, w2)
# 引入动量思想
s = 0
S = []
beta = 0.8
for i in range(100):
s = 0.2 * s + J_prime1(w1) ** 2 * 0.8
S.append(np.sqrt(s))
plt.plot(S)
c.RMSProp
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
# RMSProp
# 加入不同方向的scling!,缓解了长时间优化梯度变小的问题
J = lambda w1, w2: w1 ** 2 + 10 * w2 ** 2
J_prime1 = lambda w1: 2 * w1
J_prime2 = lambda w2: 20 * w2
w1 = 1
w2 = -1
epoch = 200
lr = 0.01
beta2 = 0.5
y = []
v = 0
s = 0
Loss = []
W1 = []
W2 = []
s1 = s2 = 0
for i in range(epoch):
s1 = beta2 * s1 + (1 - beta2) * (J_prime1(w1) ** 2)
s1_correct = s1 / (1 - beta2 ** (i + 1))
w1 = w1 - lr * (J_prime1(w1) / np.sqrt(s1))
s2 = beta2 * s2 + (1 - beta2) * (J_prime2(w2) ** 2)
s2_correct = s2 / (1 - beta2) ** (i + 1)
w2 = w2 - lr * (J_prime2(w2) / np.sqrt(s2))
W1.append(w1)
W2.append(w2)
Loss.append(J(w1, w2))
plt.plot(Loss)
plt.figure()
plt.plot(W1)
plt.plot(W2)
print(w1, w2)
d.Adam
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
# Adam是一个集成了多种优化器优势的方法,通过momentum解决了鞍点问题,通过ada解决了对
# 弱小方向尺度矫正的问题,同时加入偏置,解决初始值过大的问题
# 一维种用Adam优化实现代码
J = lambda w: 1.5 * w ** 4 - 15 * w ** 3 + 3 * w ** 2
# 在函数J导数后面加上了一个噪声是为了模拟随机梯度下降,随机梯度下降是每一次在数据里随机选取若干样本得到的梯度
J_prime = lambda w: (6 * w ** 3 - 45 * w ** 2 + 6 * w) * (1 + 10 * np.random.random())
w = 1
epoch = 200
lr = 0.01
beta1 = 0.9
beta2 = 0.99
y = []
v = 0
s = 0
Loss = []
W = []
for i in range(epoch):
v = beta1 * v + (1 - beta1) * J_prime(w)
v_correct = v / (1 - beta1 ** (i + 1))
s = beta2 * s + (1 - beta2) * (J_prime(w) ** 2)
s_correct = s / (1 - beta2 ** (i + 1))
w = w - lr * (v / np.sqrt(s))
W.append(w)
Loss.append(J(w))
plt.plot(Loss)
plt.figure()
plt.plot(W)
e.Adam二维优化
import numpy as np
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
# Adam是一个集成了多种优化器优势的方法,通过momentum解决了鞍点问题,通过ada解决了对
# 弱小方向尺度矫正的问题,同时加入偏置,解决初始值过大的问题
J = lambda w1, w2: w1 ** 2 + 10 * w2 ** 2
J_prime1 = lambda w1: 2 * w1
J_prime2 = lambda w2: 20 * w2
w1 = 1
w2 = -1
epoch = 200
lr = 0.01
beta1 = 0.9
beta2 = 0.99
y = []
v1 = v2 = 0
s1 = s2 = 0
Loss = []
W1 = []
W2 = []
for i in range(epoch):
v1 = beta1 * v1 + (1 - beta1) * J_prime1(w1)
v1_correct = v1 / (1 - beta1 ** (i + 1))
s1 = beta2 * s1 + (1 - beta2) * (J_prime1(w1) ** 2)
s1_correct = s1 / (1 - beta2 ** (i + 1))
w1 = w1 - lr * (v1 / np.sqrt(s1))
v2 = beta1 * v2 + (1 - beta1) * J_prime2(w2)
v2_correct = v2 / (1 - beta1 ** (i + 1))
s2 = beta2 * s2 + (1 - beta2) * (J_prime2(w1) ** 2)
s2_correct = s2 / (1 - beta2 ** (i + 1))
w2 = w2 - lr * (v2 / np.sqrt(s2))
W1.append(w1)
W2.append(w2)
Loss.append(J(w1, w2))
plt.plot(Loss)
plt.figure()
plt.plot(W1)
plt.plot(W2)
plt.show()
2.PyTorch种优化器选择
from torch.autograd import Variable
import torch
from torch import nn
from torch import optim
import numpy as np
import matplotlib.pyplot as plt
xy = np.loadtxt('./data/diabetes.csv.gz', delimiter=',', dtype=np.float32)
x_data = Variable(torch.from_numpy(xy[:, 0:-1]))
y_data = Variable(torch.from_numpy(xy[:, [-1]]))
print(x_data.data.shape)
print(y_data.data.shape)
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.l1 = nn.Linear(8, 6)
self.l2 = nn.Linear(6, 4)
self.l3 = nn.Linear(4, 1)
self.sigmoid = nn.Sigmoid()
def foward(self, x):
out1 = self.sigmoid(self.l1(x))
out2 = self.sigmoid(self.l2(out1))
y_pred = self.sigmoid(self.l3(out2))
return y_pred
model = Model()
criterion = nn.BCELoss(size_average=True)
optimizer = optim.Adam(model.parameters(), lr=0.05, betas=(0.9, 0.999), weight_decay=0.001)
Loss = []
for epoch in range(200):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if epoch % 20 == 0:
print("epoch =", epoch, "loss =", loss.data[0])
Loss.append(loss.data[0])
optimizer.zero_grad()
loss.backward()
optimizer.step()
plt.plot(Loss)
hour_var = Variable(torch.randn(1, 8))
print("predict", model(hour_var).data[0] > 0.5)
plt.show()