optimizer
这里不讨论各优化器原理,只是简单的相互比较
比较较为常见的优化器,详细见文档
torch 版本 1.4.0
jupyter
数据
import torch
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
torch.manual_seed(42)
np.random.seed(42)
X = np.expand_dims(np.linspace(-1, 1, 500), axis=1)
y = X**2 + np.random.random(X.shape)*0.3
plt.scatter(X, y)
plt.savefig('./test.jpg')
plt.show()
将其转为torch,并使用DataLoader
import torch
from torch.utils.data import DataLoader, TensorDataset
X = torch.from_numpy(X)
y = torch.from_numpy(y)
dataset = TensorDataset(X.float(),y.float())
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
网络
设置简单的网络
并为每种优化器赋予相同的网络
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.linear1 = nn.Linear(1, 30)
self.linear2 = nn.Linear(30, 1)
def forward(self, x):
x = F.relu(self.linear1(x))
x = self.linear2(x)
return x
net_SGD = Net()
net_SGD_momentum = Net()
net_RMSprop = Net()
net_Adam = Net()
nets = [net_SGD, net_SGD_momentum, net_RMSprop, net_Adam]
names = ['SGD', 'SGD_Momentum', 'RMSProp', 'Adam']
优化器
from torch import optim
optim_SGD = optim.SGD(net_SGD.parameters(), lr=0.01)
optim_SGD_momentum = optim.SGD(net_SGD_momentum.parameters(), lr=0.01, momentum=0.9)
optim_RMSprop = optim.RMSprop(net_RMSprop.parameters())
optim_Adam = optim.Adam(net_Adam.parameters())
optims = [optim_SGD, optim_SGD_momentum, optim_RMSprop, optim_Adam]
训练
loss_func = nn.MSELoss()
loss_data = [[] for _ in range(len(optims))]
for epoch in range(10):
for index, (x_batch, y_batch) in enumerate(dataloader):
for net, optim, loss_d in zip(nets, optims, loss_data):
output = net(x_batch)
optim.zero_grad()
loss = loss_func(output,y_batch)
loss.backward()
optim.step()
loss_d.append(loss.item())
Loss 曲线
plt.figure(figsize=(12,8))
for i in range(len(loss_data)):
plt.plot(loss_data[i], label=names[i])
plt.title('Loss')
plt.legend()
plt.savefig('./test1.jpg')
plt.show()
结论
虽然说优化器的先进程度是
Adam > RMSprop > SGD_Momentum > SGD
但在实际情况下的效果不是越先进越好
数据不同,网络不同可能实际的效果也不同
需要针对当前的问题找到最合适的优化器