一般pytorch需要用户自定义训练循环,可以说有1000个pytorch用户就有1000种训练代码风格。
从实用角度讲,一个优秀的训练循环应当具备以下特点。
代码简洁易懂 【模块化、易修改、short-enough】
支持常用功能 【进度条、评估指标、early-stopping】
经过反复斟酌测试,我精心设计了仿照keras风格的pytorch训练循环,完全满足以上条件。
该方案在知乎受到许多读者喜爱,目前为止获得了超过600个赞。
知乎完整回答链接:《深度学习里面,请问有写train函数的模板吗?》
https://www.zhihu.com/question/523869554/answer/2633479163
以上pytorch模型训练模版也是我开源的一个pytorch模型训练工具 torchkeras库的核心代码。
https://github.com/lyhue1991/torchkeras
铛铛铛铛,torchkeras加入新功能啦。
最近,通过引入HuggingFace的accelerate库的功能,torchkeras进一步支持了 多GPU的DDP模式和TPU设备上的模型训练。
这里给大家演示一下,非常强大和丝滑。
公众号算法美食屋后台回复关键词:训练模版,获取本文B站视频演示和notebook源代码。
#从git安装最新的accelerate仓库
!pip install git+https://github.com/huggingface/accelerate
一,torchkeras源码解析
torchkeras的核心代码在 下面这个文件中。
https://github.com/lyhue1991/torchkeras/blob/master/torchkeras/kerasmodel.py
import sys,datetime
from tqdm import tqdm
from copy import deepcopy
import numpy as np
import pandas as pd
import torch
from accelerate import Accelerator
def colorful(obj,color="red", display_type="plain"):
color_dict = {"black":"30", "red":"31", "green":"32", "yellow":"33",
"blue":"34", "purple":"35","cyan":"36", "white":"37"}
display_type_dict = {"plain":"0","highlight":"1","underline":"4",
"shine":"5","inverse":"7","invisible":"8"}
s = str(obj)
color_code = color_dict.get(color,"")
display = display_type_dict.get(display_type,"")
out = '\033[{};{}m'.format(display,color_code)+s+'\033[0m'
return out
class StepRunner:
def __init__(self, net, loss_fn, accelerator, stage = "train", metrics_dict = None,
optimizer = None, lr_scheduler = None
):
self.net,self.loss_fn,self.metrics_dict,self.stage = net,loss_fn,metrics_dict,stage
self.optimizer,self.lr_scheduler = optimizer,lr_scheduler
self.accelerator = accelerator
def __call__(self, batch):
features,labels = batch
#loss
preds = self.net(features)
loss = self.loss_fn(preds,labels)
#backward()
if self.optimizer is not None and self.stage=="train":
self.accelerator.backward(loss)
self.optimizer.step()
if self.lr_scheduler is not None:
self.lr_scheduler.step()
self.optimizer.zero_grad()
all_preds = self.accelerator.gather(preds)
all_labels = self.accelerator.gather(labels)
all_loss = self.accelerator.gather(loss).sum()
#metrics
step_metrics = {self.stage+"_"+name:metric_fn(all_preds, all_labels).item()
for name,metric_fn in self.metrics_dict.items()}
return all_loss.item(),step_metrics
class EpochRunner:
def __init__(self,steprunner):
self.steprunner = steprunner
self.stage = steprunner.stage
self.steprunner.net.train() if self.stage=="train" else self.steprunner.net.eval()
self.accelerator = self.steprunner.accelerator
def __call__(self,dataloader):
total_loss,step = 0,0
loop = tqdm(enumerate(dataloader),
total =len(dataloader),
file=sys.stdout,
disable=not self.accelerator.is_local_main_process,
ncols = 100
)
for i, batch in loop:
if self.stage=="train":
loss, step_metrics = self.steprunner(batch)
else:
with torch.no_grad():
loss, step_metrics = self.steprunner(batch)
step_log = dict({self.stage+"_loss":loss},**step_metrics)
total_loss += loss
step+=1
if i!=len(dataloader)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {self.stage+"_"+name:metric_fn.compute().item()
for name,metric_fn in self.steprunner.metrics_dict.items()}
epoch_log = dict({self.stage+"_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in self.steprunner.metrics_dict.items():
metric_fn.reset()
return epoch_log
class KerasModel(torch.nn.Module):
def __init__(self,net,loss_fn,metrics_dict=None,optimizer=None,lr_scheduler = None):
super().__init__()
self.net,self.loss_fn = net, loss_fn
self.metrics_dict = torch.nn.ModuleDict(metrics_dict)
self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(
self.net.parameters(), lr=1e-3)
self.lr_scheduler = lr_scheduler
def forward(self, x):
return self.net.forward(x)
def fit(self, train_data, val_data=None, epochs=10,ckpt_path='checkpoint.pt',
patience=5, monitor="val_loss", mode="min", mixed_precision='no'):
accelerator = Accelerator(mixed_precision=mixed_precision)
device = str(accelerator.device)
device_type = '🐌' if 'cpu' in device else '⚡️'
accelerator.print(colorful("<<<<<< "+device_type +" "+ device +" is used >>>>>>"))
net,optimizer,lr_scheduler= accelerator.prepare(
self.net,self.optimizer,self.lr_scheduler)
train_dataloader,val_dataloader = accelerator.prepare(train_data,val_data)
loss_fn = self.loss_fn
if isinstance(loss_fn,torch.nn.Module):
loss_fn.to(accelerator.device)
metrics_dict = self.metrics_dict
metrics_dict.to(accelerator.device)
history = {}
for epoch in range(1, epochs+1):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
accelerator.print("\n"+"=========="*8 + "%s"%nowtime)
accelerator.print("Epoch {0} / {1}".format(epoch, epochs)+"\n")
# 1,train -------------------------------------------------
train_step_runner = StepRunner(
net = net,
loss_fn = loss_fn,
accelerator = accelerator,
stage="train",
metrics_dict=deepcopy(metrics_dict),
optimizer = optimizer,
lr_scheduler = lr_scheduler
)
train_epoch_runner = EpochRunner(train_step_runner)
train_metrics = train_epoch_runner(train_dataloader)
for name, metric in train_metrics.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
if val_dataloader:
val_step_runner = StepRunner(
net = net,
loss_fn = loss_fn,
accelerator = accelerator,
stage="val",
metrics_dict= deepcopy(metrics_dict)
)
val_epoch_runner = EpochRunner(val_step_runner)
with torch.no_grad():
val_metrics = val_epoch_runner(val_dataloader)
val_metrics["epoch"] = epoch
for name, metric in val_metrics.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
accelerator.wait_for_everyone()
arr_scores = history[monitor]
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
if best_score_idx==len(arr_scores)-1:
unwrapped_net = accelerator.unwrap_model(net)
accelerator.save(unwrapped_net.state_dict(),ckpt_path)
accelerator.print(colorful("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx])))
if len(arr_scores)-best_score_idx>patience:
accelerator.print(colorful("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience)))
break
if accelerator.is_local_main_process:
self.net.load_state_dict(torch.load(ckpt_path))
dfhistory = pd.DataFrame(history)
accelerator.print(dfhistory)
return dfhistory
@torch.no_grad()
def evaluate(self, val_data):
accelerator = Accelerator()
self.net = accelerator.prepare(self.net)
val_data = accelerator.prepare(val_data)
if isinstance(self.loss_fn,torch.nn.Module):
self.loss_fn.to(accelerator.device)
self.metrics_dict.to(accelerator.device)
val_step_runner = StepRunner(net = self.net,stage="val",
loss_fn = self.loss_fn,metrics_dict=deepcopy(self.metrics_dict),
accelerator = accelerator)
val_epoch_runner = EpochRunner(val_step_runner)
val_metrics = val_epoch_runner(val_data)
return val_metrics
以上该训练循环满足我所设想的全部特性。
模块化:自下而上分成 StepRunner, EpochRunner, 和KerasModel 三级,结构清晰明了。
易修改:如果输入和label形式有差异(例如,输入可能组装成字典,或者有多个输入),仅需更改StepRunner就可以了,后面无需改动,非常灵活。
short-enough: 全部训练代码不到200行。
支持进度条:通过tqdm引入。
支持评估指标:可以引入torchmetrics库中的指标,也可以自定义评估指标。
支持early-stopping:在fit时候指定 monitor、mode、patience即可。
一,使用 CPU/单GPU 训练你的pytorch模型
当系统存在GPU时,torchkeras 会自动使用GPU训练你的pytorch模型,否则会使用CPU训练模型。
在我们的范例中,单GPU训练的话,一个Epoch大约是18s。
!pip install -U torchkeras
import torch
from torch import nn
import torchvision
from torchvision import transforms
import torchmetrics
from torchkeras import KerasModel
### 1,准备数据
def create_dataloaders(batch_size=1024):
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./mnist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./mnist/",train=False,download=True,transform=transform)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True,
num_workers=2,drop_last=True)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=batch_size, shuffle=False,
num_workers=2,drop_last=True)
return dl_train,dl_val
dl_train,dl_val = create_dataloaders(batch_size=1024)
### 2,定义模型
def create_net():
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=512,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=512,out_channels=256,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = 0.1))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(256,128))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(128,10))
return net
net = create_net()
### 3,训练模型
loss_fn = nn.CrossEntropyLoss()
metrics_dict = {'acc':torchmetrics.Accuracy(task='multiclass',num_classes=10)}
optimizer = torch.optim.AdamW(params=net.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
optimizer=optimizer,T_0=5)
model = KerasModel(net,loss_fn,metrics_dict,optimizer,lr_scheduler)
dfhistory = model.fit(train_data = dl_train,
val_data = dl_val,
epochs=5,
ckpt_path='checkpoint.pt',
patience=2,
monitor='val_acc',
mode='max',
mixed_precision='no')
### 4,评估模型
model.net.load_state_dict(torch.load('checkpoint.pt'))
print(model.evaluate(dl_val))
二,使用多GPU DDP模式训练你的pytorch模型
Kaggle中右边settings 中的 ACCELERATOR选择 GPU T4x2。
1,设置config
import os
from accelerate.utils import write_basic_config
write_basic_config() # Write a config file
os._exit(0) # Restart the notebook to reload info from the latest config file
# %load /root/.cache/huggingface/accelerate/default_config.yaml
{
"compute_environment": "LOCAL_MACHINE",
"deepspeed_config": {},
"distributed_type": "MULTI_GPU",
"downcast_bf16": false,
"fsdp_config": {},
"machine_rank": 0,
"main_process_ip": null,
"main_process_port": null,
"main_training_function": "main",
"mixed_precision": "no",
"num_machines": 1,
"num_processes": 2,
"use_cpu": false
}
# or answer some question to create a config
#!accelerate config
2,训练代码
在我们的范例中,双GPU使用DDP模式训练的话,一个Epoch大约是12s。
import torchvision
from torchvision import transforms
from torch import nn
import torch
import torchmetrics
from accelerate import notebook_launcher
from torchkeras import KerasModel
### 1,准备数据
def create_dataloaders(batch_size=1024):
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./minist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./minist/",train=False,download=True,transform=transform)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True,
num_workers=2,drop_last=True)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=batch_size, shuffle=False,
num_workers=2,drop_last=True)
return dl_train,dl_val
dl_train,dl_val = create_dataloaders(batch_size=1024)
### 2,定义模型
def create_net():
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=512,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=512,out_channels=256,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = 0.1))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(256,128))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(128,10))
return net
net = create_net()
### 3,训练模型
loss_fn = nn.CrossEntropyLoss()
metrics_dict = {'acc':torchmetrics.Accuracy(task='multiclass',num_classes=10)}
optimizer = torch.optim.AdamW(params=net.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
optimizer=optimizer,T_0=5)
model = KerasModel(net,loss_fn,metrics_dict,optimizer,lr_scheduler)
ckpt_path = 'checkpoint.pt'
args = dict(train_data = dl_train,
val_data = dl_val,
epochs=5,
ckpt_path= ckpt_path,
patience=2,
monitor='val_acc',
mode='max',
mixed_precision='no').values()
notebook_launcher(model.fit, args, num_processes=2)
### 4,评估模型
model.net.load_state_dict(torch.load('checkpoint.pt'))
print(model.evaluate(dl_val))
17/12
三,使用TPU加速你的pytorch模型
Kaggle中右边settings 中的 ACCELERATOR选择 TPU v3-8。
1,安装torch_xla
#安装torch_xla支持
!pip uninstall -y torch torch_xla
!pip install torch==1.8.2+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl
#从git安装最新的accelerate仓库
!pip install git+https://github.com/huggingface/accelerate
!pip install -U torchkeras
!pip install -U torchmetrics
#检查是否成功安装 torch_xla
import torch_xla
2,训练代码
torchmetrics库和TPU兼容性不太好,可以去掉metrics_dict进行训练。
import torch
from torch import nn
import torchvision
from torchvision import transforms
from accelerate import notebook_launcher
from torchkeras import KerasModel
### 1,准备数据
def create_dataloaders(batch_size=1024):
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="./minist/",train=True,download=True,transform=transform)
ds_val = torchvision.datasets.MNIST(root="./minist/",train=False,download=True,transform=transform)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True,
num_workers=2,drop_last=True)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=batch_size, shuffle=False,
num_workers=2,drop_last=True)
return dl_train,dl_val
dl_train,dl_val = create_dataloaders(batch_size=1024)
### 2,定义模型
def create_net():
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=512,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=512,out_channels=256,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = 0.1))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(256,128))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(128,10))
return net
net = create_net()
### 3,训练模型
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params=net.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
optimizer=optimizer,T_0=5)
model = KerasModel(net,loss_fn,None,optimizer,lr_scheduler)
from accelerate import notebook_launcher
ckpt_path = 'checkpoint.pt'
args = dict(train_data = dl_train,
val_data = dl_val,
epochs=5,
ckpt_path= ckpt_path,
patience=2,
monitor='val_loss',
mode='min',
mixed_precision='no').values()
notebook_launcher(model.fit, args, num_processes=8)