前言
pytorch中,常见的GPU训练方式有:单机单卡训练,单机多卡训练及多机多卡训练。
一、单机单卡训练
单机单卡训练,是最常用的训练方式。相比CPU训练,只需将模型及数据拷贝到cuda中即可。
步骤:
1、使用torch.cuda.is_available()判断本机是否有cuda;
2、将模型复制到cuda中:net.to(device);
3、将训练数据及验证数据复制到cuda中:images.to(device)
完整代码看我的GitHub。
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = AlexNet(num_classes=5, init_weights=True)
net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.0002)
epochs = 10
save_path = './AlexNet.pth'
best_acc = 0.0
train_steps = len(train_loader)
for epoch in range(epochs):
# train
net.train()
running_loss = 0.0
train_bar = tqdm(train_loader, file=sys.stdout)
for step, data in enumerate(train_bar):
images, labels = data
optimizer.zero_grad()
outputs = net(images.to(device))
loss = loss_function(outputs, labels.to(device))
loss.backward()
optimizer.step()
running_loss += loss.item()
train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(epoch + 1, epochs, loss)
# validate
net.eval()
acc = 0.0 # accumulate accurate number / epoch
with torch.no_grad():
val_bar = tqdm(validate_loader, file=sys.stdout)
for val_data in val_bar:
val_images, val_labels = val_data
outputs = net(val_images.to(device))
predict_y = torch.max(outputs, dim=1)[1]
acc += torch.eq(predict_y, val_labels.to(device)).sum().item()
val_accurate = acc / val_num
print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' %
(epoch + 1, running_loss / train_steps, val_accurate))
if val_accurate > best_acc:
best_acc = val_accurate
torch.save(net.state_dict(), save_path)
二、单机多卡训练
单机多卡训练有两种方式:
(1)torch.nn.DataParallel:DP模式,简单来说,就是一个进程控制多个GPU训练。
(2)torch.nn.parallel.DistributedDataParallel:DDP模式,多个进程控制多个GPU训练。
1.torch.nn.DataParallel
由于该方法采用单进程训练,效率慢,故现在很少采用。
步骤:
1、使用torch.cuda.device_count()判断本机是否有多个GPU,是否支持多GPU进行训练;
2、将模型使用torch.nn.DataParallel()进行包装,要指定device_ids用来训练的GPU卡;
3、将训练数据传入cuda()中
4、模型保存时,注意要使用torch.save(model.module.state_dict())
代码示例:(完整代码看我GitHub)
if torch.cuda.device_count() > 1: # 判断GPU卡是否大于1
net = AlexNet(num_classes=5, init_weights=True)
net = nn.DataParallel(net.cuda(), device_ids=[0, 1]) # 包装model,并指定GPU
epochs = 10
save_path = './AlexNet.pth'
best_acc = 0.0
train_steps = len(train_loader)
for epoch in range(epochs):
# train
net.train()
running_loss = 0.0
train_bar = tqdm(train_loader, file=sys.stdout)
for step, data in enumerate(train_bar):
images, labels = data
optimizer.zero_grad()
outputs = net(images.cuda()) # image,labels复制到cuda中
loss = loss_function(outputs, labels.cuda())
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(epoch + 1,
epochs,
loss)
# validate
net.eval()
acc = 0.0 # accumulate accurate number / epoch
with torch.no_grad():
val_bar = tqdm(validate_loader, file=sys.stdout)
for val_data in val_bar:
val_images, val_labels = val_data
outputs = net(val_images.cuda())
predict_y = torch.max(outputs, dim=1)[1]
acc += torch.eq(predict_y, val_labels.cuda()).sum().item()
val_accurate = acc / val_num
print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' %
(epoch + 1, running_loss / train_steps, val_accurate))
if val_accurate > best_acc:
best_acc = val_accurate
torch.save(net.state_dict(), save_path)
2.torch.nn.parallel.DistributedDataParallel
DDP模式:该方法用命令行方式来运行代码
python -m torch.distributed.launch --nproc_per_node= n_gpus train.py
torch.distributed.launch: 用于控制当前节点哪个gpu运行,可以有argparse模块获取;
n_gpus:表示用几块gpu卡训练。
代码示例如下:
import json
from torch.utils.data import Dataset, DataLoader, DistributedSampler
import argparse
import torch
import torch.nn as nn
from torchvision import transforms, datasets
import sys
import os
import logging
logging.basicConfig(
level=logging.WARN,
stream=sys.stdout,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
# 模型定义
class AlexNet(nn.Module):
def __init__(self, num_classes=1000, init_weights=False):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=2), # input[3, 224, 224] output[48, 55, 55]
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # output[48, 27, 27]
nn.Conv2d(48, 128, kernel_size=5, padding=2), # output[128, 27, 27]
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # output[128, 13, 13]
nn.Conv2d(128, 192, kernel_size=3, padding=1), # output[192, 13, 13]
nn.ReLU(inplace=True),
nn.Conv2d(192, 192, kernel_size=3, padding=1), # output[192, 13, 13]
nn.ReLU(inplace=True),
nn.Conv2d(192, 128, kernel_size=3, padding=1), # output[128, 13, 13]
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # output[128, 6, 6]
)
self.classifier = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(128 * 6 * 6, 2048),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(2048, 2048),
nn.ReLU(inplace=True),
nn.Linear(2048, num_classes),
)
if init_weights:
self._initialize_weights()
def forward(self, x):
x = self.features(x)
x = torch.flatten(x, start_dim=1)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
# trian
def train(local_rank, train_dataset, eval_dataset, model, optimizer, num_epoch, log_step_interval, save_step_interval,
eval_step_interval, save_path):
""" 此处data_loader是map-style dataset """
batch_size = 32
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers
model = nn.parallel.DistributedDataParallel(model.cuda(local_rank),
device_ids=[local_rank]) # 模型拷贝,放入DistributedDataParallel
train_sampler = DistributedSampler(train_dataset)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler,
num_workers=nw)
validate_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=8)
val_num = len(validate_dataset)
train_steps = len(train_data_loader)
for epoch_index in range(num_epoch):
model.train()
train_sampler.set_epoch(epoch_index) # 为了让每张卡在每个周期中得到的数据是随机的
for batch_index, (images, labels) in enumerate(train_data_loader):
optimizer.zero_grad()
running_loss = 0.0
images = images.cuda(local_rank) # 数据拷贝
labels = labels.cuda(local_rank) # 数据拷贝
outputs = model(images)
loss = nn.CrossEntropyLoss()(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
# validate
model.eval()
acc = 0.0 # accumulate accurate number / epoch
with torch.no_grad():
for val_data in validate_loader:
val_images, val_labels = val_data
outputs = model(val_images)
predict_y = torch.max(outputs, dim=1)[1]
acc += torch.eq(predict_y, val_labels).sum().item()
val_accurate = acc / val_num
print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' %
(epoch_index + 1, running_loss / train_steps, val_accurate))
if val_accurate > best_acc:
best_acc = val_accurate
torch.save(model.modules().state_dict(), save_path)
# test
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", help="local device id on current node",
type=int)
args = parser.parse_args()
if torch.cuda.is_available():
logging.warning("Cuda is available!")
if torch.cuda.device_count() > 1:
logging.warning(f"Find {torch.cuda.device_count()} GPUs!")
else:
logging.warning("Too few GPU!")
else:
logging.warning("Cuda is not available! Exit!")
n_gpus = 2
torch.distributed.init_process_group("nccl", world_size=n_gpus, rank=args.local_rank)
torch.cuda.set_device(args.local_rank)
data_transform = {
"train": transforms.Compose([transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]),
"val": transforms.Compose([transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])}
data_root = os.path.abspath(os.path.join(os.getcwd(), "../..")) # get data root path
image_path = os.path.join(data_root, "data_set", "flower_data") # flower data set path
assert os.path.exists(image_path), "{} path does not exist.".format(image_path)
train_dataset = datasets.ImageFolder(root=os.path.join(image_path, "train"),
transform=data_transform["train"])
train_num = len(train_dataset)
# {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
flower_list = train_dataset.class_to_idx
cla_dict = dict((val, key) for key, val in flower_list.items())
# write dict into json file
json_str = json.dumps(cla_dict, indent=4)
with open('class_indices.json', 'w') as json_file:
json_file.write(json_str)
validate_dataset = datasets.ImageFolder(root=os.path.join(image_path, "val"),
transform=data_transform["val"])
model = AlexNet()
print("模型总参数:", sum(p.numel() for p in model.parameters()))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train(args.local_rank, train_dataset, validate_dataset, model,
optimizer, num_epoch=10, log_step_interval=20, save_step_interval=500, eval_step_interval=300,
save_path="./")
执行命令:
python -m torch.distributed.launch --nproc_per_node=2 trian.py