可以用“watch -n 0.1 nvidia-smi”来查看gpu状态,我用的是3块12G的GPU进行实验
本实验将使用一个简单的瞎写的网络进行,网络训练一个分类任务,当然这个不重要,我们也不关心效果,这里希望用一个简单的网络来说明如何使用GPU训练,这个网络是可以直接跑起来的,xdm可以动手尝试一下
在第0部分是CPU上训练的代码,第一部分使用了单张GPU,第二部分是单机多卡的任务
目录
0、CPU代码
#样例 准备数据,加载数据,准备模型,设置损失函数,设置优化器,开始训练,最后验证,结果聚合展示
import torch
import torchvision
from torch.nn import Sequential
from torch.utils.data import DataLoader
from torch import nn
#搭建神经网络
class MyModule(nn.Module):
def __init__(self):
super(MyModule, self).__init__()
self.model1=Sequential(
nn.Conv2d(3,32,5,1,2),
nn.Conv2d(32,64,5,1,2),
nn.Conv2d(64,512,5,1,2),
nn.MaxPool2d(2),
nn.Conv2d(512, 1024, 5, 1, 2),
nn.MaxPool2d(2),
nn.Conv2d(1024, 2048, 5, 1, 2),
nn.Conv2d(2048, 4096, 5, 1, 2),
nn.MaxPool2d(2)
)
self.model2=Sequential(
nn.Flatten(),
nn.Linear(4096*4*4,8000),
nn.Linear(8000,64),
nn.Linear(64,10)
)
def forward(self, x):
x=self.model1(x)
x=self.model2(x)
return x
if __name__=="__main__":
"""准备数据集"""
#训练
train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,
transform=torchvision.transforms.ToTensor(),
download=True)
train_data_size=len(train_data)
print("训练数据集的长度为{}".format(train_data_size))#50000
#利用DataLoader加载数据集
train_dataloader=DataLoader(train_data,batch_size=64)
#初始化
#创建模型
myModule=MyModule()
#损失函数
loss_fn=nn.CrossEntropyLoss()
#定义优化器
learning_rate=0.01
optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )
#训练次数
train_step=0
#测试次数
test_step=0
#训练轮数
epoch=10
#开始训练
#训练迭代几次
for i in range(epoch):
print("------第 {} 轮训练开始------".format(i+1))
#每一次的训练
myModule.train()
for data in train_dataloader:
imgs,targets=data
outputs=myModule(imgs)
#计算损失
loss=loss_fn(outputs,targets)
# 利用优化器对参数优化,调优
optimize.zero_grad()
loss.backward()
optimize.step()
train_step=train_step+1#一张图片加一次
if train_step%10==0:
print("训练次数:{},Loss:{}".format(train_step,loss.item()))#加item将tensor转化为数字
#保存每一轮训练后的模型
torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))
print("模型已保存")
1、单机单卡
使用的函数:
#1、判断GPU是否可用:
torch.cuda.is_availabel()
#2、使用0号GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"
#3、数据拷贝到GPU
model.cuda()#不用赋值
data = data.cuda()
#4、模型保存与加载
torch.save
torch.load(file_name,mao_location=torch.device("cuda"/"cpu"))
在任务1的基础上改变的代码均使用注释标记了
#样例 准备数据,加载数据,准备模型,设置损失函数,设置优化器,开始训练,最后验证,结果聚合展示
import torch
import torchvision
import os
from torch.nn import Sequential
from torch.utils.data import DataLoader
from torch import nn
class MyModule(nn.Module):
def __init__(self):
super(MyModule, self).__init__()
self.model1=Sequential(
nn.Conv2d(3,32,5,1,2),
nn.Conv2d(32,64,5,1,2),
nn.Conv2d(64,512,5,1,2),
nn.MaxPool2d(2),
nn.Conv2d(512, 1024, 5, 1, 2),
nn.MaxPool2d(2),
nn.Conv2d(1024, 2048, 5, 1, 2),
nn.Conv2d(2048, 4096, 5, 1, 2),
nn.MaxPool2d(2)
)
self.model2=Sequential(
nn.Flatten(),
nn.Linear(4096*4*4,5000),
nn.Linear(5000,64),
nn.Linear(64,10)
)
def forward(self, x):
x=self.model1(x)
x=self.model2(x)
return x
if __name__=="__main__":
#检验GPU是否可用
if torch.cuda.is_available():
print("Use one GPU")
#使用的GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
print("can't use GPU")
raise Exception("can't use GPU")
train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,
transform=torchvision.transforms.ToTensor(),
download=True)
train_data_size=len(train_data)
print("训练数据集的长度为{}".format(train_data_size))#50000
train_dataloader=DataLoader(train_data,batch_size=64)
myModule=MyModule()
#模型拷贝到cuda()
myModule.cuda()
loss_fn=nn.CrossEntropyLoss()
learning_rate=0.01
optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )
train_step=0
test_step=0
epoch=10
for i in range(epoch):
print("------第 {} 轮训练开始------".format(i+1))
myModule.train()
for data in train_dataloader:
imgs,targets=data
#将数据赋值到gpu
imgs = imgs.cuda()
targets = targets.cuda()
outputs=myModule(imgs)
loss=loss_fn(outputs,targets)
optimize.zero_grad()
loss.backward()
optimize.step()
train_step=train_step+1
if train_step%1==0:
print("训练次数:{},Loss:{}".format(train_step,loss.item()))
torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))
print("模型已保存")
2、单机多卡
这里有两种方案,一种是使用torch.nn.DataParaller,这种方案会慢但改动的代码量较少,但巨慢。另一种是nn.parallerl.DistributedDataParallel,这种方案改动会比较大,但是最终的代码多进程效率会高。
注意这边的小坑:后面采取的cuda标号都是按os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"中的顺序
2.1 DataParaller(DP)(不建议用)
锐评:改是真的好改,慢是真的慢
#将模型放到多个GPU上
#只用改一句
model = DataParallerl(model.cuda,device_ids = [0,1,2])
代码:
import torch
import torchvision
import os
from torch.nn import Sequential
from torch.utils.data import DataLoader
from torch import nn
class MyModule(nn.Module):
def __init__(self):
super(MyModule, self).__init__()
self.model1=Sequential(
nn.Conv2d(3,32,5,1,2),
nn.Conv2d(32,64,5,1,2),
nn.Conv2d(64,512,5,1,2),
nn.MaxPool2d(2),
nn.Conv2d(512, 1024, 5, 1, 2),
nn.MaxPool2d(2),
nn.Conv2d(1024, 2048, 5, 1, 2),
nn.Conv2d(2048, 4096, 5, 1, 2),
nn.MaxPool2d(2)
)
self.model2=Sequential(
nn.Flatten(),
nn.Linear(4096*4*4,5000),
nn.Linear(5000,64),
nn.Linear(64,10)
)
def forward(self, x):
x=self.model1(x)
x=self.model2(x)
return x
if __name__=="__main__":
#检验GPU是否可用
if torch.cuda.is_available():
print(torch.cuda.device_count())
print("Use GPU")
#使用的GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"
else:
print("can't use GPU")
raise Exception("can't use GPU")
train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,
transform=torchvision.transforms.ToTensor(),
download=True)
train_data_size=len(train_data)
print("训练数据集的长度为{}".format(train_data_size))#50000
train_dataloader=DataLoader(train_data,batch_size=64)
myModule=MyModule()
#模型拷贝到cuda()
myModule = nn.DataParallel(myModule.cuda(), device_ids = [0,1,2])
loss_fn=nn.CrossEntropyLoss()
learning_rate=0.01
optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )
train_step=0
test_step=0
epoch=10
for i in range(epoch):
print("------第 {} 轮训练开始------".format(i+1))
myModule.train()
for data in train_dataloader:
imgs,targets=data
#将数据赋值到gpu
imgs = imgs.cuda()
targets = targets.cuda()
outputs=myModule(imgs)
loss=loss_fn(outputs,targets)
optimize.zero_grad()
loss.backward()
optimize.step()
train_step=train_step+1
if train_step%1==0:
print("训练次数:{},Loss:{}".format(train_step,loss.item()))
torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))
print("模型已保存")
2.2DistributedSampler(DDP)
这个方法的最大不同在于gpu自动分配为args.local_rank
注意模型文件只在args.local_rank == 0时保存就可以
注意启动方式特殊
python -m torch.distributed.launch --nproc_per_node=n_gpus test.py
#执行命令n_gpus 是gpu数目,torch.distributed.launch自动分配从0到n_gpus-1
python -m torch.distributed.launch --nproc_per_node=n_gpus test.py
在写代码时要用parser来接住'--local_rank'
#初始化:
torch.distributed.init_process_group("nccl", world_size = n_gpus, rank = args.local_rank)
#参数分别是:gpu通信方式,gpu数量,一个环境变量
#
torch.cuda.set_device(arg.local_rank)
#模型载入
model = DistributeDataParallel(model.cuda(arg.local_rank),device_ids = [args.local_rank])
#数据集操作
#分配数据集
train_sampler = DistributedSampler(train_dataset)
#为增加随机性。要注意在每个批次训练之前需要调用
train_sampler.set_epoch(epoch)
#DataLoader中传入sampler,注意sampler和shuffle互斥
train_dataloader = DataLoader(..., sampler=train_sampler)
#数据拷贝到相应的卡上
data = data.cuda(args.local_rank)
代码:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import torch
import torchvision
from torch.nn import Sequential
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler#导入包
from torch import nn
import argparse
import time
class MyModule(nn.Module):
def __init__(self):
super(MyModule, self).__init__()
self.model1=Sequential(
nn.Conv2d(3,32,5,1,2),
nn.Conv2d(32,64,5,1,2),
nn.Conv2d(64,512,5,1,2),
nn.MaxPool2d(2),
nn.Conv2d(512, 1024, 5, 1, 2),
nn.MaxPool2d(2),
nn.Conv2d(1024, 2048, 5, 1, 2),
nn.Conv2d(2048, 4096, 5, 1, 2),
nn.MaxPool2d(2)
)
self.model2=Sequential(
nn.Flatten(),
nn.Linear(4096*4*4,5000),
nn.Linear(5000,64),
nn.Linear(64,10)
)
def forward(self, x):
x=self.model1(x)
x=self.model2(x)
return x
if __name__=="__main__":
#设置参数args.local_rank
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", help = "local device id on current node", type = int)
args = parser.parse_args()
if torch.cuda.is_available():
print(torch.cuda.device_count())
print("Use GPU")
else:
print("can't use GPU")
raise Exception("can't use GPU")
#初始化
n_gpus = 2
torch.distributed.init_process_group("nccl", world_size = n_gpus, rank = args.local_rank)
torch.cuda.set_device(args.local_rank)#修改环境变量
train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,
transform=torchvision.transforms.ToTensor(),
download=True)
train_data_size=len(train_data)
print("训练数据集的长度为{}".format(train_data_size))#50000
#数据集划分后载入
train_sampler = DistributedSampler(train_data)
train_dataloader=DataLoader(train_data,batch_size=64, sampler = train_sampler)#batch_size会变小
myModule=MyModule()
#模型载入args.local_rank
myModule.cuda()
myModule = nn.parallel.DistributedDataParallel(myModule.cuda(args.local_rank), device_ids = [args.local_rank])
loss_fn=nn.CrossEntropyLoss()
learning_rate=0.01
optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )
train_step=0
test_step=0
epoch=10
for i in range(epoch):
print("------第 {} 轮训练开始------".format(i+1))
train_sampler.set_epoch(epoch)#每张卡在每个周期上的值是随机的
myModule.train()
for data in train_dataloader:
imgs,targets=data
#将数据赋值到args.local_rank
imgs = imgs.cuda(args.local_rank)
targets = targets.cuda(args.local_rank)
starttime = time.time()
outputs=myModule(imgs)
loss=loss_fn(outputs,targets)
optimize.zero_grad()
loss.backward()
optimize.step()
endtime = time.time()
train_step=train_step+1
if train_step%1==0:
print("训练次数:{},Loss:{},time:{}".format(train_step,loss.item(),endtime-starttime))
#仅在args.local_rank == 0时保存
if args.local_rank ==0:
torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))
print("模型已保存")