『Python,平台框架』
可参考B站(bilibili)视频教学搜索:晓A技术文档
Research Unit of Machine Learning Application
目标:使用多张显卡,加速模型训练(附代码)
原因:有时深度学习模型显存不够
各种运算方式使用时间对比:
结果和结论:
(1)使用数据并行可加速常规运算;
(2)有时模型太大,单张显卡无法运算,可使用模型并行进行计算,虽然时间长,但不得已而为之。
常规运算示例代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = "Lou Xiao(louxiao@tea.ac.cn)"
__copyright__ = "Copyright 2018~2020"
__created_time__ = '2020-05-30 21:07:00 CST'
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import time
class GreatNetwork(nn.Module):
def __init__(self):
super().__init__()
# network define
self.network = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
)
def forward(self, x):
return self.network(x)
class FakeDataset(Dataset):
def __init__(self):
super().__init__()
self.count = 20000
def __len__(self):
return self.count
def __getitem__(self, index):
image = torch.randn(3, 512, 512)
return image
def main():
default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("running at device %s" % default_device)
default_type = torch.float32
# init model
model = GreatNetwork()
model.to(default_device).type(default_type)
loss_function = nn.MSELoss()
loss_function.to(default_device).type(default_type)
optimizer = Adam(model.parameters(), lr=0.0001)
batch_size = 1
ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)
position = 0
for epoch in range(20):
for image in ds:
position += 1
timestamp = time.time()
image = image.to(default_device).type(default_type)
optimizer.zero_grad()
image_hat = model(image)
loss = loss_function(image, image_hat)
loss.backward()
optimizer.step()
print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))
if __name__ == '__main__':
main()
数据并行示例代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = "Lou Xiao(louxiao@tea.ac.cn)"
__copyright__ = "Copyright 2018~2020"
__created_time__ = '2020-05-30 21:07:00 CST'
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import time
class GreatNetwork(nn.Module):
def __init__(self):
super().__init__()
# network define
self.network = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
)
def forward(self, x):
return self.network(x)
class FakeDataset(Dataset):
def __init__(self):
super().__init__()
self.count = 20000
def __len__(self):
return self.count
def __getitem__(self, index):
image = torch.randn(3, 512, 512)
return image
def main():
default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("running at device %s" % default_device)
default_type = torch.float32
# init model
model = GreatNetwork()
# import here: DataParallel
model.to(default_device).type(default_type)
model = nn.DataParallel(model)
loss_function = nn.MSELoss()
loss_function.to(default_device).type(default_type)
optimizer = Adam(model.parameters(), lr=0.0001)
batch_size = 2
ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)
position = 0
for epoch in range(20):
for image in ds:
position += 1
timestamp = time.time()
image = image.to(default_device).type(default_type)
optimizer.zero_grad()
image_hat = model(image)
loss = loss_function(image, image_hat)
loss.backward()
optimizer.step()
print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))
if __name__ == '__main__':
main()
模型并行示例代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = "Lou Xiao(louxiao@tea.ac.cn)"
__copyright__ = "Copyright 2018~2020"
__created_time__ = '2020-05-30 21:07:00 CST'
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import time
class GreatNetwork(nn.Module):
def __init__(self):
super().__init__()
# network define
self.network1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
)
self.network2 = nn.Sequential(
nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
)
if torch.cuda.device_count() >= 2:
self.network1.to(device=torch.device('cuda:0'))
self.network2.to(device=torch.device('cuda:1'))
def forward(self, x):
if torch.cuda.device_count() >= 2:
x = x.to(device=torch.device('cuda:0'))
x = self.network1(x)
x = x.to(device=torch.device('cuda:1'))
x = self.network2(x)
x = x.to(device=torch.device('cuda:0'))
return x
else:
x = self.network1(x)
x = self.network2(x)
return x
class FakeDataset(Dataset):
def __init__(self):
super().__init__()
self.count = 20000
def __len__(self):
return self.count
def __getitem__(self, index):
image = torch.randn(3, 512, 512)
return image
def main():
default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("running at device %s" % default_device)
default_type = torch.float32
# init model
model = GreatNetwork()
model.type(default_type)
loss_function = nn.MSELoss()
loss_function.to(default_device).type(default_type)
optimizer = Adam(model.parameters(), lr=0.0001)
batch_size = 2
ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)
position = 0
for epoch in range(20):
for image in ds:
position += 1
timestamp = time.time()
image = image.to(default_device).type(default_type)
optimizer.zero_grad()
image_hat = model(image)
loss = loss_function(image, image_hat)
loss.backward()
optimizer.step()
print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))
if __name__ == '__main__':
main()