机器学习什么显卡_机器学习之多显卡加速运算（Pytorch版）

最新推荐文章于 2024-05-05 05:56:53 发布

weixin_39662432

最新推荐文章于 2024-05-05 05:56:53 发布

阅读量761

点赞数 1

文章标签：机器学习什么显卡

『Python，平台框架』

可参考B站(bilibili)视频教学搜索：晓A技术文档

Research Unit of Machine Learning Application

目标：使用多张显卡，加速模型训练(附代码)

原因：有时深度学习模型显存不够

(1)如果单张显卡可以feed至少一个sample，可以进行：常规运算、数据并行、或模型并行的方式； (2)如果单张显卡不能feed至少一个sample，只能进行：模型并行的方式。

各种运算方式使用时间对比：

结果和结论：

(1)使用数据并行可加速常规运算；

(2)有时模型太大，单张显卡无法运算，可使用模型并行进行计算，虽然时间长，但不得已而为之。

常规运算示例代码如下：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

__author__ = "Lou Xiao(louxiao@tea.ac.cn)"

__created_time__ = '2020-05-30 21:07:00 CST'

import torch

from torch import nn

from torch.optim import Adam

from torch.utils.data import Dataset, DataLoader

import time

class GreatNetwork(nn.Module):

def __init__(self):

super().__init__()

# network define

self.network = nn.Sequential(

nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

def forward(self, x):

return self.network(x)

class FakeDataset(Dataset):

def __init__(self):

super().__init__()

self.count = 20000

def __len__(self):

return self.count

def __getitem__(self, index):

image = torch.randn(3, 512, 512)

return image

def main():

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("running at device %s" % default_device)

default_type = torch.float32

# init model

model = GreatNetwork()

model.to(default_device).type(default_type)

loss_function = nn.MSELoss()

loss_function.to(default_device).type(default_type)

optimizer = Adam(model.parameters(), lr=0.0001)

batch_size = 1

ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)

position = 0

for epoch in range(20):

for image in ds:

position += 1

timestamp = time.time()

image = image.to(default_device).type(default_type)

optimizer.zero_grad()

image_hat = model(image)

loss = loss_function(image, image_hat)

loss.backward()

optimizer.step()

print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))

if __name__ == '__main__':

main()

数据并行示例代码如下：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

__author__ = "Lou Xiao(louxiao@tea.ac.cn)"

__created_time__ = '2020-05-30 21:07:00 CST'

import torch

from torch import nn

from torch.optim import Adam

from torch.utils.data import Dataset, DataLoader

import time

class GreatNetwork(nn.Module):

def __init__(self):

super().__init__()

# network define

self.network = nn.Sequential(

nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

def forward(self, x):

return self.network(x)

class FakeDataset(Dataset):

def __init__(self):

super().__init__()

self.count = 20000

def __len__(self):

return self.count

def __getitem__(self, index):

image = torch.randn(3, 512, 512)

return image

def main():

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("running at device %s" % default_device)

default_type = torch.float32

# init model

model = GreatNetwork()

# import here: DataParallel

model.to(default_device).type(default_type)

model = nn.DataParallel(model)

loss_function = nn.MSELoss()

loss_function.to(default_device).type(default_type)

optimizer = Adam(model.parameters(), lr=0.0001)

batch_size = 2

ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)

position = 0

for epoch in range(20):

for image in ds:

position += 1

timestamp = time.time()

image = image.to(default_device).type(default_type)

optimizer.zero_grad()

image_hat = model(image)

loss = loss_function(image, image_hat)

loss.backward()

optimizer.step()

print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))

if __name__ == '__main__':

main()

模型并行示例代码如下：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

__author__ = "Lou Xiao(louxiao@tea.ac.cn)"

__created_time__ = '2020-05-30 21:07:00 CST'

import torch

from torch import nn

from torch.optim import Adam

from torch.utils.data import Dataset, DataLoader

import time

class GreatNetwork(nn.Module):

def __init__(self):

super().__init__()

# network define

self.network1 = nn.Sequential(

nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

self.network2 = nn.Sequential(

nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(32, 16, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),

nn.ReLU(),

)

if torch.cuda.device_count() >= 2:

self.network1.to(device=torch.device('cuda:0'))

self.network2.to(device=torch.device('cuda:1'))

def forward(self, x):

if torch.cuda.device_count() >= 2:

x = x.to(device=torch.device('cuda:0'))

x = self.network1(x)

x = x.to(device=torch.device('cuda:1'))

x = self.network2(x)

x = x.to(device=torch.device('cuda:0'))

return x

else:

x = self.network1(x)

x = self.network2(x)

return x

class FakeDataset(Dataset):

def __init__(self):

super().__init__()

self.count = 20000

def __len__(self):

return self.count

def __getitem__(self, index):

image = torch.randn(3, 512, 512)

return image

def main():

default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("running at device %s" % default_device)

default_type = torch.float32

# init model

model = GreatNetwork()

model.type(default_type)

loss_function = nn.MSELoss()

loss_function.to(default_device).type(default_type)

optimizer = Adam(model.parameters(), lr=0.0001)

batch_size = 2

ds = DataLoader(FakeDataset(), batch_size=batch_size, shuffle=True, drop_last=True, num_workers=5)

position = 0

for epoch in range(20):

for image in ds:

position += 1

timestamp = time.time()

image = image.to(default_device).type(default_type)

optimizer.zero_grad()

image_hat = model(image)

loss = loss_function(image, image_hat)

loss.backward()

optimizer.step()

print('TRAIN[%010d] Time: %10.4fs' % (position, time.time() - timestamp))

if __name__ == '__main__':

main()

weixin_39662432

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
机器学习什么显卡_机器学习之多显卡加速运算（Pytorch版）

『Python，平台框架』可参考B站(bilibili)视频教学搜索：晓A技术文档ResearchUnit ofMachineLearningApplication目标：使用多张显卡，加速模型训练(附代码)原因：有时深度学习模型显存不够(1)如果单张显卡可以feed至少一个sample，可以进行：常规运算、数据并行、或模型并行的方式；(2)如果单张显卡不能feed至少一个sam...
复制链接

扫一扫