1. 多stream下的代码如下:
from timeit import default_timer as timer
import sys
import torch
class ProfilerTest(torch.nn.Module) :
def __init__(self):
super(ProfilerTest, self).__init__()
self.conv2 = torch.nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 1, stride = 1, padding = 0)
self.conv5 = torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
self.conv1 = torch.nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
self.conv3 = torch.nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 1, stride = 1, padding = 0)
self.conv4 = torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
self.stream_conv1 = torch.cuda.Stream()
def forward(self, x, multi_stream):
#
o2 = self.conv2(x)
o2 = self.conv5(o2)
#
if multi_stream:
with torch.cuda.stream(self.stream_conv1):
o1 = self.conv1(x)
o1 = self.conv3(o1)
o1 = self.conv4(o1)
torch.cuda.current_stream().wait_stream(self.stream_conv1)
else:
o1 = self.conv1(x)
o1 = self.conv3(o1)
o1 = self.conv4(o1)
#
output = o1 + o2
#
return output
device = torch.device("cuda:0")
model = ProfilerTest()
model = model.to(device)
input = torch.randn(1,3,224,224).to(device)
default_stream = torch.cuda.current_stream()
torch.cuda.Stream.synchronize(default_stream)
start = timer()
for i in range(10):
output = model(input, sys.argv[1])
end = timer()
print ("using time: %.1f ms" % (1000 * (end - start)))
2. 测试结果:
3. nsight下的行为观察:
profiler脚本如下:
nsys profile --trace=cuda,cudnn,cublas --stats=true python test_torch.py True
kernel的执行行为如下(可发现multi stream下的kernel的并行执行concurrent kernel execution):