import numpy as np from mxnet import gluon,npx,nd data = np.arange(15).reshape(3, 5) dataloader = gluon.data.DataLoader(data, batch_size=2, shuffle=False, last_batch='keep') devices = [npx.gpu(0), npx.gpu(1)] for data in dataloader: try: split = gluon.utils.split_and_load(data, devices,even_split=False) print('split: ', split) except Exception as e: print('error0: ', e) #问题源于nd.slice_axis函数报错 x = nd.array([[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]]) try: print(nd.slice_axis(x, axis=0, begin=1, end=1)) except Exception as e: print('error1: ', e) #主要原因是多gpu训练拆分数据集,当batch的数据集不足以多gpu训练时,应采用与数据相同的gpu个数训练 #解决办法 for data in dataloader: if len(data)>=len(devices): split = gluon.utils.split_and_load(data, devices, even_split=False) print('split0: ', split) else: split = gluon.utils.split_and_load(data, devices[:len(data)], even_split=False) print('split1: ',split)
结果:
split: [
[[0 1 2 3 4]]
<NDArray 1x5 @gpu(0)>,
[[5 6 7 8 9]]
<NDArray 1x5 @gpu(1)>]
error0: Traceback (most recent call last):
File "src/operator/tensor/./matrix_op-inl.h", line 1296
MXNetError: Check failed: (*begin < *end): Invalid begin, end, get begin=1, end=1
error1: Traceback (most recent call last):
File "src/operator/tensor/./matrix_op-inl.h", line 1296
MXNetError: Check failed: (*begin < *end): Invalid begin, end, get begin=1, end=1
split0: [
[[0 1 2 3 4]]
<NDArray 1x5 @gpu(0)>,
[[5 6 7 8 9]]
<NDArray 1x5 @gpu(1)>]
split1: [
[[10 11 12 13 14]]
<NDArray 1x5 @gpu(0)>]