CUDA error: invalid device ordinal错误解决
(***) ****@server-04:~/code/****/codes$ python eval.py
Evaluating: 0%| | 0/611 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/share3/home/****/code/myMANet/codes/eval.py", line 70, in <module>
evaluate(test_loader,model)
File "/share3/home/****/code/myMANet/codes/eval.py", line 57, in evaluate
img_pred = model(torch.cat((blur,event_inT),dim=1))
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 158, in forward
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 175, in scatter
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 44, in scatter_kwargs
inputs = scatter(inputs, target_gpus, dim) if inputs else []
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 36, in scatter
res = scatter_map(inputs)
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 23, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 19, in scatter_map
return Scatter.apply(target_gpus, None, dim, obj)
File "/share3/home/****/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/_functions.py", line 96, in forward
outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
File "/share3/home/****an/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 189, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
错误原因及解决办法
由于在训练时使用DataParallel,且torch.save使用如下代码保存
def save_checkpoint(epoch, model, optimizer):
"""
Save model checkpoint.
:param epoch: epoch number
:param model: model
:param optimizer: optimizer
"""
import torch
state = {'epoch': epoch,
'model': model,
'optimizer': optimizer}
filename = 'checkpoint_myMANet_blur5.pth.tar'
torch.save(state, filename)
在这直接保存模型。将之改为保存模型的参数(注意:必须使用model.module.state_dict()
,不能使用model.state_dict()
。相关链接:pytorch加载nn.DataParallel训练的模型出现的问题
更改为:
def save_checkpoint_dict(epoch, model, optimizer):
"""
Save model checkpoint.
:param epoch: epoch number
:param model: model
:param optimizer: optimizer
"""
import torch
state = {'epoch': epoch,
'model': model.module.state_dict(),
'optimizer': optimizer.state_dict()}
filename = 'checkpoint_dict_myMANet_blur5.pth.tar'
torch.save(state, filename)
具体使用如下:
test.py
os.environ['CUDA_VISIBLE_DEVICES']="1" # choose GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = r'checkpoint_dict_myMANet_blur5.pth.tar'
# Load model checkpoint that is to be evaluated
checkpoint = torch.load(checkpoint,map_location=torch.device('cpu'))
#
model = MANet_s3(ev_lr_nc=7,in_nc=32,out_nc=1,pca_path=r'./pca_matrix_aniso21_15_x4.pth')
model.load_state_dict(checkpoint['model'])
# 这个是torch.save(model)时使用
#model = checkpoint['model']
model = model.to(device)
# Switch to eval mode
model.eval()
train.py(未验证是否可行)
model = MANet_s3(ev_lr_nc=7,in_nc=32,out_nc=1,pca_path=r'/share3/home/****/code/myMANet/codes/pca_matrix_aniso21_15_x4.pth')
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
if checkpoint is None:
start_epoch = 0
#model = MANet_s3(ev_lr_nc=7,in_nc=32,out_nc=1,pca_path=r'/share3/home/z***an/code/myMANet/codes/pca_matrix_aniso21_15_x4.pth')
#optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
else:
checkpoint = torch.load(checkpoint,map_location=torch.device('cpu'))
start_epoch = checkpoint['epoch'] + 1
print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
#model = checkpoint['model']
#optimizer = checkpoint['optimizer']
model = model.cuda()
model = torch.nn.DataParallel(model)