描述:在调试复现github代码时,训练完成进行评估时报错,以下为报错代码片段
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand(input_shape)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
错误信息:Pytorch RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 ‘self’ in call to _th_index_select
Traceback (most recent call last):
File “run.py”, line 54, in
test(base_config, model, test_dataset)
File “/home//Chinese-Text-Classification/bert/test.py”, line 9, in test
test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_data, test=True)
File “/home/Chinese-Text-Classification/bert/train.py”, line 117, in evaluate
outputs = model(**inputs)
File “/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 722, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/Chinese-Text-Classification/bert/transformers/modeling_bert.py”, line 1206, in forward
output_attentions=output_attentions,
File “/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 722, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/Chinese-Text-Classification/bert/transformers/modeling_bert.py”, line 722, in forward
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
File “/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 722, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/Chinese-Text-Classification/bert/transformers/modeling_bert.py”, line 175, in forward
inputs_embeds = self.word_embeddings(input_ids)
File “/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 722, in _call_impl
result = self.forward(*input, **kwargs)
File “/opt/conda/lib/python3.7/site-packages/torch/nn/modules/sparse.py”, line 126, in forward
self.norm_type, self.scale_grad_by_freq, self.sparse)
File “/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py”, line 1814, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 ‘self’ in call to _th_index_select
分析原因:设置model.device为 cuda 不会更改您的内部模块设备,因此self.lstm,self.char_embed和self.dist_fc仍然在 cpu 上。也就是训练设置了cuda:0可以使用GPU进行训练,但评估时代码的内部模块设备使用的还是CPU,所以解决办法就是在代码模块加上cuda,让模块在GPU上跑
修改代码如下:
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id).cuda()
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size).cuda()
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size).cuda()
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps).cuda()
self.dropout = nn.Dropout(config.hidden_dropout_prob).cuda()
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
device = input_ids.device if input_ids is not None else inputs_embeds.device
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand(input_shape)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
解决方案:把代码里的所有类的init下的用到的模块都加上.cuda()即可,此方法参考于https://www.codeleading.com/article/61233946963/