在数据集预处理,训练参数都没问题的情况下进行微调出现
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/contextlib.py", line 131, in __exit__
self.gen.throw(type, value, traceback)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/accelerate/accelerator.py", line 995, in no_sync
yield
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/transformers/trainer.py", line 2481, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/transformers/trainer.py", line 3579, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/transformers/trainer.py", line 3633, in compute_loss
outputs = model(**inputs)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 176, in forward
inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 198, in scatter
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 78, in scatter_kwargs
scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 64, in scatter
res = scatter_map(inputs)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 55, in scatter_map
return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 51, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 47, in scatter_map
return Scatter.apply(target_gpus, None, dim, obj)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 96, in forward
outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
File "/home/chenrd/.conda/envs/nlp/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 188, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: chunk expects at least a 1-dimensional tensor
python-BaseException
追溯bug发现train()方法调用时会产生一个num_items_in_batch变量
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
该变量为0维的tensor张量
在后续的torch/nn/parallel/comm.py中的scatter函数将对该张量做运算,而被计算的张量形状至少为1维。
解决方法1.
发现使用的是torch中的并行化计算,使用了多个gpu才会导致该bug,对此设置单个gpu
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
torch.cuda.set_device(0)
#以上需要在导入transormers相关模块之前
解决方法2.
使用单张显卡还是不得劲,对Trainer类重写对应的方法,实现多显卡并行计算
class MyTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Subclass and override for custom behavior.
"""
if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
labels = inputs.pop("labels")
else:
labels = None
if self.model_accepts_loss_kwargs:
loss_kwargs = {}
if num_items_in_batch is not None:
loss_kwargs["num_items_in_batch"] = num_items_in_batch.reshape(1,-1)
inputs = {**inputs, **loss_kwargs}
outputs = model(**inputs)
# Save past state if it exists
# TODO: this needs to be fixed and made cleaner later.
if self.args.past_index >= 0:
self._past = outputs[self.args.past_index]
if labels is not None:
unwrapped_model = self.accelerator.unwrap_model(model)
if self._is_peft_model(unwrapped_model):
model_name = unwrapped_model.base_model.model._get_name()
else:
model_name = unwrapped_model._get_name()
# User-defined compute_loss function
if self.compute_loss_func is not None:
loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch)
elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
loss = self.label_smoother(outputs, labels, shift_labels=True)
else:
loss = self.label_smoother(outputs, labels)
else:
if isinstance(outputs, dict) and "loss" not in outputs:
raise ValueError(
"The model did not return a loss from the inputs, only the following keys: "
f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
)
# We don't use .loss here since the model may return tuples instead of ModelOutput.
loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs:
loss *= self.accelerator.num_processes
return (loss, outputs) if return_outputs else loss
主要是将num_items_in_batch维度增加一维,避免报错
loss_kwargs["num_items_in_batch"] = num_items_in_batch.reshape(1,-1)
问题解决