这是一段DeBERTa的代码,会在evaluation阶段造成GPU内存持续上涨,小内存的GPU很容易就OOM
predicts=[]
labels=[]
for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=no_tqdm):
batch = batch_to(batch, device)
with torch.no_grad():
output = model(**batch)
logits = output['logits'].detach()
tmp_eval_loss = output['loss'].detach()
if 'labels' in output:
label_ids = output['labels'].detach().to(device)
else:
label_ids = batch['labels'].to(device)
predicts.append(logits)
labels.append(label_ids)
eval_loss += tmp_eval_loss.mean().item()
input_ids = batch['input_ids']
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1
原因就是代码中predicts和labels一直被保留着,而且是在GPU上,为了解决这个问题,我们应该将数据放在CPU上,改成一下即可
predicts=[]
labels=[]
for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=no_tqdm):
batch = batch_to(batch, device)
with torch.no_grad():
output = model(**batch)
logits = output['logits'].detach().cpu() # 修改
tmp_eval_loss = output['loss'].detach()
if 'labels' in output:
label_ids = output['labels'].detach().cpu() # 修改
else:
label_ids = batch['labels'].cpu() # 修改
predicts.append(logits)
labels.append(label_ids)
eval_loss += tmp_eval_loss.mean().item()
input_ids = batch['input_ids']
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1