关于如何解决“RuntimeError: Expected all tensors to be on the same device, but found at least two devices,”

问题的背景:希望使用batch批量生成结果,但是示例文件的batch_example跑不通

溯源:检测发现默认model.cuda()导致使用的是cuda:0然后显存不够给我在转移到了cpu上了(其实本质还是显卡资源不足导致的bug,第一次遇到,找了好久。因为打印model的device的时候显示的是cuda但是实际上部分资源已经给了cpu,我的理解是这样的)

原始报错代码:

import os
import re
import random
from tqdm import tqdm
import subprocess
import time

from typing import List

import torch
import tokenizers
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
# import pandas as pd
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
VERBOSE =False
model_name = "模型名"
CUDA=None,
tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.'))
if tokenizers_version < (0, 12, 1):
    print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1")
kwargs = {}
print("loading model")
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
print("loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("loading complete")
PAD="<pad>"
tokenizer.padding_side = "left"
tokenizer.pad_token = PAD
# BUGGY!!!!!!!!!!!!!!!!!!!
model = model.half().cuda()
# 生成多个输入文本
input_texts = ["This is the first input.", "This is the second input.", "And this is the third input."]

# 使用分词器对输入进行编码
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
inputs.pop('token_type_ids', None)

# 获取输入的最大长度
max_length = max(inputs['input_ids'].shape[1], inputs['attention_mask'].shape[1])

# 填充输入以使它们具有相同的长度
inputs['input_ids'] = inputs['input_ids'][:, :max_length]
inputs['attention_mask'] = inputs['attention_mask'][:, :max_length]

# 将输入移动到适当的设备(例如,'cuda:0')
inputs = {key: value.to('cuda:1') for key, value in inputs.items()}

# 运行模型
with torch.no_grad():
    output = model(**inputs)

# 处理模型输出
output_logits = output.logits

# 输出形状:(batch_size, max_length, vocab_size)
print(output_logits.shape)

解决方案:显式声明tensor的device为比较空闲的cuda:1,当然具体还是要看自己的服务器上容量较多的卡是哪一块哈

修改后代码如下:

import os
import re
import random
from tqdm import tqdm
import subprocess
import time

from typing import List

import torch
import tokenizers
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
# import pandas as pd
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
VERBOSE =False
model_name = "模型名"
CUDA=None,
tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.'))
if tokenizers_version < (0, 12, 1):
    print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1")
kwargs = {}
print("loading model")
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
print("loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("loading complete")
PAD="<pad>"
tokenizer.padding_side = "left"
tokenizer.pad_token = PAD
# 这里~~~~~修改后就能跑通啦
model.to('cuda:1')
# 生成多个输入文本
input_texts = ["This is the first input.", "This is the second input.", "And this is the third input."]

# 使用分词器对输入进行编码
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
inputs.pop('token_type_ids', None)

# 获取输入的最大长度
max_length = max(inputs['input_ids'].shape[1], inputs['attention_mask'].shape[1])

# 填充输入以使它们具有相同的长度
inputs['input_ids'] = inputs['input_ids'][:, :max_length]
inputs['attention_mask'] = inputs['attention_mask'][:, :max_length]

# 将输入移动到适当的设备(例如,'cuda:0')
inputs = {key: value.to('cuda:1') for key, value in inputs.items()}

# 运行模型
with torch.no_grad():
    output = model(**inputs)

# 处理模型输出
output_logits = output.logits

# 输出形状:(batch_size, max_length, vocab_size)
print(output_logits.shape)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值