问题的背景:希望使用batch批量生成结果,但是示例文件的batch_example跑不通
溯源:检测发现默认model.cuda()导致使用的是cuda:0
然后显存不够给我在转移到了cpu上了(其实本质还是显卡资源不足导致的bug,第一次遇到,找了好久。因为打印model的device的时候显示的是cuda但是实际上部分资源已经给了cpu,我的理解是这样的)
原始报错代码:
import os
import re
import random
from tqdm import tqdm
import subprocess
import time
from typing import List
import torch
import tokenizers
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
# import pandas as pd
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
VERBOSE =False
model_name = "模型名"
CUDA=None,
tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.'))
if tokenizers_version < (0, 12, 1):
print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1")
kwargs = {}
print("loading model")
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
print("loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("loading complete")
PAD="<pad>"
tokenizer.padding_side = "left"
tokenizer.pad_token = PAD
# BUGGY!!!!!!!!!!!!!!!!!!!
model = model.half().cuda()
# 生成多个输入文本
input_texts = ["This is the first input.", "This is the second input.", "And this is the third input."]
# 使用分词器对输入进行编码
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
inputs.pop('token_type_ids', None)
# 获取输入的最大长度
max_length = max(inputs['input_ids'].shape[1], inputs['attention_mask'].shape[1])
# 填充输入以使它们具有相同的长度
inputs['input_ids'] = inputs['input_ids'][:, :max_length]
inputs['attention_mask'] = inputs['attention_mask'][:, :max_length]
# 将输入移动到适当的设备(例如,'cuda:0')
inputs = {key: value.to('cuda:1') for key, value in inputs.items()}
# 运行模型
with torch.no_grad():
output = model(**inputs)
# 处理模型输出
output_logits = output.logits
# 输出形状:(batch_size, max_length, vocab_size)
print(output_logits.shape)
解决方案:显式声明tensor的device为比较空闲的cuda:1
,当然具体还是要看自己的服务器上容量较多的卡是哪一块哈
修改后代码如下:
import os
import re
import random
from tqdm import tqdm
import subprocess
import time
from typing import List
import torch
import tokenizers
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
# import pandas as pd
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
VERBOSE =False
model_name = "模型名"
CUDA=None,
tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.'))
if tokenizers_version < (0, 12, 1):
print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1")
kwargs = {}
print("loading model")
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
print("loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("loading complete")
PAD="<pad>"
tokenizer.padding_side = "left"
tokenizer.pad_token = PAD
# 这里~~~~~修改后就能跑通啦
model.to('cuda:1')
# 生成多个输入文本
input_texts = ["This is the first input.", "This is the second input.", "And this is the third input."]
# 使用分词器对输入进行编码
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
inputs.pop('token_type_ids', None)
# 获取输入的最大长度
max_length = max(inputs['input_ids'].shape[1], inputs['attention_mask'].shape[1])
# 填充输入以使它们具有相同的长度
inputs['input_ids'] = inputs['input_ids'][:, :max_length]
inputs['attention_mask'] = inputs['attention_mask'][:, :max_length]
# 将输入移动到适当的设备(例如,'cuda:0')
inputs = {key: value.to('cuda:1') for key, value in inputs.items()}
# 运行模型
with torch.no_grad():
output = model(**inputs)
# 处理模型输出
output_logits = output.logits
# 输出形状:(batch_size, max_length, vocab_size)
print(output_logits.shape)