一、前置环境
1.安装了accelerate 库
当在命令行里面执行 accelerate env的时候,如果出现以下内容,则accelerate库正常
2.查看服务GPU资源
执行 nvidia-smi,目前服务器上面有8张卡
二、Accelerate 库使用
2.1 多gpu消息传递
编写以下代码,并保存为gather_object_demo.py
from accelerate import Accelerator
from accelerate.utils import gather_object
accelerate = Accelerator()
## 如果有多个GPU,在每一个GPU上面打印对应的ID
message = [f"Hello ,this is GPU {accelerate.process_index}"]
## collect the message from all GPUs
messages = gather_object(message)
# output the messages only on the main process with accelerator.print()
accelerate.print(messages)
在命令行中执行命令 accelerate launch gather_object_demo.py
如上图所示,打印了8个message
2.2 多GPU单条推理
编写以下代码,并保存为muti_gpu_single_data.py
from accelerate import Accelerator
from accelerate.utils import gather_object
from transformers import AutoTokenizer, AutoModelForCausalLM
from statistics import mean
import torch
import time
import json
accelerator = Accelerator()
prompts_all = [
"真搞笑呀",
"我也很开心",
"你非常可爱",
"么么哒你知道的那么清楚",
"是吗我现在就抱你一下",
"你我不好看",
"我好嗨",
"你说我美不美可爱不可爱",
"你我不想活了",
"发烧了什么故障",
] * 10
## 加载模型和分词器
model_path = 'Qwen/Qwen2-1.5B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map={"": accelerator.process_index},
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
## 同步GPU
accelerator.wait_for_everyone()
start_time = time.time()
## 将prompt分发到可用的gpu上面
with accelerator.split_between_processes(prompts_all) as prompts:
# 保存结果
results = dict(outputs=[], num_tokens=0)
# 每一个GPU开始预测结果,一条一条执行
for prompt in prompts:
# Qwen 对话模板封装
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
chat_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([chat_str], return_tensors="pt", add_special_tokens=False).to("cuda")
# 预测结果
generated_ids = model.generate(
model_inputs.input_ids,
do_sample=False, # 固定输出
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# 获取最终的输出
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# 保存输出以及对应的token数量
results["outputs"].append(response)
results["num_tokens"] += len(generated_ids)
# gather_object()处理的对象必须是list类型,不然会报错
results = [results]
# 从所有的GPU中拿结果
all_result_gathered = gather_object(results)
if accelerator.is_main_process:
time_diff = time.time() - start_time
num_tokens = sum([r["num_tokens"] for r in all_result_gathered])
print(
f"tokens/sec: {num_tokens // time_diff}, time {time_diff}, total tokens {num_tokens}, total prompts {len(prompts_all)}")
单GPU
tokens/sec: 61.0, time 64.24465918540955, total tokens 3950, total prompts 100
多GPU(8块)
tokens/sec: 372.0, time 10.601160287857056, total tokens 3950, total prompts 100
2.3 多GPU Batch推理
from accelerate import Accelerator
from accelerate.utils import gather_object
from transformers import AutoTokenizer, AutoModelForCausalLM
from statistics import mean
import torch
import time
import json
accelerator = Accelerator()
prompts_all = [
"真搞笑呀",
"我也很开心",
"你非常可爱",
"么么哒你知道的那么清楚",
"是吗我现在就抱你一下",
"你我不好看",
"我好嗨",
"你说我美不美可爱不可爱",
"你我不想活了",
"发烧了什么故障",
] * 10
## 加载模型和分词器
model_path = '/home/jovyan/lwr/Qwen/Qwen2-1.5B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map={"": accelerator.process_index},
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# batch, 在 infernece的时候,需要用left pad
def prepare_prompts(prompts, tokenizer, batch_size=16):
qwen_prompts = []
# Qwen 对话模板封装
for prompt in prompts:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
chat_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
qwen_prompts.append(chat_str)
batches = [qwen_prompts[i:i + batch_size] for i in range(0, len(qwen_prompts), batch_size)]
batches_tok = []
tokenizer.padding_side="left"
for prompt_batch in batches:
batches_tok.append(
tokenizer(
prompt_batch,
return_tensors="pt",
padding='longest',
truncation=False,
pad_to_multiple_of=8, # 填充到8的倍数
add_special_tokens=False).to("cuda")
)
tokenizer.padding_side="right"
return batches_tok
## 同步GPU
accelerator.wait_for_everyone()
start_time = time.time()
## 将prompt分发到可用的gpu上面
with accelerator.split_between_processes(prompts_all) as prompts:
# 保存结果
results = dict(outputs=[], num_tokens=0)
# 每一个GPU用batch来推理
prompt_batches = prepare_prompts(prompts, tokenizer, batch_size=16)
for prompts_batch in prompt_batches:
# 预测结果
generated_ids = model.generate(
**prompts_batch,
do_sample=False, # 固定输出
max_length =1024
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(prompts_batch.input_ids, generated_ids)
]
# 获取最终的输出
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
num_tokens = sum([len(t) for t in generated_ids])
# 保存输出以及对应的token数量
results["outputs"].extend(response)
results["num_tokens"] += num_tokens
# gather_object()处理的对象必须是list类型,不然会报错
results = [results]
# 从所有的GPU中拿结果
all_result_gathered = gather_object(results)
if accelerator.is_main_process:
time_diff = time.time() - start_time
num_tokens = sum([r["num_tokens"] for r in all_result_gathered])
print(
f"tokens/sec: {num_tokens // time_diff}, time {time_diff}, total tokens {num_tokens}, total prompts {len(prompts_all)}")
推理时间
相比于单条推理,总时间又提升了3倍多