修改ModelLink在RTX3090完成预训练、微调、推理、评估以及TRT-LLM转换、推理、性能测试
- 1 参考文档
- 2 测试环境
- 3 创建容器
- 4 安装AscendSpeed、ModelLink
- 5 下载LLAMA2-7B预训练权重和词表
- 6 huggingface模型的推理及性能测试
- 7.1 修改torch,deepspeed规避缺失npu环境的问题
- 7.2 修改ModelLink,规避缺失npu环境的问题
- 8 将权重从huggingface格式转化为AscendSpeed格式(PTD模式)
- 9 下载alpaca数据集并查看第一条记录
- 10.1 将alpacal转换成LLM预训练数据集格式
- 10.2 开始预训练
- 11.1 将alpacal转换成LLM指令微调数据集格式
- 11.2 开始全参微调
- 11.3 采用ModelLink进行指令微调模型的推理测试
- 11.4.1 准备MMLU精度测试数据集
- 11.4.2 采用ModelLink进行指令微调模型的MMLU精度测试
- 11.5 将模型从Megatron格式转回HuggingFace格式
- 12 指令微调后HuggingFace格式模型的推理测试
- 13 TensorRT-LLM推理测试
- 14 异常处理--提示tensorrt找不到
背景:因为没有华为的训练卡,又想跑ModelLink,顺便熟悉LLM从训练到部署的完全过程,记录备用
1 参考文档
2 测试环境
- 8张 NVIDIA GeForce RTX 3090 ; Driver Version: 530.30.02 ; CUDA Version: 12.1
3 创建容器
docker run --gpus all --shm-size=32g -ti -e NVIDIA_VISIBLE_DEVICES=all \
--privileged --net=host -v $PWD:/home \
-w /home --name ModelLink \
nvcr.io/nvidia/pytorch:23.07-py3 /bin/bash
mkdir -p /home/ModelLink
4 安装AscendSpeed、ModelLink
cd /home/ModelLink
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
# 非必须,为了生成diff,看看我修改了哪些地方
git add * -f
git commit -m "add"
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
#安装AscendSpeed
cd /home/ModelLink
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install -e .
cd ..
#安装deepspeed
pip install deepspeed
#安装ModelLink
cd /home/ModelLink/ModelLink
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install -e .
#其它
pip uninstall transformer-engine -y #不卸载会报错,与容器里的torch版本不兼容
5 下载LLAMA2-7B预训练权重和词表
cd /home/ModelLink
mkdir -p llama-2-7b-hf
cd llama-2-7b-hf
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/config.json
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/generation_config.json
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/pytorch_model-00001-of-00002.bin
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/pytorch_model-00002-of-00002.bin
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/pytorch_model.bin.index.json
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/special_tokens_map.json
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/tokenizer.json
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/tokenizer.model
wget https://huggingface.co/daryl149/llama-2-7b-hf/resolve/main/tokenizer_config.json
6 huggingface模型的推理及性能测试
cd /home/ModelLink
tee torch_infer.py <<-'EOF'
import sys
import os
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
import torch
import time
import numpy as np
torch.cuda.empty_cache()
gc.collect()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")
model_name = sys.argv[1]
import json
import torch
from torch.utils.data import Dataset, DataLoader
class TextGenerationDataset(Dataset):
def __init__(self, json_data):
self.data = json.loads(json_data)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
input_text = item['input']
expected_output = item['expected_output']
return input_text, expected_output
# 创建 Dataset 实例
json_data =r'''
[
{"input": "Give three tips for staying healthy", "expected_output": "TODO"}
]
'''
def get_gpu_mem_usage():
allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 2)
max_allocated_memory = torch.cuda.max_memory_allocated(device) / (1024 ** 2)
cached_memory = torch.cuda.memory_reserved(device) / (1024 ** 2)
max_cached_memory = torch.cuda.max_memory_reserved(device) / (1024 ** 2)
return np.array([allocated_memory,max_allocated_memory,cached_memory,max_cached_memory])
def load_model_fp16():
model = AutoModelForCausalLM.from_pretrained(model_name).half().to(device)
return model
def predict(model,tokenizer,test_dataloader):
global device
dataloader_iter = iter(test_dataloader)
input_text, expected_output=next(dataloader_iter)
inputs = tokenizer(input_text, return_tensors="pt").to(device)
for _ in range(3):
torch.manual_seed(42)
start_time = time.time()
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=1)
first_token_time = time.time() - start_time
first_token = tokenizer.decode(outputs[0], skip_special_tokens=True)
torch.manual_seed(42)
start_time = time.time()
with torch.no_grad():
outputs = model.generate(**inputs,max_length=128)
total_time = time.time() - start_time
generated_tokens = len(outputs[0]) - len(inputs["input_ids"][0])
tokens_per_second = generated_tokens / total_time
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n\n---------------------------------------- Response -------------------------------------")
print(f"{response}")
print("---------------------------------------------------------------------------------------")
print(f"Time taken for first token: {first_token_time:.4f} seconds")
print(f"Total time taken: {total_time:.4f} seconds")
print(f"Number of tokens generated: {generated_tokens}")
print(f"Tokens per second: {tokens_per_second:.2f}")
test_dataset = TextGenerationDataset(json_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model=load_model_fp16()
mem_usage_0=get_gpu_mem_usage()
predict(model,tokenizer,test_dataloader)
mem_usage_1=get_gpu_mem_usage()
print(f"BEFORE MA: {mem_usage_0[0]:.2f} MMA: {mem_usage_0[1]:.2f} CA: {mem_usage_0[2]:.2f} MCA: {mem_usage_0[3]:.2f}")
print(f"AFTER MA: {mem_usage_1[0]:.2f} MMA: {mem_usage_1[1]:.2f} CA: {mem_usage_1[2]:.2f} MCA: {mem_usage_1[3]:.2f}")
diff=mem_usage_1-mem_usage_0
print(f"DIFF MA: {diff[0]:.2f} MMA: {diff[1]:.2f} CA: {diff[2]:.2f} MCA: {diff[3]:.2f}")
EOF
python3 torch_infer.py ./llama-2-7b-hf
输出:(40.15 tps)
---------------------------------------- Response -------------------------------------
Give three tips for staying healthy during the holidays.
The holidays are a time of celebration and joy, but they can also be a time of stress and overindulgence. Here are three tips for staying healthy during the holidays:
1. Eat healthy foods.
2. Exercise regularly.
3. Get enough sleep.
What are some of the most common health problems during the holidays?
The most common health problems during the holidays are colds, flu, and stomach problems.
What are some of
---------------------------------------------------------------------------------------
Time taken for first token: 0.0251 seconds
Total time taken: 2.9637 seconds
Number of tokens generated: 119
Tokens per second: 40.15
BEFORE MA: 12884.52 MMA: 12884.52 CA: 12886.00 MCA: 12886.00
AFTER MA: 12892.65 MMA: 13019.47 CA: 13036.00 MCA: 13036.00
DIFF MA: 8.12 MMA: 134.94 CA: 150.00 MCA: 150.00
7.1 修改torch,deepspeed规避缺失npu环境的问题
tee -a /usr/local/lib/python3.10/dist-packages/torch/__init__.py <<-'EOF'
class FakeDevice(object):
def __init__(self, name=""):
self.name = name
def __getattr__(self, item):
return FakeDevice(f"{self.name}.{item}")
def __call__(self, *args, **kwargs):
return 0
torch.npu = FakeDevice("torch.npu")
fake_torch_npu = FakeDevice("torch_npu")
fake_deepspeed_npu = FakeDevice("deepspeed_npu")
sys.modules.update({
"torch.npu": torch.npu,
"torch.npu.contrib": torch.npu.contrib,
"torch_npu": fake_torch_npu,
"torch_npu.utils": fake_torch_npu.utils,
"torch_npu.contrib": fake_torch_npu.contrib,
"torch_npu.testing": fake_torch_npu.testing,
"torch_npu.testing.testcase": fake_torch_npu.testing.testcase,
"deepspeed_npu": fake_deepspeed_npu
})
EOF
sed -i 's/accelerator_name = "npu"/accelerator_name = "cuda"/g' /usr/local/lib/python3.10/dist-packages/deepspeed/accelerator/real_accelerator.py
7.2 修改ModelLink,规避缺失npu环境的问题
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
index 33744a2..b8e2553 100644
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -1,7 +1,9 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from apex.optimizers import FusedAdam as Adam
-from apex.optimizers import FusedSGD as SGD
+#from apex.optimizers import FusedAdam as Adam
+#from apex.optimizers import FusedSGD as SGD
+from torch.optim import SGD
+from torch.optim import Adam
from megatron import get_args
diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py
index d58b1b0..ded0ba7 100644
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
@@ -3,7 +3,8 @@
"""Megatron distributed optimizer."""
-from apex.optimizers import FusedAdam as Adam
+#from apex.optimizers import FusedAdam as Adam
+from torch.optim import Adam
import math
import torch
diff --git a/modellink/__init__.py b/modellink/__init__.py
index a2cb976..682cf28 100644
--- a/modellink/__init__.py
+++ b/modellink/__init__.py
@@ -14,6 +14,32 @@
# limitations under the License.
import logging
+import torch
+import sys
+class FakeDevice(object):
+ def __init__(self, name=""):
+ self.name = name
+ def __getattr__(self, item):
+ return FakeDevice(f"{self.name}.{item}")
+ def __call__(self, *args, **kwargs):
+ return 0
+
+torch.npu = FakeDevice("torch.npu")
+fake_torch_npu = FakeDevice("torch_npu")
+fake_deepspeed_npu = FakeDevice("deepspeed_npu")
+
+sys.modules.update({
+ "torch.npu": torch.npu,
+ "torch.npu.contrib": torch.npu.contrib,
+ "torch_npu": fake_torch_npu,
+ "torch_npu.npu": fake_torch_npu.npu,
+ "torch_npu.utils": fake_torch_npu.utils,
+ "torch_npu.contrib": fake_torch_npu.contrib,
+ "torch_npu.testing": fake_torch_npu.testing,
+ "torch_npu.testing.testcase": fake_torch_npu.testing.testcase,
+ "deepspeed_npu": fake_deepspeed_npu
+})
+
try:
import torch_npu
diff --git a/modellink/model/transformer.py b/modellink/model/transformer.py
index bd1ef11..9745a4b 100644
--- a/modellink/model/transformer.py
+++ b/modellink/model/transformer.py
@@ -442,9 +442,9 @@ class FlashSelfAttention(torch.nn.Module):
if not hasattr(self, 'attention_mask') or self.attention_mask.shape[0] != seq_length:
if use_sliding_windows:
self.attention_mask = torch.triu(
- torch.ones(self.FA_SPARSE_ATTN_MASK_LEN, self.FA_SPARSE_ATTN_MASK_LEN), 1).bool().npu()
+ torch.ones(self.FA_SPARSE_ATTN_MASK_LEN, self.FA_SPARSE_ATTN_MASK_LEN), 1).bool().cuda()
else:
- self.attention_mask = torch.triu(torch.ones(seq_length, seq_length), 1).bool().npu()
+ self.attention_mask = torch.triu(torch.ones(seq_length, seq_length), 1).bool().cuda()
q, k, v = [rearrange(x, 's b h d -> s b (h d)') for x in [q, k, v]]
diff --git a/requirements.txt b/requirements.txt
index 3cb83fd..dd9cb61 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,8 +10,8 @@ datasets
pybind11
accelerate
six
-torch==2.1.0
-torchvision==0.16.0
+#torch==2.1.0
+#torchvision==0.16.0
protobuf
peft==0.7.1
-tiktoken
\ No newline at end of file
+tiktoken
diff --git a/tools/checkpoint/convert_ckpt.py b/tools/checkpoint/convert_ckpt.py
index 5c71645..6cc1dfd 100644
--- a/tools/checkpoint/convert_ckpt.py
+++ b/tools/checkpoint/convert_ckpt.py
@@ -80,6 +80,9 @@ def main():
loader.add_arguments(parser)
saver.add_arguments(parser)
+ import torch
+ torch.multiprocessing.set_start_method('spawn')
+
args = parser.parse_args()
queue = mp.Queue(maxsize=args.max_queue_size)
8 将权重从huggingface格式转化为AscendSpeed格式(PTD模式)
cd /home/ModelLink/ModelLink
rm ../model_weights -rf
mkdir -p ../model_weights
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 1 \
--load-dir ../llama-2-7b-hf \
--save-dir ../model_weights/llama-2-7b-hf-v0.1-tp8-pp1/ \
--tokenizer-model ../llama-2-7b-hf/tokenizer.model
9 下载alpaca数据集并查看第一条记录
cd /home/ModelLink
mkdir dataset_llama2
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet -O dataset_llama2/train-00000-of-00001-a09b74b3ef9c3b56.parquet
#查看第一条记录
python -c "import pandas as pd;df = pd.read_parquet('dataset_llama2/train-00000-of-00001-a09b74b3ef9c3b56.parquet');first_row = df.iloc[0];print(first_row)"
输出
instruction Give three tips for staying healthy.
input
output 1.Eat a balanced diet and make sure to include...
text Below is an instruction that describes a task....
Name: 0, dtype: object
10.1 将alpacal转换成LLM预训练数据集格式
cd /home/ModelLink/ModelLink
rm -rf ../dataset
mkdir -p ../dataset/llama-2-7b-hf/
python ./tools/preprocess_data.py \
--input ../dataset_llama2/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ../llama-2-7b-hf/ \
--output-prefix ../dataset/llama-2-7b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
10.2 开始预训练
因内存不足,将batchsize改为1024,优化器变成sgd
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CKPT_LOAD_DIR="../model_weights/llama-2-7b-hf-v0.1-tp8-pp1/"
CKPT_SAVE_DIR="./ckpt/llama-2-7b-hf/"
TOKENIZER_MODEL="../llama-2-7b-hf/tokenizer.model" #词表路径
DATA_PATH="../dataset/llama-2-7b-hf/alpaca_text_document" #数据集路径
TP=8
PP=1
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 1 \
--global-batch-size 32 \
--make-vocab-size-divisible-by 1 \
--lr 1.25e-6 \
--train-iters 5000 \
--lr-decay-style cosine \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--normalization RMSNorm \
--swiglu \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--min-lr 1.25e-7 \
--weight-decay 1e-1 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--initial-loss-scale 65536 \
--adam-beta2 0.95 \
--no-gradient-accumulation-fusion \
--no-load-optim \
--no-load-rng \
--optimizer sgd \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 15 \
--eval-interval 15 \
--exit-interval 15
--eval-iters 10 \
"
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--load $CKPT_LOAD_DIR \
--save $CKPT_SAVE_DIR
输出:
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02 Driver Version: 530.30.02 CUDA Version: 12.1 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage