1.创建开发机
开发机创建完成后点击“进入开发机”
2.安装运行环境
internStudio 里面集成了python 和cuda 运行环境。所以我们这里就需要安装XTuner 框架和相关依赖包安装即可
进入开发机后,执行下面命令安装XTuner
# 创建虚拟机
conda create --name xtuner-env python=3.10 -y
#激活环境
conda activate xtuner-env
cd ~
git clone https://github.com/InternLM/XTuner
cd XTuner
pip install -e '.[all]'
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
安装完成后我们检查一下
xtuner -v
3 模型下载
我们使用 InternStudio 所以可以使用它共享文件夹模型
模型目录在/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b
所以我们可以做软连接
mkdir -p ~/model
cd ~/model
ln -s /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b ~/model/internlm2-chat-1_8b
4.自我认知训练数据集准备
mkdir -p ~/data
cd ~/data
touch data/assistant.json
我们可以利用xtuner_generate_assistant.py 这个代码生成数据集,代码内容如下
import json
# 设置用户的名字
name = '周辉同志'
# 设置需要重复添加的数据次数
n = 4650
# 初始化数据
data = [
{"conversation": [{"input": "请介绍一下你自己", "output": "我是{}的小助手,内在是上海AI实验室书生·浦语的1.8B大模型哦".format(name)}]},
{"conversation": [{"input": "你在实战营做什么", "output": "我在这里帮助{}完成XTuner微调个人小助手的任务".format(name)}]}
]
# 通过循环,将初始化的对话数据重复添加到data列表中
for i in range(n):
data.append(data[0])
data.append(data[1])
# 将data列表中的数据写入到'datas/assistant.json'文件中
with open('/root/data/assistant.json', 'w', encoding='utf-8') as f:
# 使用json.dump方法将数据以JSON格式写入文件
# ensure_ascii=False 确保中文字符正常显示
# indent=4 使得文件内容格式化,便于阅读
json.dump(data, f, ensure_ascii=False, indent=4)
然后执行该脚本来生成数据文件。
python xtuner_generate_assistant.py
执行完成后我们查看一下数据集
5.XTuner配置文件准备
准备配置。XTuner 提供了许多现成的配置,我们可以通过以下方式查看所有配置
# 创建一个存放 config 文件的文件夹
cd ~/
mkdir -p ~/config
# 使用 XTuner 中的 copy-cfg 功能将 config 文件复制到指定的位置
xtuner copy-cfg internlm2_1_8b_qlora_alpaca_e3 ~/config
我们查询一下生成的internlm2_1_8b_qlora_alpaca_e3_copy.py
配置文件修改
将internlm2_1_8b_qlora_alpaca_e3_copy.py 代码修改成下面的代码
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.parallel.sequence import SequenceParallelSampler
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = '/root/model/internlm2-chat-1_8b'
use_varlen_attn = False
# Data
alpaca_en_path = '/root/data/assistant.json'
prompt_template = PROMPT_TEMPLATE.internlm2_chat
max_length = 2048
pack_to_max_length = True
# parallel
sequence_parallel_size = 1
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 16
accumulative_counts *= sequence_parallel_size
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 500
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = SYSTEM_TEMPLATE.alpaca
evaluation_inputs = [
'请介绍一下你自己', 'Please introduce yourself'
]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
use_varlen_attn=use_varlen_attn,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_en = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path='json', data_files=dict(train=alpaca_en_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=None,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length,
use_varlen_attn=use_varlen_attn)
sampler = SequenceParallelSampler \
if sequence_parallel_size > 1 else DefaultSampler
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=alpaca_en,
sampler=dict(type=sampler, shuffle=True),
collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
end=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
if use_varlen_attn:
custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 10 iterations.
logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per `save_steps`.
checkpoint=dict(
type=CheckpointHook,
by_epoch=False,
interval=save_steps,
max_keep_ckpts=save_total_limit),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)
# set log processor
log_processor = dict(by_epoch=False)
简单解释一下 主要是修改 模型地址,数据集地址 和训练步数
pretrained_model_name_or_path = ‘/root/model/internlm2-chat-1_8b’
alpaca_en_path = ‘/root/data/assistant.json’
save_steps = 500
6.训练模型
开始训练,我们使用deepspeed 加速。共有三种不同的 deepspeed
类型可进行选择,分别是 deepspeed_zero1
, deepspeed_zero2
和 deepspeed_zero3
#激活环境
conda activate xtuner-env
cd ~
# 常规训练
xtuner train /root/config/internlm2_1_8b_qlora_alpaca_e3_copy.py --work-dir /root/train
# deepspeed 训练
xtuner train /root/config/internlm2_1_8b_qlora_alpaca_e3_copy.py --work-dir /root/train_deepspeed --deepspeed deepspeed_zero2
模型开始微调,这个时间会有点长我们需要等一段时间。
7 模型转换
将训练好的模型转换成HF 格式模型文件
xtuner convert pth_to_hf /root/train/internlm2_1_8b_qlora_alpaca_e3_copy.py \
/root/train/iter_864.pth \
/root/xtuner_hf_adapter
这里我们看到模型转换后的模型文件,目录在xtuner_hf_adapter 目录下
这个就是我们熟悉的huggingface模型文件 .bin文件
8 模型合并
在 XTuner 中也是提供了一键整合的指令,但是在使用前我们需要准备好三个地址,包括原模型的地址、训练好的 adapter 层的地址(转为 Huggingface 格式后保存的部分)以及最终保存的地址。
# 创建一个名为 final_model 的文件夹存储整合后的模型文件
mkdir -p /root/final_model
# 解决一下线程冲突的 Bug
export MKL_SERVICE_FORCE_INTEL=1
# 进行模型整合
# xtuner convert merge ${NAME_OR_PATH_TO_LLM} ${NAME_OR_PATH_TO_ADAPTER} ${SAVE_PATH}
xtuner convert merge /root/model/internlm2-chat-1_8b /root/xtuner_hf_adapter /root/final_model
9 推理验证
xtuner chat /root/final_model --prompt-template internlm2_chat
我们问了几个问题 发现它已经回答出我微调后的希望输出的结果了。