1、创建虚拟环境
要先创建一个虚拟环境。使用 Anaconda
创建一个名为 xtuner0117
的虚拟环境,可以直接执行命令。
# 创建虚拟环境
conda create -n xtuner0117 python=3.10 -y
# 激活虚拟环境
conda activate xtuner0117
# 安装一些必要的库
pip install torch==2.0.1 torchaudio==2.0.2 torchvision==0.15.2 modelscope==1.15.0
2、安装 XTuner
虚拟环境创建完成后,就可以安装 XTuner 了。首先,从 Github 上下载源码。
# 创建一个目录,用来存放源代码
mkdir -p /root/InternLM/code
cd /root/InternLM/code
git clone -b v0.1.21 https://github.com/InternLM/XTuner
其次,进入源码目录,执行安装。
# 进入到源码目录
cd /root/InternLM/code/XTuner
# 执行安装
pip install -e '.[deepspeed]'
如果速度太慢可以换成 pip install -e '.[deepspeed]' -i https://mirrors.aliyun.com/pypi/simple/
3 、模型准备
软件安装好后,我们就可以准备要微调的模型了。可以在浦语平台上使用 InternLM 推出的1.8B的小模型来完成此次微调。
# 创建一个目录,用来存放微调的资料
mkdir -p /root/InternLM/XTuner
cd /root/InternLM/XTuner
mkdir -p Shanghai_AI_Laboratory
ln -s /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b Shanghai_AI_Laboratory/internlm2-chat-1_8b
目录结构:
├── Shanghai_AI_Laboratory
│ ├── internlm2-1_8b
│ │ ├── README.md
│ │ ├── config.json
│ │ ├── configuration.json
│ │ ├── configuration_internlm2.py
│ │ ├── generation_config.json
│ │ ├── modeling_internlm2.py
│ │ ├── pytorch_model.bin
│ │ ├── special_tokens_map.json
│ │ ├── tokenization_internlm2.py
│ │ ├── tokenization_internlm2_fast.py
│ │ ├── tokenizer.json
│ │ ├── tokenizer.model
│ │ └── tokenizer_config.json
│ └── internlm2-chat-1_8b -> /root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b
│ ├── README.md
│ ├── config.json
│ ├── configuration.json
│ ├── configuration_internlm2.py
│ ├── generation_config.json
│ ├── model-00001-of-00002.safetensors
│ ├── model-00002-of-00002.safetensors
│ ├── model.safetensors.index.json
│ ├── modeling_internlm2.py
│ ├── special_tokens_map.json
│ ├── tokenization_internlm2.py
│ ├── tokenization_internlm2_fast.py
│ ├── tokenizer.model
│ └── tokenizer_config.json
4、微调前模型对话
# 导入库包
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# 定义模型加载方法
def load_model(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
model = model.eval()
return tokenizer, model
# 定义对话方法
messages = []
def chat(input_text):
length = 0
for response, _ in model.stream_chat(tokenizer, input_text, messages):
if response is not None:
print(response[length:], flush=True, end="")
length = len(response)
# 模型加载
tokenizer, model = load_model("/root/InternLM/XTuner/Shanghai_AI_Laboratory/internlm2-chat-1_8b")
# 对话
chat("请介绍一下你自己")
# 释放缓存
del tokenizer, model
torch.cuda.empty_cache()
5、进行模型微调
5.1、微调数据准备
mkdir -p datas
touch datas/assistant.json
使用浦语数据脚本,批量生成微调数据
import json
# 设置用户的名字
name = '👑彳余👑'
# 设置需要重复添加的数据次数
n = 10000
# 初始化数据
data = [
{"conversation": [{"input": "请介绍一下你自己", "output": "我是{}的小助手,内在是上海AI实验室书生·浦语的1.8B大模型哦".format(name)}]},
{"conversation": [{"input": "你在实战营做什么", "output": "我在这里帮助{}完成XTuner微调个人小助手的任务".format(name)}]}
]
# 通过循环,将初始化的对话数据重复添加到data列表中
for i in range(n):
data.append(data[0])
data.append(data[1])
# 将data列表中的数据写入到'datas/assistant.json'文件中
with open('datas/assistant.json', 'w', encoding='utf-8') as f:
# 使用json.dump方法将数据以JSON格式写入文件
# ensure_ascii=False 确保中文字符正常显示
# indent=4 使得文件内容格式化,便于阅读
json.dump(data, f, ensure_ascii=False, indent=4)
5.2、指令微调,配置文件
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.parallel.sequence import SequenceParallelSampler
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = '/root/InternLM/XTuner/Shanghai_AI_Laboratory/internlm2-chat-1_8b'
use_varlen_attn = False
# Data
alpaca_en_path = 'datas/assistant.json'
prompt_template = PROMPT_TEMPLATE.internlm2_chat
max_length = 2048
pack_to_max_length = True
# parallel
sequence_parallel_size = 1
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 16
accumulative_counts *= sequence_parallel_size
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 500
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = SYSTEM_TEMPLATE.alpaca
evaluation_inputs = [
'请介绍一下你自己', 'Please introduce yourself'
]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
use_varlen_attn=use_varlen_attn,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
alpaca_en = dict(
type=process_hf_dataset,
dataset=dict(type=load_dataset, path='json', data_files=dict(train=alpaca_en_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=None,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length,
use_varlen_attn=use_varlen_attn)
sampler = SequenceParallelSampler \
if sequence_parallel_size > 1 else DefaultSampler
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=alpaca_en,
sampler=dict(type=sampler, shuffle=True),
collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
end=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
dict(type=DatasetInfoHook, tokenizer=tokenizer),
dict(
type=EvaluateChatHook,
tokenizer=tokenizer,
every_n_iters=evaluation_freq,
evaluation_inputs=evaluation_inputs,
system=SYSTEM,
prompt_template=prompt_template)
]
if use_varlen_attn:
custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 10 iterations.
logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per `save_steps`.
checkpoint=dict(
type=CheckpointHook,
by_epoch=False,
interval=save_steps,
max_keep_ckpts=save_total_limit),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)
# set log processor
log_processor = dict(by_epoch=False)
6、启动微调
xtuner train ./internlm2_chat_1_8b_qlora_alpaca_e3_copy.py
6.1、模型格式转换
模型转换的本质其实就是将原本使用 Pytorch 训练出来的模型权重文件转换为目前通用的 HuggingFace 格式文件,那么我们可以通过以下命令来实现一键转换。
我们可以使用 xtuner convert pth_to_hf
命令来进行模型格式转换。
xtuner convert pth_to_hf
命令用于进行模型格式转换。该命令需要三个参数:CONFIG
表示微调的配置文件,PATH_TO_PTH_MODEL
表示微调的模型权重文件路径,即要转换的模型权重,SAVE_PATH_TO_HF_MODEL
表示转换后的 HuggingFace 格式文件的保存路径。
除此之外,我们其实还可以在转换的命令中添加几个额外的参数,包括:
pth_file=`ls -t ./work_dirs/internlm2_chat_1_8b_qlora_alpaca_e3_copy/*.pth | head -n 1` && MKL_SERVICE_FORCE_INTEL=1 MKL_THREADING_LAYER=GNU xtuner convert pth_to_hf ./internlm2_chat_1_8b_qlora_alpaca_e3_copy.py ${pth_file} ./hf
6.2、模型合并
对于 LoRA 或者 QLoRA 微调出来的模型其实并不是一个完整的模型,而是一个额外的层(Adapter),训练完的这个层最终还是要与原模型进行合并才能被正常的使用。
对于全量微调的模型(full)其实是不需要进行整合这一步的,因为全量微调修改的是原模型的权重而非微调一个新的 Adapter ,因此是不需要进行模型整合的。
在 XTuner 中提供了一键合并的命令 xtuner convert merge
,在使用前我们需要准备好三个路径,包括原模型的路径、训练好的 Adapter 层的(模型格式转换后的)路径以及最终保存的路径。
xtuner convert merge
命令用于合并模型。该命令需要三个参数:LLM
表示原模型路径,ADAPTER
表示 Adapter 层的路径,SAVE_PATH
表示合并后的模型最终的保存路径。
在模型合并这一步还有其他很多的可选参数,包括:
参数名 | 解释 |
---|---|
--max-shard-size {GB} | 代表每个权重文件最大的大小(默认为2GB) |
--device {device_name} | 这里指的就是device的名称,可选择的有cuda、cpu和auto,默认为cuda即使用gpu进行运算 |
--is-clip | 这个参数主要用于确定模型是不是CLIP模型,假如是的话就要加上,不是就不需要添加 |
CLIP(Contrastive Language–Image Pre-training)模型是 OpenAI 开发的一种预训练模型,它能够理解图像和描述它们的文本之间的关系。CLIP 通过在大规模数据集上学习图像和对应文本之间的对应关系,从而实现了对图像内容的理解和分类,甚至能够根据文本提示生成图像。
MKL_SERVICE_FORCE_INTEL=1 MKL_THREADING_LAYER=GNU xtuner convert merge /root/InternLM/XTuner/Shanghai_AI_Laboratory/internlm2-chat-1_8b ./hf ./merged --max-shard-size 2GB
7、微调后模型对话
通过web部署一个demo来验证微调候后模型对话能力: import copy import warnings from dataclasses import asdict, dataclass from typing import Callable, List, Optional import streamlit as st import torch from torch import nn from transformers.generation.utils import (LogitsProcessorList, StoppingCriteriaList) from transformers.utils import logging from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip logger = logging.get_logger(__name__) model_name_or_path = "./merged" @dataclass class GenerationConfig: # this config is used for chat to provide more diversity max_length: int = 2048 top_p: float = 0.75 temperature: float = 0.1 do_sample: bool = True repetition_penalty: float = 1.000 @torch.inference_mode() def generate_interactive( model, tokenizer, prompt, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, additional_eos_token_id: Optional[int] = None, **kwargs, ): inputs = tokenizer([prompt], padding=True, return_tensors='pt') input_length = len(inputs['input_ids'][0]) for k, v in inputs.items(): inputs[k] = v.cuda() input_ids = inputs['input_ids'] _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] if generation_config is None: generation_config = model.generation_config generation_config = copy.deepcopy(generation_config) model_kwargs = generation_config.update(**kwargs) bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612 generation_config.bos_token_id, generation_config.eos_token_id, ) if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] if additional_eos_token_id is not None: eos_token_id.append(additional_eos_token_id) has_default_max_length = kwargs.get( 'max_length') is None and generation_config.max_length is not None if has_default_max_length and generation_config.max_new_tokens is None: warnings.warn( f"Using 'max_length''s default ({repr(generation_config.max_length)}) \ to control the generation length. " 'This behaviour is deprecated and will be removed from the \ config in v5 of Transformers -- we' ' recommend using `max_new_tokens` to control the maximum \ length of the generation.', UserWarning, ) elif generation_config.max_new_tokens is not None: generation_config.max_length = generation_config.max_new_tokens + \ input_ids_seq_length if not has_default_max_length: logger.warn( # pylint: disable=W4902 f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) " f"and 'max_length'(={generation_config.max_length}) seem to " "have been set. 'max_new_tokens' will take precedence. " 'Please refer to the documentation for more information. ' '(https://huggingface.co/docs/transformers/main/' 'en/main_classes/text_generation)', UserWarning, ) if input_ids_seq_length >= generation_config.max_length: input_ids_string = 'input_ids' logger.warning( f"Input length of {input_ids_string} is {input_ids_seq_length}, " f"but 'max_length' is set to {generation_config.max_length}. " 'This can lead to unexpected behavior. You should consider' " increasing 'max_new_tokens'.") # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None \ else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None \ else StoppingCriteriaList() logits_processor = model._get_logits_processor( generation_config=generation_config, input_ids_seq_length=input_ids_seq_length, encoder_input_ids=input_ids, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, logits_processor=logits_processor, ) stopping_criteria = model._get_stopping_criteria( generation_config=generation_config, stopping_criteria=stopping_criteria) logits_warper = model._get_logits_warper(generation_config) unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) scores = None while True: model_inputs = model.prepare_inputs_for_generation( input_ids, **model_kwargs) # forward pass to get next token outputs = model( **model_inputs, return_dict=True, output_attentions=False, output_hidden_states=False, ) next_token_logits = outputs.logits[:, -1, :] # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) next_token_scores = logits_warper(input_ids, next_token_scores) # sample probs = nn.functional.softmax(next_token_scores, dim=-1) if generation_config.do_sample: next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) else: next_tokens = torch.argmax(probs, dim=-1) # update generated ids, model inputs, and length for next step input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) model_kwargs = model._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=False) unfinished_sequences = unfinished_sequences.mul( (min(next_tokens != i for i in eos_token_id)).long()) output_token_ids = input_ids[0].cpu().tolist() output_token_ids = output_token_ids[input_length:] for each_eos_token_id in eos_token_id: if output_token_ids[-1] == each_eos_token_id: output_token_ids = output_token_ids[:-1] response = tokenizer.decode(output_token_ids) yield response # stop when each sentence is finished # or if we exceed the maximum length if unfinished_sequences.max() == 0 or stopping_criteria( input_ids, scores): break def on_btn_click(): del st.session_state.messages @st.cache_resource def load_model(): model = (AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to( torch.bfloat16).cuda()) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) return model, tokenizer def prepare_generation_config(): with st.sidebar: max_length = st.slider('Max Length', min_value=8, max_value=32768, value=2048) top_p = st.slider('Top P', 0.0, 1.0, 0.75, step=0.01) temperature = st.slider('Temperature', 0.0, 1.0, 0.1, step=0.01) st.button('Clear Chat History', on_click=on_btn_click) generation_config = GenerationConfig(max_length=max_length, top_p=top_p, temperature=temperature) return generation_config user_prompt = '<|im_start|>user\n{user}<|im_end|>\n' robot_prompt = '<|im_start|>assistant\n{robot}<|im_end|>\n' cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\ <|im_start|>assistant\n' def combine_history(prompt): messages = st.session_state.messages meta_instruction = ('') total_prompt = f"<s><|im_start|>system\n{meta_instruction}<|im_end|>\n" for message in messages: cur_content = message['content'] if message['role'] == 'user': cur_prompt = user_prompt.format(user=cur_content) elif message['role'] == 'robot': cur_prompt = robot_prompt.format(robot=cur_content) else: raise RuntimeError total_prompt += cur_prompt total_prompt = total_prompt + cur_query_prompt.format(user=prompt) return total_prompt def main(): # torch.cuda.empty_cache() print('load model begin.') model, tokenizer = load_model() print('load model end.') st.title('InternLM2-Chat-1.8B') generation_config = prepare_generation_config() # Initialize chat history if 'messages' not in st.session_state: st.session_state.messages = [] # Display chat messages from history on app rerun for message in st.session_state.messages: with st.chat_message(message['role'], avatar=message.get('avatar')): st.markdown(message['content']) # Accept user input if prompt := st.chat_input('What is up?'): # Display user message in chat message container with st.chat_message('user'): st.markdown(prompt) real_prompt = combine_history(prompt) # Add user message to chat history st.session_state.messages.append({ 'role': 'user', 'content': prompt, }) with st.chat_message('robot'): message_placeholder = st.empty() for cur_response in generate_interactive( model=model, tokenizer=tokenizer, prompt=real_prompt, additional_eos_token_id=92542, **asdict(generation_config), ): # Display robot response in chat message container message_placeholder.markdown(cur_response + '▌') message_placeholder.markdown(cur_response) # Add robot response to chat history st.session_state.messages.append({ 'role': 'robot', 'content': cur_response, # pylint: disable=undefined-loop-variable }) torch.cuda.empty_cache() if __name__ == '__main__': main()
启动demo应用
streamlit run xtuner_streamlit_demo.py
以上就是一个快速微调的demo