1.conda环境准备
git clone https://github.com/yangjianxin1/Firefly.git
conda create -n firefly python=3.10
cd ./Firefly
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
然后还有依赖要安装
pip install git+https://github.com/huggingface/peft.git -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install git+https://github.com/huggingface/accelerate.git -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install git+https://github.com/huggingface/transformers.git -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install git+https://github.com/TimDettmers/bitsandbytes.git -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install einops transformers_stream_generator
pip install tiktoken
2.推理
这里我是将chat.py代码放到component文件夹下了,所以untils,而不是component.utils
from transformers import AutoTokenizer
import torch
import sys
from utils import ModelUtils
"""
单轮对话,不具有对话历史的记忆功能
"""
def main():
# 使用合并后的模型进行推理
model_name_or_path = '/home/cxh/Qwen-7B'
adapter_name_or_path = None
# 使用base model和adapter进行推理,无需手动合并权重
# model_name_or_path = 'baichuan-inc/Baichuan-7B'
# adapter_name_or_path = 'YeungNLP/firefly-baichuan-7b-qlora-sft'
# 是否使用4bit进行推理,能够节省很多显存,但效果可能会有一定的下降
load_in_4bit = False
# 生成超参配置
max_new_tokens = 500
top_p = 0.9
temperature = 0.35
repetition_penalty = 1.0
device = 'cuda'
# 加载模型
model = ModelUtils.load_model(
model_name_or_path,
load_in_4bit=load_in_4bit,
adapter_name_or_path=adapter_name_or_path
).eval()
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True,
# llama不支持fast
use_fast=False if model.config.model_type == 'llama' else True,
padding_side='left' # 设置左侧填充
)
print("tokenizer.__class__.__name__",tokenizer.__class__.__name__)
# QWenTokenizer比较特殊,pad_token_id、bos_token_id、eos_token_id均为None。eod_id对应的token为<|endoftext|>
if tokenizer.__class__.__name__ == 'QWenTokenizer':
tokenizer.pad_token_id = tokenizer.eod_id
tokenizer.bos_token_id = tokenizer.eod_id
tokenizer.eos_token_id = tokenizer.eod_id
text = input('User:')
while True:
text = text.strip()
# chatglm使用官方的数据组织格式
print(123)
if model.config.model_type == 'chatglm':
print("chatglm")
text = '[Round 1]\n\n问:{}\n\n答:'.format(text)
input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
# 为了兼容qwen-7b,因为其对eos_token进行tokenize,无法得到对应的eos_token_id
else:
print("QWEN")
input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
bos_token_id = torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long).to(device)
eos_token_id = torch.tensor([[tokenizer.eos_token_id]], dtype=torch.long).to(device)
input_ids = torch.concat([bos_token_id, input_ids, eos_token_id], dim=1)
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=True,
top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty,
eos_token_id=tokenizer.eos_token_id
)
outputs = outputs.tolist()[0][len(input_ids[0]):]
response = tokenizer.decode(outputs)
response = response.strip().replace(tokenizer.eos_token, "").strip()
print("123321")
print("Firefly:{}".format(response))
text = input('User:')
if __name__ == '__main__':
main()
3.微调
pip install loguru astunparse
torchrun --nproc_per_node=1 train_qlora.py --train_args_file /home/cxh/Firefly/train_args/qlora/qwen-7b-sft-qlora.json
train_qlora.py内容如下
from transformers import AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
set_seed,
HfArgumentParser,
TrainingArguments,
AutoModelForCausalLM
)
import argparse
from loguru import logger
import os
from os.path import join
import torch
import bitsandbytes as bnb
from collections import defaultdict
from collator import SFTDataCollator
from dataset import (
SFTDataset,
ChatGLM2SFTDataset,
ChatGLM3SFTDataset,
MistralSFTDataset,
ZephyrSFTDataset,
QwenSFTDataset
)
from argument import QLoRAArguments
from trainer import LoRATrainer
# from component.loss import TargetLMLoss
torch.cuda.empty_cache()
def verify_model_dtype(model):
"""
查看模型种各种类型的参数的情况
"""
dtype2param_num = defaultdict(int) # 每种数据类型的参数量
dtype2param_name = defaultdict(list) # 每种数据类型的参数名称
dtype2trainable_param_num = defaultdict(int) # 每种数据类型参与训练的参数量
dtype2trainable_param_name = defaultdict(list) # 每种数据类型参与训练的参数名称
for name, p in model.named_parameters():
dtype = p.dtype
dtype2param_num[dtype] += p.numel()
dtype2param_name[dtype].append(name)
if p.requires_grad:
dtype2trainable_param_num[dtype] += p.numel()
dtype2trainable_param_name[dtype].append(name)
# 统计全部参数中,各种类型参数分布
total = 0
print('verify all params of the model')
for k, v in dtype2param_num.items():
total += v
for k, v in dtype2param_num.items():
print(k, v, v / total)
for k, v in dtype2trainable_param_name.items():
print(k, v)
print()
# 统计可训练参数中,各种类型参数分布
print('verify trainable params the model')
total_trainable = 0
for k, v in dtype2trainable_param_num.items():
total_trainable += v
for k, v in dtype2trainable_param_num.items():
print(k, v, v / total_trainable)
for k, v in dtype2trainable_param_num.items():
print(k, v)
def find_all_linear_names(model):
"""
找出所有全连接层,为所有全连接添加adapter
"""
cls = bnb.nn.Linear4bit
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
def setup_everything():
parser = argparse.ArgumentParser()
parser.add_argument("--train_args_file", type=str, default='train_args/baichuan-sft-qlora.json', help="")
args = parser.parse_args()
train_args_file = args.train_args_file
# 读取训练的参数配置
parser = HfArgumentParser((QLoRAArguments, TrainingArguments))
# 解析得到自定义参数,以及自带参数
args, training_args = parser.parse_json_file(json_file=train_args_file)
# 创建输出目录
if not os.path.exists(training_args.output_dir):
os.makedirs(training_args.output_dir)
# logger.add(join(training_args.output_dir, 'train.log'))
# logger.info("train_args:{}".format(training_args))
# 设置随机种子
set_seed(training_args.seed)
return args, training_args
def init_components(args, training_args):
logger.info('Initializing components...')
training_args.ddp_find_unused_parameters = False
local_rank = int(os.environ.get('LOCAL_RANK', '0'))
device_map = {'': local_rank}
# 加载模型并进行4bit量化配置
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
)
# 确保仅使用 `quantization_config` 参数,移除 `load_in_4bit`
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
device_map=device_map,
torch_dtype=torch.float16,
trust_remote_code=True,
quantization_config=quantization_config # 使用4bit量化配置
)
model.config.use_cache = False
if 'output_router_logits' in model.config.to_dict():
logger.info('set output_router_logits as True')
model.config.output_router_logits = True
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
use_fast=False if model.config.model_type == 'llama' else True
)
if tokenizer.__class__.__name__ == 'QWenTokenizer':
tokenizer.pad_token_id = tokenizer.eod_id
tokenizer.bos_token_id = tokenizer.eod_id
tokenizer.eos_token_id = tokenizer.eod_id
elif tokenizer.__class__.__name__ != 'ChatGLMTokenizer':
assert tokenizer.eos_token_id is not None
assert tokenizer.bos_token_id is not None
tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
print(f'memory footprint of model: {model.get_memory_footprint()/(1024*1024*1024)} GB')
target_modules = find_all_linear_names(model)
config = LoraConfig(
r=args.lora_rank,
lora_alpha=args.lora_alpha,
target_modules=target_modules,
lora_dropout=args.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
model.config.torch_dtype = torch.float32
verify_model_dtype(model)
if 'chatglm2' in args.model_name_or_path.lower():
train_dataset = ChatGLM2SFTDataset(args.train_file, tokenizer, args.max_seq_length)
elif 'chatglm3' in args.model_name_or_path.lower():
train_dataset = ChatGLM3SFTDataset(args.train_file, tokenizer, args.max_seq_length)
elif 'mistral' in args.model_name_or_path.lower() or 'mixtral' in args.model_name_or_path.lower():
train_dataset = MistralSFTDataset(args.train_file, tokenizer, args.max_seq_length)
elif 'zephyr' in args.model_name_or_path.lower():
train_dataset = ZephyrSFTDataset(args.train_file, tokenizer, args.max_seq_length)
elif 'qwen' in args.model_name_or_path.lower():
train_dataset = QwenSFTDataset(args.train_file, tokenizer, args.max_seq_length)
else:
train_dataset = SFTDataset(args.train_file, tokenizer, args.max_seq_length)
data_collator = SFTDataCollator(tokenizer, args.max_seq_length)
trainer = LoRATrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
)
return trainer
def main():
# 进行一些配置和检查
args, training_args = setup_everything()
# 加载各种组件
trainer = init_components(args, training_args)
# 开始训练
logger.info("*** starting training ***")
# todo resume from checkpoint
# https://github.com/huggingface/transformers/issues/24252
train_result = trainer.train()
# 保存最后的checkpoint
trainer.save_model(training_args.output_dir) # Saves the tokenizer too
# 保存训练指标
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
if __name__ == "__main__":
main()
qwen-7b-sft-qlora.json内如如下
{
"output_dir": "output/firefly-qwen-7b",
"model_name_or_path": "/home/cxh/Qwen-7B",
"train_file": "/home/cxh/Firefly/data/dummy_data.jsonl",
"num_train_epochs": 20,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 2,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 300,
"save_steps": 500,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 3000,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
5.权重合并
在训练中,我们只保存adapter的权重,不保存合并后的模型权重。
adapter与base model进行权重合并
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
"""
使用该脚本,将lora的权重合并大base model中
"""
def merge_lora_to_base_model():
model_name_or_path = '/home/cxh/Qwen-7B'
adapter_name_or_path = '/home/cxh/Firefly/component/output/firefly-qwen-7b'
save_path = '/home/cxh/Firefly/script/firefly-Qwen-7B-qlora-sft-merge'
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True,
# llama不支持fast
use_fast=False if config.model_type == 'llama' else True
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
# device_map='auto',
device_map={'': 'cpu'}
)
model = PeftModel.from_pretrained(model, adapter_name_or_path, device_map={'': 'cpu'})
model = model.merge_and_unload()
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)
if __name__ == '__main__':
merge_lora_to_base_model()
6.合并后的模型推理
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
"""
单轮对话,不具有对话历史的记忆功能
"""
def main():
model_name = '/home/cxh/Firefly/script/firefly-Qwen-7B-qlora-sft-merge'
max_new_tokens = 500
top_p = 0.9
temperature = 0.35
repetition_penalty = 1.0
device = 'cuda'
input_pattern = '<s>{}</s>'
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map='auto'
).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
# llama不支持fast
use_fast=False if model.config.model_type == 'llama' else True
)
text = input('User:')
while True:
text = text.strip()
text = input_pattern.format(text)
input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=True,
top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty,
eos_token_id=tokenizer.eos_token_id
)
outputs = outputs.tolist()[0][len(input_ids[0]):]
response = tokenizer.decode(outputs)
response = response.strip().replace(text, "").replace('</s>', "").replace('<s>', "").strip()
print("Firefly:{}".format(response))
text = input('User:')
if __name__ == '__main__':
main()
6.docker打包conda环境
conda env export --name firefly > environment.yml
environment.yml内容如下
name: firefly
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- bzip2=1.0.8=h5eee18b_6
- ca-certificates=2024.7.2=h06a4308_0
- ld_impl_linux-64=2.38=h1181459_1
- libffi=3.4.4=h6a678d5_1
- libgcc-ng=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=1.41.5=h5eee18b_0
- ncurses=6.4=h6a678d5_0
- openssl=3.0.14=h5eee18b_0
- pip=24.0=py310h06a4308_0
- python=3.10.14=h955ad1f_1
- readline=8.2=h5eee18b_0
- setuptools=72.1.0=py310h06a4308_0
- sqlite=3.45.3=h5eee18b_0
- tk=8.6.14=h39e8969_0
- tzdata=2024a=h04d1e81_0
- wheel=0.43.0=py310h06a4308_0
- xz=5.4.6=h5eee18b_1
- zlib=1.2.13=h5eee18b_1
- pip:
- absl-py==2.1.0
- accelerate==0.33.0.dev0
- astunparse==1.6.3
- bitsandbytes==0.43.1
- certifi==2024.7.4
- charset-normalizer==3.3.2
- einops==0.8.0
- filelock==3.15.4
- fsspec==2024.6.1
- grpcio==1.65.4
- huggingface-hub==0.24.5
- idna==3.7
- jinja2==3.1.4
- loguru==0.7.2
- markdown==3.6
- markupsafe==2.1.5
- mpmath==1.3.0
- networkx==3.3
- numpy==1.26.4
- nvidia-cublas-cu12==12.1.3.1
- nvidia-cuda-cupti-cu12==12.1.105
- nvidia-cuda-nvrtc-cu12==12.1.105
- nvidia-cuda-runtime-cu12==12.1.105
- nvidia-cudnn-cu12==9.1.0.70
- nvidia-cufft-cu12==11.0.2.54
- nvidia-curand-cu12==10.3.2.106
- nvidia-cusolver-cu12==11.4.5.107
- nvidia-cusparse-cu12==12.1.0.106
- nvidia-nccl-cu12==2.20.5
- nvidia-nvjitlink-cu12==12.6.20
- nvidia-nvtx-cu12==12.1.105
- packaging==24.1
- peft==0.12.1.dev0
- protobuf==4.25.4
- psutil==6.0.0
- pyyaml==6.0.2
- regex==2024.7.24
- requests==2.32.3
- safetensors==0.4.4
- six==1.16.0
- sympy==1.13.2
- tensorboard==2.17.0
- tensorboard-data-server==0.7.2
- tiktoken==0.7.0
- tokenizers==0.19.1
- torch==2.4.0
- tqdm==4.66.5
- transformers==4.45.0.dev0
- transformers-stream-generator==0.0.5
- triton==3.0.0
- typing-extensions==4.12.2
- urllib3==2.2.2
- werkzeug==3.0.3
prefix: /home/cxh/.conda/envs/firefly
创建dockerfile
# 使用官方的 Anaconda 基础镜像
FROM continuumio/miniconda3
# 设置工作目录
WORKDIR /app
# 复制环境配置文件到镜像中
COPY environment.yml .
# 创建 Conda 环境并安装所有依赖
RUN conda env create -f environment.yml
# 激活环境
SHELL ["conda", "run", "-n", "<firefly>", "/bin/bash", "-c"]
# 安装你的代码或其他依赖项
COPY . .
# 默认的入口点
CMD ["python", "docker-file.py"]
docker build -t firefly_env .
docker run --rm firefly_env
7.将conda环境挪到google云端硬盘配合colab使用
或者直接一开始就在云盘环境里安装conda环境
conda环境安装指定位置的虚拟环境_conda指定路径创建虚拟环境-CSDN博客
python=3.10.14