LoRA:Low-Rank Adaptation of large language models
简要概括:
详细步骤:
A为降维操作,B为升维操作
秩的选择:
初始化:
为什么矩阵A,B不能同时为0?
(训练开始阶段由于没有经过训练。LoRA Module的输出不稳定,B初始化为0可以降低该阶段对整体模型输出的影响,类似ControlNet Zero Convolution的设置。)
补充:为什么能够提速?
参数量基本会降低到原先的10%以下
模型准备
(1)冻结参数,因为原有的参数是不需要训练的
(2)输出层调整更高的精度,会让loss更合理
(3)使lora部分拿到梯度,需要打开enable_input_require_grads
Lora配置
(1)找到哪些层需要使用lora 手动配置或者自动化的找到
1)已经进行了量化加载,可以自动化找到,精度为4bit的那些线性层
2)手动配置,把名字传进去。使用target_modules传给配置文件
训练
有两种模型权重,一种是原本的模型权重,一种是lora部分的权重
测试
pipeline总览
- 配置+导包
import os
import transformers
from transformers import Trainer, TrainingArguments, HfArgumentParser, set_seed
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_kbit_training,
)
import torch
from dataclasses import field, fields, dataclass
import bitsandbytes as bnb
from model import load_model
from dataset import belle_open_source_500k
### 定义一些配置信息
@dataclass
class FinetuneArguments:
model_name: str = field()
model_path: str = field()
data_name: str = field()
data_path: str = field()
train_size: int = field(default=-1)
test_size: int = field(default=200)
max_len: int = field(default=1024)
lora_rank: int = field(default=8)
lora_modules: str = field(default=None)
quantization: str = field(default="4bit")
- 写个函数,找到需要微调的linear层
def find_all_linear_names(model):
#cls = bnb.nn.Linear8bitLt
cls = bnb.nn.Linear4bit
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
- main
def main():
args, training_args = HfArgumentParser(
(FinetuneArguments, TrainingArguments)
).parse_args_into_dataclasses()
set_seed(training_args.seed)
world_size = int(os.environ.get("WORLD_SIZE", 1))
local_rank = int(os.environ.get("LOCAL_RANK", 0))
print(f"world size {world_size} local rank {local_rank}")
####### prepare model ############
model, tokenizer = load_model(args.model_name, args.model_path, args.quantization, local_rank)
model = prepare_model_for_kbit_training(model)
modules = find_all_linear_names(model)
target_modules = args.lora_modules.split(",") if args.lora_modules is not None else modules
config = LoraConfig(
r=args.lora_rank,
lora_alpha=16,
lora_dropout=0.05,
bias="none",
target_modules=target_modules,
task_type="CAUSAL_LM",
)
print(config)
model = get_peft_model(model, config)
############# prepare data ###########
data = eval(args.data_name)(args.data_path, tokenizer, args.max_len)
if args.train_size > 0:
data = data.shuffle(seed=training_args.seed).select(range(args.train_size))
if args.test_size > 0:
train_val = data.train_test_split(
test_size=args.test_size, shuffle=True, seed=training_args.seed
)
train_data = train_val["train"].shuffle(seed=training_args.seed)
val_data = train_val["test"].shuffle(seed=training_args.seed)
else:
train_data = data['train'].shuffle(seed=training_args.seed)
val_data = None
trainer = Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=training_args,
data_collator=transformers.DataCollatorForSeq2Seq(tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True),
)
trainer.train(resume_from_checkpoint=False)
model.save_pretrained(training_args.output_dir)
- sh脚本