LLM预训练

网上大量预训练代码都是封装了trainer-deepspeed后的结果,看了也不了解其中所用技术的优化点在哪。本文从最基础的训练过程开始,层层加码并对比。

在这里插入图片描述

基础版本

1.代码
from transformers import AutoModel,AutoTokenizer
from torch.utils.data import Dataset, DataLoader
# 模型加载
model_path = "xxx/glm2" # 你的模型路径
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True,torch_dtype=torch.float32,device_map="auto")

# 使用一半参数测试
for k,v in model.named_parameters():
    no_grad_layer = list(range(0,27,2))
    no_grad_layer = [str(i) for i in no_grad_layer]
    name_list = k.split(".")
    if len(name_list) >= 4 and name_list[3] in no_grad_layer:
        v.requires_grad=False

# dataset
example = ["text1","text2","text3"]
class GenerateDataset(Dataset):
    def __init__(self, example):
        super(GenerateDataset, self).__init__()
        self.example = example

    def __getitem__(self, item):
        return self.example[item]

    def __len__(self):
        return len(self.example)
ds = GenerateDataset(example)

# dataloader+coll_fn
def pretrain_fn(context):
    input_ids = []
    label_ids = []
    max_len = 2000
    context_ids = tokenizer(context,add_special_tokens=False)["input_ids"]
    target_ids = tokenizer(context,add_special_tokens=False)["input_ids"]
    for c_ids, t_ids in zip(context_ids, target_ids):
        length = len(c_ids)
        if length >= max_len:
            c_ids = c_ids[:max_len - 1] + [2]
            t_ids = t_ids[:max_len - 1] + [2]
        else:
            c_ids = c_ids + [2] * (max_len - length)
            t_ids = t_ids + [-100] * (max_len - length)
        input_ids.append(c_ids)
        label_ids.append(t_ids)
    return {"input_ids":torch.LongTensor(input_ids),"labels":torch.LongTensor(label_ids)}
    
dl = DataLoader(ds, batch_size=1,collate_fn=pretrain_fn)
# 优化器
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# 训练
for batch in tqdm(dl):
    optimizer.zero_grad()
    out=model(input_ids=batch["input_ids"], labels=batch["labels"])
    loss = out.loss
    print(loss)
    loss.backward()
    optimizer.step()
    # print(loss.item)
2.有几个注意事项:

1.torch_dtype=torch.float32,如果用了fp16,模型参数和损失直接变为nan,训练过程出错。用accelerate和trainer会报ValueError: Attempting to unscale FP16 gradients.

  0%|                                                  | 0/1615 [00:00<?, ?it/s]tensor(11.4844, dtype=torch.float16, grad_fn=<ToCopyBackward0>)
  0%|                                        | 1/1615 [00:04<1:54:59,  4.27s/it]tensor(nan, dtype=torch.float16, grad_fn=<ToCopyBackward0>)
  0%|                                          | 2/1615 [00:04<50:53,  1.89s/it]tensor(nan, dtype=torch.float16, grad_fn=<ToCopyBackward0>)
ValueError: Attempting to unscale FP16 gradients.

2.优化器SGD/Adamw,6B模型全参数训练时候,如果用Adamw会爆显存(batch=1,文本长度20),用SGD则不会。实际llm没看到用SGD做优化器,这里只是提一嘴。

Accelerate 版本

# 加载model optimizer dataloader 后,加入以下代码
from accelerate import Accelerator
accelerator = Accelerator(mixed_precision="fp16")
device = accelerator.device
model, optimizer, dl = accelerator.prepare(model, optimizer, dl)
# 原来的loss.backward()替换为accelerator.backward(loss)
# loss.backward()
accelerator.backward(loss)

Trainer 版本

原代码的dataloader optimizer 和训练代码都不用了

train_args = TrainingArguments(save_strategy="epoch",
                               log_level="debug",
                               output_dir="./saved_checkpoint/",
                               per_device_train_batch_size=4,  
                               gradient_accumulation_steps=1,  # 梯度累积步数
                               num_train_epochs=1,
                               learning_rate=1e-5,
                               fp16=True, # 是否使用fp16
                               logging_steps=1,
                               warmup_steps=50,
                               optim="adamw_hf",  # 指定优化器
                               gradient_checkpointing=True, # 梯度检查点
                               )
trainer = Trainer(model=model,train_dataset=ds,args=train_args, data_collator=pretrain_fn)
trainer.train()

需要注意的时:如果gradient_checkpointing=True,则需要加上下面的代码

model.enable_input_require_grads()
# 或者 直接修改模型embedding参数的requires_grad
for k,v in model.named_parameters():
	if k == "transformer.embedding.word_embeddings.weight":
		v.requires_grad = True

Trainer + DeepSpeed

修改train_args, 其他不变

train_args = TrainingArguments(save_strategy="steps",
                               save_steps=40,
                               log_level="debug",
                               output_dir="./test_lora/",
                               per_device_train_batch_size=2,
                               gradient_accumulation_steps=2,
                               num_train_epochs=10,
                               learning_rate=1e-6,
                               fp16=True,
                               logging_steps=1,
                               warmup_steps=10,
                               optim="adamw_hf",
                               gradient_checkpointing=True,
                               deepspeed="./notebooks/deepspeed-stage2.json" # 加入ds路径
                               )

stage-2参数

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 20,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

stage-3参数

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 20,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

启动命令

deepspeed pretrain.py --seepspeed ./notebooks/deepspeed-stage2.json

### 解决训练 Stable Diffusion 时遇到的 `ValueError: Attempting to unscale FP16 gradients` 错误 当尝试使用混合精度(FP16)进行训练时,可能会遇到 `ValueError: Attempting to unscale FP16 gradients.` 的错误提示[^1]。此问题通常发生在梯度缩放过程中,特别是在处理半精度浮点数运算时。 为了有效解决这一问题,可以采取以下几种方法: #### 方法一:禁用混合精度训练 如果不需要特别依赖于混合精度带来的性能提升,可以选择完全关闭混合精度模式。通过设置参数 `--mixed_precision="no"` 或者不指定该选项来实现这一点。这会使得整个训练过程仅使用单精度浮点数(FP32),从而避免与 FP16 相关的问题。 ```bash accelerate launch --mixed_precision="no" train_text_to_image_lora.py ... ``` #### 方法二:调整优化器配置 有时可以通过修改优化器的相关配置项来绕过这个问题。具体来说,在初始化 AdamW 等支持 LAMB 类型更新规则的优化器时加入额外参数 `use_fp16=True` 和 `max_grad_norm=None` 可能会有帮助[^2]。 ```python from transformers import get_linear_schedule_with_warmup, Adafactor optimizer = Adafactor( model.parameters(), lr=learning_rate, eps=(1e-30, 1e-3), clip_threshold=1.0, decay_rate=-0.8, beta1=None, weight_decay=0.0, relative_step=False, scale_parameter=False, use_fp16=True, max_grad_norm=None ) ``` #### 方法三:应用 LoRA 技术 Low-Rank Adaptation (LoRA) 是一种有效的迁移学习技术,可以在保持原有模型权重不变的情况下引入少量可训练参数来进行特定任务上的微调。对于某些情况下发生的 FP16 梯度异常情况,采用 LoRA 方案或许能够提供更稳定的解决方案。 ```bash accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py ... --lora_rank=4 --lora_alpha=16 --lora_dropout=0.1 ``` 以上三种方式可以根据实际需求灵活选用或者组合起来试用,以找到最适合当前环境的最佳实践方案。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值