1--参考文档
accelerate+deepspeed多机多卡训练-适用集群环境
2--安装过程
# 安装accelerate
pip install acceleratepip install importlib-metadata
# 获取默认配置文件
python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"# 默认保存地址
# /home/liujinfu/.cache/huggingface/accelerate/default_config.yaml# 查看配好的环境
accelerate env# 查看环境是否配好
accelerate test
3--测试代码
# 加载库
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from accelerate import Accelerator, DeepSpeedPlugin
# 定义测试网络
class TestNet(nn.Module):
def __init__(self, input_dim: int, output_dim: int):
super(TestNet, self).__init__()
self.fc1 = nn.Linear(in_features = input_dim, out_features = output_dim)
self.fc2 = nn.Linear(in_features = output_dim, out_features = output_dim)
def forward(self, x: torch.Tensor):
x = torch.relu(self.fc1(x))
x = torch.fc2(x)
return x
if __name__ == "__main__":
input_dim = 8
output_dim = 64
batch_size = 8
dataset_size = 1000
# 随机生成数据
input_data = torch.randn(dataset_size, input_dim)
labels = torch.randn(dataset_size, output_dim)
# 创建数据集
dataset = TensorDataset(input_data, labels)
dataloader = DataLoader(dataset = dataset, batch_size = batch_size)
# 初始化模型
model = TestNet(input_dim = input_dim, output_dim = output_dim)
# 创建Deepspeed配置
deepspeed = DeepSpeedPlugin(zero_stage = 2, gradient_clipping = 1.0) # 使用zero-2
accelerator = Accelerator(deepspeed_plugin = deepspeed)
# 创建训练配置
optimizator = torch.optim.Adam(model.parameters(), lr = 0.001)
loss_func = nn.MSELoss()
# 初始化
model, optimizator, dataloader = accelerator.prepare(model, optimizator, dataloader)
# 训练模型
for epoch in range(10):
model.train()
for batch in dataloader:
inputs, labels = batch
# 清理梯度
optimizator.zero_grad()
outputs = model(inputs)
loss = loss_func(outputs, labels)
accelerator.backward(loss) # 核心改动
optimizator.step()
print(f"Epoch {epoch}, Loss: {loss.item()}")
# 保存模型
accelerator.wait_for_everyone()
accelerator.save(model.state_dict(), "test_model.pth")
4--代码运行
未完待续!