【torch 测试不同GPU数据并行训练时各GPU小批量的梯度加起来等于一整个批量的梯度】

Qlf~~

已于 2023-10-14 20:50:52 修改

阅读量100

点赞数

文章标签： pytorch 深度学习 python

于 2023-10-14 20:49:20 首次发布

本文链接：https://blog.csdn.net/q2462750467/article/details/133830032

版权

import torch
from torch import nn
from torch.nn import functional as F

# 初始化模型参数
scale = 0.01
W1 = torch.randn(size=(20, 1, 3, 3)) * scale
b1 = torch.zeros(20)
W2 = torch.randn(size=(50, 20, 5, 5)) * scale
b2 = torch.zeros(50)
W3 = torch.randn(size=(800, 128)) * scale
b3 = torch.zeros(128)
W4 = torch.randn(size=(128, 10)) * scale
b4 = torch.zeros(10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

# 定义模型
def lenet(X, params):
    h1_conv = F.conv2d(input=X, weight=params[0], bias=params[1])
    h1_activation = F.relu(h1_conv)
    h1 = F.avg_pool2d(input=h1_activation, kernel_size=(2, 2), stride=(2, 2))
    h2_conv = F.conv2d(input=h1, weight=params[2], bias=params[3])
    h2_activation = F.relu(h2_conv)
    h2 = F.avg_pool2d(input=h2_activation, kernel_size=(2, 2), stride=(2, 2))
    h2 = h2.reshape(h2.shape[0], -1)
    h3_linear = torch.mm(h2, params[4]) + params[5]
    h3 = F.relu(h3_linear)
    y_hat = torch.mm(h3, params[6]) + params[7]
    return y_hat


# 交叉熵损失函数
loss = nn.CrossEntropyLoss(reduction="none")


def get_params(params, device):
    new_params = [p.to(device) for p in params]
    for p in new_params:
        p.requires_grad_()
    return new_params


def try_gpu(i=0):
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f"cuda:{i}")
    return torch.device("cpu")


# new_params = get_params(params, try_gpu(0))
# params_cpu = get_params(params.copy(), try_gpu(10))
# print('b1 权重：', new_params[1])
# print('b1 梯度：', new_params[1].grad)


def allreduce(data):
    for i in range(1, len(data)):
        data[0][:] += data[i].to(data[0].device)
    for i in range(1, len(data)):
        data[i][:] = data[0].to(data[i].device)


def split_batch(X, y, devices):
    """将X和y拆分到多个设备上"""
    assert X.shape[0] == y.shape[0]
    return (nn.parallel.scatter(X, devices), nn.parallel.scatter(y, devices))


def sgd(params, lr, batch_size):
    """小批量随机梯度下降"""
    # 在该模块下，所有计算得出的tensor的requires_grad都自动设置为False。
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

import torchvision
from torchvision import transforms
from torch.utils import data
import os

script_dir = os.path.dirname(os.path.abspath(__file__))


def load_data_fashion_mnist(batch_size, resize=None):
    """下载Fashion-MNIST数据集，然后将其加载到内存中"""
    # transforms.ToTensor()函数的作用是将原始的PILImage格式或者numpy.array格式的数据格式化为可被pytorch快速处理的张量类型。
    # https://blog.csdn.net/qq_38410428/article/details/94719553
    trans = [transforms.ToTensor()]  # 实例化
    if resize:
        trans.insert(0, transforms.Resize(resize))
    #  例如，我们需要对一张图片先进行尺度变换，再进行转化为Tensor算子。我们可以分步骤来，但是这样往往比较繁琐。
    # 所以，我们可以利用Compose操作。实例时，我们传入一个列表，列表分别是几个实例化后的tansforms类，作为参数传入Compose中。
    # 特别注意的是，compose中第一个操作后的数据，要符合第二个操作的输入类型。例如上中，第二个操作的输入是PIL类型，所以可以正常进行Totensor变换。
    trans = transforms.Compose(trans)
    # 获取当前脚本的绝对路径

    # 构建数据下载目录的路径（在当前脚本文件的上层目录）
    data_dir = os.path.join(script_dir, "..", "..", "data")

    mnist_train = torchvision.datasets.FashionMNIST(
        root=data_dir, train=True, transform=trans, download=True
    )
    mnist_test = torchvision.datasets.FashionMNIST(
        root=data_dir, train=False, transform=trans, download=True
    )
    return (
        data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=2),
        data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=2),
    )

import matplotlib
matplotlib.use("Agg")  # 这一句一定要放在下面这句的前面
from matplotlib import pyplot as plt

def use_svg_display():
    """使用svg格式在Jupyter中显示绘图"""
    #可以试试加上这个代码，%config InlineBackend.figure_format = 'svg'
    # backend_inline.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5, 2.5)):
    """设置matplotlib的图表大小"""
    use_svg_display()
    plt.rcParams['figure.figsize'] = figsize

def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
    """设置matplotlib的轴"""
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()

#通过以上三个用于图形配置的函数，定义一个plot函数来简洁地绘制多条曲线， 因为我们需要在整个书中可视化许多曲线。
def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None,
         ylim=None, xscale='linear', yscale='linear',
         fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
    """绘制数据点"""
    if legend is None:
        legend = []

    set_figsize(figsize)
    axes = axes if axes else plt.gca()

    # 如果X有一个轴，输出True
    def has_one_axis(X):
        return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
                and not hasattr(X[0], "__len__"))

    if has_one_axis(X):
        X = [X]
    if Y is None:
        X, Y = [[]] * len(X), X
    elif has_one_axis(Y):
        Y = [Y]
    if len(X) != len(Y):
        X = X * len(Y)
    axes.cla()
    for x, y, fmt in zip(X, Y, fmts):
        if len(x):
            axes.plot(x, y, fmt)
        else:
            axes.plot(y, fmt)
    set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)

class Animator:
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(7, 5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        use_svg_display()
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        plt.show()

class Accumulator:
    """在n个变量上累加"""

    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def accuracy(y_hat, y):
    """计算预测正确的数量"""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)  # 获得每行中最大元素的索引来获得预测类别
    cmp = y_hat.type(y.dtype) == y  #
    return float(cmp.type(y.dtype).sum())  # 返回预测正确的个数

import time
import numpy as np

class Timer:
    """记录多次运行时间"""

    def __init__(self):
        self.times = []
        self.lastTimeSum = 0
        self.start()

    def start(self):
        """启动计时器"""
        self.tik = time.time()

    def stop(self):
        """停止计时器并将时间记录在列表中"""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """返回平均时间"""
        return sum(self.times) / len(self.times)
    
    def sum(self):
        """返回时间总和"""
        self.lastTimeSum = sum(self.times)
        return self.lastTimeSum

    def cumsum(self):
        """返回累计时间"""
        return np.array(self.times).cumsum().tolist()

def evaluate_accuracy_gpu(net, data_iter, device=None):
    """使用GPU计算模型在数据集上的精度"""
    if isinstance(net, torch.nn.Module):
        net.eval()  # 设置为评估模式，关闭Dropout和直接结算所有batch的均值和方差
        if not device:
            # 使用参数来构建一个虚拟的计算图，然后从计算图中获取一个参数张量，然后通过 .device 属性获取这个参数张量所在的设备。这个参数张量位于模型的第一个参数（通常是一个权重矩阵）。
            device = next(iter(net.parameters())).device
    # 正确预测的数量，总预测的数量
    metric = Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # BERT微调所需要的
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]


def train_batch(X, y, device_params, devices, lr):
    X_shards, y_shards = split_batch(X, y, devices)
    # 在每个GPU上分别计算损失
    ls = [
        loss(lenet(X_shard, device_W), y_shard).sum()
        for X_shard, y_shard, device_W in zip(X_shards, y_shards, device_params)
    ]
    # l_cpu = loss(lenet(X, params_cpu), y).sum()
    # l_cpu.backward()
    # print(f"大批量的梯度 ", [param.grad for param in params_cpu])
    # 反向传播在每个GPU上分别执行
    for l in ls:
        print(l)
        l.backward()
    # 将每个GPU的所有梯度相加，并将其广播到所有GPU
    with torch.no_grad():
        for i in range(len(device_params[0])):
            allreduce([device_params[c][i].grad for c in range(len(devices))])
    # 在每个GPU上分别更新模型参数
    for params in device_params:
        # 在这里，我们使用全尺寸的小批量
        sgd(params, lr, X.shape[0])


def train(num_gpus, batch_size, lr):
    train_iter, test_iter = load_data_fashion_mnist(batch_size)
    devices = [try_gpu(i) for i in range(num_gpus)]
    # 将模型参数复制到num_gpus个GPU
    device_params = [get_params(params, d) for d in devices]
    num_epochs = 10
    animator = Animator("epoch", "test acc", xlim=[1, num_epochs])
    timer = Timer()
    for epoch in range(num_epochs):
        timer.start()
        for X, y in train_iter:
            # 为单个小批量执行多GPU训练
            train_batch(X, y, device_params, devices, lr)
            # 同步操作，用于确保在进行下一步操作之前，所有CUDA核心中的所有流都已经执行完毕
            torch.cuda.synchronize()
            break
        timer.stop()
        # 在GPU0上评估模型
        animator.add(
            epoch + 1,
            (
                evaluate_accuracy_gpu(
                    lambda x: lenet(x, device_params[0]), test_iter, devices[0]
                ),
            ),
        )
        break
    print(f"测试精度: {animator.Y[0][-1]:.2f}, {timer.avg():.1f}秒/轮， 在{str(devices)}")


# train(num_gpus=1, batch_size=256, lr=0.2)
# plt.savefig(script_dir + f"/TrainChart1.png")

train(num_gpus=2, batch_size=256, lr=0.2)
plt.savefig(script_dir + f"/TrainChart1.png")

取消注释 params_cpu = get_params(params.copy(), try_gpu(10))

报错

/home/qlf/anaconda3/envs/d2l/lib/python3.9/site-packages/torch/_tensor.py:1083: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:482.)
return self._grad
Traceback (most recent call last):
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 342, in <module>
train(num_gpus=2, batch_size=256, lr=0.2)
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 321, in train
train_batch(X, y, device_params, devices, lr)
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 302, in train_batch
allreduce([device_params[c][i].grad for c in range(len(devices))])
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 58, in allreduce
data[0][:] += data[i].to(data[0].device)
TypeError: 'NoneType' object is not subscriptable

Qlf~~

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
【torch 测试不同GPU数据并行训练时各GPU小批量的梯度加起来等于一整个批量的梯度】

File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 302, in train_batch。File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 321, in train。取消注释 params_cpu = get_params(params.copy(), try_gpu(10))
复制链接

扫一扫