import torch
from torch import nn
from torch.nn import functional as F
# 初始化模型参数
scale = 0.01
W1 = torch.randn(size=(20, 1, 3, 3)) * scale
b1 = torch.zeros(20)
W2 = torch.randn(size=(50, 20, 5, 5)) * scale
b2 = torch.zeros(50)
W3 = torch.randn(size=(800, 128)) * scale
b3 = torch.zeros(128)
W4 = torch.randn(size=(128, 10)) * scale
b4 = torch.zeros(10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]
# 定义模型
def lenet(X, params):
h1_conv = F.conv2d(input=X, weight=params[0], bias=params[1])
h1_activation = F.relu(h1_conv)
h1 = F.avg_pool2d(input=h1_activation, kernel_size=(2, 2), stride=(2, 2))
h2_conv = F.conv2d(input=h1, weight=params[2], bias=params[3])
h2_activation = F.relu(h2_conv)
h2 = F.avg_pool2d(input=h2_activation, kernel_size=(2, 2), stride=(2, 2))
h2 = h2.reshape(h2.shape[0], -1)
h3_linear = torch.mm(h2, params[4]) + params[5]
h3 = F.relu(h3_linear)
y_hat = torch.mm(h3, params[6]) + params[7]
return y_hat
# 交叉熵损失函数
loss = nn.CrossEntropyLoss(reduction="none")
def get_params(params, device):
new_params = [p.to(device) for p in params]
for p in new_params:
p.requires_grad_()
return new_params
def try_gpu(i=0):
"""如果存在,则返回gpu(i),否则返回cpu()"""
if torch.cuda.device_count() >= i + 1:
return torch.device(f"cuda:{i}")
return torch.device("cpu")
# new_params = get_params(params, try_gpu(0))
# params_cpu = get_params(params.copy(), try_gpu(10))
# print('b1 权重:', new_params[1])
# print('b1 梯度:', new_params[1].grad)
def allreduce(data):
for i in range(1, len(data)):
data[0][:] += data[i].to(data[0].device)
for i in range(1, len(data)):
data[i][:] = data[0].to(data[i].device)
def split_batch(X, y, devices):
"""将X和y拆分到多个设备上"""
assert X.shape[0] == y.shape[0]
return (nn.parallel.scatter(X, devices), nn.parallel.scatter(y, devices))
def sgd(params, lr, batch_size):
"""小批量随机梯度下降"""
# 在该模块下,所有计算得出的tensor的requires_grad都自动设置为False。
with torch.no_grad():
for param in params:
param -= lr * param.grad / batch_size
param.grad.zero_()
import torchvision
from torchvision import transforms
from torch.utils import data
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
def load_data_fashion_mnist(batch_size, resize=None):
"""下载Fashion-MNIST数据集,然后将其加载到内存中"""
# transforms.ToTensor()函数的作用是将原始的PILImage格式或者numpy.array格式的数据格式化为可被pytorch快速处理的张量类型。
# https://blog.csdn.net/qq_38410428/article/details/94719553
trans = [transforms.ToTensor()] # 实例化
if resize:
trans.insert(0, transforms.Resize(resize))
# 例如,我们需要对一张图片先进行尺度变换,再进行转化为Tensor算子。我们可以分步骤来,但是这样往往比较繁琐。
# 所以,我们可以利用Compose操作。实例时,我们传入一个列表,列表分别是几个实例化后的tansforms类,作为参数传入Compose中。
# 特别注意的是,compose中第一个操作后的数据,要符合第二个操作的输入类型。例如上中,第二个操作的输入是PIL类型,所以可以正常进行Totensor变换。
trans = transforms.Compose(trans)
# 获取当前脚本的绝对路径
# 构建数据下载目录的路径(在当前脚本文件的上层目录)
data_dir = os.path.join(script_dir, "..", "..", "data")
mnist_train = torchvision.datasets.FashionMNIST(
root=data_dir, train=True, transform=trans, download=True
)
mnist_test = torchvision.datasets.FashionMNIST(
root=data_dir, train=False, transform=trans, download=True
)
return (
data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=2),
data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=2),
)
import matplotlib
matplotlib.use("Agg") # 这一句一定要放在下面这句的前面
from matplotlib import pyplot as plt
def use_svg_display():
"""使用svg格式在Jupyter中显示绘图"""
#可以试试加上这个代码,%config InlineBackend.figure_format = 'svg'
# backend_inline.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5, 2.5)):
"""设置matplotlib的图表大小"""
use_svg_display()
plt.rcParams['figure.figsize'] = figsize
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
"""设置matplotlib的轴"""
axes.set_xlabel(xlabel)
axes.set_ylabel(ylabel)
axes.set_xscale(xscale)
axes.set_yscale(yscale)
axes.set_xlim(xlim)
axes.set_ylim(ylim)
if legend:
axes.legend(legend)
axes.grid()
#通过以上三个用于图形配置的函数,定义一个plot函数来简洁地绘制多条曲线, 因为我们需要在整个书中可视化许多曲线。
def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None,
ylim=None, xscale='linear', yscale='linear',
fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
"""绘制数据点"""
if legend is None:
legend = []
set_figsize(figsize)
axes = axes if axes else plt.gca()
# 如果X有一个轴,输出True
def has_one_axis(X):
return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
and not hasattr(X[0], "__len__"))
if has_one_axis(X):
X = [X]
if Y is None:
X, Y = [[]] * len(X), X
elif has_one_axis(Y):
Y = [Y]
if len(X) != len(Y):
X = X * len(Y)
axes.cla()
for x, y, fmt in zip(X, Y, fmts):
if len(x):
axes.plot(x, y, fmt)
else:
axes.plot(y, fmt)
set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
class Animator:
"""在动画中绘制数据"""
def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
ylim=None, xscale='linear', yscale='linear',
fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
figsize=(7, 5)):
# 增量地绘制多条线
if legend is None:
legend = []
use_svg_display()
self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
if nrows * ncols == 1:
self.axes = [self.axes, ]
# 使用lambda函数捕获参数
self.config_axes = lambda: set_axes(
self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
self.X, self.Y, self.fmts = None, None, fmts
def add(self, x, y):
# 向图表中添加多个数据点
if not hasattr(y, "__len__"):
y = [y]
n = len(y)
if not hasattr(x, "__len__"):
x = [x] * n
if not self.X:
self.X = [[] for _ in range(n)]
if not self.Y:
self.Y = [[] for _ in range(n)]
for i, (a, b) in enumerate(zip(x, y)):
if a is not None and b is not None:
self.X[i].append(a)
self.Y[i].append(b)
self.axes[0].cla()
for x, y, fmt in zip(self.X, self.Y, self.fmts):
self.axes[0].plot(x, y, fmt)
self.config_axes()
plt.show()
class Accumulator:
"""在n个变量上累加"""
def __init__(self, n):
self.data = [0.0] * n
def add(self, *args):
self.data = [a + float(b) for a, b in zip(self.data, args)]
def reset(self):
self.data = [0.0] * len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def accuracy(y_hat, y):
"""计算预测正确的数量"""
if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
y_hat = y_hat.argmax(axis=1) # 获得每行中最大元素的索引来获得预测类别
cmp = y_hat.type(y.dtype) == y #
return float(cmp.type(y.dtype).sum()) # 返回预测正确的个数
import time
import numpy as np
class Timer:
"""记录多次运行时间"""
def __init__(self):
self.times = []
self.lastTimeSum = 0
self.start()
def start(self):
"""启动计时器"""
self.tik = time.time()
def stop(self):
"""停止计时器并将时间记录在列表中"""
self.times.append(time.time() - self.tik)
return self.times[-1]
def avg(self):
"""返回平均时间"""
return sum(self.times) / len(self.times)
def sum(self):
"""返回时间总和"""
self.lastTimeSum = sum(self.times)
return self.lastTimeSum
def cumsum(self):
"""返回累计时间"""
return np.array(self.times).cumsum().tolist()
def evaluate_accuracy_gpu(net, data_iter, device=None):
"""使用GPU计算模型在数据集上的精度"""
if isinstance(net, torch.nn.Module):
net.eval() # 设置为评估模式,关闭Dropout和直接结算所有batch的均值和方差
if not device:
# 使用参数来构建一个虚拟的计算图,然后从计算图中获取一个参数张量,然后通过 .device 属性获取这个参数张量所在的设备。这个参数张量位于模型的第一个参数(通常是一个权重矩阵)。
device = next(iter(net.parameters())).device
# 正确预测的数量,总预测的数量
metric = Accumulator(2)
with torch.no_grad():
for X, y in data_iter:
if isinstance(X, list):
# BERT微调所需要的
X = [x.to(device) for x in X]
else:
X = X.to(device)
y = y.to(device)
metric.add(accuracy(net(X), y), y.numel())
return metric[0] / metric[1]
def train_batch(X, y, device_params, devices, lr):
X_shards, y_shards = split_batch(X, y, devices)
# 在每个GPU上分别计算损失
ls = [
loss(lenet(X_shard, device_W), y_shard).sum()
for X_shard, y_shard, device_W in zip(X_shards, y_shards, device_params)
]
# l_cpu = loss(lenet(X, params_cpu), y).sum()
# l_cpu.backward()
# print(f"大批量的梯度 ", [param.grad for param in params_cpu])
# 反向传播在每个GPU上分别执行
for l in ls:
print(l)
l.backward()
# 将每个GPU的所有梯度相加,并将其广播到所有GPU
with torch.no_grad():
for i in range(len(device_params[0])):
allreduce([device_params[c][i].grad for c in range(len(devices))])
# 在每个GPU上分别更新模型参数
for params in device_params:
# 在这里,我们使用全尺寸的小批量
sgd(params, lr, X.shape[0])
def train(num_gpus, batch_size, lr):
train_iter, test_iter = load_data_fashion_mnist(batch_size)
devices = [try_gpu(i) for i in range(num_gpus)]
# 将模型参数复制到num_gpus个GPU
device_params = [get_params(params, d) for d in devices]
num_epochs = 10
animator = Animator("epoch", "test acc", xlim=[1, num_epochs])
timer = Timer()
for epoch in range(num_epochs):
timer.start()
for X, y in train_iter:
# 为单个小批量执行多GPU训练
train_batch(X, y, device_params, devices, lr)
# 同步操作,用于确保在进行下一步操作之前,所有CUDA核心中的所有流都已经执行完毕
torch.cuda.synchronize()
break
timer.stop()
# 在GPU0上评估模型
animator.add(
epoch + 1,
(
evaluate_accuracy_gpu(
lambda x: lenet(x, device_params[0]), test_iter, devices[0]
),
),
)
break
print(f"测试精度: {animator.Y[0][-1]:.2f}, {timer.avg():.1f}秒/轮, 在{str(devices)}")
# train(num_gpus=1, batch_size=256, lr=0.2)
# plt.savefig(script_dir + f"/TrainChart1.png")
train(num_gpus=2, batch_size=256, lr=0.2)
plt.savefig(script_dir + f"/TrainChart1.png")
取消注释 params_cpu = get_params(params.copy(), try_gpu(10))
报错
/home/qlf/anaconda3/envs/d2l/lib/python3.9/site-packages/torch/_tensor.py:1083: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:482.)
return self._grad
Traceback (most recent call last):
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 342, in <module>
train(num_gpus=2, batch_size=256, lr=0.2)
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 321, in train
train_batch(X, y, device_params, devices, lr)
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 302, in train_batch
allreduce([device_params[c][i].grad for c in range(len(devices))])
File "/home/qlf/d2l/chapter12/12_5_多GPU训练/test不同GPU的小批量的梯度.py", line 58, in allreduce
data[0][:] += data[i].to(data[0].device)
TypeError: 'NoneType' object is not subscriptable