10.3.4 PyTorch训练
(1)加载了训练配置(train_cfg),构建了栅格化器 rasterizer,并打开了训练数据集 train_zarr。然后,使用这些对象创建了一个代理数据集 train_dataset,并将其传递给 PyTorch 的 DataLoader,以便进行批量训练。最后,通过 print(len(train_dataloader)) 打印了数据加载器中的批次数量。
from torch.utils.data import DataLoader
train_cfg = cfg["train_data_loader"]
rasterizer = build_rasterizer(cfg, dm)
train_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open()
train_dataset = AgentDataset(cfg, train_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset,
shuffle = cfg["train_data_loader"]["shuffle"],
batch_size = cfg["train_data_loader"]["batch_size"],
num_workers = cfg["train_data_loader"]["num_workers"])
print(len(train_dataloader))
这个过程准备了训练神经网络所需的数据集和数据加载器,以便进行自动驾驶车辆运动预测模型的训练。执行后会输出:
703023
(2)创建一个名为 LyftModel 的PyTorch模型类,基于预训练的ResNet架构,用于自动驾驶车辆的运动预测。该模型包含了定制的输入通道、线性层序列和输出层,用于预测未来轨迹的坐标以及概率分布。在前向传播中,模型通过骨干网络提取特征,经过线性层序列生成最终的预测输出。
class LyftModel(nn.Module):
def __init__(self, cfg: Dict, num_modes=3):
super().__init__()
architecture = cfg["model_params"]["model_architecture"]
backbone = eval(architecture)(pretrained=True, progress=True)
self.backbone = backbone
num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
num_in_channels = 3 + num_history_channels
self.backbone.conv1 = nn.Conv2d(
num_in_channels,
self.backbone.conv1.out_channels,
kernel_size=self.backbone.conv1.kernel_size,
stride=self.backbone.conv1.stride,
padding=self.backbone.conv1.padding,
bias=False,
)
if architecture == "resnet50":
backbone_out_features = 2048
else:
backbone_out_features = 512
self.future_len = cfg["model_params"]["future_num_frames"]
num_targets = 2 * self.future_len
self.head = nn.Sequential(
nn.Linear(in_features=backbone_out_features, out_features=4096),
)
self.num_preds = num_targets * num_modes
self.num_modes = num_modes
self.logit = nn.Linear(4096, out_features=self.num_preds + num_modes)
def forward(self, x):
x = self.backbone.conv1(x)
x = self.backbone.bn1(x)
x = self.backbone.relu(x)
x = self.backbone.maxpool(x)
x = self.backbone.layer1(x)
x = self.backbone.layer2(x)
x = self.backbone.layer3(x)
x = self.backbone.layer4(x)
x = self.backbone.avgpool(x)
x = torch.flatten(x, 1)
x = self.head(x)
x = self.logit(x)
return x
(3)创建函数 forward,定义了一个前向传播过程,用于在训练过程中计算模型的损失。函数接受输入数据 data、神经网络模型 model、计算设备 device 和损失函数 criterion 作为参数。
def forward(data, model, device, criterion):
inputs = data["image"].to(device)
target_availabilities = data["target_availabilities"].to(device)
targets = data["target_positions"].to(device)
matrix = data["world_to_image"].to(device)
centroid = data["centroid"].to(device)[:,None,:].to(torch.float)
# Forward pass
outputs = model(inputs)
bs,tl,_ = targets.shape
assert tl == cfg["model_params"]["future_num_frames"]
if cfg['train_params']['image_coords']:
targets = targets + centroid
targets = torch.cat([targets,torch.ones((bs,tl,1)).to(device)], dim=2)
targets = torch.matmul(matrix.to(torch.float), targets.transpose(1,2))
targets = targets.transpose(1,2)[:,:,:2]
rs = cfg["raster_params"]["raster_size"]
ec = cfg["raster_params"]["ego_center"]
bias = torch.tensor([rs[0] * ec[0], rs[1] * ec[1]])[None, None, :].to(device)
targets = targets - bias
confidences, pred = outputs[:,:3], outputs[:,3:]
pred = pred.view(bs, 3, tl, 2)
assert confidences.shape == (bs, 3)
confidences = torch.softmax(confidences, dim=1)
loss = criterion(targets, pred, confidences, target_availabilities)
loss = torch.mean(loss)
if cfg['train_params']['image_coords']:
matrix_inv = torch.inverse(matrix)
pred = pred + bias[:,None,:,:]
pred = torch.cat([pred,torch.ones((bs,3,tl,1)).to(device)], dim=3)
pred = torch.stack([torch.matmul(matrix_inv.to(torch.float), pred[:,i].transpose(1,2))
for i in range(3)], dim=1)
pred = pred.transpose(2,3)[:,:,:,:2]
pred = pred - centroid[:,None,:,:]
return loss, pred, confidences
在前向传播中,首先将输入数据转移到指定的设备上,并提取目标位置、目标可用性、世界到图像的转换矩阵等必要信息。然后进行模型的前向计算,得到模型的输出。接下来,根据配置文件中的参数进行一系列处理:
- 若 image_coords 为真,则将目标位置转换为图像坐标,并进行相关的调整。
- 计算损失,其中包括目标位置、预测位置、置信度、目标可用性等信息。
- 对于 image_coords 为真的情况,进行逆向坐标转换。
最终,返回计算得到的损失值、预测的位置信息以及置信度信息。这个函数的主要目的是在训练过程中计算模型的损失,以便进行梯度反向传播和参数更新。
(4)在下面的代码中创建了两个函数,用于计算多模态车辆轨迹预测任务中的负对数似然损失。
def pytorch_neg_multi_log_likelihood_batch(
gt: Tensor, pred: Tensor, confidences: Tensor, avails: Tensor
) -> Tensor:
assert len(pred.shape) == 4, f"expected 3D (MxTxC) array for pred, got {pred.shape}"
batch_size, num_modes, future_len, num_coords = pred.shape
assert gt.shape == (batch_size, future_len, num_coords), f"expected 2D (Time x Coords) array for gt, got {gt.shape}"
assert confidences.shape == (batch_size, num_modes), f"expected 1D (Modes) array for gt, got {confidences.shape}"
assert torch.allclose(torch.sum(confidences, dim=1), confidences.new_ones((batch_size,))), "confidences should sum to 1"
assert avails.shape == (batch_size, future_len), f"expected 1D (Time) array for gt, got {avails.shape}"
assert torch.isfinite(pred).all(), "invalid value found in pred"
assert torch.isfinite(gt).all(), "invalid value found in gt"
assert torch.isfinite(confidences).all(), "invalid value found in confidences"
assert torch.isfinite(avails).all(), "invalid value found in avails"
# convert to (batch_size, num_modes, future_len, num_coords)
gt = torch.unsqueeze(gt, 1) # 添加模式
avails = avails[:, None, :, None] # 添加模式和坐标
# 误差 (batch_size, num_modes, future_len)
error = torch.sum(((gt - pred) * avails) ** 2, dim=-1) # 减少坐标并使用可用性
if cfg['train_params']['image_coords']:
error = error / 4
with np.errstate(divide="ignore"): # 当置信度为0时,log变为-inf,但我们对此无所谓
# 误差 (batch_size, num_modes)
error = torch.log(confidences) - 0.5 * torch.sum(error, dim=-1) # 减少时间
# 使用模式的最大聚合器以保持数值稳定性
# 误差 (batch_size, num_modes)
max_value, _ = error.max(dim=1, keepdim=True) # 此时误差为负数,max()返回最小值
error = -torch.log(torch.sum(torch.exp(error - max_value), dim=-1, keepdim=True)) - max_value # 减少模式
return torch.mean(error)
def pytorch_neg_multi_log_likelihood_single(
gt: Tensor, pred: Tensor, avails: Tensor
) -> Tensor:
# pred (bs)x(time)x(2D coords) --> (bs)x(mode=1)x(time)x(2D coords)
# 创建置信度 (bs)x(mode=1)
batch_size, future_len, num_coords = pred.shape
confidences = pred.new_ones((batch_size, 1))
return pytorch_neg_multi_log_likelihood_batch(gt, pred.unsqueeze(1), confidences, avails)
对上述代码的具体说明如下所示:
- 函数pytorch_neg_multi_log_likelihood_batch:接受真实轨迹 gt、模型预测轨迹 pred、置信度分布 confidences 和轨迹可用性 avails 作为输入,计算多模态负对数似然损失。函数首先根据输入的数据维度进行断言和校验,然后通过加权欧几里得距离计算每个模态的误差,最终通过对时间维度的减少和置信度的处理,得到损失值。
- 函数pytorch_neg_multi_log_likelihood_single:将单模态情况下的损失计算抽象为调用 pytorch_neg_multi_log_likelihood_batch 的过程。在这个函数中,将模态数设置为1,即单模态,以获取相应的损失值。这两个函数在多模态预测任务中提供了有效的损失计算工具,用于评估模型对车辆未来轨迹的预测性能。
(5)创建一个 torch.device 对象,将计算设置在GPU上进行加速,并将 LyftModel 模型移到相应的设备上。同时,定义了使用 Adam 优化器(学习率为1e-3)和之前定义的损失函数 pytorch_neg_multi_log_likelihood_batch 进行模型的训练。
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LyftModel(cfg).to(device)
optimizer = optim.Adam(model.parameters(), lr = 1e-3)
criterion = pytorch_neg_multi_log_likelihood_batch
执行后会输出:
Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/checkpoints/resnet18-5c106cde.pth
100%|██████████|44.7M/44.7M [05:58<00:00, 130kB/s]
(6)检查设备类型,如果是CPU则打印“Training on CPU”,如果是GPU则打印“Training on GPU”。接着,通过迭代训练数据加载器,对模型进行训练。在每个迭代中,获取一个batch的数据,通过前向传播计算损失,然后进行反向传播和优化器的更新。训练过程中记录每个迭代的损失值,并根据指定的步骤保存模型的状态。最后,使用 tqdm 进行训练进度的可视化显示,展示当前迭代的损失以及平均损失。这个循环会在训练过程中执行多次,直到达到指定的最大训练步数。
if device.type == 'cpu': print('Training on CPU')
if device.type == 'cuda': print('Training on GPU')
tr_it = iter(train_dataloader)
progress_bar = tqdm(range(cfg["train_params"]["max_num_steps"]))
losses_train = []
for itr in progress_bar:
try:
data = next(tr_it)
except StopIteration:
tr_it = iter(train_dataloader)
data = next(tr_it)
model.train()
torch.set_grad_enabled(True)
loss, pred, confidences = forward(data, model, device, criterion)
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses_train.append(loss.item())
if (itr+1) % cfg['train_params']['checkpoint_every_n_steps'] == 0 and not DEBUG:
torch.save(model.state_dict(), f'model_state_{itr}.pth')
#display training progress
progress_bar.set_description(f"loss: {loss.item()} loss(avg): {np.mean(losses_train)}")
执行后会输出:
loss: 519.0257568359375 loss(avg): 1455.1976208496094: 100%|██████████| 25/25 [05:45<00:00, 13.80s/it]