pytoch lightning执行trainer.fit()后卡住不动;
pytorch lightning get stuck at fit
只打印出了模型信息,然后就不输出任何东西了
解决方案:
dataset或者dataloader写的有问题,可能混杂了除了torch.tensor之外的数据;
根据以下运行通过的代码,自己重新写一下dataloader就好了。
原问题及讨论参见
import os
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
# 限制每个进程使用的最大显存比例
def set_memory_limit(fraction: float):
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
torch.cuda.set_per_process_memory_fraction(fraction, i)
# 定义GRU + MLP模型
class GRU_MLP_Model(pl.LightningModule):
def __init__(self, input_size, hidden_size, num_layers, mlp_hidden_size, output_size):
super(GRU_MLP_Model, self).__init__()
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.mlp = nn.Sequential(
nn.Linear(hidden_size, mlp_hidden_size),
nn.ReLU(),
nn.Linear(mlp_hidden_size, output_size)
)
def forward(self, x):
# GRU
gru_out, _ = self.gru(x)
gru_out = gru_out[:, -1, :] # 取最后一个时间步的输出
# MLP
mlp_out = self.mlp(gru_out)
return mlp_out
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = point_cloud_distance_loss(y_hat, y)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
val_loss = point_cloud_distance_loss(y_hat, y)
self.log('val_loss', val_loss)
return val_loss
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
return optimizer
# 自定义点云距离损失函数
def point_cloud_distance_loss(point_cloud1, point_cloud2):
assert point_cloud1.shape == point_cloud2.shape, "Point clouds must have the same shape"
distances = torch.norm(point_cloud1 - point_cloud2, dim=2)
loss = distances.mean()
return loss
# 数据准备
def create_dataloaders():
X = torch.randn(1000, 10, 8) # (样本数, 时间步数, 特征数)
y = torch.randn(1000, 10, 3) # (样本数, 时间步数, 输出维度)
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
return train_loader, val_loader
# 训练模型
def train_model():
input_size = 8
hidden_size = 16
num_layers = 2
mlp_hidden_size = 32
output_size = 3
model = GRU_MLP_Model(input_size, hidden_size, num_layers, mlp_hidden_size, output_size)
train_loader, val_loader = create_dataloaders()
# 限制每个进程的显存使用比例
set_memory_limit(0.5) # 限制每个进程最多使用50%的显存
trainer = pl.Trainer(max_epochs=10, gpus=1) # 使用1个GPU
trainer.fit(model, train_loader, val_loader)
if __name__ == "__main__":
train_model()