pytoch lightning执行trainer.fit()后卡住不动;pytorch lightning get stuck at fit

pytoch lightning执行trainer.fit()后卡住不动;
pytorch lightning get stuck at fit
只打印出了模型信息,然后就不输出任何东西了
解决方案:
dataset或者dataloader写的有问题,可能混杂了除了torch.tensor之外的数据;
根据以下运行通过的代码,自己重新写一下dataloader就好了。
原问题及讨论参见

import os
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

# 限制每个进程使用的最大显存比例
def set_memory_limit(fraction: float):
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            torch.cuda.set_per_process_memory_fraction(fraction, i)

# 定义GRU + MLP模型
class GRU_MLP_Model(pl.LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, mlp_hidden_size, output_size):
        super(GRU_MLP_Model, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, output_size)
        )
    
    def forward(self, x):
        # GRU
        gru_out, _ = self.gru(x)
        gru_out = gru_out[:, -1, :]  # 取最后一个时间步的输出
        
        # MLP
        mlp_out = self.mlp(gru_out)
        return mlp_out
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = point_cloud_distance_loss(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = point_cloud_distance_loss(y_hat, y)
        self.log('val_loss', val_loss)
        return val_loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

# 自定义点云距离损失函数
def point_cloud_distance_loss(point_cloud1, point_cloud2):
    assert point_cloud1.shape == point_cloud2.shape, "Point clouds must have the same shape"
    distances = torch.norm(point_cloud1 - point_cloud2, dim=2)
    loss = distances.mean()
    return loss

# 数据准备
def create_dataloaders():
    X = torch.randn(1000, 10, 8)  # (样本数, 时间步数, 特征数)
    y = torch.randn(1000, 10, 3)  # (样本数, 时间步数, 输出维度)
    
    dataset = TensorDataset(X, y)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    
    return train_loader, val_loader

# 训练模型
def train_model():
    input_size = 8
    hidden_size = 16
    num_layers = 2
    mlp_hidden_size = 32
    output_size = 3
    
    model = GRU_MLP_Model(input_size, hidden_size, num_layers, mlp_hidden_size, output_size)
    
    train_loader, val_loader = create_dataloaders()
    
    # 限制每个进程的显存使用比例
    set_memory_limit(0.5)  # 限制每个进程最多使用50%的显存
    
    trainer = pl.Trainer(max_epochs=10, gpus=1)  # 使用1个GPU
    trainer.fit(model, train_loader, val_loader)

if __name__ == "__main__":
    train_model()

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值