深度学习jacquard数据集运行一半卡死但不报错,且一运行cpu跑满cuda不动

最近学习GGCNN,自己想照着原代码敲一遍学习,学习jacquard数据集,代码照着以往的规范写,但一运行到特定位置就卡住了,并且cpu跑满cuda不动,明明以已经全部.to(device)了,求大佬指点

def train_one_epoch(epoch_num,net,train_data,optimizer,batches_per_epoch,device='cuda'):

    results = {
        'loss': 0,
        'losses': {
        }
    }
    print(f"start training on {device},epoch:{epoch_num}")
    print(len(train_data))      # 33, 8415/256

    net.train()

    batch_idx = 0
    for x,y,_,_,_ in train_data:
        print('------------程序卡死,这个输出不了----------')
        batch_idx += 1
        if batch_idx >= batches_per_epoch:
            break
        x = x.to(device)
        y = [_.to(device) for _ in y]

        lossed = net.compute_loss(x,y)
        print(lossed)
        loss = lossed['loss']

        results['loss'] += loss.item()
        for loss_name,loss_num in lossed['losses'].items():
            if loss_name not in results['losses']:
                results['losses'][loss_name]=loss_num.item() 
            else:
                results['losses'][loss_name]+=loss_num.item()
                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    results['loss'] /= batch_idx
    for loss_name,_ in results['losses']:
        results['losses'][loss_name] /= batch_idx

    return results

关于train_data和其它的定义如下:

    device = torch.device('cuda')

    Dataset = get_dataset(args.dataset)
    dataset = Dataset(args.dataset_path,
                      output_size=args.input_size,
                      ds_rotate=args.ds_rotate,
                      random_rotate=True,
                      random_zoom=True,
                      include_depth=args.use_depth,
                      include_rgb=args.use_rgb)

    indices = list(range(dataset.length))       
    split_point = int(np.floor(args.train_test_ratio * dataset.length))
    print(dataset.length)
    print(split_point)
    if args.shuffle:
        np.random.seed(args.seed)
        np.random.shuffle(indices)
    train_indices,test_indices = indices[:split_point],indices[split_point:]


    train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
    test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)
    train_data = torch.utils.data.DataLoader(dataset,
                                             batch_size=args.batch_size,
                                             num_workers=args.num_workers,
                                             sampler=train_sampler)
    test_data = torch.utils.data.DataLoader(dataset,
                                            batch_size=args.batch_size,
                                            num_workers=args.num_workers,
                                            sampler=test_sampler)
    

    input_channels = 1 * args.use_depth + 3 * args.use_rgb
    network = get_network('grconvnet')
    net = network(
        input_channels=input_channels,
        dropout=args.use_dropout,
        prob=args.dropout_prob,
        channel_size=args.channel_size)
    net = net.to(device)


    if args.optim.lower() == 'adam':
        optimizer = optim.Adam(net.parameters())
    elif args.optim.lower() == 'sgd':
        optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
    else:
        raise NotImplementedError('Optimizer {} is not implemented'.format(args.optim))
    

    start_train(net=net,
                num_epochs=args.epochs,
                train_data=train_data,
                valid_data=test_data,
                optimizer=optimizer,
                batches_per_epoch=args.batches_per_epoch,
                device=device)

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值