最近学习GGCNN,自己想照着原代码敲一遍学习,学习jacquard数据集,代码照着以往的规范写,但一运行到特定位置就卡住了,并且cpu跑满cuda不动,明明以已经全部.to(device)了,求大佬指点
def train_one_epoch(epoch_num,net,train_data,optimizer,batches_per_epoch,device='cuda'):
results = {
'loss': 0,
'losses': {
}
}
print(f"start training on {device},epoch:{epoch_num}")
print(len(train_data)) # 33, 8415/256
net.train()
batch_idx = 0
for x,y,_,_,_ in train_data:
print('------------程序卡死,这个输出不了----------')
batch_idx += 1
if batch_idx >= batches_per_epoch:
break
x = x.to(device)
y = [_.to(device) for _ in y]
lossed = net.compute_loss(x,y)
print(lossed)
loss = lossed['loss']
results['loss'] += loss.item()
for loss_name,loss_num in lossed['losses'].items():
if loss_name not in results['losses']:
results['losses'][loss_name]=loss_num.item()
else:
results['losses'][loss_name]+=loss_num.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
results['loss'] /= batch_idx
for loss_name,_ in results['losses']:
results['losses'][loss_name] /= batch_idx
return results
关于train_data和其它的定义如下:
device = torch.device('cuda')
Dataset = get_dataset(args.dataset)
dataset = Dataset(args.dataset_path,
output_size=args.input_size,
ds_rotate=args.ds_rotate,
random_rotate=True,
random_zoom=True,
include_depth=args.use_depth,
include_rgb=args.use_rgb)
indices = list(range(dataset.length))
split_point = int(np.floor(args.train_test_ratio * dataset.length))
print(dataset.length)
print(split_point)
if args.shuffle:
np.random.seed(args.seed)
np.random.shuffle(indices)
train_indices,test_indices = indices[:split_point],indices[split_point:]
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
test_sampler = torch.utils.data.SubsetRandomSampler(test_indices)
train_data = torch.utils.data.DataLoader(dataset,
batch_size=args.batch_size,
num_workers=args.num_workers,
sampler=train_sampler)
test_data = torch.utils.data.DataLoader(dataset,
batch_size=args.batch_size,
num_workers=args.num_workers,
sampler=test_sampler)
input_channels = 1 * args.use_depth + 3 * args.use_rgb
network = get_network('grconvnet')
net = network(
input_channels=input_channels,
dropout=args.use_dropout,
prob=args.dropout_prob,
channel_size=args.channel_size)
net = net.to(device)
if args.optim.lower() == 'adam':
optimizer = optim.Adam(net.parameters())
elif args.optim.lower() == 'sgd':
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
else:
raise NotImplementedError('Optimizer {} is not implemented'.format(args.optim))
start_train(net=net,
num_epochs=args.epochs,
train_data=train_data,
valid_data=test_data,
optimizer=optimizer,
batches_per_epoch=args.batches_per_epoch,
device=device)