T U R N − T A P TURN-TAP TURN−TAP
https://www.cnblogs.com/demian/p/9720597.html
1.视频教程:
B站、网易云课堂、腾讯课堂
2.代码地址:
Gitee
Github
3.存储地址:
Google云
百度云:
提取码:
《TURN TAP: Temporal Unit Regression Network for Temporal Action Proposals》
—待写
作者:Jiyang Gao, Zhenheng Yang, Chen Sun, Kan Chen, Ram Nevatia
单位:
发表会议及时间:ICCV 2017
Submission history
[v1] Fri, 17 Mar 2017 20:24:32 UTC (1,636 KB)
[v2] Fri, 4 Aug 2017 19:21:31 UTC (3,279 KB)
- Abstract
Temporal Action Proposal (TAP) generation is an important problem, as fast and accurate extraction of semantically important (e.g. human actions) segments from untrimmed videos is an important step for large-scale video analysis. We propose a novel Temporal Unit Regression Network (TURN) model. There are two salient aspects of TURN: (1) TURN jointly predicts action proposals and refines the temporal boundaries by temporal coordinate regression; (2) Fast computation is enabled by unit feature reuse: a long untrimmed video is decomposed into video units, which are reused as basic building blocks of temporal proposals. TURN outperforms the state-of-the-art methods under average recall (AR) by a large margin on THUMOS-14 and ActivityNet datasets, and runs at over 880 frames per second (FPS) on a TITAN X GPU. We further apply TURN as a proposal generation stage for existing temporal action localization pipelines, it outperforms state-of-the-art performance on THUMOS-14 and ActivityNet.
一 论文导读
Motivation
实现快速和准确地抽取出视频中的语义片段
Proposed Method
- 提出了TURN模型预测proposal并用temporal coordinate regression来校正proposal的边界
- 通过复用unit feature来实现快速计算
二 论文精读
三 代码实现
1.Video Unit Processing:
2.Clip Pyramid Modeling:
3.Unit-level Temporal Coordinate Regression:
4.Loss function:
criterion = [torch.nn.CrossEntropyLoss(), RegressionLoss().cuda()]
end = time.time()
optimizer.zero_grad()
for i, (feats, labels, start_offsets, end_offsets) in enumerate(train_loader):
data_time.update(time.time() - end)
input_feats = torch.autograd.Variable(feats).cuda()
input_labels = torch.autograd.Variable(labels).cuda()
start_offsets = torch.autograd.Variable(start_offsets).cuda().float()
end_offsets = torch.autograd.Variable(end_offsets).cuda().float()
pred_labels = model(input_feats)
cls_loss = criterion[0](pred_labels[:, :2], input_labels)
res_loss = criterion[1](pred_labels[:, 2:], input_labels.float(), start_offsets, end_offsets)
cls_losses.update(cls_loss.cpu().item(), feats.size(0))
res_losses.update(res_loss.cpu().item(), torch.sum(labels))
loss = cls_loss + args.lambda_reg * res_loss
losses.update(loss.cpu().item(), feats.size(0))
# compute gradient and do SGD step
loss.backward()
if args.clip_gradient is not None:
total_norm = clip_grad_norm(model.parameters(), args.clip_gradient)
if total_norm > args.clip_gradient:
print('Clipping gradient: {} with coef {}'.format(total_norm, args.clip_gradient / total_norm))
optimizer.step()
optimizer.zero_grad()
turn_model.py
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
import numpy as np
class GroupNormalize(object):
def __init__(self, p=2):
self.p = p
def __call__(self, feat_group):
if isinstance(feat_group, np.ndarray):
feat_group = torch.from_numpy(feat_group).contiguous()
assert len(feat_group.size()) == 1, 'the size of feats is {}, ' \
'but expected {}'.format(len(feat_group.size()), 2)
return F.normalize(feat_group, p=2, dim=0)
class TURN(torch.nn.Module):
def __init__(self, tr_batch_size, ts_batch_size, lambda_reg,
unit_feature_dim, middle_layer_dim=1000,
dropout=0.5, num_class=4, norm_p=2):
super(TURN, self).__init__()
self.tr_batch_size = tr_batch_size
self.ts_batch_size = ts_batch_size
self.lambda_reg = lambda_reg
self.unit_feature_dim = unit_feature_dim
self.input_feature_dim = unit_feature_dim * 3
self.middle_layer_dim = middle_layer_dim
self.dropout = dropout
self.num_class = num_class
self.norm_p = norm_p
print(("""
Initializing TURN ...
Configurations of TURN:
training batch size: {}
testing batch size: {}
lambda for regression: {}
unit feature size: {}
input feature size: {}
middle_layer_dim: {}
dropout_ratio: {}
""".format(tr_batch_size, ts_batch_size, lambda_reg, unit_feature_dim,
self.input_feature_dim, middle_layer_dim, dropout)))
self._prepare_turn_model()
def _prepare_turn_model(self):
self.fc_layer = nn.Linear(self.input_feature_dim, self.middle_layer_dim)
if self.dropout:
self.dropout_layer = nn.Dropout(p=self.dropout)
self.output_layer = nn.Linear(self.middle_layer_dim, self.num_class)
nn.init.normal(self.fc_layer.weight.data, 0, 0.001)
nn.init.constant(self.fc_layer.bias.data, 0)
nn.init.normal(self.output_layer.weight.data, 0, 0.001)
nn.init.constant(self.output_layer.bias.data, 0)
def forward(self, inputdata):
out = self.fc_layer(inputdata)
out = F.relu(out, inplace=True)
if self.training:
out = self.dropout_layer(out)
out = self.output_layer(out)
return out
def get_optim_policies(self):
weights = list()
bias = list()
for m in self.modules():
if isinstance(m, torch.nn.Linear):
ps = list(m.parameters())
weights.append(ps[0])
if len(ps) == 2:
bias.append(ps[1])
elif len(m._modules) == 0:
if len(list(m.parameters())) > 0:
raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m)))
return [
{'params': weights, 'lr_mult': 1, 'weight_decay_mult': 1,
'name': "weight"},
{'params': bias, 'lr_mult': 2, 'weight_decay_mult': 0,
'name': "bias"},
]
def data_preparation(self):
return torchvision.transforms.Compose([GroupNormalize(self.norm_p), ])