wtalc-pytorch源码解析
论文名:W-TALC: Weakly-supervised Temporal Activity Localization and Classification
代码链接:https://github.com/sujoyp/wtalc-pytorch
代码的主要结构如下:
python file | function |
---|---|
main.py | 主函数 |
options.py | 参数配置 |
video_dataset.py | 数据集分类与载入 |
model.py | 弱监督层模型 |
train.py | 训练代码 |
test.py | 测试代码 |
detectionMAP.py | map |
classificationMAP.py | 分类的map |
1.opts.py是参数配置。
parser = argparse.ArgumentParser(description='WTALC')
parser.add_argument('--lr', type=float, default=0.00001,help='learning rate (default: 0.0001)')
parser.add_argument('--batch-size', type=int, default=10, help='number of instances in a batch of data (default: 10)')
parser.add_argument('--model-name', default='weakloc', help='name to save model')
parser.add_argument('--pretrained-ckpt', default=None, help='ckpt for pretrained model')
parser.add_argument('--feature-size', default=2048, help='size of feature (default: 2048)')
parser.add_argument('--num-class', default=20, help='number of classes (default: )')
parser.add_argument('--dataset-name', default='Thumos14reduced', help='dataset to train on (default: )')
parser.add_argument('--max-seqlen', type=int, default=1200, help='maximum sequence length during training (default: 750)')
parser.add_argument('--Lambda', type=float, default=0.5, help='weight on Co-Activity Loss (default: 0.5)')
parser.add_argument('--num-similar', default=3, help='number of similar pairs in a batch of data (default: 3)')
parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')
parser.add_argument('--max-iter', type=int, default=100000, help='maximum iteration to train (default: 50000)')
parser.add_argument('--feature-type', type=str, default='I3D', help='type of feature to be used I3D or UNT (default: I3D)')
–lr 学习率
–batch-size
–model-name 保存的模型名
–pretrained-ckpt 预训练模型
–feature-size 特征维度
–num-class 类别数
–dataset-name 数据集名称
–max-seqlen 在训练期间最大的序列长度
–Lambda Co-Activity Loss占总损失的权重
–num-similar 一个batch中视频相似对
–max-iter 训练周期
–feature-type 提取的特征用的模型
2.video_dataset.py是数据集分类和载入部分
2.1 init()
init()首先获取本数据集的一些配置,然后调用train_test_idx()函数和classwise_feature_mapping()函数。
2.2 train_test_idx()
train_test_idx()的功能以序号的形式分为训练集和测试集
def train_test_idx(self):
for i, s in enumerate(self.subset):
if s.decode('utf-8') == 'validation': # Specific to Thumos14
self.trainidx.append(i) # 训练集序号
else:
self.testidx.append(i) # 测试集序号
2.3 classwise_feature_mapping()
classwise_feature_mapping()对数据集视频进行归类
def classwise_feature_mapping(self):
for category in self.classlist:
idx = [] # 一个类别的视频序号添加到一个idx中
for i in self.trainidx:
for label in self.labels[i]:
if label == category.decode('utf-8'):
idx.append(i); break;
self.classwiseidx.append(idx)
2.4 load_data()
load_data()的主要功能是获取相似视频对,最后返回5个视频对的特征矩阵和label
def load_data(self, n_similar=3, is_training=True):
if is_training==True:
features = []
labels = []
idx = []
# Load similar pairs-->3对相似的视频对
rand_classid = np.random.choice(len(self.classwiseidx), size=n_similar)
# 加载一对相似的视频
for rid in rand_classid:
rand_sampleid = np.random.choice(len(self.classwiseidx[rid]), size=2)
idx.append(self.classwiseidx[rid][rand_sampleid[0]])
idx.append(self.classwiseidx[rid][rand_sampleid[1]])
# idx = [6,]-->idx[10,]
# Load rest pairs-->随机又生成2个视频对?并且不一定是相似的 有什么用
rand_sampleid = np.random.choice(len(self.trainidx), size=self.batch_size-2*n_similar)
for r in rand_sampleid:
idx.append(self.trainidx[r])
# 返回5个视频对的特征矩阵和label
return np.array([utils.process_feat(self.features[i], self.t_max) for i in idx]), np.array([self.labels_multihot[i] for i in idx])
else:
labs = self.labels_multihot[self.testidx[self.currenttestidx]]
feat = self.features[self.testidx[self.currenttestidx]]
if self.currenttestidx == len(self.testidx)-1:
done = True; self.currenttestidx = 0
else:
done = False; self.currenttestidx += 1
return np.array(feat), np.array(labs), done
3.mdel.py是模型部分
model.py的功能主要是实现弱监督层模块的模型(很简单,看源码与论文的弱监督公式相对应即可)。
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-sniWRnPJ-1603872318456)(C:\Users\shan\AppData\Roaming\Typora\typora-user-images\image-20201028153100513.png)]
class Model(torch.nn.Module):
def __init__(self, n_feature, n_class):
super(Model, self).__init__()
self.fc = nn.Linear(n_feature, n_feature)
self.fc1 = nn.Linear(n_feature, n_feature)
self.classifier = nn.Linear(n_feature, n_class)
self.dropout = nn.Dropout(0.7)
self.apply(weights_init)
#self.train()
def forward(self, inputs, is_training=True):
x = F.relu(self.fc(inputs))
if is_training:
x = self.dropout(x)
#x = F.relu(self.fc1(x))
#if is_training:
# x = self.dropout(x)
return x, self.classifier(x)
4.train.py是训练模块
train.py的主要部分是求多实例损失和 Co-Activity Similiarity损失
4.1 MILL()是多实例损失函数
def MILL(element_logits, seq_len, batch_size, labels, device):
''' element_logits should be torch tensor of dimension (B, n_element, n_class),
k should be numpy array of dimension (B,) indicating the top k locations to average over,
labels should be a numpy array of dimension (B, n_class) of 1 or 0
return is a torch tensor of dimension (B, n_class) '''
print('******************************')
# [18 68 20 43 68 22 16 37 42 37]
k = np.ceil(seq_len/8).astype('int32')
labels = labels / torch.sum(labels, dim=1, keepdim=True)
instance_logits = torch.zeros(0).to(device)
for i in range(batch_size):
# 取batch_size的第i批次的前seq_len[i]行,在第0个维度进行排序,取一个视频特征相对突出的前k行特征
tmp, _ = torch.topk(element_logits[i][:seq_len[i]], k=int(k[i]), dim=0) # [seq_len[i], 20]
instance_logits = torch.cat([instance_logits, torch.mean(tmp, 0, keepdim=True)], dim=0) # [1,20]
# 套论文公式求出millloss
milloss = -torch.mean(torch.sum(Variable(labels) * F.log_softmax(instance_logits, dim=1), dim=1), dim=0)
return milloss
4.2 CASL()是Co-Activity Similiarity损失函数
def CASL(x, element_logits, seq_len, n_similar, labels, device):
''' x is the torch tensor of feature from the last layer of model of dimension (n_similar, n_element, n_feature),
element_logits should be torch tensor of dimension (n_similar, n_element, n_class)
seq_len should be numpy array of dimension (B,)
labels should be a numpy array of dimension (B, n_class) of 1 or 0 '''
sim_loss = 0.
n_tmp = 0.
for i in range(0, n_similar*2, 2):
# 使用softmax对每个视频类的激活分数沿时间轴进行标准化
atn1 = F.softmax(element_logits[i][:seq_len[i]], dim=0)
atn2 = F.softmax(element_logits[i+1][:seq_len[i+1]], dim=0)
n1 = torch.FloatTensor([np.maximum(seq_len[i]-1, 1)]).to(device)
n2 = torch.FloatTensor([np.maximum(seq_len[i+1]-1, 1)]).to(device)
# 首先定义高、低attention区域的类的特征向量
Hf1 = torch.mm(torch.transpose(x[i][:seq_len[i]], 1, 0), atn1)
Hf2 = torch.mm(torch.transpose(x[i+1][:seq_len[i+1]], 1, 0), atn2)
Lf1 = torch.mm(torch.transpose(x[i][:seq_len[i]], 1, 0), (1 - atn1)/n1)
Lf2 = torch.mm(torch.transpose(x[i+1][:seq_len[i+1]], 1, 0), (1 - atn2)/n2)
# 使用余弦相似度来衡量两个特征向量之间的相似度
d1 = 1 - torch.sum(Hf1*Hf2, dim=0) / (torch.norm(Hf1, 2, dim=0) * torch.norm(Hf2, 2, dim=0))
d2 = 1 - torch.sum(Hf1*Lf2, dim=0) / (torch.norm(Hf1, 2, dim=0) * torch.norm(Lf2, 2, dim=0))
d3 = 1 - torch.sum(Hf2*Lf1, dim=0) / (torch.norm(Hf2, 2, dim=0) * torch.norm(Lf1, 2, dim=0))
# 为了加强上述两个性质,使用了rank hinge loss
sim_loss = sim_loss + 0.5*torch.sum(torch.max(d1-d2+0.5, torch.FloatTensor([0.]).to(device))*Variable(labels[i,:])*Variable(labels[i+1,:]))
sim_loss = sim_loss + 0.5*torch.sum(torch.max(d1-d3+0.5, torch.FloatTensor([0.]).to(device))*Variable(labels[i,:])*Variable(labels[i+1,:]))
n_tmp = n_tmp + torch.sum(Variable(labels[i,:])*Variable(labels[i+1,:]))
# 整个训练集的总损失
sim_loss = sim_loss / n_tmp
return sim_loss
5.test.py是测试模块
test.py的主要公式是调用dmAP()函数和cmAP()分别求map和分类的map
对于map的理解参考链接如下:https://blog.csdn.net/better_boy/article/details/109334234
6.main.py
最后我们来讲解主函数,将上述的类和函数串联起来。
6.1 获取参数配置
args = options.parser.parse_args()
6.2 加载数据集
dataset = Dataset(args)
6.3 实例化好模型和参数
model = Model(dataset.feature_size, dataset.num_class).to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005)
6.4 然后开始每个epoch迭代,调用训练函数和测试函数并保存每500次迭代的模型
for itr in range(args.max_iter):
train(itr, dataset, args, model, optimizer, logger, device)
if itr % 5 == 0 and not itr == 0:
torch.save(model.state_dict(), './ckpt/' + args.model_name + '.pkl')
test(itr, dataset, args, model, logger, device)