GaitSet源代码解读(二)

书接上文,看完DataSet,回到initialization.py文件 

# -*- coding: utf-8 -*-
# @Author  : admin
# @Time    : 2018/11/15
import os
from copy import deepcopy #深拷贝

import numpy as np

from .utils import load_data #加载数据集
from .model import Model # 模型


# 数据加载初始化  加载配置文件参数
# return ndarray
def initialize_data(config, train=False, test=False):
    # 这里的train和test代表的是否使用cache
    print("Initializing data source...")
    # 得到Dateset对象
    train_source, test_source = load_data(**config['data'], cache=(train or test)) # *标识符接收任何多余位置参数的元组,**标识符接收任何多余关键字参数的新字典
    if train:
        print("Loading training data...")
        train_source.load_all_data()
    if test:
        print("Loading test data...")
        test_source.load_all_data()
    print("Data initialization complete.")
    return train_source, test_source

# 模型参数初始化,加载配置文件参数
def initialize_model(config, train_source, test_source):
    print("Initializing model...")
    data_config = config['data']
    model_config = config['model']
    model_param = deepcopy(model_config)
    model_param['train_source'] = train_source
    model_param['test_source'] = test_source
    model_param['train_pid_num'] = data_config['pid_num']
    batch_size = int(np.prod(model_config['batch_size'])) # np.prod 计算所有元素的乘积
    model_param['save_name'] = '_'.join(map(str,[
        model_config['model_name'],
        data_config['dataset'],
        data_config['pid_num'],
        data_config['pid_shuffle'],
        model_config['hidden_dim'],
        model_config['margin'],
        batch_size,
        model_config['hard_or_full_trip'],
        model_config['frame_num'],
    ]))

    m = Model(**model_param)
    print("Model initialization complete.")
    return m, model_param['save_name']


def initialization(config, train=False, test=False):
    print("Initialzing...")
    WORK_PATH = config['WORK_PATH']
    os.chdir(WORK_PATH)# os.chdir() 方法用于改变当前工作目录到指定的路径。
    os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"]
    train_source, test_source = initialize_data(config, train, test)
    print('train:',len(train_source))
    print("test",len(test_source))
    return initialize_model(config, train_source, test_source)

数据初始化完毕后,开始模型初始化,也就是initialize_model函数,这个函数加载了config.py文件的参数,然后送入Model,接下来,让我们进入model.py

import math
import os
import os.path as osp
import random
import sys
from datetime import datetime

import numpy as np
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.utils.data as tordata
from tensorboardX import SummaryWriter

from .network import TripletLoss, SetNet # 三元组损失,SetNet
from .utils import TripletSampler# 三元组采样器


class Model:
    def __init__(self,
                 hidden_dim,
                 lr,
                 hard_or_full_trip,
                 margin,
                 num_workers,
                 batch_size,
                 restore_iter,#这是啥
                 total_iter,#总共迭代次数
                 save_name,#模型保存名称
                 train_pid_num,#数据集划分方式
                 frame_num,#每个视频提取多少帧
                 model_name,#模型名称
                 train_source,#训练数据集
                 test_source,#测试数据集
                 img_size=64):

        self.save_name = save_name# e.g. save_name:'GaitSet_CASIA-B_73_False_256_0.2_128_full_30'
        self.train_pid_num = train_pid_num# 73
        self.train_source = train_source
        self.test_source = test_source

        self.hidden_dim = hidden_dim#256
        self.lr = lr# 0.0001
        self.hard_or_full_trip = hard_or_full_trip# 默认:full
        self.margin = margin# 0.2
        self.frame_num = frame_num#30
        self.num_workers = num_workers#16
        self.batch_size = batch_size#(8,16)
        self.model_name = model_name#GaitSet
        self.P, self.M = batch_size#8,16

        self.restore_iter = restore_iter#0
        self.total_iter = total_iter#80000

        self.img_size = img_size#64

        self.encoder = SetNet(self.hidden_dim).float()
        self.encoder = nn.DataParallel(self.encoder)
        self.triplet_loss = TripletLoss(self.P * self.M, self.hard_or_full_trip, self.margin).float()
        self.triplet_loss = nn.DataParallel(self.triplet_loss)
        self.encoder.cuda()
        self.triplet_loss.cuda()

        self.optimizer = optim.Adam([
            {'params': self.encoder.parameters()},
        ], lr=self.lr)

        self.hard_loss_metric = []
        self.full_loss_metric = []
        self.full_loss_num = []
        self.dist_list = []
        self.mean_dist = 0.01

        self.sample_type = 'all' #代码默认为all

    def collate_fn(self, batch):
        # batch是一个list 大小是128,每一个list有5维 (frame*64*44,数字 0-frame,角度,bg-02,id),应该是调用for in trainloder的时候才会执行这个地方,生成规定的格式
        """
        其实这个函数就是自定义DataLoader如何取样本的
        改变的也是只有data,本来data是一个样本(这个样本包含许多轮廓图),然后经过select_frame有放回的取30帧,然后再做成batch
        :param batch:[30帧张量的data,view, seq_type, label, None]都是index索引对应的
        :return:
        """
        # print(len(batch))
        batch_size = len(batch)
        """
                data = [self.__loader__(_path) for _path in self.seq_dir[index]]
                feature_num代表的是data数据所包含的集合的个数,这里一直为1,因为读取的是
                  _seq_dir = osp.join(seq_type_path, _view)
                        seqs = os.listdir(_seq_dir)  # 遍历出所有的轮廓剪影
        """
        feature_num = len(batch[0][0])
        # print(batch[0][0])
        # print(batch[0][1])
        # print(batch[0][2])
        # print(batch[0][3])
        # print(batch[0][4])
        seqs = [batch[i][0] for i in range(batch_size)]  # 对应于data
        # print(len(seqs))
        frame_sets = [batch[i][1] for i in range(batch_size)]  # 对应于 frame_set
        view = [batch[i][2] for i in range(batch_size)]  # 对应于self.view[index]
        seq_type = [batch[i][3] for i in range(batch_size)]  # 对应于self.seq_type[index]
        label = [batch[i][4] for i in range(batch_size)]  # 对应于self.label[index]    # 这几段代码就是从batch中分别取batch_size个对应的seqs、frame_sets、view、seq_type、label
        batch = [seqs, view, seq_type, label, None]# batch重新组织了一下,不同于刚开始调入时候的batch格式了
        '''
                 这里的一个样本由 data, frame_set, self.view[index], self.seq_type[index], self.label[index]组成
        '''

        def select_frame(index):
            sample = seqs[index]
            frame_set = frame_sets[index]
            if self.sample_type == 'random':
                # 这里的random.choices是有放回的抽取样本,k是选取次数,这里的frame_num=30
                frame_id_list = random.choices(frame_set, k=self.frame_num)  # 从所有frame数量的帧中 选取30帧,组成一个list
                _ = [feature.loc[frame_id_list].values for feature in sample]  # _:(30帧,64,44)  .loc是使用标签进行索引、.iloc是使用行号进行索引
            else:
                _ = [feature.values for feature in sample]
            return _

        seqs = list(map(select_frame, range(len(seqs))))#选取的30帧样本的ndarray与len(seqs)=128做一个键值对,然后转成一个list   # seqs:128长度的list,每个list:(30,64,44)。 map函数意为将第二个参数(一般是数组)中的每一个项,处理为第一个参数的类型。

        if self.sample_type == 'random':
            seqs = [np.asarray([seqs[i][j] for i in range(batch_size)]) for j in range(feature_num)] #选取的是一个样本中的30帧,所以一个样本是一个集合,feature_num=1    # asarry和.array的作用都是转为ndarray, feature_num=1
        else:# 全采样的话,数据就不都是30帧了,所以需要补充。batch_frames应该是只有在全采样和多个显卡的时候才会用到,否则基本用不到,先不用管
            gpu_num = min(torch.cuda.device_count(), batch_size)
            batch_per_gpu = math.ceil(batch_size / gpu_num)
            batch_frames = [[
                                len(frame_sets[i])
                                for i in range(batch_per_gpu * _, batch_per_gpu * (_ + 1))
                                if i < batch_size
                                ] for _ in range(gpu_num)]
            if len(batch_frames[-1]) != batch_per_gpu:
                for _ in range(batch_per_gpu - len(batch_frames[-1])):
                    batch_frames[-1].append(0)
            max_sum_frame = np.max([np.sum(batch_frames[_]) for _ in range(gpu_num)])
            seqs = [[
                        np.concatenate([
                                           seqs[i][j]
                                           for i in range(batch_per_gpu * _, batch_per_gpu * (_ + 1))
                                           if i < batch_size
                                           ], 0) for _ in range(gpu_num)]
                    for j in range(feature_num)]
            seqs = [np.asarray([
                                   np.pad(seqs[j][_],
                                          ((0, max_sum_frame - seqs[j][_].shape[0]), (0, 0), (0, 0)),
                                          'constant',
                                          constant_values=0)
                                   for _ in range(gpu_num)])
                    for j in range(feature_num)]
            batch[4] = np.asarray(batch_frames)

        batch[0] = seqs
        return batch

    def fit(self):#训练
        #加载权重
        if self.restore_iter != 0:# 不是从0开始
            self.load(self.restore_iter)

        self.encoder.train() # 对于有dropout和BathNorm的训练要 .train()
        self.sample_type = 'random'
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.lr
        triplet_sampler = TripletSampler(self.train_source, self.batch_size)#采样器  # 采样,triplet_sampler.size:8007
        train_loader = tordata.DataLoader(
            dataset=self.train_source,
            batch_sampler=triplet_sampler,#自定义从数据集中取样本的策略,但是一次只返回一个batch的indices(索引)一个batch有128个index
            collate_fn=self.collate_fn,#将一个list的sample组成一个mini-batch的函数
            num_workers=self.num_workers)#这个参数决定了有几个进程来处理data loading。0意味着所有的数据都会被load进主进程。(默认为0)
        # 当计算机的内存充足的时候,可以设置pin_memory=True,放在内存中锁页,不放在硬盘上。当系统卡住,或者交换内存使用过多的时候,设置pin_memory=False。
        train_label_set = list(self.train_source.label_set)#标签  # label set length:73
        train_label_set.sort()#对标签排序  # 里面没有005,73个id 进行排序

        _time1 = datetime.now()#计时
        for seq, view, seq_type, label, batch_frame in train_loader: # seq_size:1, but seq[0]_size:(128,30,64,44)、view_size:128、seq_type:128、label_size:128=16*8(8个id)、batch_frame:None
            # batch_frame的作用,原作者回答
            # 这个主要用于多样本的并行测试。和model中的collate_fn()呼应。测试时不同样本长度不同不能用普通方式组成batch。
            # 代码中将样本按卡的数目重新分配拼接成大的“样本”,从而实现最小空间浪费的批量测试。
            self.restore_iter += 1
            self.optimizer.zero_grad()#梯度清零

            for i in range(len(seq)):
                # seq[i] = self.np2var(seq[i]).float()
                seq[i] = self.np2ts(seq[i]).float()
            if batch_frame is not None:#这个batch_frame是测试时用,这段白写,删了也行
                batch_frame = self.np2ts(batch_frame).int()

            feature, label_prob = self.encoder(*seq, batch_frame)#feature的维度是torch.Size([128, 62, 256]),label_prob=None  # 62的由来,两个31维度的特征concat成62维度

            target_label = [train_label_set.index(l) for l in label]#list.index() 返回索引位置,每个label在label_set中的索引位置
            target_label = self.np2ts(np.array(target_label)).long()#训练标签转换为tensor torch.Size([128])  target_label.size:(128),label变成了索引位置
            # print(target_label.size())

            triplet_feature = feature.permute(1, 0, 2).contiguous()#torch.Size([62, 128, 256])
            # transpose、permute等操作会让tensor变得在内存上不连续,因此要想view,就得让tensor先连续;
            # 解释如下:有些tensor并不是占用一整块内存,而是由不同的数据块组成,而tensor的view()操作依赖于内存是整块的,这时只需要执行contiguous()这个函数,把tensor变成在内存中连续分布的形式;
            # print(triplet_feature.size())
            # print(triplet_feature.size(0)) 62
            triplet_label = target_label.unsqueeze(0).repeat(triplet_feature.size(0), 1)#torch.Size([62, 128])   target_labeltorch.Size([128])->torch.Size([1, 128])->torch.Size([62, 128])
            # print(triplet_label.size())
            (full_loss_metric, hard_loss_metric, mean_dist, full_loss_num) = self.triplet_loss(triplet_feature, triplet_label)
            if self.hard_or_full_trip == 'hard':
                loss = hard_loss_metric.mean()
            elif self.hard_or_full_trip == 'full':
                loss = full_loss_metric.mean()# 对每个条带的loss取平均

            self.hard_loss_metric.append(hard_loss_metric.mean().data.cpu().numpy())# 难样本度量损失
            self.full_loss_metric.append(full_loss_metric.mean().data.cpu().numpy())# 全样本度量损失
            self.full_loss_num.append(full_loss_num.mean().data.cpu().numpy())# loss不为0的数量
            self.dist_list.append(mean_dist.mean().data.cpu().numpy())

            if loss > 1e-9:#如果loss大于阈值,反向传播Adam优化
                loss.backward()
                self.optimizer.step()

            if self.restore_iter % 100 == 0:#打印100次迭代的训练时间
                print("100次训练时间",datetime.now() - _time1)
                _time1 = datetime.now()

            if self.restore_iter % 10 == 0:#10次迭代打印
                self.save()#每训练10次,保存一次模型
                print('iter {}:'.format(self.restore_iter), end='')
                print(', hard_loss_metric={0:.8f}'.format(np.mean(self.hard_loss_metric)), end='')
                print(', full_loss_metric={0:.8f}'.format(np.mean(self.full_loss_metric)), end='')
                print(', full_loss_num={0:.8f}'.format(np.mean(self.full_loss_num)), end='')
                self.mean_dist = np.mean(self.dist_list)
                print(', mean_dist={0:.8f}'.format(self.mean_dist), end='')
                print(', lr=%f' % self.optimizer.param_groups[0]['lr'], end='')
                print(', hard or full=%r' % self.hard_or_full_trip)
                sys.stdout.flush()
                self.hard_loss_metric = []
                self.full_loss_metric = []
                self.full_loss_num = []
                self.dist_list = []

            # Visualization using t-SNE
            # if self.restore_iter % 500 == 0:
            #     pca = TSNE(2)
            #     pca_feature = pca.fit_transform(feature.view(feature.size(0), -1).data.cpu().numpy())
            #     for i in range(self.P):
            #         plt.scatter(pca_feature[self.M * i:self.M * (i + 1), 0],
            #                     pca_feature[self.M * i:self.M * (i + 1), 1], label=label[self.M * i])
            #
            #     plt.show()

            if self.restore_iter == self.total_iter:
                break

    # def ts2var(self, x):
    #     return autograd.Variable(x).cuda()

    def np2ts(self, x):
        return torch.from_numpy(x).cuda()

    def transform(self, flag, batch_size=1):#测试
        self.encoder.eval()
        source = self.test_source if flag == 'test' else self.train_source
        self.sample_type = 'all'
        data_loader = tordata.DataLoader(
            dataset=source,
            batch_size=batch_size,
            sampler=tordata.sampler.SequentialSampler(source),
            collate_fn=self.collate_fn,
            num_workers=self.num_workers)

        feature_list = list()
        view_list = list()
        seq_type_list = list()
        label_list = list()

        for i, x in enumerate(data_loader):
            seq, view, seq_type, label, batch_frame = x
            for j in range(len(seq)):
                seq[j] = self.np2ts(seq[j]).float()
            if batch_frame is not None:
                batch_frame = self.np2ts(batch_frame).int()
            # print(batch_frame, np.sum(batch_frame))

            feature, _ = self.encoder(*seq, batch_frame)
            n, num_bin, _ = feature.size()
            feature_list.append(feature.view(n, -1).data.cpu().numpy())
            view_list += view
            seq_type_list += seq_type
            label_list += label

        return np.concatenate(feature_list, 0), view_list, seq_type_list, label_list

    def save(self):
        os.makedirs(osp.join('checkpoint', self.model_name), exist_ok=True)
        torch.save(self.encoder.state_dict(),
                   osp.join('checkpoint', self.model_name,
                            '{}-{:0>5}-encoder.ptm'.format(
                                self.save_name, self.restore_iter)))
        torch.save(self.optimizer.state_dict(),
                   osp.join('checkpoint', self.model_name,
                            '{}-{:0>5}-optimizer.ptm'.format(
                                self.save_name, self.restore_iter)))

    # restore_iter: iteration index of the checkpoint to load
    def load(self, restore_iter):
        self.encoder.load_state_dict(torch.load(osp.join(
            'checkpoint', self.model_name,
            '{}-{:0>5}-encoder.ptm'.format(self.save_name, restore_iter))))
        self.optimizer.load_state_dict(torch.load(osp.join(
            'checkpoint', self.model_name,
            '{}-{:0>5}-optimizer.ptm'.format(self.save_name, restore_iter))))

从以上代码的fit函数看,这个是训练函数

然后进入sampler.py

mport torch.utils.data as tordata
import random


class TripletSampler(tordata.sampler.Sampler):
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        while (True):
            sample_indices = list()
            pid_list = random.sample(list(self.dataset.label_set),self.batch_size[0]) #截取列表的指定长度的随机数,但是不会改变列表本身的排序 batch_size[0]=8
            for pid in pid_list:#8次循环
                _index = self.dataset.index_dict.loc[pid, :, :].values #self.dataset.index_dict.loc[pid, :, :].values 先解释index_dict这里面是每份样本对应的index的键值对.loc[pid, :, :].values是访问对应pid也就是label的所有视角、类型的index  # _index是一个(10,11)的ndarray    # dataset.index_dict方法用来规整成['label', 'seq_type', 'view']样子
                _index = _index[_index > 0].flatten().tolist()#因为有不存在的标为-1,所以要_index>0证明存在,再展平再转换为list
                _index = random.choices(_index,k=self.batch_size[1])#这里的random.choices是有放回的抽取样本,k是选取次数,选取16个样本(视频)
                sample_indices += _index  # 将8个id人的 所有索引出来的图片串起来。长度128
            # print(sample_indices)128个样本
            # print(len(sample_indices))
            yield sample_indices

    def __len__(self):
        return self.dataset.data_size

紧接着进入gaitset.py文件

import torch
import torch.nn as nn
import numpy as np

from .basic_blocks import SetBlock, BasicConv2d


class SetNet(nn.Module):
    def __init__(self, hidden_dim):
        super(SetNet, self).__init__()
        self.hidden_dim = hidden_dim#256
        self.batch_frame = None

        _set_in_channels = 1#输入的是二值图,所以channels为1
        _set_channels = [32, 64, 128]#这是C1,C2,C3,C4,C5,C6的channel
        self.set_layer1 = SetBlock(BasicConv2d(_set_in_channels, _set_channels[0], 5, padding=2))#这是第一层卷积C1,in_channel=1,output_channel=32,最后一个参数默认为False,不要进行池化操作
        self.set_layer2 = SetBlock(BasicConv2d(_set_channels[0], _set_channels[0], 3, padding=1), True)#这是第二层卷积C2,in_channel=32,out_putchannel=32,最后一个参数为True,要进行池化操作,这实际有两层:一层卷积,一层池化;池化层的kernel_size=2
        self.set_layer3 = SetBlock(BasicConv2d(_set_channels[0], _set_channels[1], 3, padding=1))#这是第三层卷积C3,in_channel=32,output_channel=64,最后一个参数默认为False,不要进行池化操作
        self.set_layer4 = SetBlock(BasicConv2d(_set_channels[1], _set_channels[1], 3, padding=1), True)#这是第四层卷积C4,in_channel=64,out_putchannel=64,最后一个参数为True,要进行池化操作,这实际有两层:一层卷积,一层池化;池化层的kernel_size=2
        self.set_layer5 = SetBlock(BasicConv2d(_set_channels[1], _set_channels[2], 3, padding=1))#这是第五层卷积C5,in_channel=64,output_channel=128,最后一个参数默认为False,不要进行池化操作
        self.set_layer6 = SetBlock(BasicConv2d(_set_channels[2], _set_channels[2], 3, padding=1))#这是第六层卷积C6,in_channel=128,output_channel=128,最后一个参数默认为False,不要进行池化操作

        #MGP这一部分的CNN与池化操作
        _gl_in_channels = 32
        _gl_channels = [64, 128]
        self.gl_layer1 = BasicConv2d(_gl_in_channels, _gl_channels[0], 3, padding=1)#第一次SP后的feature,输入gl_layer1做CNN,再经过layer2,再经过pooling
        self.gl_layer2 = BasicConv2d(_gl_channels[0], _gl_channels[0], 3, padding=1)
        self.gl_layer3 = BasicConv2d(_gl_channels[0], _gl_channels[1], 3, padding=1)#第二次SP后的feature+前两层处理后的feature,进过layer3,layer4
        self.gl_layer4 = BasicConv2d(_gl_channels[1], _gl_channels[1], 3, padding=1)
        self.gl_pooling = nn.MaxPool2d(2)

        self.bin_num = [1, 2, 4, 8, 16]#HPM的5个scale
        # 其实parameterList()就是一种和列表、元组之类一样的一种新的数据格式,用于保存神经网络权重及参数。
        #类型转换函数,将一个不可训练的类型Tensor转换成可以训练的类型parameter并将这个parameter绑定到这个module里面(net.parameter()中就有这个绑定的parameter,所以在参数优化的时候可以进行优化的),所以经过类型转换这个self.fc_bin变成了模型的一部分,成为了模型中根据训练可以改动的参数了。
        self.fc_bin = nn.ParameterList([nn.Parameter(nn.init.xavier_uniform_(torch.zeros(sum(self.bin_num) * 2, 128, hidden_dim)))])#xavier初始化,均匀分布  # 参数的形状为62*128*256
        # self.fc_bin = nn.Parameter(
        #     nn.init.xavier_uniform_(  # xavier思想是保持输入前后方差,uniform是均匀初始化,normal是正态初始化
        #         torch.zeros(sum(self.bin_num) * 2, 128, hidden_dim)))  # hidden_dim:256
        for m in self.modules():
            if isinstance(m, (nn.Conv2d, nn.Conv1d)):
                nn.init.xavier_uniform_(m.weight.data)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data)
                nn.init.constant(m.bias.data, 0.0)
            elif isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
                nn.init.normal(m.weight.data, 1.0, 0.02)
                nn.init.constant(m.bias.data, 0.0)

    def frame_max(self, x):
        if self.batch_frame is None:
            return torch.max(x, 1)#第二维度选最大
        else:
            _tmp = [
                torch.max(x[:, self.batch_frame[i]:self.batch_frame[i + 1], :, :, :], 1)
                for i in range(len(self.batch_frame) - 1)
                ]
            max_list = torch.cat([_tmp[i][0] for i in range(len(_tmp))], 0)
            arg_max_list = torch.cat([_tmp[i][1] for i in range(len(_tmp))], 0)
            return max_list, arg_max_list

    def frame_median(self, x):
        if self.batch_frame is None:
            return torch.median(x, 1)#第二维度求平均
        else:
            _tmp = [
                torch.median(x[:, self.batch_frame[i]:self.batch_frame[i + 1], :, :, :], 1)
                for i in range(len(self.batch_frame) - 1)
                ]
            median_list = torch.cat([_tmp[i][0] for i in range(len(_tmp))], 0)
            arg_median_list = torch.cat([_tmp[i][1] for i in range(len(_tmp))], 0)
            return median_list, arg_median_list

    def forward(self, silho, batch_frame=None):#silho torch.Size([128, 30, 64, 44])128个样本 一个样本30帧  帧大小为64*44
        # n: batch_size, s: frame_num, k: keypoints_num, c: channel
        if batch_frame is not None:
            batch_frame = batch_frame[0].data.cpu().numpy().tolist()
            _ = len(batch_frame)
            for i in range(len(batch_frame)):
                if batch_frame[-(i + 1)] != 0:
                    break
                else:
                    _ -= 1
            batch_frame = batch_frame[:_]
            frame_sum = np.sum(batch_frame)
            if frame_sum < silho.size(1):
                silho = silho[:, :frame_sum, :, :]
            self.batch_frame = [0] + np.cumsum(batch_frame).tolist()
        n = silho.size(0)#n=128
        # print(silho.size())
        x = silho.unsqueeze(2)#在第三维上增加一个维度也就是torch.Size([128, 30, 1, 64, 44])增加的应该是channel的维度
        del silho

        x = self.set_layer1(x)#torch.Size([128, 30, 32, 64, 44])
        # print(x.size())
        x = self.set_layer2(x)#torch.Size([128, 30, 32, 32, 22])
        # print(x.size())
        # w = self.frame_max(x)[0]  torch.max()[0], 只返回最大值的每个数
        # print(w.size())
        # self.frame_max(x)[0]的返回值为 torch.Size([128, 32, 32, 22])  将一个样本的30帧的CNN提取出的特征,合成一帧的特征这就是SetPooling   torch.Size([128, 32, 32, 22])对应是batch_size,channel数,feature map大小(32*22)
        # 这里的self.frame_max相当于set pooling 采用了max统计函数
        gl = self.gl_layer1(self.frame_max(x)[0])#torch.Size([128, 64, 32, 22])
        # print(gl.size())
        gl = self.gl_layer2(gl)#torch.Size([128, 64, 32, 22])
        # print(gl.size())
        gl = self.gl_pooling(gl)#torch.Size([128, 64, 16, 11])
        # print(gl.size())

        x = self.set_layer3(x)#torch.Size([128, 30, 64, 32, 22])
        x = self.set_layer4(x)#torch.Size([128, 30, 64, 16, 11])
        gl = self.gl_layer3(gl + self.frame_max(x)[0])#图中的圆圈+真的就是+,维度一致都是torch.Size([128, 64, 16, 11]),加完还是torch.Size([128, 64, 16, 11]),送入gl_layer3之后为torch.Size([128, 128, 16, 11])
        gl = self.gl_layer4(gl)#torch.Size([128, 128, 16, 11])

        x = self.set_layer5(x)#torch.Size([128, 30, 128, 16, 11])
        x = self.set_layer6(x)#torch.Size([128, 30, 128, 16, 11])
        x = self.frame_max(x)[0]#torch.Size([128, 128, 16, 11])
        gl = gl + x#图中第二个圆圈+,加完维度是torch.Size([128, 128, 16, 11])

        feature = list()
        n, c, h, w = gl.size()#n,c,h,w对应torch.Size([128, 128, 16, 11])
        for num_bin in self.bin_num:# 这里的循环相当于对feature map运用HPP
            # z = x.view(n, c, num_bin, -1)
            # torch.Size([128, 128, 1, 176])
            # torch.Size([128, 128, 2, 88])
            # torch.Size([128, 128, 4, 44])
            # torch.Size([128, 128, 8, 22])
            # torch.Size([128, 128, 16, 11])
            z = x.view(n, c, num_bin, -1)# 按高度进行划分成strips
            # print(z.size())
            # print(z.max(3)[0].size())
            z = z.mean(3) + z.max(3)[0]#对最后一个维度求均值和求最大值,然后对应维度相加   # 应用maxpool和avgpool
            # torch.Size([128, 128, 1])
            # torch.Size([128, 128, 2])
            # torch.Size([128, 128, 4])
            # torch.Size([128, 128, 8])
            # torch.Size([128, 128, 16])
            feature.append(z)# z的形状为 n,c,num_bin  # 将主流水线中的feature map加入到feature中
            z = gl.view(n, c, num_bin, -1)# 对gl也运用HPP
            z = z.mean(3) + z.max(3)[0]
            feature.append(z)# 将gl中的feature map加入到feature中
        feature = torch.cat(feature, 2).permute(2, 0, 1).contiguous()#torch.Size([62, 128, 128])
        # print(feature.size())

        # 由于不同比例尺度上的条带描绘了不同的感受野特征,并且每个比例尺度上的不同条带描绘了不同的空间位置的特征,因此使用独立的FC很自然的
        # feature:62*128*128,self.fc_bin:62*128*256
        # 相当于62个条带,每个条带128维,那么对每个条带分别进行FC的映射
        feature = feature.matmul(self.fc_bin[0])
        # 这样经过全连接层计算之后就变成了 62*128*256
        feature = feature.permute(1, 0, 2).contiguous()#torch.Size([128, 62, 256])
        # print(feature.size())

        return feature, None

之后返回

feature, label_prob = self.encoder(*seq, batch_frame)#feature的维度是torch.Size([128, 62, 256]),label_prob=None  # 62的由来,两个31维度的特征concat成62维度

然后再看triplet.py

import torch
import torch.nn as nn
import torch.nn.functional as F


class TripletLoss(nn.Module):
    def __init__(self, batch_size, hard_or_full, margin):
        super(TripletLoss, self).__init__()
        self.batch_size = batch_size#128
        self.margin = margin#0.2

    def forward(self, feature, label):
        # feature: [n, m, d], label: [n, m]n=62 m=128 d=256
        n, m, d = feature.size()#62,128,256
        # print(label.size())
        # print(feature.size())
        # hp_mask是找出所有样本对中具有相同标签的,相同的为true,不同的为false  正样本对
        # 正样本, length:62*128*128=1015808,两边128个id看是否有相同的
        # print(label.unsqueeze(1).size()) #torch.Size([62, 1, 128])
        # print(label.unsqueeze(2).size())#torch.Size([62, 128, 1])
        #样本对就是128*128的二维tensor
        hp_mask = (label.unsqueeze(1) == label.unsqueeze(2)).bool().view(-1)#torch.Size([1015808])
        # print(hp_mask.size())
        # hn_mask与上面相反,是找出不同的标签的样本对   负样本对
        hn_mask = (label.unsqueeze(1) != label.unsqueeze(2)).bool().view(-1)

        dist = self.batch_dist(feature)#torch.Size([62, 128, 128])# 这里求出了batch中每个条带的各个id之间的欧式距离,size:(62,128,128)
        mean_dist = dist.mean(1).mean(1)#torch.Size([62])
        # print(mean_dist.size())
        dist = dist.view(-1)#torch.Size([1015808])
        # print(dist.size())
        # 这里是困难样本对发掘,找出每个样本对应的正样本对中的最大距离,找出每个样本的每个负样本对中最小距离,这就相对于进行困难样本挖掘
        # hard
        hard_hp_dist = torch.max(torch.masked_select(dist, hp_mask).view(n, m, -1), 2)[0]
        # masked_select是根据mask进行扣取元素,最后是1D的, (62,128)
        # print(hard_hp_dist.size())  torch.Size([62, 128])
        hard_hn_dist = torch.min(torch.masked_select(dist, hn_mask).view(n, m, -1), 2)[0]
        hard_loss_metric = F.relu(self.margin + hard_hp_dist - hard_hn_dist).view(n, -1)
        # print(hard_loss_metric.size())  torch.Size([62, 128])
        # 计算每个条带的hard_loss的平均值
        hard_loss_metric_mean = torch.mean(hard_loss_metric, 1)
        # print(hard_loss_metric_mean.size()) torch.Size([62])

        # 这里是求取所有正负样本对的loss,没有进行困难样本挖掘
        # non-zero full
        full_hp_dist = torch.masked_select(dist, hp_mask).view(n, m, -1, 1)
        # print(full_hp_dist.size()) torch.Size([62, 128, 16, 1])
        full_hn_dist = torch.masked_select(dist, hn_mask).view(n, m, 1, -1)
        # print(full_hn_dist.size()) torch.Size([62, 128, 1, 112])
        full_loss_metric = F.relu(self.margin + full_hp_dist - full_hn_dist).view(n, -1)
        # print(full_loss_metric.size())  torch.Size([62, 229376])
        # 计算每个正样本对和负样本对之间的triplet loss
        # full_loss_metric_sum:62
        full_loss_metric_sum = full_loss_metric.sum(1)
        full_loss_num = (full_loss_metric != 0).sum(1).float()# 对每个条带中loss不为0的样本进行统计
        # 计算每个条带的所有triple loss平均值
        full_loss_metric_mean = full_loss_metric_sum / full_loss_num # loss不为0的样本才贡献了损失,所以只对贡献的样本进行平均
        full_loss_metric_mean[full_loss_num == 0] = 0
        # 返回值的形状依次为:62 ,            62,                  62,        62
        return full_loss_metric_mean, hard_loss_metric_mean, mean_dist, full_loss_num

    def batch_dist(self, x):
        x2 = torch.sum(x ** 2, 2)#torch.Size([62, 128])
        # print(x2.size())
        # print(x2.unsqueeze(2).size())
        dist = x2.unsqueeze(2) + x2.unsqueeze(2).transpose(1, 2) - 2 * torch.matmul(x, x.transpose(1, 2))
        dist = torch.sqrt(F.relu(dist))#torch.Size([62, 128, 128])
        # print(dist)
        # print(dist.size())
        #dist torch.Size([62, 128, 128]) 相当于62个条带为batch,每个条带128*256,最后dist代表每个条带与自己和其他条带的欧氏距离
        return dist

最后返回fit函数,开始训练

至此,训练部分代码讲解完毕

下一篇文章给出测试部分的代码解析

​​​​​​​如果文章对大家有帮助,麻烦大家一键三连,谢谢大家!!!

  • 10
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值