[DiMP跟踪算法]代码学习笔记

missyoudaisy

已于 2022-08-19 15:24:32 修改

阅读量9.7k

点赞数 11

分类专栏：目标跟踪文章标签： python 深度学习神经网络

于 2020-03-04 12:23:47 首次发布

本文链接：https://blog.csdn.net/missyoudaisy/article/details/104641595

版权

本文是作者关于DiMP跟踪算法代码的学习笔记，重点探讨了目标状态判定和参数更新的过程，包括初始化、特征提取、样本生成等关键步骤。通过分析pytracking库中的相关函数，如initialize、initialize_features和generate_init_samples，详细阐述了DiMP的工作原理。同时，提到了数据增强在生成初始训练样本中的应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

大家好，最近一直在看DiMP的代码，但是发现自己经常容易看了后面的忘了前面的，妈妈常和我说“好脑子不如烂笔头”，所以我今天决定把关于代码的一些想法记录下来，一定会有很多问题的，希望大家多多包容，多多与我交流，很欢迎大家评论或私信我哦～
最后更新时间：20201214
下面主要focus DiMP中关于目标状态判定的代码～最近又重新下载了一次代码，发现和之前的有稍微的不一样。

目标状态判定及参数更新：

在pytracking/pytracking/tracker/dimp/dimp.py文件中

    def track(self, image, info: dict = None) -> dict:
        self.debug_info = {
   }
        ......此处省略若干行代码......
        # ------- LOCALIZATION ------- #

        # Extract backbone features 采样图像块，并提取其主干特征，见下面的函数
        backbone_feat, sample_coords, im_patches = self.extract_backbone_features(im, self.get_centered_sample_pos(),
                                                                      self.target_scale * self.params.scale_factors,
                                                                      self.img_sample_sz)
        # Extract classification features 提取用于classifier的特征，由一个卷积层和一个L2正则化组成
        test_x = self.get_classification_features(backbone_feat)

        # Location of sample sample_pos:center position of sample 获取中心点坐标和采样尺度，详情见下面
        sample_pos, sample_scales = self.get_sample_location(sample_coords)
        # Compute classification scores 计算出分类的置信度分数
        scores_raw = self.classify_target(test_x)
        # Localize the target 此处进入localize_target函数
        translation_vec, scale_ind, s, flag = self.localize_target(scores_raw, sample_pos, sample_scales)
        new_pos = sample_pos[scale_ind,:] + translation_vec #获得新的样本中心点
        
        # Update position and scale 如果目标未丢失，更新各参数
        if flag != 'not_found':
            if self.params.get('use_iou_net', True):
                #进入当前if
                update_scale_flag = self.params.get('update_scale_when_uncertain', True) or flag != 'uncertain'
                if self.params.get('use_classifier', True):
                    #更新bb的中心点坐标 self.pos
                    self.update_state(new_pos)
                #微调classifier输出的结果，并更新各参数，见下面self.refine_target_box函数，理论可以看ATOM那篇论文
                self.refine_target_box(backbone_feat, sample_pos[scale_ind,:], sample_scales[scale_ind], scale_ind, update_scale_flag)
            elif self.params.get('use_classifier', True):
                self.update_state(new_pos, sample_scales[scale_ind])

        # ------- UPDATE ------- #
        #在'not_found', 'uncertain'情况外，update_flag为True
        update_flag = flag not in ['not_found', 'uncertain'] 
        hard_negative = (flag == 'hard_negative')
        learning_rate = self.params.get('hard_negative_learning_rate', None) if hard_negative else None
        #如果不处于'not_found'和'uncertain'状态，则更新classifier
        if update_flag and self.params.get('update_classifier', False):
            # Get train sample 将当前样本加入训练样本中
            train_x = test_x[scale_ind:scale_ind+1, ...]
            # Create target_box and label for spatial sample
            target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos[scale_ind,:], sample_scales[scale_ind])
            # Update the classifier model 更新classifier模板，并绘制loss曲线，见下面self.update_classifier函数
            self.update_classifier(train_x, target_box, learning_rate, s[scale_ind,...])

        # Set the pos of the tracker to iounet pos 又更新了一遍self.pos，其实之前在self.refine_target_box函数中更新过
        if self.params.get('use_iou_net', True) and flag != 'not_found' and hasattr(self, 'pos_iounet'):
            self.pos = self.pos_iounet.clone()

        score_map = s[scale_ind, ...] #置信度分数
        max_score = torch.max(score_map).item() #置信度分数的最大值

        # Visualize and set debug info 可视化
        self.search_area_box = torch.cat((sample_coords[scale_ind,[1,0]], sample_coords[scale_ind,[3,2]] - sample_coords[scale_ind,[1,0]] - 1)) #搜索区域的大小位置
        self.debug_info['flag' + self.id_str] = flag #当前帧目标状态信息
        self.debug_info['max_score' + self.id_str] = max_score #置信度分数的最大值
        if self.visdom is not None:
            self.visdom.register(score_map, 'heatmap', 2, 'Score Map' + self.id_str)
            self.visdom.register(self.debug_info, 'info_dict', 1, 'Status')
        elif self.params.debug >= 2:
            show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score))

        # Compute output bounding box 当前帧的目标bbox
        new_state = torch.cat((self.pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]]))

        if self.params.get('output_not_found_box', False) and flag == 'not_found':
            output_state = [-1, -1, -1, -1]
        else:
            output_state = new_state.tolist()

        out = {
   'target_bbox': output_state}
        return out

函数作用：提取采样图像块的主干特征

    def extract_backbone_features(self, im: torch.Tensor, pos: torch.Tensor, scales, sz: torch.Tensor):
        """ input
        im: frame t
        pos: center position of sample
        scales: just scale
        sz: 288 
            output
        im_patches: image patches sampled from frame t 从t帧中采样的样本
        patch_coords: top-left and bottom-right coordinate for image patches 样本的左上角和右下角坐标
        """
        # sample_patch_multiscale函数在preprocessing.py文件中，见下面
        im_patches, patch_coords = sample_patch_multiscale(im, pos, scales, sz,
                                                           mode=self.params.get('border_mode', 'replicate'),
                                                           max_scale_change=self.params.get('patch_max_scale_change', None))
        with torch.no_grad():
            backbone_feat = self.net.extract_backbone(im_patches)
        return backbone_feat, patch_coords, im_patches

函数作用：在某一中心点处，采样一定大小和尺度的图像块

def sample_patch_multiscale(im, pos, scales, image_sz, mode: str='replicate', max_scale_change=None):
    """Extract image patches at multiple scales.
    args:
        im: Image.
        pos: Center position for extraction.
        scales: Image scales to extract image patches from.
        image_sz: Size to resize the image samples to
        mode: how to treat image borders: 'replicate' (default), 'inside' or 'inside_major'
        max_scale_change: maximum allowed scale change when using 'inside' and 'inside_major' mode
    """
    if isinstance(scales, (int, float)):
        scales = [scales]
    # Get image patches 见sample_patch函数 得到采样的图像块和对应的坐标
    patch_iter, coord_iter = zip(*(sample_patch(im, pos, s*image_sz, image_sz, mode=mode,
                                                max_scale_change=max_scale_change) for s in scales))
    im_patches = torch.cat(list(patch_iter))
    patch_coords = torch.cat(list(coord_iter))

    return  im_patches, patch_coords #返回extract_backbone_features函数


def sample_patch(im: torch.Tensor, pos: torch.Tensor, sample_sz: torch.Tensor, output_sz: torch.Tensor = None,
                 mode: str = 'replicate', max_scale_change=None, is_mask=False):
    """Sample an image patch.
    args:
        im: Image
        pos: center position of crop
        sample_sz: size to crop
        output_sz: size to resize to
        mode: how to treat image borders: 'replicate' (default), 'inside' or 'inside_major'
        max_scale_change: maximum allowed scale change when using 'inside' and 'inside_major' mode
    """

    # if mode not in ['replicate', 'inside']:
    #     raise ValueError('Unknown border mode \'{}\'.'.format(mode))

    # copy and convert
    posl = pos.long().clone()

    pad_mode = mode

    # Get new sample size if forced inside the image
    if mode == 'inside' or mode == 'inside_major':
        pad_mode = 'replicate'
        im_sz = torch.Tensor([im.shape[2], im.shape[3]])
        shrink_factor = (sample_sz.float() / im_sz)
        if mode == 'inside':
            shrink_factor = shrink_factor.max()
        elif mode == 'inside_major':
            shrink_factor = shrink_factor.min()
        shrink_factor.clamp_(min=1, max=max_scale_change)
        sample_sz = (sample_sz.float() / shrink_factor).long()

    # Compute pre-downsampling factor
    if output_sz is not None:
        # 采样大小相对于输出大小的倍数
        resize_factor = torch.min(sample_sz.float() / output_sz.float()).item()
        # 取整
        df = int(max(int(resize_factor - 0.1), 1))
    else:
        df = int(1)
    sz = sample_sz.float() / df     # new size 新的output size

    # Do downsampling
    if df > 1:
        os = posl % df              # offset
        posl = (posl - os) // df     # new position
        im2 = im[..., os[0].item()::df, os[1].item()::df]   # downsample
    else:
        im2 = im

    # compute size to crop 四舍五入取整
    szl = torch.max(sz.round(), torch.Tensor([2])).long()

    # Extract top and bottom coordinates 根据中心点和尺寸求取左上角和右下角坐标
    tl = posl - (szl - 1) // 2
    br = posl + szl//2 + 1

    # Shift the crop to inside
    if mode == 'inside' or mode == 'inside_major':
        im2_sz = torch.LongTensor([im2.shape[2], im2.shape[3]])
        shift = (-tl).clamp(0) - (br - im2_sz).clamp(0)
        tl += shift
        br += shift

        outside = ((-tl).clamp(0) + (br - im2_sz).clamp(0)) // 2
        shift = (-tl - outside) * (outside > 0).long()
        tl += shift
        br += shift

        # Get image patch
        # im_patch = im2[...,tl[0].item():br[0].item(),tl[1].item():br[1].item()]

    # Get image patch 填充函数，但其实填充的参数(-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2])都是负数，所以其实是往里“扣除”的作用
    if not is_mask:
        im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]), pad_mode)
    else:
        im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]))

    # Get image coordinates 获得image patch对应的坐标
    patch_coord = df * torch.cat((tl, br)).view(1,4)

    if output_sz is None or (im_patch.shape[-2] == output_sz[0] and im_patch.shape[-1] == output_sz[1]):
        return im_patch.clone(), patch_coord

    # Resample 将image patch插值成output_sz大小
    if not is_mask:
        im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='bilinear')
    else:
        im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='nearest')

    return im_patch, patch_coord #返回采样的图像块和对应的左上角和右下角坐标

函数作用：获取样本的中心点坐标和采样尺度(采样区域大小相对于[288，288]的倍数)

    def get_sample_location(self, sample_coord):
        """Get the location of the extracted sample."""
        sample_coord = sample_coord.float()
        sample_pos = 0.5*(sample_coord[:,:2] + sample_	coord[:,2:] - 1) # center coordinate of samples  Sum tl and br and then multiplied with 0.5 to get the center coordinate
        sample_scales = ((sample_coord[:,2:] - sample_coord[:,:2]) / self.img_sample_sz).prod(dim=1).sqrt()
        return sample_pos, sample_scales

    def localize_target(self, scores, sample_pos, sample_scales):
        输入：
        scores：classifier输出的置信度分数
        sample_pos：样本的中心点坐标
        sample_scales：采样尺度
        """Run the target localization."""
        # score的维度：Dimensions (images_in_sequence, sequences, yH, yW) or (images_in_sequence, sequences, filters, yH, yW)
        scores = scores.squeeze(1) # 去除掉sequence那一维
        preprocess_method = self.params.get('score_preprocess', 'none') #默认为none
        if preprocess_method == 'none':
            pass
        elif preprocess_method == 'exp':
            scores = scores.exp()
        elif preprocess_method == 'softmax':
            reg_val = getattr(self.net.classifier.filter_optimizer, 'softmax_reg', None)
            scores_view = scores.view(scores.shape[0], -1)
            scores_softmax = activation.softmax_reg(scores_view, dim=-1, reg=reg_val)
            scores = scores_softmax.view(scores.shape)
        else:
            raise Exception('Unknown score_preprocess in params.')

        score_filter_ksz = self.params.get('score_filter_ksz', 1) #默认为1
        if score_filter_ksz > 1:
            assert score_filter_ksz % 2 == 1
            kernel = scores.new_ones(1,1,score_filter_ksz,score_filter_ksz)
            scores = F.conv2d(scores.view(-1,1,*scores.shape[-2:]), kernel, padding=score_filter_ksz//2).view(scores.shape)

        if self.params.get('advanced_localization', False):
            #进入localize_advanced函数
            return self.localize_advanced(scores, sample_pos, sample_scales)

        # Get maximum
        score_sz = torch.Tensor(list(scores.shape[-2:]))
        score_center = (score_sz - 1)/2
        max_score, max_disp = dcf.max2d(scores)
        _, scale_ind = torch.max(max_score, dim=0)
        max_disp = max_disp[scale_ind,...].float().cpu().view(-1)
        target_disp = max_disp - score_center

        # Compute translation vector and scale change factor
        output_sz = score_sz - (self.kernel_size + 1) % 2
        translation_vec = target_disp * (self.img_support_sz

最低0.47元/天解锁文章