大家好,最近一直在看DiMP的代码,但是发现自己经常容易看了后面的忘了前面的,妈妈常和我说“好脑子不如烂笔头”,所以我今天决定把关于代码的一些想法记录下来,一定会有很多问题的,希望大家多多包容,多多与我交流,很欢迎大家评论或私信我哦~
最后更新时间:20201214
下面主要focus DiMP中关于目标状态判定的代码~最近又重新下载了一次代码,发现和之前的有稍微的不一样。
目标状态判定及参数更新:
在pytracking/pytracking/tracker/dimp/dimp.py文件中
def track(self, image, info: dict = None) -> dict:
self.debug_info = {
}
......此处省略若干行代码......
# ------- LOCALIZATION ------- #
# Extract backbone features 采样图像块,并提取其主干特征,见下面的函数
backbone_feat, sample_coords, im_patches = self.extract_backbone_features(im, self.get_centered_sample_pos(),
self.target_scale * self.params.scale_factors,
self.img_sample_sz)
# Extract classification features 提取用于classifier的特征,由一个卷积层和一个L2正则化组成
test_x = self.get_classification_features(backbone_feat)
# Location of sample sample_pos:center position of sample 获取中心点坐标和采样尺度,详情见下面
sample_pos, sample_scales = self.get_sample_location(sample_coords)
# Compute classification scores 计算出分类的置信度分数
scores_raw = self.classify_target(test_x)
# Localize the target 此处进入localize_target函数
translation_vec, scale_ind, s, flag = self.localize_target(scores_raw, sample_pos, sample_scales)
new_pos = sample_pos[scale_ind,:] + translation_vec #获得新的样本中心点
# Update position and scale 如果目标未丢失,更新各参数
if flag != 'not_found':
if self.params.get('use_iou_net', True):
#进入当前if
update_scale_flag = self.params.get('update_scale_when_uncertain', True) or flag != 'uncertain'
if self.params.get('use_classifier', True):
#更新bb的中心点坐标 self.pos
self.update_state(new_pos)
#微调classifier输出的结果,并更新各参数,见下面self.refine_target_box函数,理论可以看ATOM那篇论文
self.refine_target_box(backbone_feat, sample_pos[scale_ind,:], sample_scales[scale_ind], scale_ind, update_scale_flag)
elif self.params.get('use_classifier', True):
self.update_state(new_pos, sample_scales[scale_ind])
# ------- UPDATE ------- #
#在'not_found', 'uncertain'情况外,update_flag为True
update_flag = flag not in ['not_found', 'uncertain']
hard_negative = (flag == 'hard_negative')
learning_rate = self.params.get('hard_negative_learning_rate', None) if hard_negative else None
#如果不处于'not_found'和'uncertain'状态,则更新classifier
if update_flag and self.params.get('update_classifier', False):
# Get train sample 将当前样本加入训练样本中
train_x = test_x[scale_ind:scale_ind+1, ...]
# Create target_box and label for spatial sample
target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos[scale_ind,:], sample_scales[scale_ind])
# Update the classifier model 更新classifier模板,并绘制loss曲线,见下面self.update_classifier函数
self.update_classifier(train_x, target_box, learning_rate, s[scale_ind,...])
# Set the pos of the tracker to iounet pos 又更新了一遍self.pos,其实之前在self.refine_target_box函数中更新过
if self.params.get('use_iou_net', True) and flag != 'not_found' and hasattr(self, 'pos_iounet'):
self.pos = self.pos_iounet.clone()
score_map = s[scale_ind, ...] #置信度分数
max_score = torch.max(score_map).item() #置信度分数的最大值
# Visualize and set debug info 可视化
self.search_area_box = torch.cat((sample_coords[scale_ind,[1,0]], sample_coords[scale_ind,[3,2]] - sample_coords[scale_ind,[1,0]] - 1)) #搜索区域的大小位置
self.debug_info['flag' + self.id_str] = flag #当前帧目标状态信息
self.debug_info['max_score' + self.id_str] = max_score #置信度分数的最大值
if self.visdom is not None:
self.visdom.register(score_map, 'heatmap', 2, 'Score Map' + self.id_str)
self.visdom.register(self.debug_info, 'info_dict', 1, 'Status')
elif self.params.debug >= 2:
show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score))
# Compute output bounding box 当前帧的目标bbox
new_state = torch.cat((self.pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]]))
if self.params.get('output_not_found_box', False) and flag == 'not_found':
output_state = [-1, -1, -1, -1]
else:
output_state = new_state.tolist()
out = {
'target_bbox': output_state}
return out
函数作用:提取采样图像块的主干特征
def extract_backbone_features(self, im: torch.Tensor, pos: torch.Tensor, scales, sz: torch.Tensor):
""" input
im: frame t
pos: center position of sample
scales: just scale
sz: 288
output
im_patches: image patches sampled from frame t 从t帧中采样的样本
patch_coords: top-left and bottom-right coordinate for image patches 样本的左上角和右下角坐标
"""
# sample_patch_multiscale函数在preprocessing.py文件中,见下面
im_patches, patch_coords = sample_patch_multiscale(im, pos, scales, sz,
mode=self.params.get('border_mode', 'replicate'),
max_scale_change=self.params.get('patch_max_scale_change', None))
with torch.no_grad():
backbone_feat = self.net.extract_backbone(im_patches)
return backbone_feat, patch_coords, im_patches
函数作用:在某一中心点处,采样一定大小和尺度的图像块
def sample_patch_multiscale(im, pos, scales, image_sz, mode: str='replicate', max_scale_change=None):
"""Extract image patches at multiple scales.
args:
im: Image.
pos: Center position for extraction.
scales: Image scales to extract image patches from.
image_sz: Size to resize the image samples to
mode: how to treat image borders: 'replicate' (default), 'inside' or 'inside_major'
max_scale_change: maximum allowed scale change when using 'inside' and 'inside_major' mode
"""
if isinstance(scales, (int, float)):
scales = [scales]
# Get image patches 见sample_patch函数 得到采样的图像块和对应的坐标
patch_iter, coord_iter = zip(*(sample_patch(im, pos, s*image_sz, image_sz, mode=mode,
max_scale_change=max_scale_change) for s in scales))
im_patches = torch.cat(list(patch_iter))
patch_coords = torch.cat(list(coord_iter))
return im_patches, patch_coords #返回extract_backbone_features函数
def sample_patch(im: torch.Tensor, pos: torch.Tensor, sample_sz: torch.Tensor, output_sz: torch.Tensor = None,
mode: str = 'replicate', max_scale_change=None, is_mask=False):
"""Sample an image patch.
args:
im: Image
pos: center position of crop
sample_sz: size to crop
output_sz: size to resize to
mode: how to treat image borders: 'replicate' (default), 'inside' or 'inside_major'
max_scale_change: maximum allowed scale change when using 'inside' and 'inside_major' mode
"""
# if mode not in ['replicate', 'inside']:
# raise ValueError('Unknown border mode \'{}\'.'.format(mode))
# copy and convert
posl = pos.long().clone()
pad_mode = mode
# Get new sample size if forced inside the image
if mode == 'inside' or mode == 'inside_major':
pad_mode = 'replicate'
im_sz = torch.Tensor([im.shape[2], im.shape[3]])
shrink_factor = (sample_sz.float() / im_sz)
if mode == 'inside':
shrink_factor = shrink_factor.max()
elif mode == 'inside_major':
shrink_factor = shrink_factor.min()
shrink_factor.clamp_(min=1, max=max_scale_change)
sample_sz = (sample_sz.float() / shrink_factor).long()
# Compute pre-downsampling factor
if output_sz is not None:
# 采样大小相对于输出大小的倍数
resize_factor = torch.min(sample_sz.float() / output_sz.float()).item()
# 取整
df = int(max(int(resize_factor - 0.1), 1))
else:
df = int(1)
sz = sample_sz.float() / df # new size 新的output size
# Do downsampling
if df > 1:
os = posl % df # offset
posl = (posl - os) // df # new position
im2 = im[..., os[0].item()::df, os[1].item()::df] # downsample
else:
im2 = im
# compute size to crop 四舍五入取整
szl = torch.max(sz.round(), torch.Tensor([2])).long()
# Extract top and bottom coordinates 根据中心点和尺寸求取左上角和右下角坐标
tl = posl - (szl - 1) // 2
br = posl + szl//2 + 1
# Shift the crop to inside
if mode == 'inside' or mode == 'inside_major':
im2_sz = torch.LongTensor([im2.shape[2], im2.shape[3]])
shift = (-tl).clamp(0) - (br - im2_sz).clamp(0)
tl += shift
br += shift
outside = ((-tl).clamp(0) + (br - im2_sz).clamp(0)) // 2
shift = (-tl - outside) * (outside > 0).long()
tl += shift
br += shift
# Get image patch
# im_patch = im2[...,tl[0].item():br[0].item(),tl[1].item():br[1].item()]
# Get image patch 填充函数,但其实填充的参数(-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2])都是负数,所以其实是往里“扣除”的作用
if not is_mask:
im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]), pad_mode)
else:
im_patch = F.pad(im2, (-tl[1].item(), br[1].item() - im2.shape[3], -tl[0].item(), br[0].item() - im2.shape[2]))
# Get image coordinates 获得image patch对应的坐标
patch_coord = df * torch.cat((tl, br)).view(1,4)
if output_sz is None or (im_patch.shape[-2] == output_sz[0] and im_patch.shape[-1] == output_sz[1]):
return im_patch.clone(), patch_coord
# Resample 将image patch插值成output_sz大小
if not is_mask:
im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='bilinear')
else:
im_patch = F.interpolate(im_patch, output_sz.long().tolist(), mode='nearest')
return im_patch, patch_coord #返回采样的图像块和对应的左上角和右下角坐标
函数作用:获取样本的中心点坐标和采样尺度(采样区域大小相对于[288,288]的倍数)
def get_sample_location(self, sample_coord):
"""Get the location of the extracted sample."""
sample_coord = sample_coord.float()
sample_pos = 0.5*(sample_coord[:,:2] + sample_ coord[:,2:] - 1) # center coordinate of samples Sum tl and br and then multiplied with 0.5 to get the center coordinate
sample_scales = ((sample_coord[:,2:] - sample_coord[:,:2]) / self.img_sample_sz).prod(dim=1).sqrt()
return sample_pos, sample_scales
def localize_target(self, scores, sample_pos, sample_scales):
输入:
scores:classifier输出的置信度分数
sample_pos:样本的中心点坐标
sample_scales:采样尺度
"""Run the target localization."""
# score的维度:Dimensions (images_in_sequence, sequences, yH, yW) or (images_in_sequence, sequences, filters, yH, yW)
scores = scores.squeeze(1) # 去除掉sequence那一维
preprocess_method = self.params.get('score_preprocess', 'none') #默认为none
if preprocess_method == 'none':
pass
elif preprocess_method == 'exp':
scores = scores.exp()
elif preprocess_method == 'softmax':
reg_val = getattr(self.net.classifier.filter_optimizer, 'softmax_reg', None)
scores_view = scores.view(scores.shape[0], -1)
scores_softmax = activation.softmax_reg(scores_view, dim=-1, reg=reg_val)
scores = scores_softmax.view(scores.shape)
else:
raise Exception('Unknown score_preprocess in params.')
score_filter_ksz = self.params.get('score_filter_ksz', 1) #默认为1
if score_filter_ksz > 1:
assert score_filter_ksz % 2 == 1
kernel = scores.new_ones(1,1,score_filter_ksz,score_filter_ksz)
scores = F.conv2d(scores.view(-1,1,*scores.shape[-2:]), kernel, padding=score_filter_ksz//2).view(scores.shape)
if self.params.get('advanced_localization', False):
#进入localize_advanced函数
return self.localize_advanced(scores, sample_pos, sample_scales)
# Get maximum
score_sz = torch.Tensor(list(scores.shape[-2:]))
score_center = (score_sz - 1)/2
max_score, max_disp = dcf.max2d(scores)
_, scale_ind = torch.max(max_score, dim=0)
max_disp = max_disp[scale_ind,...].float().cpu().view(-1)
target_disp = max_disp - score_center
# Compute translation vector and scale change factor
output_sz = score_sz - (self.kernel_size + 1) % 2
translation_vec = target_disp * (self.img_support_sz