tracker/ mixformer_online.py
H, W, _ = image.shape 400 720
self.frame_id += 1
x_patch_arr, resize_factor, x_amask_arr = sample_target(image, self.state, self.params.search_factor,
output_sz=self.params.search_size) # (x1, y1, w, h)
x_patch_arr 320 320 3 将image一目标框进行缩放和裁减,形成一个320 x 320的正方区域
search = self.preprocessor.process(x_patch_arr) 变为 1 3 320 320 是配后面的网络
out_dict, _ = self.network.forward_test(search, run_score_head=True)
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
template, search = self.backbone.forward_test(search)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
search = getattr(self, f'stage{
i}').forward_test(search) 放入一个三阶段的ConvolutionalVisionTransformer中
search = self.patch_embed(search) 1 64 320 320 -> 1 64 80 80
s_B, s_C, s_H, s_W = search.size()1 64 80 80,c为经过卷积之后的通道数
search = rearrange(search, 'b c h w -> b (h w) c').contiguous() 1 6400 64
x = self.pos_drop(x) 实现dropout,1 6400 64
--for i, blk in enumerate(self.blocks):
x = blk.forward_test(x, s_H, s_W)//实现attention模快
res = x 1 6400 64
x = self.norm1(x) 1 6400 64
attn = self.attn.forward_test(x, s_h, s_w)
q_s, k, v = self.forward_conv_test(x, s_h, s_w) q_s 1 6400 64 k 1 2112 64 v 1 2112 64 通过一个网络
k = torch.cat([self.t_k, self.ot_k, k], dim=1) 1 256 64 1 256 64 1 1600 64 ------> 1 2112 64
q_s = rearrange(self.proj_q(q_s), 'b t (h d) -> b h t d', h=self.num_heads).contiguous() 1 1 6400 64 num_heads=1
k = rearrange(self.proj_k(k), 'b t (h d) -> b h t d', h=self.num_heads).contiguous() 1 1 2112 64
v = rearrange(self.proj_v(v), 'b t (h d) -> b h t d', h=self.num_heads).contiguous() 1 1 2112 64
attn_score = torch.einsum('bhlk,bhtk->bhlt', [q_s, k]) * self.scale 1 1 6400 2112转置,对应元素相乘,求和 *0.125
attn = F.softmax(attn_score, dim=-1) 1 1 6400 2112
attn = self.attn_drop(attn)
x_s = torch.einsum('bhlt,bhtv->bhlv', [attn, v]) 转置,对应元素相乘,求和 1 1 6400 64
x_s = rearrange(x_s, 'b h t d -> b t (h d)').contiguous() 1 6400 64
x = x_s
x = self.proj(x) Linear(in_features=64, out_features=64, bias=True)
x = self.proj_drop(x)
x = res + self.drop_path(attn) 1 6400 64
x = x + self.drop_path(self.mlp(self.norm2(x))) 1 6400 64
mlp (fc1): Linear(in_features=64, out_features=256, bias=True) (act): QuickGELU() (fc2): Linear(in_features=256, out_features=64, bias=True) (drop): Dropout(p=0.0, inplace=False)
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x) 1 6400 64
--search = x 1 6400 64
search = rearrange(search, 'b (h w) c -> b c h w', h=s_H, w=s_W) 1 64 80 80
完成第0个stage,进行下一个
--------------------------------