解答 Q是怎么计算出来的
看model的forward部分
def forward(self, input_color_data, input_depth_data, is_volatile=False, specific_rotation=-1, goal_condition=None):
if is_volatile:
with torch.no_grad():
output_prob = []
interm_feat = []
# Apply rotations to images
for rotate_idx in range(self.num_rotations):
rotate_theta = np.radians(rotate_idx*(360/self.num_rotations))
# Compute sample grid for rotation BEFORE neural network
affine_mat_before = rot_to_affine_mat(-rotate_theta)
if self.use_cuda:
flow_grid_before = F.affine_grid(Variable(affine_mat_before, requires_grad=False).cuda(), input_color_data.size())
else:
flow_grid_before = F.affine_grid(Variable(affine_mat_before, requires_grad=False), input_color_data.size())
# Rotate images clockwise
if self.use_cuda:
rotate_color = F.grid_sample(Variable(input_color_data, volatile=True).cuda(), flow_grid_before, mode='nearest')
rotate_depth = F.grid_sample(Variable(input_depth_data, volatile=True).cuda(), flow_grid_before, mode='nearest')
else:
rotate_color = F.grid_sample(Variable(input_color_data, volatile=True), flow_grid_before, mode='nearest')
rotate_depth = F.grid_sample(Variable(input_depth_data, volatile=True), flow_grid_before, mode='nearest')
# Compute intermediate features
interm_push_color_feat = self.push_color_trunk.features(rotate_color)
interm_push_depth_feat = self.push_depth_trunk.features(rotate_depth)
interm_push_feat = torch.cat((interm_push_color_feat, interm_push_depth_feat), dim=1)
interm_grasp_color_feat = self.grasp_color_trunk.features(rotate_color)
interm_grasp_depth_feat = self.grasp_depth_trunk.features(rotate_depth)
interm_grasp_feat = torch.cat((interm_grasp_color_feat, interm_grasp_depth_feat), dim=1)
part_interm_feat = [interm_push_feat, interm_grasp_feat]
if self.place:
interm_place_color_feat = self.place_color_trunk.features(rotate_color)
interm_place_depth_feat = self.place_depth_trunk.features(rotate_depth)
interm_place_feat = torch.cat((interm_place_color_feat, interm_place_depth_feat), dim=1)
part_interm_feat += [interm_place_feat]
interm_feat.append(part_interm_feat)
# Compute sample grid for rotation AFTER branches
affine_mat_after = rot_to_affine_mat(rotate_theta)
if self.use_cuda:
flow_grid_after = F.affine_grid(Variable(affine_mat_after, requires_grad=False).cuda(), interm_push_feat.data.size())
else:
flow_grid_after = F.affine_grid(Variable(affine_mat_after, requires_grad=False), interm_push_feat.data.size())
# Forward pass through branches, undo rotation on output predictions, upsample results
part_output_prob = [nn.Upsample(scale_factor=self.upsample_scale, mode='bilinear', align_corners=self.align_corners).forward(F.grid_sample(self.pushnet(interm_push_feat), flow_grid_after, mode='nearest', align_corners=self.align_corners)),
nn.Upsample(scale_factor=self.upsample_scale, mode='bilinear', align_corners=self.align_corners).forward(F.grid_sample(self.graspnet(interm_grasp_feat), flow_grid_after, mode='nearest', align_corners=self.align_corners))]
if self.place:
part_output_prob += [nn.Upsample(scale_factor=self.upsample_scale, mode='bilinear', align_corners=self.align_corners).forward(F.grid_sample(self.placenet(interm_place_feat), flow_grid_after, mode='nearest', align_corners=self.align_corners))]
# Forward pass through branches, undo rotation on output predictions, upsample results
output_prob.append(part_output_prob)
return output_prob, interm_feat
rotate_theta中保存了16个旋转的旋转角度。 通过grid_sample和affine_grid完成对input_color_data和input_depth_data的16个角度的旋转
grid_sample和affine_grid
简单来说就是,提供一个input的Tensor以及一个对应的flow-field网格(比如光流,体素流等),然后根据grid中每个位置提供的坐标信息(这里指input中pixel的坐标),将input中对应位置的像素值填充到grid指定的位置,得到最终的输出。
torch.nn.functional.grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corners=None)
这里的input和output就是输入的图片,或者是网络中的feature map。关键的处理过程在于grid,grid的最后一维的大小为2,即表示input中pixel的位置信息 (x,y)
,这里一般会将x和y的取值范围归一化到 [−1,1]之间, (−1,−1)表示input左上角的像素的坐标,(1,1)表示input右下角的像素的坐标,对于超出这个范围的坐标(x,y),函数将会根据参数padding_mode的设定进行不同的处理。
padding_mode='zeros':对于越界的位置在网格中采用pixel value=0进行填充。padding_mode='border':对于越界的位置在网格中采用边界的pixel value进行填充。padding_mode='reflection':对于越界的位置在网格中采用关于边界的对称值进行填充。
mode有nearest和bilinear两种模式。 nearest就是直接采用与 距离最近处的像素值来填充
开始提取特征
interm_push_color_feat = self.push_color_trunk.features(rotate_color)
形式是这样的,其中self.push_color_trunk.features(rotate_color)定义如下
self.push_color_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.push_depth_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.grasp_color_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.grasp_depth_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.push_color_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.push_depth_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.grasp_color_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
self.grasp_depth_trunk = torchvision.models.densenet.densenet121(pretrained=pretrained)
vpg是用densenet121进行特征提取
再用中间特征通过pushnet或者graspnet,定义如下:
return nn.Sequential(OrderedDict([
# 进行数据的归一化处理,这使得数据在进行Relu之前不会因为数据过大而导致网络性能的不稳定
(name + '-norm0', nn.BatchNorm2d(first_fc)), # 批归一化2d 2048个输入特征图?
(name + '-relu0', nn.ReLU(inplace=True)),
(name + '-conv0', nn.Conv2d(first_fc, second_fc, kernel_size=1, stride=1, bias=False)),
(name + '-norm1', nn.BatchNorm2d(second_fc)), # 64个特征图
(name + '-relu1', nn.ReLU(inplace=True)),
(name + '-conv1', nn.Conv2d(second_fc, channels_out, kernel_size=1, stride=1, bias=False))
# ('push-upsample2', nn.Upsample(scale_factor=4, mode='bilinear'))
]))
把结果旋转回来在前向传递再上采样返回,其实就是每个像素对应的特征值了保存在output_prob,移除额外的填充就是每个像素对应的 Q值了