MLP-based BEV视图变换流程和代码
pipeline:
1.backbone提取32倍,64倍下采样特征:
img_s32 = self.bb(img) # backbone: resnet34
img_s64 = self.down(img_s32)
2.RV feature -> BEV feature
也就是,使用MLP将32倍,64倍下采样的featmap尺寸的图像特征映射到相同的BEV空间尺寸的BEV特征:
bev_32 = self.s32transformer(img_s32)
bev_64 = self.s64transformer(img_s64)
3.将两个BEV特征concat起来
bev = torch.cat([bev_64, bev_32], dim=1)
图像解释就是:
import torch
import torch.nn as nn
import torchvision
def naive_init_module(mod):
for m in mod.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
return mod
class Residual(nn.Module):
def __init__(self, module, downsample=None):
super(Residual, self).__init__()
self.module = module
self.downsample = downsample
self.relu = nn.ReLU()
def forward(self, x):
identity = x
out = self.module(x)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
return self.relu(out)
class FCTransform_(nn.Module):
def __init__(self, image_featmap_size, space_featmap_size):
super(FCTransform_, self).__init__()
ic, ih, iw = image_featmap_size # (256, 16, 16)
sc, sh, sw = space_featmap_size # (128, 16, 32)
self.image_featmap_size = image_featmap_size
self.space_featmap_size = space_featmap_size
self.fc_transform = nn.Sequential(
nn.Linear(ih * iw, sh * sw),
nn.ReLU(),
nn.Linear(sh * sw, sh * sw),
nn.ReLU()
)
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels=ic, out_channels=sc, kernel_size=1 * 1, stride=1, bias=False),
nn.BatchNorm2d(sc),
nn.ReLU(), )
self.residual = Residual(
module=nn.Sequential(
nn.Conv2d(in_channels=sc, out_channels=sc, kernel_size=3, padding=1, stride=1, bias=False),
nn.BatchNorm2d(sc),
))
def forward(self, x):
x = x.view(list(x.size()[:2]) + [self.image_featmap_size[1] * self.image_featmap_size[2], ]) # 这个 B,V,C,H*W
bev_view = self.fc_transform(x) # 拿出一个视角
bev_view = bev_view.view(list(bev_view.size()[:2]) + [self.space_featmap_size[1], self.space_featmap_size[2]])
bev_view = self.conv1(bev_view)
bev_view = self.residual(bev_view)
return bev_view
# model
# ResNet34 骨干网络 (self.bb),在 ImageNet 上进行预训练。
# 一个下采样层 (self.down),用于减小特征图的空间维度。
# 两个全连接变换层 (self.s32transformer 和 self.s64transformer),将 ResNet 骨干网络的特征图转换为 BEV 表示。
class VPN(nn.Module): # BEV-LaneDet
def __init__(self):
super(VPN, self).__init__()
# backbone: resnet34
self.bb = nn.Sequential(*list(torchvision.models.resnet34(pretrained=True).children())[:-2])
self.down = naive_init_module(
Residual(
module=nn.Sequential(
nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1), # S64
nn.BatchNorm2d(1024),
nn.ReLU(),
nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(1024)
),
downsample=nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1),
)
)
self.s32transformer = FCTransform_((512, 20, 20), (256, 25, 5))
self.s64transformer = FCTransform_((1024, 10, 10), (256, 25, 5))
def forward(self, img):
# backbone: resnet34
img_s32 = self.bb(img)
img_s64 = self.down(img_s32)
bev_32 = self.s32transformer(img_s32)
bev_64 = self.s64transformer(img_s64)
bev = torch.cat([bev_64, bev_32], dim=1)
if __name__ == "__main__":
vpn = VPN()
input_img = torch.rand(1, 3,640,640)
output = vpn(input_img)
以32倍下采样featiremap为例:
哪些model使用到了MLP-based的BEV视图变换方法
1 VPN(View Parsing Network)
Cross-view Semantic Segmentation for Sensing Surroundings
几乎上第一个探索BEV语义分割的任务
架构图
2 BEV-LaneDet
3 HDMapNet