bevfusion是在mmdetection3D的代码框架上的二次开发,但是基于的版本是比较早期的版本,坐标系系统可能比较混乱。如下具体分析下训练过程中的坐标变换,点云数据增强、图像3D数据增强、LSS过程中如上两个变化的使用。
1. 点云的数据增强
class GlobalRotScaleTrans:
def __init__(self, resize_lim, rot_lim, trans_lim, is_train):
self.resize_lim = resize_lim
self.rot_lim = rot_lim
self.trans_lim = trans_lim
self.is_train = is_train
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
transform = np.eye(4).astype(np.float32)
if self.is_train:
scale = random.uniform(*self.resize_lim)
theta = random.uniform(*self.rot_lim)
translation = np.array([random.normal(0, self.trans_lim) for i in range(3)])
rotation = np.eye(3)
# 使用base_points类对应的rotate,translate,scale函数对点云进行相应的变换
# 注意这里用的是-theta,逆时针-theta,也就是顺时针theta
if "points" in data:
data["points"].rotate(-theta)
data["points"].translate(translation)
data["points"].scale(scale)
# 使用lidar_boxes类的rotate,translate,scale对box进行相应的变换
# 注意这里用的是theta,顺时针theta
gt_boxes = data["gt_bboxes_3d"]
rotation = rotation @ gt_boxes.rotate(theta).numpy()
gt_boxes.translate(translation)
gt_boxes.scale(scale)
data["gt_bboxes_3d"] = gt_boxes
# 保留变换矩阵
# 注意,这里rotation加了转置,返回的矩阵是逆时针theta的矩阵
# 这里转置变成顺时针的矩阵,和上面的变换保持一致
transform[:3, :3] = rotation.T * scale
transform[:3, 3] = translation * scale
data["lidar_aug_matrix"] = transform
return
base_points中的rotate
逆时针旋转theta角
# 逆时针旋转theta角
elif axis == 2 or axis == -1:
rot_mat_T = rotation.new_tensor(
[[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]
)
# 转置,顺时针旋转theta角
rot_mat_T = rot_mat_T.T
# 右乘,逆时针旋转theta角度
self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
lidar_boxes中的rotate
顺时针旋转theta角
# 逆时针旋转theta角
if angle.numel() == 1:
rot_sin = torch.sin(angle)
rot_cos = torch.cos(angle)
rot_mat_T = self.tensor.new_tensor(
[[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]
)
# 右乘,顺时针旋转theta角
self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
# 顺时针旋转theta角,同lidar_box,yaw朝向的定义
self.tensor[:, 6] += angle
inno,代码的简单更改
顺时针旋转theta角,匹配到yaw的定义变为-angle
# 逆时针旋转theta角
if angle.numel() == 1:
rot_sin = torch.sin(angle)
rot_cos = torch.cos(angle)
rot_mat_T = self.tensor.new_tensor(
[[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]
)
# 右乘,顺时针旋转theta角
self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
# 顺时针旋转theta角,也就是相当于逆时针旋转-theta角,符合inno_gt_box yaw朝向的定义
#self.tensor[:, 6] += angle
self.tensor[:, 6] -= angle
2. 图像的3D增强
对应的图像进行顺序变换,resize => crop => flip => rotate,用的是PIL.img的函数
对应的矩阵也顺序的进行相乘,并进行保留,用于后续的变换对齐
图像的默认的坐标,X朝右,Y朝下,右手系。
先resize
然后处理crop,也就是平移
然后处理flip,
然后处理rotate,
PIL.img中的旋转,默认是逆时针旋转theta角,但是图像坐标系中默认是顺时针旋转,因此这里旋转矩阵和default的进行了一个-theta的变换。
还有一个要处理的是,旋转的中心不是坐标原点,而是图像中心点,因此也要特殊处理:
先平移b,然后逆时针旋转theta,然后反向平移-b,右乘相对。
def img_transform(
self, img, rotation, translation, resize, resize_dims, crop, flip, rotate
):
# adjust image
img = img.resize(resize_dims)
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
img = img.rotate(rotate)
# post-homography transformation
rotation *= resize
translation -= torch.Tensor(crop[:2])
if flip:
A = torch.Tensor([[-1, 0], [0, 1]])
b = torch.Tensor([crop[2] - crop[0], 0])
rotation = A.matmul(rotation)
translation = A.matmul(translation) + b
theta = rotate / 180 * np.pi
A = torch.Tensor(
[
[np.cos(theta), np.sin(theta)],
[-np.sin(theta), np.cos(theta)],
]
)
b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
b = A.matmul(-b) + b
rotation = A.matmul(rotation)
translation = A.matmul(translation) + b
return img, rotation, translation
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
imgs = data["img"]
new_imgs = []
transforms = []
for img in imgs:
resize, resize_dims, crop, flip, rotate = self.sample_augmentation(data)
post_rot = torch.eye(2)
post_tran = torch.zeros(2)
new_img, rotation, translation = self.img_transform(
img,
post_rot,
post_tran,
resize=resize,
resize_dims=resize_dims,
crop=crop,
flip=flip,
rotate=rotate,
)
transform = torch.eye(4)
transform[:2, :2] = rotation
transform[:2, 3] = translation
new_imgs.append(new_img)
transforms.append(transform.numpy())
data["img"] = new_imgs
# update the calibration matrices
data["img_aug_matrix"] = transforms
return data
3. LSS的坐标变化的使用
3.1 lidar投影到image,获取image的depth
此时的点云是经过点云的数据增强的,首先进行lidar_aug的逆变换,恢复原始点云;
然后利用lidar2camera以及intrinsic,进行lidar to image的投影;
然后按照image_aug进行变换,这样就能得到点云xyz在最终image上的xyz,也就是uvd;
cur_coords = points[b][:, :3]
cur_img_aug_matrix = img_aug_matrix[b]
cur_lidar_aug_matrix = lidar_aug_matrix[b]
cur_lidar2image = lidar2image[b]
# inverse aug
cur_coords -= cur_lidar_aug_matrix[:3, 3]
cur_coords = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul(
cur_coords.transpose(1, 0)
)
# lidar2image
cur_coords = cur_lidar2image[:, :3, :3].matmul(cur_coords)
cur_coords += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1)
# get 2d coords
dist = cur_coords[:, 2, :]
cur_coords[:, 2, :] = torch.clamp(cur_coords[:, 2, :], 1e-5, 1e5)
cur_coords[:, :2, :] /= cur_coords[:, 2:3, :]
# imgaug
cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords)
cur_coords += cur_img_aug_matrix[:, :3, 3].reshape(-1, 3, 1)
cur_coords = cur_coords[:, :2, :].transpose(1, 2)
# normalize coords for grid sample
cur_coords = cur_coords[..., [1, 0]]
3.2 结合image的depth,以及前序的image的feature,预估出深度
通过dtransform,一系列的卷积,进行深度的特征提取,同时resize到特征图的尺度;
和前序的特征concate到一起;
通过depthnet,预估出新的特征,以及深度对应的权重;
def get_cam_feats(self, x, d):
B, N, C, fH, fW = x.shape
d = d.view(B * N, *d.shape[2:])
x = x.view(B * N, C, fH, fW)
d = self.dtransform(d)
x = torch.cat([d, x], dim=1)
x = self.depthnet(x)
depth = x[:, : self.D].softmax(dim=1)
x = depth.unsqueeze(1) * x[:, self.D : (self.D + self.C)].unsqueeze(2)
x = x.view(B, N, self.C, self.D, fH, fW)
x = x.permute(0, 1, 3, 4, 5, 2)
return x
3.3 生成视锥,以及视锥点云到bev坐标的投影关系
视椎生成,image下,每个pixel对应一系列的depth,也就是相当很多的点云;
def create_frustum(self):
iH, iW = self.image_size
fH, fW = self.feature_size
ds = (
torch.arange(*self.dbound, dtype=torch.float)
.view(-1, 1, 1)
.expand(-1, fH, fW)
)
D, _, _ = ds.shape
xs = (
torch.linspace(0, iW - 1, fW, dtype=torch.float)
.view(1, 1, fW)
.expand(D, fH, fW)
)
ys = (
torch.linspace(0, iH - 1, fH, dtype=torch.float)
.view(1, fH, 1)
.expand(D, fH, fW)
)
frustum = torch.stack((xs, ys, ds), -1)
return nn.Parameter(frustum, requires_grad=False)
首先对image_aug进行逆变换;
image u,v坐标变成,xyz坐标;
利用intrinsic以及外参,变化到lidar坐标系下;
然后进行lidar的数据增强操作;
def get_geometry(
self,
camera2lidar_rots,
camera2lidar_trans,
intrins,
post_rots,
post_trans,
**kwargs,
):
B, N, _ = camera2lidar_trans.shape
# undo post-transformation
# B x N x D x H x W x 3
points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
points = (
torch.inverse(post_rots)
.view(B, N, 1, 1, 1, 3, 3)
.matmul(points.unsqueeze(-1))
)
# cam_to_lidar
points = torch.cat(
(
points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
points[:, :, :, :, :, 2:3],
),
5,
)
combine = camera2lidar_rots.float().matmul(torch.inverse(intrins))
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += camera2lidar_trans.view(B, N, 1, 1, 1, 3)
if "extra_rots" in kwargs:
extra_rots = kwargs["extra_rots"]
points = (
extra_rots.view(B, 1, 1, 1, 1, 3, 3)
.repeat(1, N, 1, 1, 1, 1, 1)
.matmul(points.unsqueeze(-1))
.squeeze(-1)
)
if "extra_trans" in kwargs:
extra_trans = kwargs["extra_trans"]
points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1)
return points
3.4 提取feature到bev pillar中
此处调用了封装的bev_pool函数。
def bev_pool(self, geom_feats, x):
B, N, D, H, W, C = x.shape
Nprime = B * N * D * H * W
# flatten x
x = x.reshape(Nprime, C)
# flatten indices
geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long()
geom_feats = geom_feats.view(Nprime, 3)
batch_ix = torch.cat(
[
torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long)
for ix in range(B)
]
)
geom_feats = torch.cat((geom_feats, batch_ix), 1)
# filter out points that are outside box
kept = (
(geom_feats[:, 0] >= 0)
& (geom_feats[:, 0] < self.nx[0])
& (geom_feats[:, 1] >= 0)
& (geom_feats[:, 1] < self.nx[1])
& (geom_feats[:, 2] >= 0)
& (geom_feats[:, 2] < self.nx[2])
)
x = x[kept]
geom_feats = geom_feats[kept]
x = bev_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])
# collapse Z
final = torch.cat(x.unbind(dim=2), 1)
return final