KITTI数据集只提供了左目的2Dbbox,而如果要使用双目进行3D box估计,就需要先将3D box投影到右目获得真值。以下代码由Stereo RCNN 修改而来:
#!/bin/usr/env python3
import numpy as np
import csv
import math as m
import os
def NormalizeVector(P):
return np.append(P, [1])
def E2R(Ry, Rx, Rz):
'''Combine Euler angles to the rotation matrix (right-hand)
Inputs:
Ry, Rx, Rz : rotation angles along y, x, z axis
only has Ry in the KITTI dataset
Returns:
3 x 3 rotation matrix
'''
R_yaw = np.array([[ m.cos(Ry), 0 ,m.sin(Ry)],
[ 0, 1 , 0],
[-m.sin(Ry), 0 ,m.cos(Ry)]])
R_pitch = np.array([[1, 0, 0],
[0, m.cos(Rx), -m.sin(Rx)],
[0, m.sin(Rx), m.cos(Rx)]])
#R_roll = np.array([[[m.cos(Rz), -m.sin(Rz), 0],
# [m.sin(Rz), m.cos(Rz), 0],
# [ 0, 0 , 1]])
return (R_pitch.dot(R_yaw))
class KittiObject:
def __init__(self):
self.cls = '' # Car, Van, Truck
self.truncate = 0 # float 0(non-truncated) - 1(totally truncated)
self.occlusion = 0 # integer 0, 1, 2, 3
self.alpha = 0 # viewpoint angle -pi - pi
self.boxes = (Box2d(),\
Box2d(), Box2d()) # Box2d list, default order: box_left, box_right, box_merge
self.pos = [] # x, y, z in cam2 frame
self.dim = [] # width(x), height(y), length(z)
self.orientation = 0 # [-pi - pi]
self.R = [] # rotation matrix in cam2 frame
class Box2d:
def __init__(self):
self.box = [] # left, top, right bottom in 2D image
self.keypoints = [] # holds the u coordinates of 4 keypoints, -1 denotes the invisible one
self.visible_left = 0 # The left side is visible (not occluded) by other object
self.visible_right = 0 # The right side is visible (not occluded) by other object
class FrameCalibrationData:
'''Frame Calibration Holder
p0-p3 Camera P matrix. Contains extrinsic 3x4
and intrinsic parameters.
r0_rect Rectification matrix, required to transform points 3x3
from velodyne to camera coordinate frame.
tr_velodyne_to_cam0 Used to transform from velodyne to cam 3x4
coordinate frame according to:
Point_Camera = P_cam * R0_rect *
Tr_velo_to_cam *
Point_Velodyne.
'''
def __init__(self):
self.p0 = []
self.p1 = []
self.p2 = []
self.p3 = []
self.p2_3 = []
self.r0_rect = []
self.t_cam2_cam0 = []
self.tr_velodyne_to_cam0 = []
def Space2Image(P0, pts3):
pts2_norm = P0.dot(pts3)
pts2 = np.array([int(pts2_norm[0]/pts2_norm[2]), int(pts2_norm[1]/pts2_norm[2])])
return pts2
def read_obj_calibration(CALIB_PATH):
''' Reads in Calibration file from Kitti Dataset.
Inputs:
CALIB_PATH : Str PATH of the calibration file.
Returns:
frame_calibration_info : FrameCalibrationData
Contains a frame's full calibration data.
^ z ^ z ^ z ^ z
| cam2 | cam0 | cam3 | cam1
|-----> x |-----> x |-----> x |-----> x
'''
frame_calibration_info = FrameCalibrationData()
data_file = open(CALIB_PATH, 'r')
data_reader = csv.reader(data_file, delimiter=' ')
data = []
for row in data_reader:
data.append(row)
data_file.close()
p_all = []
for i in range(4):
p = data[i]
p = p[1:]
p = [float(p[i]) for i in range(len(p))]
p = np.reshape(p, (3, 4))
p_all.append(p)
# based on camera 0
frame_calibration_info.p0 = p_all[0]
frame_calibration_info.p1 = p_all[1]
frame_calibration_info.p2 = p_all[2]
frame_calibration_info.p3 = p_all[3]
# based on camera 2
frame_calibration_info.p2_2 = np.copy(p_all[2])
frame_calibration_info.p2_2[0,3] = frame_calibration_info.p2_2[0,3] - frame_calibration_info.p2[0,3]
frame_calibration_info.p2_3 = np.copy(p_all[3])
frame_calibration_info.p2_3[0,3] = frame_calibration_info.p2_3[0,3] - frame_calibration_info.p2[0,3]
frame_calibration_info.t_cam2_cam0 = np.zeros(3)
frame_calibration_info.t_cam2_cam0[0] = (frame_calibration_info.p2[0,3] - frame_calibration_info.p0[0,3])/frame_calibration_info.p2[0,0]
# Read in rectification matrix
tr_rect = data[4]
tr_rect = tr_rect[1:]
tr_rect = [float(tr_rect[i]) for i in range(len(tr_rect))]
frame_calibration_info.r0_rect = np.reshape(tr_rect, (3, 3))
# Read in velodyne to cam matrix
tr_v2c = data[5]
tr_v2c = tr_v2c[1:]
tr_v2c = [float(tr_v2c[i]) for i in range(len(tr_v2c))]
frame_calibration_info.tr_velodyne_to_cam0 = np.reshape(tr_v2c, (3, 4))
return frame_calibration_info
def read_obj_data(LABEL_PATH, calib=None, im_shape=None):
'''Reads in object label file from Kitti Object Dataset.
Inputs:
LABEL_PATH : Str PATH of the label file.
Returns:
List of KittiObject : Contains all the labeled data
'''
used_cls = ['Car']
objects = []
detection_data = open(LABEL_PATH, 'r')
detections = detection_data.readlines()
for object_index in range(len(detections)):
data_str = detections[object_index]
data_list = data_str.split()
# if data_list[0] not in used_cls: ## 所有的类别都保存
# continue
object_it = KittiObject() ##
object_it.cls = data_list[0]
object_it.truncate = float(data_list[1])
object_it.occlusion = int(data_list[2])
object_it.alpha = float(data_list[3])
# width height lenth
object_it.dim = np.array([data_list[9], data_list[8], data_list[10]]).astype(float)
# The KITTI GT 3D box is on cam0 frame, while we deal with cam2 frame
object_it.pos = np.array(data_list[11:14]).astype(float) + calib.t_cam2_cam0 # 0.062
# The orientation definition is inconsitent with right-hand coordinates in kitti
object_it.orientation = float(data_list[14]) + m.pi/2
object_it.R = E2R(object_it.orientation, 0, 0)
pts3_c_o = [] # 3D location of 3D bounding box corners ## 3d bbox的8个顶点
pts3_c_o.append(object_it.pos + object_it.R.dot([-object_it.dim[0], 0, -object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([-object_it.dim[0], 0, object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([object_it.dim[0], 0, object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([object_it.dim[0], 0, -object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([-object_it.dim[0 ], -2.0*object_it.dim[1], -object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([-object_it.dim[0], -2.0*object_it.dim[1], object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([object_it.dim[0], -2.0*object_it.dim[1], object_it.dim[2]])/2.0)
pts3_c_o.append(object_it.pos + object_it.R.dot([object_it.dim[0], -2.0*object_it.dim[1], -object_it.dim[2]])/2.0)
object_it.boxes[0].box = np.array([10000, 10000, 0, 0]).astype(float) ##
object_it.boxes[1].box = np.array([10000, 10000, 0, 0]).astype(float)
object_it.boxes[2].box = np.array([0.0, 0.0, 0.0, 0.0]).astype(float)
object_it.boxes[0].keypoints = np.array([-1.0, -1.0, -1.0, -1.0]).astype(float)
object_it.boxes[1].keypoints = np.array([-1.0, -1.0, -1.0, -1.0]).astype(float)
for j in range(2): # left and right boxes
for i in range(8):
if pts3_c_o[i][2] < 0:
continue
if j == 0: # project 3D corner to left image
pt2 = Space2Image(calib.p2_2, NormalizeVector(pts3_c_o[i]))
elif j == 1: # project 3D corner to right image
pt2 = Space2Image(calib.p2_3, NormalizeVector(pts3_c_o[i]))
if i < 4:
object_it.boxes[j].keypoints[i] = pt2[0]
object_it.boxes[j].box[0] = min(object_it.boxes[j].box[0], pt2[0])
object_it.boxes[j].box[1] = min(object_it.boxes[j].box[1], pt2[1])
object_it.boxes[j].box[2] = max(object_it.boxes[j].box[2], pt2[0])
object_it.boxes[j].box[3] = max(object_it.boxes[j].box[3], pt2[1])
object_it.boxes[j].box[0] = max(object_it.boxes[j].box[0], 0)
object_it.boxes[j].box[1] = max(object_it.boxes[j].box[1], 0)
if im_shape is not None:
object_it.boxes[j].box[2] = min(object_it.boxes[j].box[2], im_shape[1]-1)
object_it.boxes[j].box[3] = min(object_it.boxes[j].box[3], im_shape[0]-1)
# deal with unvisible keypoints
left_keypoint, right_keypoint = 5000, 0
left_inx, right_inx = -1, -1
# 1. Select keypoints that lie on the left and right side of the 2D box
for i in range(4):
if object_it.boxes[j].keypoints[i] < left_keypoint:
left_keypoint = object_it.boxes[j].keypoints[i]
left_inx = i
if object_it.boxes[j].keypoints[i] > right_keypoint:
right_keypoint = object_it.boxes[j].keypoints[i]
right_inx = i
# 2. For keypoints between left and right side, select the visible one
for i in range(4):
if i == left_inx or i == right_inx:
continue
if pts3_c_o[i][2] > object_it.pos[2]:
object_it.boxes[j].keypoints[i] = -1
# calculate the union of the left and right box
object_it.boxes[2].box[0] = min(object_it.boxes[1].box[0], object_it.boxes[0].box[0])
object_it.boxes[2].box[1] = min(object_it.boxes[1].box[1], object_it.boxes[0].box[1])
object_it.boxes[2].box[2] = max(object_it.boxes[1].box[2], object_it.boxes[0].box[2])
object_it.boxes[2].box[3] = max(object_it.boxes[1].box[3], object_it.boxes[0].box[3])
objects.append(object_it)
return objects
## 按照kitti label格式重新保存,这里左目真值也是由3D box投影得到,所以与原label不同。
def save_label(file,objects,calib,cam=3):
f = open(file,"w+")
for object in objects:
label = [object.cls,object.truncate,object.occlusion,object.alpha]
if cam == 2:
label.extend(object.boxes[0].box)
elif cam == 3:
label.extend(object.boxes[1].box)
label.extend([object.dim[1],object.dim[0],object.dim[2]])
label.extend(object.pos-calib.t_cam2_cam0)
label.append(object.orientation-m.pi/2 )
ss = " ".join(str(i) for i in label)
f.write(ss+'\n')
f.close()
if __name__ == "__main__":
label2_files = os.listdir('label_2/')
calib_files = os.listdir('calib/')
path = os.getcwd()
for calib_file,label2_file in zip(calib_files,label2_files):
assert(calib_file==label2_file)
calib = read_obj_calibration(path+'/calib/'+calib_file)
objects = read_obj_data(path+'/label_2/'+label2_file,calib = calib)
label_name = calib_file
save_label("new_label/label_2/"+label_name, objects, calib=calib,cam=2)
save_label("new_label/label_3/"+label_name, objects, calib=calib,cam=3)
print("saving:",label_name)