通过物体的位置进行物体位姿掩码的自动标注
利用物体的CAD和物体的位置值进行掩码标注
我们参考DEEPMIND提供的标注工具,通过SAM构建了半自动化的标注工具,但是对我们的数据集来说,分割的结果并不好,但是我们场景中的物体具有CAD模型这个先验的条件,如果没有CAD模型可以使用3D重建方法进行重建,例如colmap方法获取物体的3D模型和相机的位姿值,是一个可行的方法。我们使用的方法是通过HALCON进行获取相机的位姿值,通过CAD模型转换为mesh,通过trimesh进行渲染得到,我们的代码来自物体六自由度位姿估计方法的数据集构造方法库,
ObjectDatasetTools. 这是一个构建位姿估计数据集的通用库,在此记录一下具体的代码:
#通过物体的位姿值和物体的对物体的掩码进行计算
# copy from https://github.com/BraveBoBo/ObjectDatasetTools
# 使用halcon的库进行计算 物体的实际位姿只能通过halcon的库进行计算
# 已知物体的位姿值
"""
create_label_files.py
---------------
This script produces:
1. Reorient the processed registered_scene mesh in a mesh with an AABB centered at the
origin and the same dimensions as the OBB, saved under the name foldername.ply
2. Create label files with class labels and projections of 3D BBs in the format
singleshotpose requires, saved under labels
3. Create pixel-wise masks, saved under mask
4. Save the homogeneous transform of object in regards to the foldername.ply in each
frame
"""
import os
import cv2
import glob
import trimesh
import numpy as np
import sys
from tqdm import trange
from scipy.optimize import minimize
import json
cam_intrinsic = np.array([
[5446.400, 0.0, 925.576],
[0.0, 5447.326, 553.982],
[0.0, 0.0, 1.0]])
# halcon相机内参值
def get_camera_intrinsic(folder='./data/'):
with open(folder+'intrinsics.json', 'r') as f:
camera_intrinsics = json.load(f)
K = np.zeros((3, 3), dtype='float64')
K[0, 0], K[0, 2] = float(camera_intrinsics['fx']), float(camera_intrinsics['ppx'])
K[1, 1], K[1, 2] = float(camera_intrinsics['fy']), float(camera_intrinsics['ppy'])
K[2, 2] = 1.
return (camera_intrinsics, K)
def compute_projection(points_3D,internal_calibration):
points_3D = points_3D.T
projections_2d = np.zeros((2, points_3D.shape[1]), dtype='float32')
camera_projection = (internal_calibration).dot(points_3D)
projections_2d[0, :] = camera_projection[0, :]/camera_projection[2, :]
projections_2d[1, :] = camera_projection[1, :]/camera_projection[2, :]
return projections_2d
def print_usage():
print("Usage: create_label_files.py <path>")
print("path: all or name of the folder")
print("e.g., create_label_files.py all, create_label_files.py LINEMOD/Cheezit")
## 将txt转为npy文件 储存位姿值
def get_length(folder):
return len(os.listdir(folder))
def load_txt(folder,id):
pose = np.loadtxt(folder + f"\{id:06d}" + ".txt")
pose = pose.reshape(3,4)
pose = np.vstack((pose, [0, 0, 0, 1]))
return pose
if __name__ == "__main__":
classlabel ='target'
folder = './data/'
camera_intrinsics, K = get_camera_intrinsic(folder)
path_label = folder + "labels"
if not os.path.exists(path_label):
os.makedirs(path_label)
path_mask = folder + "mask"
if not os.path.exists(path_mask):
os.makedirs(path_mask)
path_transforms = folder + "transforms"
if not os.path.exists(path_transforms):
os.makedirs(path_transforms)
transforms_file = folder + "transforms.npy"
try:
transforms = np.load(transforms_file)
except:
print("transforms not computed, run compute_gt_poses.py first")
mesh = trimesh.load(os.path.join(folder,"model","color_target.ply"))
mesh.apply_scale(10)
Tform = mesh.apply_obb()
mesh.export(file_obj = "./data/model/target.ply")
points = mesh.bounding_box.vertices
center = mesh.centroid
min_x = np.min(points[:,0])
min_y = np.min(points[:,1])
min_z = np.min(points[:,2])
max_x = np.max(points[:,0])
max_y = np.max(points[:,1])
max_z = np.max(points[:,2])
points = np.array([[min_x, min_y, min_z], [min_x, min_y, max_z], [min_x, max_y, min_z],
[min_x, max_y, max_z], [max_x, min_y, min_z], [max_x, min_y, max_z],
[max_x, max_y, min_z], [max_x, max_y, max_z]])
points_original = np.concatenate((np.array([[center[0],center[1],center[2]]]), points))
points_original = trimesh.transformations.transform_points(points_original,
np.linalg.inv(Tform))
projections = [[],[]]
for i in trange(len(transforms)):
mesh_copy = mesh.copy()
img = cv2.imread(folder+"rgb/"+f"{i:06d}" + ".png")
transform = transforms[i]
# transform = np.linalg.inv(transforms[i]) # 根据实验这个halcon标注的位姿值是反的 不需要进行取反的操作
transformed = trimesh.transformations.transform_points(points_original, transform)
corners = compute_projection(transformed,K)
corners = corners.T
corners[:,0] = corners[:,0]/int(camera_intrinsics['width'])
corners[:,1] = corners[:,1]/int(camera_intrinsics['height'])
T = np.dot(transform, np.linalg.inv(Tform))
mesh_copy.apply_transform(T)
filename = path_transforms + "/"+ str(i)+".npy"
np.save(filename, T)
sample_points = mesh_copy.sample(10000)
masks = compute_projection(sample_points,K)
masks = masks.T
min_x = np.min(masks[:,0])
min_y = np.min(masks[:,1])
max_x = np.max(masks[:,0])
max_y = np.max(masks[:,1])
image_mask = np.zeros(img.shape[:2],dtype = np.uint8)
for pixel in masks:
cv2.circle(image_mask,(int(pixel[0]),int(pixel[1])), 5, 255, -1)
thresh = cv2.threshold(image_mask, 30, 255, cv2.THRESH_BINARY)[1]
contours, _ = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnt = max(contours, key=cv2.contourArea)
image_mask = np.zeros(img.shape[:2],dtype = np.uint8)
cv2.drawContours(image_mask, [cnt], -1, 255, -1)
mask_path = path_mask+"/"+ str(i)+".png"
cv2.imwrite(mask_path, image_mask)
file = open(path_label+"/"+ str(i)+".txt","w")
message = str(classlabel)[:8] + " "
file.write(message)
for pixel in corners:
for digit in pixel:
message = str(digit)[:8] + " "
file.write(message)
message = str((max_x-min_x)/float(camera_intrinsics['width']))[:8] + " "
file.write(message)
message = str((max_y-min_y)/float(camera_intrinsics['height']))[:8]
file.write(message)
file.close()