windows
colmap
下载解压后运行
数据准备
准备工作文件夹,images存放文件,sparse存放colmap结果文件
导出模型在sparse下的0文件夹
ubuntu
官方安装方法:
apt-get update
apt-get install colmap
官方包网址:https://debian.pkgs.org/11/debian-main-amd64/colmap_3.6+really3.6-1_amd64.deb.html
手动安装
sudo apt-get install \
git \
cmake \
build-essential \
libboost-program-options-dev \
libboost-filesystem-dev \
libboost-graph-dev \
libboost-system-dev \
libboost-test-dev \
libeigen3-dev \
libsuitesparse-dev \
libfreeimage-dev \
libmetis-dev \
libgoogle-glog-dev \
libgflags-dev \
libglew-dev \
qtbase5-dev \
libqt5opengl5-dev \
libcgal-dev
ubuntu16.04/18.04执行下面语句
sudo apt-get install libcgal-qt5-dev
下载Ceres Solver
注意如果有anaconda,将anaconda改名(名字加个1什么的),避免干扰Ceres编译
sudo apt-get install libatlas-base-dev libsuitesparse-dev
git clone https://ceres-solver.googlesource.com/ceres-solver
cd ceres-solver
git checkout $(git describe --tags) # Checkout the latest release
mkdir build
cd build
cmake .. -DBUILD_TESTING=OFF -DBUILD_EXAMPLES=OFF
make -j
sudo make install
如果访问不了google官网可以用下面替换,注意要切合2.00或2.1.0分支,不能拉起master
git clone https://github.com/ceres-solver/ceres-solver.git
colmap克隆
git clone https://github.com/colmap/colmap
cd colmap
git checkout dev
mkdir build
cd build
cmake ..
make -j
sudo make install
验证
colmap -h
colmap gui
colmap处理数据和windwos一样,准备好工作目录,新建images文件夹存放照片,新建sparse文件夹
图像特征提取
相机内参已知的话通过ImageReader.camera_params传入
colmap feature_extractor --database_path database.db --image_path images --ImageReader.camera_model SIMPLE_RADIAL--ImageReader.camera_params "1813.3334,1920,1080"
相机内参未知
colmap feature_extractor --database_path database.db --image_path images --ImageReader.camera_model SIMPLE_RADIAL
图像特征匹配
colmap exhaustive_matcher --database_path database.db
- exhaustive_matcher:针对少量图像(几百张量级),可以获得足够快且最好的重建结果。它将每张图像与其余所有图像进行匹配,不过
block size 可能限制同时加载到内存中的图像数量。 - sequential_matcher:针对顺序采集的视频图像,由于相邻帧存在视觉上的重叠且没有必要进行完全匹配,它只匹配视频流中的相邻帧。同时,这种匹配方式能够基于
vocabulary tree 进行回环检测。最后,帧之间的前后关系由图像文件名给定,与数据集中的存储顺序无关。 - vocab_tree_matcher:针对大量图像(几千帧量级),可以通过提供 vocabulary tree
从而快速检索视觉上最相近的图像进行匹配。 - spatial_matcher:针对能够提供准确定位信息的图像,可以通过对应图像采集时的 GPS 信息从而仅匹配空间位置上相近的图像。
- transitive_matcher:基于传递规则使用已有的特征匹配关系确定更完全的匹配图,即 A 与 B 匹配,B 与 C
匹配,那将直接匹配 A 和 C。 - Custom Matching:通过 text 文件指定图像的匹配关系,如果是导入的特征可以进一步指定两张图像之间特征的匹配关系。
相机位姿求解与优化
mkdir sparse
colmap mapper --database_path database.db --image_path images --output_path sparse --Mapper.ba_refine_principal_point true
colmap 代码
import os
import collections
import numpy as np
import struct
import argparse
CameraModel = collections.namedtuple(
"CameraModel", ["model_id", "model_name", "num_params"])
Camera = collections.namedtuple(
"Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
"Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
Point3D = collections.namedtuple(
"Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
class Image(BaseImage):
def qvec2rotmat(self):
return qvec2rotmat(self.qvec)
CAMERA_MODELS = {
CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
CameraModel(model_id=3, model_name="RADIAL", num_params=5),
CameraModel(model_id=4, model_name="OPENCV", num_params=8),
CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
CameraModel(model_id=7, model_name="FOV", num_params=5),
CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
}
CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
for camera_model in CAMERA_MODELS])
CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
for camera_model in CAMERA_MODELS])
def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
"""Read and unpack the next bytes from a binary file.
:param fid:
:param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
:param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
:param endian_character: Any of {@, =, <, >, !}
:return: Tuple of read and unpacked values.
"""
data = fid.read(num_bytes)
return struct.unpack(endian_character + format_char_sequence, data)
def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
"""pack and write to a binary file.
:param fid:
:param data: data to send, if multiple elements are sent at the same time,
they should be encapsuled either in a list or a tuple
:param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
should be the same length as the data list or tuple
:param endian_character: Any of {@, =, <, >, !}
"""
if isinstance(data, (list, tuple)):
bytes = struct.pack(endian_character + format_char_sequence, *data)
else:
bytes = struct.pack(endian_character + format_char_sequence, data)
fid.write(bytes)
def read_cameras_text(path):
"""
see: src/base/reconstruction.cc
void Reconstruction::WriteCamerasText(const std::string& path)
void Reconstruction::ReadCamerasText(const std::string& path)
"""
cameras = {}
with open(path, "r") as fid:
while True:
line = fid.readline()
if not line:
break
line = line.strip()
if len(line) > 0 and line[0] != "#":
elems = line.split()
camera_id = int(elems[0])
model = elems[1]
width = int(elems[2])
height = int(elems[3])
params = np.array(tuple(map(float, elems[4:])))
cameras[camera_id] = Camera(id=camera_id, model=model,
width=width, height=height,
params=params)
return cameras
def read_cameras_binary(path_to_model_file):
"""
see: src/base/reconstruction.cc
void Reconstruction::WriteCamerasBinary(const std::string& path)
void Reconstruction::ReadCamerasBinary(const std::string& path)
"""
cameras = {}
with open(path_to_model_file, "rb") as fid:
num_cameras = read_next_bytes(fid, 8, "Q")[0]
for _ in range(num_cameras):
camera_properties = read_next_bytes(
fid, num_bytes=24, format_char_sequence="iiQQ")
camera_id = camera_properties[0]
model_id = camera_properties[1]
model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
width = camera_properties[2]
height = camera_properties[3]
num_params = CAMERA_MODEL_IDS[model_id].num_params
params = read_next_bytes(fid, num_bytes=8*num_params,
format_char_sequence="d"*num_params)
cameras[camera_id] = Camera(id=camera_id,
model=model_name,
width=width,
height=height,
params=np.array(params))
assert len(cameras) == num_cameras
return cameras
def write_cameras_text(cameras, path):
"""
see: src/base/reconstruction.cc
void Reconstruction::WriteCamerasText(const std::string& path)
void Reconstruction::ReadCamerasText(const std::string& path)
"""
HEADER = "# Camera list with one line of data per camera:\n" + \
"# CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n" + \
"# Number of cameras: {}\n".format(len(cameras))
with open(path, "w") as fid:
fid.write(HEADER)
for _, cam in cameras.items():
to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params]
line = " ".join([str(elem) for elem in to_write])
fid.write(line + "\n")
def write_cameras_binary(cameras, path_to_model_file):
"""
see: src/base/reconstruction.cc
void Reconstruction::WriteCamerasBinary(const std::string& path)
void Reconstruction::ReadCamerasBinary(const std::string& path)
"""
with open(path_to_model_file, "wb") as fid:
write_next_bytes(fid, len(cameras), "Q")
for _, cam in cameras.items():
model_id = CAMERA_MODEL_NAMES[cam.model].model_id
camera_properties = [cam.id,
model_id,
cam.width,
cam.height]
write_next_bytes(fid, camera_properties, "iiQQ")
for p in cam.params:
write_next_bytes(fid, float(p), "d")
return cameras
def read_images_text(path):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadImagesText(const std::string& path)
void Reconstruction::WriteImagesText(const std::string& path)
"""
images = {}
with open(path, "r") as fid:
while True:
line = fid.readline()
if not line:
break
line = line.strip()
if len(line) > 0 and line[0] != "#":
elems = line.split()
image_id = int(elems[0])
qvec = np.array(tuple(map(float, elems[1:5])))
tvec = np.array(tuple(map(float, elems[5:8])))
camera_id = int(elems[8])
image_name = elems[9]
elems = fid.readline().split()
xys = np.column_stack([tuple(map(float, elems[0::3])),
tuple(map(float, elems[1::3]))])
point3D_ids = np.array(tuple(map(int, elems[2::3])))
images[image_id] = Image(
id=image_id, qvec=qvec, tvec=tvec,
camera_id=camera_id, name=image_name,
xys=xys, point3D_ids=point3D_ids)
return images
def read_images_binary(path_to_model_file):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadImagesBinary(const std::string& path)
void Reconstruction::WriteImagesBinary(const std::string& path)
"""
images = {}
with open(path_to_model_file, "rb") as fid:
num_reg_images = read_next_bytes(fid, 8, "Q")[0]
for _ in range(num_reg_images):
binary_image_properties = read_next_bytes(
fid, num_bytes=64, format_char_sequence="idddddddi")
image_id = binary_image_properties[0]
qvec = np.array(binary_image_properties[1:5])
tvec = np.array(binary_image_properties[5:8])
camera_id = binary_image_properties[8]
image_name = ""
current_char = read_next_bytes(fid, 1, "c")[0]
while current_char != b"\x00": # look for the ASCII 0 entry
image_name += current_char.decode("utf-8")
current_char = read_next_bytes(fid, 1, "c")[0]
num_points2D = read_next_bytes(fid, num_bytes=8,
format_char_sequence="Q")[0]
x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
format_char_sequence="ddq"*num_points2D)
xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
tuple(map(float, x_y_id_s[1::3]))])
point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
images[image_id] = Image(
id=image_id, qvec=qvec, tvec=tvec,
camera_id=camera_id, name=image_name,
xys=xys, point3D_ids=point3D_ids)
return images
def write_images_text(images, path):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadImagesText(const std::string& path)
void Reconstruction::WriteImagesText(const std::string& path)
"""
if len(images) == 0:
mean_observations = 0
else:
mean_observations = sum((len(img.point3D_ids) for _, img in images.items()))/len(images)
HEADER = "# Image list with two lines of data per image:\n" + \
"# IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n" + \
"# POINTS2D[] as (X, Y, POINT3D_ID)\n" + \
"# Number of images: {}, mean observations per image: {}\n".format(len(images), mean_observations)
with open(path, "w") as fid:
fid.write(HEADER)
for _, img in images.items():
image_header = [img.id, *img.qvec, *img.tvec, img.camera_id, img.name]
first_line = " ".join(map(str, image_header))
fid.write(first_line + "\n")
points_strings = []
for xy, point3D_id in zip(img.xys, img.point3D_ids):
points_strings.append(" ".join(map(str, [*xy, point3D_id])))
fid.write(" ".join(points_strings) + "\n")
def write_images_binary(images, path_to_model_file):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadImagesBinary(const std::string& path)
void Reconstruction::WriteImagesBinary(const std::string& path)
"""
with open(path_to_model_file, "wb") as fid:
write_next_bytes(fid, len(images), "Q")
for _, img in images.items():
write_next_bytes(fid, img.id, "i")
write_next_bytes(fid, img.qvec.tolist(), "dddd")
write_next_bytes(fid, img.tvec.tolist(), "ddd")
write_next_bytes(fid, img.camera_id, "i")
for char in img.name:
write_next_bytes(fid, char.encode("utf-8"), "c")
write_next_bytes(fid, b"\x00", "c")
write_next_bytes(fid, len(img.point3D_ids), "Q")
for xy, p3d_id in zip(img.xys, img.point3D_ids):
write_next_bytes(fid, [*xy, p3d_id], "ddq")
def read_points3D_text(path):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadPoints3DText(const std::string& path)
void Reconstruction::WritePoints3DText(const std::string& path)
"""
points3D = {}
with open(path, "r") as fid:
while True:
line = fid.readline()
if not line:
break
line = line.strip()
if len(line) > 0 and line[0] != "#":
elems = line.split()
point3D_id = int(elems[0])
xyz = np.array(tuple(map(float, elems[1:4])))
rgb = np.array(tuple(map(int, elems[4:7])))
error = float(elems[7])
image_ids = np.array(tuple(map(int, elems[8::2])))
point2D_idxs = np.array(tuple(map(int, elems[9::2])))
points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb,
error=error, image_ids=image_ids,
point2D_idxs=point2D_idxs)
return points3D
def read_points3D_binary(path_to_model_file):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadPoints3DBinary(const std::string& path)
void Reconstruction::WritePoints3DBinary(const std::string& path)
"""
points3D = {}
with open(path_to_model_file, "rb") as fid:
num_points = read_next_bytes(fid, 8, "Q")[0]
for _ in range(num_points):
binary_point_line_properties = read_next_bytes(
fid, num_bytes=43, format_char_sequence="QdddBBBd")
point3D_id = binary_point_line_properties[0]
xyz = np.array(binary_point_line_properties[1:4])
rgb = np.array(binary_point_line_properties[4:7])
error = np.array(binary_point_line_properties[7])
track_length = read_next_bytes(
fid, num_bytes=8, format_char_sequence="Q")[0]
track_elems = read_next_bytes(
fid, num_bytes=8*track_length,
format_char_sequence="ii"*track_length)
image_ids = np.array(tuple(map(int, track_elems[0::2])))
point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
points3D[point3D_id] = Point3D(
id=point3D_id, xyz=xyz, rgb=rgb,
error=error, image_ids=image_ids,
point2D_idxs=point2D_idxs)
return points3D
def write_points3D_text(points3D, path):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadPoints3DText(const std::string& path)
void Reconstruction::WritePoints3DText(const std::string& path)
"""
if len(points3D) == 0:
mean_track_length = 0
else:
mean_track_length = sum((len(pt.image_ids) for _, pt in points3D.items()))/len(points3D)
HEADER = "# 3D point list with one line of data per point:\n" + \
"# POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n" + \
"# Number of points: {}, mean track length: {}\n".format(len(points3D), mean_track_length)
with open(path, "w") as fid:
fid.write(HEADER)
for _, pt in points3D.items():
point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error]
fid.write(" ".join(map(str, point_header)) + " ")
track_strings = []
for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs):
track_strings.append(" ".join(map(str, [image_id, point2D])))
fid.write(" ".join(track_strings) + "\n")
def write_points3D_binary(points3D, path_to_model_file):
"""
see: src/base/reconstruction.cc
void Reconstruction::ReadPoints3DBinary(const std::string& path)
void Reconstruction::WritePoints3DBinary(const std::string& path)
"""
with open(path_to_model_file, "wb") as fid:
write_next_bytes(fid, len(points3D), "Q")
for _, pt in points3D.items():
write_next_bytes(fid, pt.id, "Q")
write_next_bytes(fid, pt.xyz.tolist(), "ddd")
write_next_bytes(fid, pt.rgb.tolist(), "BBB")
write_next_bytes(fid, pt.error, "d")
track_length = pt.image_ids.shape[0]
write_next_bytes(fid, track_length, "Q")
for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
write_next_bytes(fid, [image_id, point2D_id], "ii")
def detect_model_format(path, ext):
if os.path.isfile(os.path.join(path, "cameras" + ext)) and \
os.path.isfile(os.path.join(path, "images" + ext)) and \
os.path.isfile(os.path.join(path, "points3D" + ext)):
print("Detected model format: '" + ext + "'")
return True
return False
def read_model(path, ext=""):
# try to detect the extension automatically
if ext == "":
if detect_model_format(path, ".bin"):
ext = ".bin"
elif detect_model_format(path, ".txt"):
ext = ".txt"
else:
print("Provide model format: '.bin' or '.txt'")
return
if ext == ".txt":
cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
images = read_images_text(os.path.join(path, "images" + ext))
points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
else:
cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
images = read_images_binary(os.path.join(path, "images" + ext))
points3D = read_points3D_binary(os.path.join(path, "points3D") + ext)
return cameras, images, points3D
def write_model(cameras, images, points3D, path, ext=".bin"):
if ext == ".txt":
write_cameras_text(cameras, os.path.join(path, "cameras" + ext))
write_images_text(images, os.path.join(path, "images" + ext))
write_points3D_text(points3D, os.path.join(path, "points3D") + ext)
else:
write_cameras_binary(cameras, os.path.join(path, "cameras" + ext))
write_images_binary(images, os.path.join(path, "images" + ext))
write_points3D_binary(points3D, os.path.join(path, "points3D") + ext)
return cameras, images, points3D
def qvec2rotmat(qvec):
return np.array([
[1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
[2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
[2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])
def rotmat2qvec(R):
Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
K = np.array([
[Rxx - Ryy - Rzz, 0, 0, 0],
[Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
[Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
[Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
eigvals, eigvecs = np.linalg.eigh(K)
qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
if qvec[0] < 0:
qvec *= -1
return qvec
def main():
parser = argparse.ArgumentParser(description="Read and write COLMAP binary and text models")
parser.add_argument("--input_model", help="path to input model folder")
parser.add_argument("--input_format", choices=[".bin", ".txt"],
help="input model format", default="")
parser.add_argument("--output_model",
help="path to output model folder")
parser.add_argument("--output_format", choices=[".bin", ".txt"],
help="outut model format", default=".txt")
args = parser.parse_args()
cameras, images, points3D = read_model(path=args.input_model, ext=args.input_format)
print("num_cameras:", len(cameras))
print("num_images:", len(images))
print("num_points3D:", len(points3D))
if args.output_model is not None:
write_model(cameras, images, points3D, path=args.output_model, ext=args.output_format)
if __name__ == "__main__":
main()
colmap2nerf
#!/usr/bin/env python3
# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import argparse
from glob import glob
import os
from pathlib import Path, PurePosixPath
import numpy as np
import json
import sys
import math
import cv2
import os
import shutil
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
SCRIPTS_FOLDER = os.path.join(ROOT_DIR, "scripts")
def parse_args():
parser = argparse.ArgumentParser(
description="Convert a text colmap export to nerf format transforms.json; optionally convert video to images, and optionally run colmap in the first place.")
parser.add_argument("--video_in", default="",
help="Run ffmpeg first to convert a provided video file into a set of images. Uses the video_fps parameter also.")
parser.add_argument("--video_fps", default=2)
parser.add_argument("--time_slice", default="",
help="Time (in seconds) in the format t1,t2 within which the images should be generated from the video. E.g.: \"--time_slice '10,300'\" will generate images only from 10th second to 300th second of the video.")
parser.add_argument("--run_colmap", action="store_true", help="run colmap first on the image folder")
parser.add_argument("--colmap_matcher", default="sequential",
choices=["exhaustive", "sequential", "spatial", "transitive", "vocab_tree"],
help="Select which matcher colmap should use. Sequential for videos, exhaustive for ad-hoc images.")
parser.add_argument("--colmap_db", default="colmap.db", help="colmap database filename")
parser.add_argument("--colmap_camera_model", default="OPENCV",
choices=["SIMPLE_PINHOLE", "PINHOLE", "SIMPLE_RADIAL", "RADIAL", "OPENCV",
"SIMPLE_RADIAL_FISHEYE", "RADIAL_FISHEYE", "OPENCV_FISHEYE"], help="Camera model")
parser.add_argument("--colmap_camera_params", default="",
help="Intrinsic parameters, depending on the chosen model. Format: fx,fy,cx,cy,dist")
parser.add_argument("--images", default="images", help="Input path to the images.")
parser.add_argument("--text", default="colmap_text",
help="Input path to the colmap text files (set automatically if --run_colmap is used).")
parser.add_argument("--aabb_scale", default=32, choices=["1", "2", "4", "8", "16", "32", "64", "128"],
help="Large scene scale factor. 1=scene fits in unit cube; power of 2 up to 128")
parser.add_argument("--skip_early", default=0, help="Skip this many images from the start.")
parser.add_argument("--keep_colmap_coords", action="store_true",
help="Keep transforms.json in COLMAP's original frame of reference (this will avoid reorienting and repositioning the scene for preview and rendering).")
parser.add_argument("--out", default="transforms.json", help="Output JSON file path.")
parser.add_argument("--vocab_path", default="", help="Vocabulary tree path.")
parser.add_argument("--overwrite", action="store_true",
help="Do not ask for confirmation for overwriting existing images and COLMAP data.")
parser.add_argument("--mask_categories", nargs="*", type=str, default=[],
help="Object categories that should be masked out from the training images. See `scripts/category2id.json` for supported categories.")
args = parser.parse_args()
return args
def do_system(arg):
print(f"==== running: {arg}")
err = os.system(arg)
if err:
print("FATAL: command failed")
sys.exit(err)
def run_ffmpeg(args):
ffmpeg_binary = "ffmpeg"
# On Windows, if FFmpeg isn't found, try automatically downloading it from the internet
if os.name == "nt" and os.system(f"where {ffmpeg_binary} >nul 2>nul") != 0:
ffmpeg_glob = os.path.join(ROOT_DIR, "external", "ffmpeg", "*", "bin", "ffmpeg.exe")
candidates = glob(ffmpeg_glob)
if not candidates:
print("FFmpeg not found. Attempting to download FFmpeg from the internet.")
do_system(os.path.join(SCRIPTS_FOLDER, "download_ffmpeg.bat"))
candidates = glob(ffmpeg_glob)
if candidates:
ffmpeg_binary = candidates[0]
if not os.path.isabs(args.images):
args.images = os.path.join(os.path.dirname(args.video_in), args.images)
images = "\"" + args.images + "\""
video = "\"" + args.video_in + "\""
fps = float(args.video_fps) or 1.0
print(f"running ffmpeg with input video file={video}, output image folder={images}, fps={fps}.")
if not args.overwrite and (
input(
f"warning! folder '{images}' will be deleted/replaced. continue? (Y/n)").lower().strip() + "y")[
:1] != "y":
sys.exit(1)
try:
# Passing Images' Path Without Double Quotes
shutil.rmtree(args.images)
except:
pass
do_system(f"mkdir {images}")
time_slice_value = ""
time_slice = args.time_slice
if time_slice:
start, end = time_slice.split(",")
time_slice_value = f",select='between(t\,{start}\,{end})'"
do_system(f"{ffmpeg_binary} -i {video} -qscale:v 1 -qmin 1 -vf \"fps={fps}{time_slice_value}\" {images}/%04d.jpg")
def run_colmap(args):
colmap_binary = "colmap"
# On Windows, if FFmpeg isn't found, try automatically downloading it from the internet
if os.name == "nt" and os.system(f"where {colmap_binary} >nul 2>nul") != 0:
colmap_glob = os.path.join(ROOT_DIR, "external", "colmap", "*", "COLMAP.bat")
candidates = glob(colmap_glob)
if not candidates:
print("COLMAP not found. Attempting to download COLMAP from the internet.")
do_system(os.path.join(SCRIPTS_FOLDER, "download_colmap.bat"))
candidates = glob(colmap_glob)
if candidates:
colmap_binary = candidates[0]
db = args.colmap_db
images = "\"" + args.images + "\""
db_noext = str(Path(db).with_suffix(""))
if args.text == "text":
args.text = db_noext + "_text"
text = args.text
sparse = db_noext + "_sparse"
print(f"running colmap with:\n\tdb={db}\n\timages={images}\n\tsparse={sparse}\n\ttext={text}")
if not args.overwrite and (
input(
f"warning! folders '{sparse}' and '{text}' will be deleted/replaced. continue? (Y/n)").lower().strip() + "y")[
:1] != "y":
sys.exit(1)
if os.path.exists(db):
os.remove(db)
do_system(
f"{colmap_binary} feature_extractor --ImageReader.camera_model {args.colmap_camera_model} --ImageReader.camera_params \"{args.colmap_camera_params}\" --SiftExtraction.estimate_affine_shape=true --SiftExtraction.domain_size_pooling=true --ImageReader.single_camera 1 --database_path {db} --image_path {images}")
match_cmd = f"{colmap_binary} {args.colmap_matcher}_matcher --SiftMatching.guided_matching=true --database_path {db}"
if args.vocab_path:
match_cmd += f" --VocabTreeMatching.vocab_tree_path {args.vocab_path}"
do_system(match_cmd)
try:
shutil.rmtree(sparse)
except:
pass
do_system(f"mkdir {sparse}")
do_system(f"{colmap_binary} mapper --database_path {db} --image_path {images} --output_path {sparse}")
do_system(
f"{colmap_binary} bundle_adjuster --input_path {sparse}/0 --output_path {sparse}/0 --BundleAdjustment.refine_principal_point 1")
try:
shutil.rmtree(text)
except:
pass
do_system(f"mkdir {text}")
do_system(f"{colmap_binary} model_converter --input_path {sparse}/0 --output_path {text} --output_type TXT")
def variance_of_laplacian(image):
return cv2.Laplacian(image, cv2.CV_64F).var()
def sharpness(imagePath):
image = cv2.imread(imagePath)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
fm = variance_of_laplacian(gray)
return fm
def qvec2rotmat(qvec):
return np.array([
[
1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
], [
2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
], [
2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2
]
])
def rotmat(a, b):
a, b = a / np.linalg.norm(a), b / np.linalg.norm(b)
v = np.cross(a, b)
c = np.dot(a, b)
# handle exception for the opposite direction input
if c < -1 + 1e-10:
return rotmat(a + np.random.uniform(-1e-2, 1e-2, 3), b)
s = np.linalg.norm(v)
kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
return np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s ** 2 + 1e-10))
def closest_point_2_lines(oa, da, ob,
db): # returns point closest to both rays of form o+t*d, and a weight factor that goes to 0 if the lines are parallel
da = da / np.linalg.norm(da)
db = db / np.linalg.norm(db)
c = np.cross(da, db)
denom = np.linalg.norm(c) ** 2
t = ob - oa
ta = np.linalg.det([t, db, c]) / (denom + 1e-10)
tb = np.linalg.det([t, da, c]) / (denom + 1e-10)
if ta > 0:
ta = 0
if tb > 0:
tb = 0
return (oa + ta * da + ob + tb * db) * 0.5, denom
if __name__ == "__main__":
args = parse_args()
if args.video_in != "":
run_ffmpeg(args)
if args.run_colmap:
run_colmap(args)
AABB_SCALE = int(args.aabb_scale)
SKIP_EARLY = int(args.skip_early)
IMAGE_FOLDER = args.images
TEXT_FOLDER = args.text
OUT_PATH = args.out
# Check that we can save the output before we do a lot of work
try:
open(OUT_PATH, "a").close()
except Exception as e:
print(f"Could not save transforms JSON to {OUT_PATH}: {e}")
sys.exit(1)
print(f"outputting to {OUT_PATH}...")
cameras = {}
with open(os.path.join(TEXT_FOLDER, "cameras.txt"), "r") as f:
camera_angle_x = math.pi / 2
for line in f:
# 1 SIMPLE_RADIAL 2048 1536 1580.46 1024 768 0.0045691
# 1 OPENCV 3840 2160 3178.27 3182.09 1920 1080 0.159668 -0.231286 -0.00123982 0.00272224
# 1 RADIAL 1920 1080 1665.1 960 540 0.0672856 -0.0761443
if line[0] == "#":
continue
els = line.split(" ")
camera = {}
camera_id = int(els[0])
camera["w"] = float(els[2])
camera["h"] = float(els[3])
camera["fl_x"] = float(els[4])
camera["fl_y"] = float(els[4])
camera["k1"] = 0
camera["k2"] = 0
camera["k3"] = 0
camera["k4"] = 0
camera["p1"] = 0
camera["p2"] = 0
camera["cx"] = camera["w"] / 2
camera["cy"] = camera["h"] / 2
camera["is_fisheye"] = False
if els[1] == "SIMPLE_PINHOLE":
camera["cx"] = float(els[5])
camera["cy"] = float(els[6])
elif els[1] == "PINHOLE":
camera["fl_y"] = float(els[5])
camera["cx"] = float(els[6])
camera["cy"] = float(els[7])
elif els[1] == "SIMPLE_RADIAL":
camera["cx"] = float(els[5])
camera["cy"] = float(els[6])
camera["k1"] = float(els[7])
elif els[1] == "RADIAL":
camera["cx"] = float(els[5])
camera["cy"] = float(els[6])
camera["k1"] = float(els[7])
camera["k2"] = float(els[8])
elif els[1] == "OPENCV":
camera["fl_y"] = float(els[5])
camera["cx"] = float(els[6])
camera["cy"] = float(els[7])
camera["k1"] = float(els[8])
camera["k2"] = float(els[9])
camera["p1"] = float(els[10])
camera["p2"] = float(els[11])
elif els[1] == "SIMPLE_RADIAL_FISHEYE":
camera["is_fisheye"] = True
camera["cx"] = float(els[5])
camera["cy"] = float(els[6])
camera["k1"] = float(els[7])
elif els[1] == "RADIAL_FISHEYE":
camera["is_fisheye"] = True
camera["cx"] = float(els[5])
camera["cy"] = float(els[6])
camera["k1"] = float(els[7])
camera["k2"] = float(els[8])
elif els[1] == "OPENCV_FISHEYE":
camera["is_fisheye"] = True
camera["fl_y"] = float(els[5])
camera["cx"] = float(els[6])
camera["cy"] = float(els[7])
camera["k1"] = float(els[8])
camera["k2"] = float(els[9])
camera["k3"] = float(els[10])
camera["k4"] = float(els[11])
else:
print("Unknown camera model ", els[1])
# fl = 0.5 * w / tan(0.5 * angle_x);
camera["camera_angle_x"] = math.atan(camera["w"] / (camera["fl_x"] * 2)) * 2
camera["camera_angle_y"] = math.atan(camera["h"] / (camera["fl_y"] * 2)) * 2
camera["fovx"] = camera["camera_angle_x"] * 180 / math.pi
camera["fovy"] = camera["camera_angle_y"] * 180 / math.pi
print(
f"camera {camera_id}:\n\tres={camera['w'], camera['h']}\n\tcenter={camera['cx'], camera['cy']}\n\tfocal={camera['fl_x'], camera['fl_y']}\n\tfov={camera['fovx'], camera['fovy']}\n\tk={camera['k1'], camera['k2']} p={camera['p1'], camera['p2']} ")
cameras[camera_id] = camera
if len(cameras) == 0:
print("No cameras found!")
sys.exit(1)
with open(os.path.join(TEXT_FOLDER, "images.txt"), "r") as f:
i = 0
bottom = np.array([0.0, 0.0, 0.0, 1.0]).reshape([1, 4])
if len(cameras) == 1:
camera = cameras[camera_id]
out = {
"camera_angle_x": camera["camera_angle_x"],
"camera_angle_y": camera["camera_angle_y"],
"fl_x": camera["fl_x"],
"fl_y": camera["fl_y"],
"k1": camera["k1"],
"k2": camera["k2"],
"k3": camera["k3"],
"k4": camera["k4"],
"p1": camera["p1"],
"p2": camera["p2"],
"is_fisheye": camera["is_fisheye"],
"cx": camera["cx"],
"cy": camera["cy"],
"w": camera["w"],
"h": camera["h"],
"aabb_scale": AABB_SCALE,
"frames": [],
}
else:
out = {
"frames": [],
"aabb_scale": AABB_SCALE
}
up = np.zeros(3)
for line in f:
line = line.strip()
if line[0] == "#":
continue
i = i + 1
if i < SKIP_EARLY * 2:
continue
if i % 2 == 1:
elems = line.split(
" ") # 1-4 is quat, 5-7 is trans, 9ff is filename (9, if filename contains no spaces)
# name = str(PurePosixPath(Path(IMAGE_FOLDER, elems[9])))
# why is this requireing a relitive path while using ^
image_rel = os.path.relpath(IMAGE_FOLDER)
name = str(f"./{image_rel}/{'_'.join(elems[9:])}")
b = sharpness(name)
print(name, "sharpness=", b)
image_id = int(elems[0])
qvec = np.array(tuple(map(float, elems[1:5])))
tvec = np.array(tuple(map(float, elems[5:8])))
R = qvec2rotmat(-qvec)
t = tvec.reshape([3, 1])
m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)
c2w = np.linalg.inv(m)
if not args.keep_colmap_coords:
c2w[0:3, 2] *= -1 # flip the y and z axis
c2w[0:3, 1] *= -1
c2w = c2w[[1, 0, 2, 3], :]
c2w[2, :] *= -1 # flip whole world upside down
up += c2w[0:3, 1]
frame = {"file_path": name, "sharpness": b, "transform_matrix": c2w}
if len(cameras) != 1:
frame.update(cameras[int(elems[8])])
out["frames"].append(frame)
nframes = len(out["frames"])
if args.keep_colmap_coords:
flip_mat = np.array([
[1, 0, 0, 0],
[0, -1, 0, 0],
[0, 0, -1, 0],
[0, 0, 0, 1]
])
for f in out["frames"]:
f["transform_matrix"] = np.matmul(f["transform_matrix"], flip_mat) # flip cameras (it just works)
else:
# don't keep colmap coords - reorient the scene to be easier to work with
up = up / np.linalg.norm(up)
print("up vector was", up)
R = rotmat(up, [0, 0, 1]) # rotate up vector to [0,0,1]
R = np.pad(R, [0, 1])
R[-1, -1] = 1
for f in out["frames"]:
f["transform_matrix"] = np.matmul(R, f["transform_matrix"]) # rotate up to be the z axis
# find a central point they are all looking at
print("computing center of attention...")
totw = 0.0
totp = np.array([0.0, 0.0, 0.0])
for f in out["frames"]:
mf = f["transform_matrix"][0:3, :]
for g in out["frames"]:
mg = g["transform_matrix"][0:3, :]
p, w = closest_point_2_lines(mf[:, 3], mf[:, 2], mg[:, 3], mg[:, 2])
if w > 0.00001:
totp += p * w
totw += w
if totw > 0.0:
totp /= totw
print(totp) # the cameras are looking at totp
for f in out["frames"]:
f["transform_matrix"][0:3, 3] -= totp
avglen = 0.
for f in out["frames"]:
avglen += np.linalg.norm(f["transform_matrix"][0:3, 3])
avglen /= nframes
print("avg camera distance from origin", avglen)
for f in out["frames"]:
f["transform_matrix"][0:3, 3] *= 4.0 / avglen # scale to "nerf sized"
for f in out["frames"]:
f["transform_matrix"] = f["transform_matrix"].tolist()
print(nframes, "frames")
print(f"writing {OUT_PATH}")
with open(OUT_PATH, "w") as outfile:
json.dump(out, outfile, indent=2)
if len(args.mask_categories) > 0:
# Check if detectron2 is installed. If not, install it.
try:
import detectron2
except ModuleNotFoundError:
try:
import torch
except ModuleNotFoundError:
print("PyTorch is not installed. For automatic masking, install PyTorch from https://pytorch.org/")
sys.exit(1)
input("Detectron2 is not installed. Press enter to install it.")
import subprocess
package = 'git+https://github.com/facebookresearch/detectron2.git'
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
import detectron2
import torch
from pathlib import Path
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
category2id = json.load(open(os.path.join(SCRIPTS_FOLDER, "category2id.json"), "r"))
mask_ids = [category2id[c] for c in args.mask_categories]
cfg = get_cfg()
# Add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 # set threshold for this model
# Find a model from detectron2's model zoo.
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
predictor = DefaultPredictor(cfg)
for frame in out["frames"]:
img = cv2.imread(frame["file_path"])
outputs = predictor(img)
output_mask = np.zeros((img.shape[0], img.shape[1]))
for i in range(len(outputs["instances"])):
if outputs["instances"][i].pred_classes.cpu().numpy()[0] in mask_ids:
pred_mask = outputs["instances"][i].pred_masks.cpu().numpy()[0]
output_mask = np.logical_or(output_mask, pred_mask)
rgb_path = Path(frame["file_path"])
mask_name = str(rgb_path.parents[0] / Path("dynamic_mask_" + rgb_path.name.replace(".jpg", ".png")))
cv2.imwrite(mask_name, (output_mask * 255).astype(np.uint8))