diff --git a/nerfstudio/process_data/meshroom_utils.py b/nerfstudio/process_data/meshroom_utils.py new file mode 100644 index 0000000000..43526a14f0 --- /dev/null +++ b/nerfstudio/process_data/meshroom_utils.py @@ -0,0 +1,276 @@ +# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper utils for processing meshroom data into the nerfstudio format.""" + +import json +import math +from copy import deepcopy as dc +from pathlib import Path +from typing import Dict, List, Optional + +import numpy as np + +from nerfstudio.process_data.process_data_utils import CAMERA_MODELS +from nerfstudio.utils.rich_utils import CONSOLE + +# Rotation matrix to adjust coordinate system +ROT_MAT = np.array([[1, 0, 0, 0], [0, 0, 1, 0], [0, -1, 0, 0], [0, 0, 0, 1]]) + + +def reflect(axis, size=4): + """Create a reflection matrix along the specified axis.""" + _diag = np.ones(size) + _diag[axis] = -1 + refl = np.diag(_diag) + return refl + + +def Mat2Nerf(mat): + """Convert a matrix to NeRF coordinate system.""" + M = np.array(mat) + M = (M @ reflect(2)) @ reflect(1) + return M + + +def closest_point_2_lines(oa, da, ob, db): + """Find the point closest to both rays of form o+t*d.""" + da = da / np.linalg.norm(da) + db = db / np.linalg.norm(db) + c = np.cross(da, db) + denom = np.linalg.norm(c) ** 2 + t = ob - oa + ta = np.linalg.det([t, db, c]) / (denom + 1e-10) + tb = np.linalg.det([t, da, c]) / (denom + 1e-10) + if ta > 0: + ta = 0 + if tb > 0: + tb = 0 + return (oa + ta * da + ob + tb * db) * 0.5, denom + + +def central_point(out): + """Find a central point all cameras are looking at.""" + CONSOLE.print("Computing center of attention...") + totw = 0.0 + totp = np.array([0.0, 0.0, 0.0]) + for f in out["frames"]: + mf = np.array(f["transform_matrix"])[0:3, :] + for g in out["frames"]: + mg = np.array(g["transform_matrix"])[0:3, :] + p, w = closest_point_2_lines(mf[:, 3], mf[:, 2], mg[:, 3], mg[:, 2]) + if w > 0.01: + totp += p * w + totw += w + + if len(out["frames"]) == 0: + CONSOLE.print("[bold red]No frames found when computing center of attention[/bold red]") + return totp + + if (totw == 0) and (not totp.any()): + CONSOLE.print("[bold red]Center of attention is zero[/bold red]") + return totp + + totp /= totw + CONSOLE.print(f"The center of attention is: {totp}") + + return totp + + +def build_sensor(intrinsic): + """Build camera intrinsics from Meshroom data.""" + out = {} + out["w"] = float(intrinsic["width"]) + out["h"] = float(intrinsic["height"]) + + # Focal length in mm + focal = float(intrinsic["focalLength"]) + + # Sensor width in mm + sensor_width = float(intrinsic["sensorWidth"]) + sensor_height = float(intrinsic["sensorHeight"]) + + # Focal length in pixels + out["fl_x"] = (out["w"] * focal) / sensor_width + + # Check W/H ratio to sensor ratio + if np.isclose((out["w"] / out["h"]), (sensor_width / sensor_height)): + out["fl_y"] = (out["h"] * focal) / sensor_height + else: + CONSOLE.print( + "[yellow]WARNING: W/H ratio does not match sensor ratio, this is likely a bug from Meshroom. Will use fl_x to set fl_y.[/yellow]" + ) + out["fl_y"] = out["fl_x"] + + camera_angle_x = math.atan(out["w"] / (out["fl_x"]) * 2) * 2 + camera_angle_y = math.atan(out["h"] / (out["fl_y"]) * 2) * 2 + + out["camera_angle_x"] = camera_angle_x + out["camera_angle_y"] = camera_angle_y + + out["cx"] = float(intrinsic["principalPoint"][0]) + (out["w"] / 2.0) + out["cy"] = float(intrinsic["principalPoint"][1]) + (out["h"] / 2.0) + + if intrinsic["type"] == "radial3": + for i, coef in enumerate(intrinsic["distortionParams"]): + out[f"k{i + 1}"] = float(coef) + + return out + + +def meshroom_to_json( + image_filename_map: Dict[str, Path], + json_filename: Path, + output_dir: Path, + ply_filename: Optional[Path] = None, + verbose: bool = False, +) -> List[str]: + """Convert Meshroom data into a nerfstudio dataset. + + Args: + image_filename_map: Mapping of original image filenames to their saved locations. + json_filename: Path to the Meshroom json file. + output_dir: Path to the output directory. + ply_filename: Path to the exported ply file. + verbose: Whether to print verbose output. + + Returns: + Summary of the conversion. + """ + summary_log = [] + + with open(json_filename, "r") as f: + data = json.load(f) + + # Create output structure + out = {} + out["aabb_scale"] = 16 # Default value + + # Extract transforms from Meshroom data + transforms = {} + for pose in data.get("poses", []): + transform = pose["pose"]["transform"] + rot = np.asarray(transform["rotation"]) + rot = rot.reshape(3, 3).astype(float) + + ctr = np.asarray(transform["center"]) + ctr = ctr.astype(float) + + M = np.eye(4) + M[:3, :3] = rot + M[:3, 3] = ctr + + M = Mat2Nerf(M.astype(float)) + transforms[pose["poseId"]] = np.dot(ROT_MAT, M) + + # Extract intrinsics from Meshroom data + intrinsics = {} + for intrinsic in data.get("intrinsics", []): + intrinsics[intrinsic["intrinsicId"]] = build_sensor(intrinsic) + + # Set camera model based on intrinsic type + if data.get("intrinsics") and "type" in data["intrinsics"][0]: + intrinsic_type = data["intrinsics"][0]["type"] + if intrinsic_type in ["radial1", "radial3"]: + out["camera_model"] = CAMERA_MODELS["perspective"].value + elif intrinsic_type in ["fisheye", "fisheye4"]: + out["camera_model"] = CAMERA_MODELS["fisheye"].value + else: + # Default to perspective + out["camera_model"] = CAMERA_MODELS["perspective"].value + else: + out["camera_model"] = CAMERA_MODELS["perspective"].value + + # Build frames + frames = [] + skipped_images = 0 + + for view in data.get("views", []): + # Get the image name from the path + path = Path(view["path"]) + name = path.stem + + # Check if the image exists in our mapping + if name not in image_filename_map: + if verbose: + CONSOLE.print(f"[yellow]Missing image for {name}, skipping[/yellow]") + skipped_images += 1 + continue + + # Get poseId and intrinsicId + poseId = view["poseId"] + intrinsicId = view["intrinsicId"] + + # Check if we have the necessary data + if poseId not in transforms: + if verbose: + CONSOLE.print(f"[yellow]PoseId {poseId} not found in transforms, skipping image: {name}[/yellow]") + skipped_images += 1 + continue + + if intrinsicId not in intrinsics: + if verbose: + CONSOLE.print(f"[yellow]IntrinsicId {intrinsicId} not found, skipping image: {name}[/yellow]") + skipped_images += 1 + continue + + # Create camera data + camera = {} + camera.update(dc(intrinsics[intrinsicId])) + camera["transform_matrix"] = transforms[poseId] + camera["file_path"] = image_filename_map[name].as_posix() + + frames.append(camera) + + out["frames"] = frames + + # Calculate center point + center = central_point(out) + + # Adjust camera positions by centering + for f in out["frames"]: + f["transform_matrix"][0:3, 3] -= center + f["transform_matrix"] = f["transform_matrix"].tolist() + + # Include point cloud if provided + if ply_filename is not None: + import open3d as o3d + + # Create the applied transform + applied_transform = np.eye(4)[:3, :] + applied_transform = applied_transform[np.array([2, 0, 1]), :] + out["applied_transform"] = applied_transform.tolist() + + # Load and transform point cloud + pc = o3d.io.read_point_cloud(str(ply_filename)) + points3D = np.asarray(pc.points) + points3D = np.einsum("ij,bj->bi", applied_transform[:3, :3], points3D) + applied_transform[:3, 3] + pc.points = o3d.utility.Vector3dVector(points3D) + o3d.io.write_point_cloud(str(output_dir / "sparse_pc.ply"), pc) + out["ply_file_path"] = "sparse_pc.ply" + summary_log.append(f"Imported {ply_filename} as starting points") + + # Write output + with open(output_dir / "transforms.json", "w", encoding="utf-8") as f: + json.dump(out, f, indent=4) + + # Add summary info + if skipped_images == 1: + summary_log.append(f"{skipped_images} image skipped due to missing camera pose or intrinsic data.") + elif skipped_images > 1: + summary_log.append(f"{skipped_images} images were skipped due to missing camera poses or intrinsic data.") + + summary_log.append(f"Final dataset contains {len(out['frames'])} frames.") + + return summary_log diff --git a/nerfstudio/scripts/process_data.py b/nerfstudio/scripts/process_data.py index 1fdd36f7f2..23626e713b 100644 --- a/nerfstudio/scripts/process_data.py +++ b/nerfstudio/scripts/process_data.py @@ -26,6 +26,7 @@ from typing_extensions import Annotated from nerfstudio.process_data import ( + meshroom_utils, metashape_utils, odm_utils, polycam_utils, @@ -330,6 +331,106 @@ def main(self) -> None: CONSOLE.rule() +@dataclass +class _NoDefaultProcessMeshroom: + """Private class to order the parameters of ProcessMeshroom in the right order for default values.""" + + json: Path + """Path to the Meshroom sfm.json file.""" + + +@dataclass +class ProcessMeshroom(BaseConverterToNerfstudioDataset, _NoDefaultProcessMeshroom): + """Process Meshroom data into a nerfstudio dataset. + + This script assumes that cameras have been aligned using Meshroom. After alignment, it is necessary to export the + camera poses as a `.json` file. + + Optional: Meshroom does not align or constrain solved cameras, you may want to add a SfMTransform after the StructureFromMotion node, set the Transformation Method to Manual, and adjust camera positioning. + + When you Start Meshroom processing, it generates an output folder for the ConvertSfMFormat node (right click > Open Folder). The sfm.json file needed for this script's --input function will be generated there. + + This script does the following: + 1. Scales images to a specified size. + 2. Converts Meshroom poses into the nerfstudio format. + """ + + ply: Optional[Path] = None + """Path to the Meshroom point export ply file.""" + + num_downscales: int = 3 + """Number of times to downscale the images. Downscales by 2 each time. For example a value of 3 + will downscale the images by 2x, 4x, and 8x.""" + max_dataset_size: int = 600 + """Max number of images to train on. If the dataset has more, images will be sampled approximately evenly. If -1, + use all images.""" + + def main(self) -> None: + """Process images into a nerfstudio dataset.""" + + if self.json.suffix != ".json": + raise ValueError(f"JSON file {self.json} must have a .json extension") + if not self.json.exists(): + raise ValueError(f"JSON file {self.json} doesn't exist") + if self.eval_data is not None: + raise ValueError("Cannot use eval_data since cameras were already aligned with Meshroom.") + + if self.ply is not None: + if self.ply.suffix != ".ply": + raise ValueError(f"PLY file {self.ply} must have a .ply extension") + if not self.ply.exists(): + raise ValueError(f"PLY file {self.ply} doesn't exist") + + self.output_dir.mkdir(parents=True, exist_ok=True) + image_dir = self.output_dir / "images" + image_dir.mkdir(parents=True, exist_ok=True) + + summary_log = [] + + # Copy images to output directory + image_filenames, num_orig_images = process_data_utils.get_image_filenames(self.data, self.max_dataset_size) + copied_image_paths = process_data_utils.copy_images_list( + image_filenames, + image_dir=image_dir, + verbose=self.verbose, + num_downscales=self.num_downscales, + ) + num_frames = len(copied_image_paths) + + copied_image_paths = [Path("images/" + copied_image_path.name) for copied_image_path in copied_image_paths] + original_names = [image_path.stem for image_path in image_filenames] + image_filename_map = dict(zip(original_names, copied_image_paths)) + + if self.max_dataset_size > 0 and num_frames != num_orig_images: + summary_log.append(f"Started with {num_frames} images out of {num_orig_images} total") + summary_log.append( + "To change the size of the dataset add the argument [yellow]--max_dataset_size[/yellow] to " + f"larger than the current value ({self.max_dataset_size}), or -1 to use all images." + ) + else: + summary_log.append(f"Started with {num_frames} images") + + # Save json + if num_frames == 0: + CONSOLE.print("[bold red]No images found, exiting") + sys.exit(1) + summary_log.extend( + meshroom_utils.meshroom_to_json( + image_filename_map=image_filename_map, + json_filename=self.json, + output_dir=self.output_dir, + ply_filename=self.ply, + verbose=self.verbose, + ) + ) + + CONSOLE.rule("[bold green]:tada: :tada: :tada: All DONE :tada: :tada: :tada:") + + for summary in summary_log: + CONSOLE.print(summary, justify="center") + CONSOLE.rule() + + @dataclass class _NoDefaultProcessRealityCapture: """Private class to order the parameters of ProcessRealityCapture in the right order for default values.""" @@ -529,6 +630,7 @@ def main(self) -> None: ... Annotated[ProcessRealityCapture, tyro.conf.subcommand(name="realitycapture")], Annotated[ProcessRecord3D, tyro.conf.subcommand(name="record3d")], Annotated[ProcessODM, tyro.conf.subcommand(name="odm")], + Annotated[ProcessMeshroom, tyro.conf.subcommand(name="meshroom")], ] # Add aria subcommand if projectaria_tools is installed.