From 5d1a85b0fabefd22ea58262c883c3b9fb61b4380 Mon Sep 17 00:00:00 2001 From: Ryan Co Date: Mon, 9 Mar 2026 23:15:46 -0400 Subject: [PATCH] pi zarr training configs --- egomimic/algo/pi.py | 22 ++-- .../hydra/launcher/submitit_pace.yaml | 2 +- .../model/{pi0.5.yaml => pi0.5_bc_aria.yaml} | 2 +- .../hydra_configs/model/pi0.5_bc_eva.yaml | 56 +++++++++ .../hydra_configs/model/pi0.5_bc_mecka.yaml | 56 +++++++++ .../hydra_configs/model/pi0.5_bc_scale.yaml | 56 +++++++++ .../model/pi0.5_cotrain_eva_aria.yaml | 64 ++++++++++ egomimic/hydra_configs/train_zarr_pi.yaml | 114 ++++++++++++++++++ egomimic/utils/aws/aws_data_utils.py | 15 ++- 9 files changed, 374 insertions(+), 13 deletions(-) rename egomimic/hydra_configs/model/{pi0.5.yaml => pi0.5_bc_aria.yaml} (90%) create mode 100644 egomimic/hydra_configs/model/pi0.5_bc_eva.yaml create mode 100644 egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml create mode 100644 egomimic/hydra_configs/model/pi0.5_bc_scale.yaml create mode 100644 egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml create mode 100644 egomimic/hydra_configs/train_zarr_pi.yaml diff --git a/egomimic/algo/pi.py b/egomimic/algo/pi.py index f28158bb..27fef0d7 100644 --- a/egomimic/algo/pi.py +++ b/egomimic/algo/pi.py @@ -86,19 +86,19 @@ def __init__( self.camera_keys[embodiment_id] = [] self.proprio_keys[embodiment_id] = [] self.lang_keys[embodiment_id] = [] - for key in data_schematic.keys_of_type("action_keys"): + for key in data_schematic.keys_of_type("action_keys", embodiment_id): if ( data_schematic.is_key_with_embodiment(key, embodiment_id) and key == self.ac_keys[embodiment] ): self.ac_keys[embodiment_id] = key - for key in data_schematic.keys_of_type("camera_keys"): + for key in data_schematic.keys_of_type("camera_keys", embodiment_id): if data_schematic.is_key_with_embodiment(key, embodiment_id): self.camera_keys[embodiment_id].append(key) - for key in data_schematic.keys_of_type("proprio_keys"): + for key in data_schematic.keys_of_type("proprio_keys", embodiment_id): if data_schematic.is_key_with_embodiment(key, embodiment_id): self.proprio_keys[embodiment_id].append(key) - for key in data_schematic.keys_of_type("lang_keys"): + for key in data_schematic.keys_of_type("lang_keys", embodiment_id): if data_schematic.is_key_with_embodiment(key, embodiment_id): self.lang_keys[embodiment_id].append(key) @@ -165,12 +165,11 @@ def process_batch_for_training(self, batch): """ processed_batch = {} - for embodiment_id, _batch in batch.items(): + for embodiment_name, _batch in batch.items(): + embodiment_id = get_embodiment_id(embodiment_name) processed_batch[embodiment_id] = {} for key, value in _batch.items(): - key_name = self.data_schematic.lerobot_key_to_keyname( - key, embodiment_id - ) + key_name = self.data_schematic.zarr_key_to_keyname(key, embodiment_id) if key_name is not None: processed_batch[embodiment_id][key_name] = value @@ -196,6 +195,13 @@ def process_batch_for_training(self, batch): processed_batch[embodiment_id] = self.data_schematic.normalize_data( processed_batch[embodiment_id], embodiment_id ) + processed_batch[embodiment_id]["embodiment"] = torch.tensor( + [embodiment_id], device=device, dtype=torch.int64 + ) + + for key, value in processed_batch[embodiment_id].items(): + if isinstance(value, torch.Tensor) and value.dtype == torch.float64: + processed_batch[embodiment_id][key] = value.float() if not processed_batch: raise ValueError( diff --git a/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml b/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml index 2d9cd957..34a1294b 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml @@ -11,7 +11,7 @@ cpus_per_task: 8 # Number of CPUs per task (ma nodes: ${launch_params.nodes} # Number of nodes tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:h200:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count (h100 for H100 GPUs) -qos: "short" # Slurm QoS +qos: "inferno" # Slurm QoS mem_per_gpu: 250G timeout_min: 2880 # Timeout in minutes (48 hours) # exclude: "protocol, puma" # Nodes to exclude diff --git a/egomimic/hydra_configs/model/pi0.5.yaml b/egomimic/hydra_configs/model/pi0.5_bc_aria.yaml similarity index 90% rename from egomimic/hydra_configs/model/pi0.5.yaml rename to egomimic/hydra_configs/model/pi0.5_bc_aria.yaml index c1af0cc4..0e9a5966 100644 --- a/egomimic/hydra_configs/model/pi0.5.yaml +++ b/egomimic/hydra_configs/model/pi0.5_bc_aria.yaml @@ -20,7 +20,7 @@ robomimic_model: config: pytorch_training_precision: bfloat16 - pytorch_weight_path: /storage/home/hcoda1/5/rpunamiya6/cedar-dx/rpunamiya6/Projects/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch + pytorch_weight_path: /storage/project/r-dxu345-0/rco3/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch model: pi05: true action_dim: 32 diff --git a/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml b/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml new file mode 100644 index 00000000..d1886d53 --- /dev/null +++ b/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml @@ -0,0 +1,56 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.pi.PI + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + eva_bimanual: "actions_cartesian" + domains: ["eva_bimanual"] + + action_converters: + rules: + EVA_BIMANUAL: + _target_: egomimic.utils.action_utils.RobotBimanualCartesianEuler + # optional fallback if no match is found + fallback: + _target_: egomimic.utils.action_utils.BaseActionConverter + + config: + pytorch_training_precision: bfloat16 + pytorch_weight_path: /storage/project/r-dxu345-0/rco3/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch + model: + pi05: true + action_dim: 32 + action_horizon: 100 + max_token_len: 180 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-4 + betas: [0.9, 0.999] + eps: 1e-8 + weight_decay: 0.0 + +scheduler: + _target_: transformers.get_cosine_schedule_with_warmup + _partial_: true + num_warmup_steps: 1000 + num_training_steps: 100000 + num_cycles: 0.5 \ No newline at end of file diff --git a/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml b/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml new file mode 100644 index 00000000..e7d2117c --- /dev/null +++ b/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml @@ -0,0 +1,56 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.pi.PI + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "mecka" # change to base_half if using half res + extrinsics_key: "mecka" + ac_keys: + mecka_bimanual: "actions_cartesian" + domains: ["mecka_bimanual"] + + action_converters: + rules: + MECKA_BIMANUAL: + _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + # optional fallback if no match is found + fallback: + _target_: egomimic.utils.action_utils.BaseActionConverter + + config: + pytorch_training_precision: bfloat16 + pytorch_weight_path: /storage/project/r-dxu345-0/rco3/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch + model: + pi05: true + action_dim: 32 + action_horizon: 100 + max_token_len: 180 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-4 + betas: [0.9, 0.999] + eps: 1e-8 + weight_decay: 0.0 + +scheduler: + _target_: transformers.get_cosine_schedule_with_warmup + _partial_: true + num_warmup_steps: 1000 + num_training_steps: 100000 + num_cycles: 0.5 \ No newline at end of file diff --git a/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml b/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml new file mode 100644 index 00000000..521abbaf --- /dev/null +++ b/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml @@ -0,0 +1,56 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.pi.PI + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "scale" # change to base_half if using half res + extrinsics_key: "scale" + ac_keys: + scale_bimanual: "actions_cartesian" + domains: ["scale_bimanual"] + + action_converters: + rules: + SCALE_BIMANUAL: + _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + # optional fallback if no match is found + fallback: + _target_: egomimic.utils.action_utils.BaseActionConverter + + config: + pytorch_training_precision: bfloat16 + pytorch_weight_path: /storage/project/r-dxu345-0/rco3/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch + model: + pi05: true + action_dim: 32 + action_horizon: 100 + max_token_len: 180 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-4 + betas: [0.9, 0.999] + eps: 1e-8 + weight_decay: 0.0 + +scheduler: + _target_: transformers.get_cosine_schedule_with_warmup + _partial_: true + num_warmup_steps: 1000 + num_training_steps: 100000 + num_cycles: 0.5 \ No newline at end of file diff --git a/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml b/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml new file mode 100644 index 00000000..c0521f56 --- /dev/null +++ b/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml @@ -0,0 +1,64 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.pi.PI + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "ariaJun7" + ac_keys: + eva_bimanual: "actions_cartesian" + aria_bimanual: "actions_cartesian" + domains: ["eva_bimanual", "aria_bimanual"] + + action_converters: + rules: + EVA_BIMANUAL: + _target_: egomimic.utils.action_utils.RobotBimanualCartesianEuler + ARIA_BIMANUAL: + _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + # optional fallback if no match is found + fallback: + _target_: egomimic.utils.action_utils.BaseActionConverter + + config: + pytorch_training_precision: bfloat16 + pytorch_weight_path: /storage/project/r-dxu345-0/rco3/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch + model: + pi05: true + action_dim: 32 + action_horizon: 100 + max_token_len: 180 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: 224 + interpolation: 3 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-4 + betas: [0.9, 0.999] + eps: 1e-8 + weight_decay: 0.0 + +scheduler: + _target_: transformers.get_cosine_schedule_with_warmup + _partial_: true + num_warmup_steps: 1000 + num_training_steps: 100000 + num_cycles: 0.5 \ No newline at end of file diff --git a/egomimic/hydra_configs/train_zarr_pi.yaml b/egomimic/hydra_configs/train_zarr_pi.yaml new file mode 100644 index 00000000..c7d0e0b1 --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_pi.yaml @@ -0,0 +1,114 @@ +defaults: + - model: pi0.5_bc_eva + - paths: default + - trainer: debug + - debug: null + - logger: debug + - data: eva + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: test +description: test +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + base_0_rgb: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_0_rgb: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_0_rgb: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + base_0_rgb: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + base_0_rgb: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + base_0_rgb: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + viz_img_key: + eva_bimanual: + base_0_rgb + aria_bimanual: + base_0_rgb + mecka_bimanual: + base_0_rgb + scale_bimanual: + base_0_rgb + +seed: 42 \ No newline at end of file diff --git a/egomimic/utils/aws/aws_data_utils.py b/egomimic/utils/aws/aws_data_utils.py index 2cc7f631..ba0602a2 100644 --- a/egomimic/utils/aws/aws_data_utils.py +++ b/egomimic/utils/aws/aws_data_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import warnings from pathlib import Path import boto3 @@ -13,12 +14,20 @@ def _uses_r2_endpoint(endpoint_url: str | None) -> bool: return bool(endpoint_url and "r2.cloudflarestorage.com" in endpoint_url) -def load_env(path="~/.egoverse_env"): +def load_env(path="~/.egoverse_env", required: bool = False): p = Path(path).expanduser() if not p.exists(): - raise ValueError( - f"Env file {p} does not exist, run ./egomimic/utils/aws/setup_secret.sh" + if required: + raise ValueError( + f"Env file {p} does not exist, run ./egomimic/utils/aws/setup_secret.sh" + ) + warnings.warn( + f"Env file {p} does not exist; AWS/R2 env vars not set. " + "Run ./egomimic/utils/aws/setup_secret.sh if you need S3/R2.", + UserWarning, + stacklevel=2, ) + return for line in p.read_text().splitlines(): line = line.strip() if not line or line.startswith("#") or "=" not in line: