From 5bfed2190b7d3fabea52feeb57271b0af6304008 Mon Sep 17 00:00:00 2001 From: ElmoPA Date: Sat, 28 Feb 2026 00:44:36 -0500 Subject: [PATCH 1/2] Latent Flow matching that works regardless of dimension dim --- .../hpt_cotrain_flow_shared_head_latent.yaml | 75 +++++ egomimic/models/conv/temporal_enc_dec.py | 305 ++++++++++++++++++ egomimic/models/denoising_policy.py | 18 +- egomimic/models/fm_policy.py | 2 + 4 files changed, 391 insertions(+), 9 deletions(-) create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml create mode 100644 egomimic/models/conv/temporal_enc_dec.py diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml new file mode 100644 index 00000000..2d45f799 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml @@ -0,0 +1,75 @@ +defaults: + - hpt_cotrain_enc_dec_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_cartesian" + aria_bimanual: "actions_cartesian" + shared_ac_key: "actions_cartesian" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + infer_ac_dims: + eva_bimanual: 14 + aria_bimanual: 14 + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 6 + cond_dim: 256 + hidden_dim: 128 + act_dim: 14 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + latent_map: + eva_bimanual: + encoder: + _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalEncoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalDecoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + aria_keypoints: + encoder: + _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalEncoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalDecoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1400 + eta_min: 1e-5 diff --git a/egomimic/models/conv/temporal_enc_dec.py b/egomimic/models/conv/temporal_enc_dec.py new file mode 100644 index 00000000..4d438c82 --- /dev/null +++ b/egomimic/models/conv/temporal_enc_dec.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +from typing import List + +import torch +import torch.nn as nn + + +class SmallTemporalEncoder(nn.Module): + """ + Fix temporal encoder for 100 seq of actiona + """ + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + use_layernorm: bool = True, + ): + super().__init__() + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + layers = [nn.Conv1d(action_dim, action_dim*2, kernel_size=8, stride=2, padding=3), + act, + nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=2), + act, + nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=3), + act, + ] + + + hidden_dim = 64 + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(action_dim*2, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: (B, T, D) or (T, D) + Output: (B, K, H) or (K, H) + """ + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D, K) + x = x.transpose(1, 2) # (B, K, D) + x = self.proj(x) # (B, K, H) + + return x.squeeze(0) if squeeze_B else x + +class SmallTemporalDecoder(nn.Module): + """ + Decoder that mirrors SmallTemporalEncoder: + Enc convs (over time, channels-first): + (D -> 2D) k=8 s=2 p=3 + (2D -> 2D) k=8 s=2 p=2 + (2D -> 2D) k=8 s=2 p=3 + For T=100 this encoder produces K=12. + + This decoder maps: + Input: (B, K=12, H=64) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.K = K + self.T = T + + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + C2 = action_dim * 2 + + self.proj = nn.Linear(hidden_dim, C2) + self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() + + self.up = nn.Sequential( + nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0), + act, + nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0), + act, + nn.ConvTranspose1d(C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + if K != self.K: + raise ValueError(f"Expected K={self.K}, got {K}") + + x = self.norm(self.proj(z)) # (B, K, 2D) + x = x.transpose(1, 2) # (B, 2D, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + + return x.squeeze(0) if squeeze_B else x + +class LargeTemporalEncoder(nn.Module): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.expect_T = expect_T + + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + D = action_dim + + self.down = nn.Sequential( + nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 + act, + nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2), # 50 -> 24 + act, + nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3), # 24 -> 12 + act, + ) + + self.proj = nn.Linear(action_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + B, T, D = x.shape + if D != self.action_dim: + raise ValueError(f"Expected D={self.action_dim}, got {D}") + if self.expect_T is not None and T != self.expect_T: + raise ValueError(f"Expected T={self.expect_T}, got {T}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D/2, K=12) + x = x.transpose(1, 2) # (B, K, D/2) + x = self.proj(x) # (B, K, H) + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalDecoder(nn.Module): + """ + Decoder that mirrors LargeTemporalEncoder: + time: 12 -> 24 -> 50 -> 100 + channels: H -> D/2 -> D + Input: (B, K=12, H) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.half_dim = action_dim // 2 + self.hidden_dim = hidden_dim + self.K = K + self.T = T + + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + self.proj = nn.Linear(hidden_dim, action_dim) + self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() + + # Mirrors paddings/strides/kernels in reverse. + # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. + self.up = nn.Sequential( + nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), + act, + nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2, output_padding=0), + act, + nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + if K != self.K: + raise ValueError(f"Expected K={self.K}, got {K}") + + x = self.norm(self.proj(z)) # (B, K, D/2) + x = x.transpose(1, 2) # (B, D/2, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + return x.squeeze(0) if squeeze_B else x + + +def count_params(module: nn.Module, trainable_only: bool = False) -> int: + if trainable_only: + return sum(p.numel() for p in module.parameters() if p.requires_grad) + return sum(p.numel() for p in module.parameters()) + + +def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: + total = 0 + for name, p in module.named_parameters(): + if trainable_only and not p.requires_grad: + continue + n = p.numel() + total += n + print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") + print(f"\nTOTAL params: {total}") + +if __name__ == "__main__": + B, T, D = 8, 100, 140 + + enc = LargeTemporalEncoder(action_dim=D) + dec = LargeTemporalDecoder(action_dim=D, use_layernorm=True) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) + + \ No newline at end of file diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py index 645a8c44..25ccc641 100644 --- a/egomimic/models/denoising_policy.py +++ b/egomimic/models/denoising_policy.py @@ -68,7 +68,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ) return noise, global_cond - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference(self, noise, global_cond, generator=None) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] """ To be implemented in subclass: predict actions from noise and conditioning. """ @@ -78,13 +78,13 @@ def sample_action(self, global_cond, embodiment_name, generator=None): noise, global_cond = self.preprocess_sampling( global_cond, embodiment_name, generator ) - return self.inference(noise, global_cond, generator) + return self.inference(noise, global_cond, generator, embodiment_name) - def forward(self, global_cond): + def forward(self, global_cond, embodiment_name): cond, embodiment = global_cond - return self.sample_action(cond, embodiment) + return self.sample_action(cond, embodiment, embodiment_name) - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict(self, actions, global_cond, embodiment_name) -> Tuple[torch.Tensor, torch.Tensor]: """ To be implemented in subclass: returns (prediction, target) given action input and conditioning. """ @@ -96,7 +96,7 @@ def loss_fn(self, pred, target): """ return F.mse_loss(pred, target) - def preprocess_compute_loss(self, global_cond, data): + def preprocess_compute_loss(self, global_cond, data, embodiment_name): if self.pooling == "mean": global_cond = global_cond.mean(dim=1) elif self.pooling == "flatten": @@ -121,7 +121,7 @@ def preprocess_compute_loss(self, global_cond, data): return actions, global_cond - def compute_loss(self, global_cond, data): - actions, global_cond = self.preprocess_compute_loss(global_cond, data) - pred, target = self.predict(actions, global_cond) + def compute_loss(self, global_cond, data, embodiment_name): + actions, global_cond = self.preprocess_compute_loss(global_cond, data, embodiment_name) + pred, target = self.predict(actions, global_cond, embodiment_name) return self.loss_fn(pred, target) diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py index e41f4943..551853a6 100644 --- a/egomimic/models/fm_policy.py +++ b/egomimic/models/fm_policy.py @@ -26,12 +26,14 @@ def __init__( action_horizon, infer_ac_dims, num_inference_steps=None, + encoder_map=None, **kwargs, ): super().__init__( model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs ) self.time_dist = kwargs.get("time_dist", "beta") + self.encoder_map = encoder_map def step(self, x_t, t, global_cond): if len(t.shape) != 1: From 69197986c5341617714d566856d13df6ff99ac54 Mon Sep 17 00:00:00 2001 From: ElmoPA Date: Thu, 5 Mar 2026 16:14:20 -0500 Subject: [PATCH 2/2] Changes for latent flow --- egomimic/algo/hpt.py | 11 +- egomimic/hydra_configs/data/aria_debug.yaml | 39 ++ .../hydra_configs/data/eva_human_cotrain.yaml | 8 +- .../data/eva_human_keypoints_cotrain.yaml | 73 +++ .../eva_human_keypoints_cotrain_wrist.yaml | 77 +++ .../hydra/launcher/submitit.yaml | 20 +- .../hydra/launcher/submitit_pace.yaml | 3 +- egomimic/hydra_configs/logger/wandb.yaml | 2 +- .../hpt_cotrain_flow_aria_keypoints.yaml | 54 ++ .../hpt_cotrain_flow_head_latent_aria.yaml | 61 ++ ...hpt_cotrain_flow_head_latent_aria_mlp.yaml | 74 +++ .../hpt_cotrain_flow_shared_head_latent.yaml | 33 +- ...cotrain_flow_shared_head_latent_large.yaml | 78 +++ ...n_flow_shared_head_latent_large_wrist.yaml | 78 +++ ...t_cotrain_flow_shared_head_latent_mlp.yaml | 74 +++ ...ain_flow_shared_head_latent_mlp_wrist.yaml | 74 +++ .../model/hpt_cotrain_keypoints_base.yaml | 160 ++++++ .../model/hpt_cotrain_keypoints_wrist.yaml | 160 ++++++ egomimic/hydra_configs/train_zarr_latent.yaml | 120 ++++ .../hydra_configs/train_zarr_latent_aria.yaml | 120 ++++ .../train_zarr_latent_wrist.yaml | 120 ++++ egomimic/hydra_configs/trainer/ddp.yaml | 2 +- egomimic/hydra_configs/trainer/debug.yaml | 2 +- egomimic/hydra_configs/trainer/default.yaml | 2 +- .../eva_cartesian_aria_keypoints.yaml | 20 +- .../eva_cartesian_aria_keypoints_wrist.yaml | 16 + egomimic/models/codec/identity.py | 10 + egomimic/models/codec/mlp.py | 16 + egomimic/models/codec/temporal_enc_dec.py | 520 ++++++++++++++++++ egomimic/models/conv/temporal_enc_dec.py | 305 ---------- egomimic/models/denoising_policy.py | 68 ++- egomimic/models/fm_policy.py | 60 +- egomimic/pl_utils/pl_model.py | 17 +- egomimic/rldb/embodiment/embodiment.py | 29 +- egomimic/rldb/embodiment/eva.py | 77 +-- egomimic/rldb/embodiment/human.py | 123 +++-- egomimic/rldb/zarr/action_chunk_transforms.py | 68 ++- .../scripts/tutorials/zarr_data_viz.ipynb | 354 ++---------- egomimic/scripts/tutorials/zarr_data_viz.py | 88 +++ egomimic/trainHydra.py | 7 +- egomimic/train_zarr.yaml | 111 ++++ egomimic/utils/pose_utils.py | 4 + egomimic/utils/viz_utils.py | 7 +- 43 files changed, 2564 insertions(+), 781 deletions(-) create mode 100644 egomimic/hydra_configs/data/aria_debug.yaml create mode 100644 egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml create mode 100644 egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml create mode 100644 egomimic/hydra_configs/train_zarr_latent.yaml create mode 100644 egomimic/hydra_configs/train_zarr_latent_aria.yaml create mode 100644 egomimic/hydra_configs/train_zarr_latent_wrist.yaml create mode 100644 egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml create mode 100644 egomimic/models/codec/identity.py create mode 100644 egomimic/models/codec/mlp.py create mode 100644 egomimic/models/codec/temporal_enc_dec.py delete mode 100644 egomimic/models/conv/temporal_enc_dec.py create mode 100644 egomimic/scripts/tutorials/zarr_data_viz.py create mode 100644 egomimic/train_zarr.yaml diff --git a/egomimic/algo/hpt.py b/egomimic/algo/hpt.py index 6ae47832..929b7afe 100644 --- a/egomimic/algo/hpt.py +++ b/egomimic/algo/hpt.py @@ -359,7 +359,7 @@ def stem_process(self, domain, data): feat_dict = {} for modality in self.modalities.get(domain, []) + self.shared_keys: if modality not in data: - continue + raise ValueError(f"Modality {modality} not found in data") if modality in self.shared_keys: domain = "shared" @@ -829,7 +829,6 @@ def __init__( self.domains = domains.copy() self.auxiliary_ac_keys = auxiliary_ac_keys.copy() self.shared_ac_key = kwargs.get("shared_ac_key", None) - self.is_6dof = kwargs.get("6dof", False) self.kinematics_solver = kwargs.get("kinematics_solver", None) model = HPTModel(**trunk) @@ -1282,13 +1281,16 @@ def compute_losses(self, predictions, batch): embodiment_name = get_embodiment(embodiment_id).lower() bc_loss = predictions[f"{embodiment_name}_loss"] scaled_bc_loss = bc_weight * bc_loss - total_action_loss += scaled_bc_loss + total_action_loss = total_action_loss + scaled_bc_loss loss_dict[f"{embodiment_name}_loss"] = bc_loss # for logging if self.ot: loss_dict["ot_loss"] = predictions["ot_loss"] loss_dict["avg_feature_distance"] = predictions["avg_feature_distance"] - total_action_loss += ot_weight * self.temperature * predictions["ot_loss"] + total_action_loss = ( + total_action_loss + + ot_weight * self.temperature * predictions["ot_loss"] + ) loss_dict["action_loss"] = total_action_loss / len(self.domains) return loss_dict @@ -1372,7 +1374,6 @@ def _robomimic_to_hpt_data( if key in batch: data[key] = batch[key] - data["is_6dof"] = self.is_6dof data["pad_mask"] = batch["pad_mask"] data["embodiment"] = batch["embodiment"] diff --git a/egomimic/hydra_configs/data/aria_debug.yaml b/egomimic/hydra_configs/data/aria_debug.yaml new file mode 100644 index 00000000..2c0b1c54 --- /dev/null +++ b/egomimic/hydra_configs/data/aria_debug.yaml @@ -0,0 +1,39 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + episode_hash: "2025-11-27-23-44-43-234000" + mode: total +valid_datasets: + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + episode_hash: "2025-11-27-23-44-43-234000" + mode: total +train_dataloader_params: + aria_bimanual: + batch_size: 64 + num_workers: 10 +valid_dataloader_params: + aria_bimanual: + batch_size: 64 + num_workers: 10 diff --git a/egomimic/hydra_configs/data/eva_human_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_cotrain.yaml index ea70acc9..357e388b 100644 --- a/egomimic/hydra_configs/data/eva_human_cotrain.yaml +++ b/egomimic/hydra_configs/data/eva_human_cotrain.yaml @@ -4,7 +4,7 @@ train_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap transform_list: @@ -16,7 +16,7 @@ train_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.human.Aria.get_keymap mode: cartesian @@ -31,7 +31,7 @@ valid_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap transform_list: @@ -43,7 +43,7 @@ valid_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.human.Aria.get_keymap mode: cartesian diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml new file mode 100644 index 00000000..81cb1065 --- /dev/null +++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml @@ -0,0 +1,73 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +valid_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +train_dataloader_params: + eva_bimanual: + batch_size: 256 + num_workers: 6 + aria_bimanual: + batch_size: 256 + num_workers: 6 +valid_dataloader_params: + eva_bimanual: + batch_size: 256 + num_workers: 6 + aria_bimanual: + batch_size: 256 + num_workers: 6 diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml new file mode 100644 index 00000000..53610780 --- /dev/null +++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml @@ -0,0 +1,77 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + mode: cartesian_wristframe_ypr + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints_wristframe_ypr + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + lab: "rl2" + mode: total +valid_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + mode: cartesian_wristframe_ypr + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints_wristframe_ypr + filters: + robot_name: "aria_bimanual" + task: "fold_clothes" + lab: "rl2" + mode: total +train_dataloader_params: + eva_bimanual: + batch_size: 32 + num_workers: 6 + aria_bimanual: + batch_size: 64 + num_workers: 6 +valid_dataloader_params: + eva_bimanual: + batch_size: 16 + num_workers: 6 + aria_bimanual: + batch_size: 16 + num_workers: 6 diff --git a/egomimic/hydra_configs/hydra/launcher/submitit.yaml b/egomimic/hydra_configs/hydra/launcher/submitit.yaml index c56f2cd5..b068685e 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit.yaml @@ -4,15 +4,15 @@ defaults: _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher # Slurm configuration -name: ${hydra.job.name} # Default job name -partition: "rl2-lab" # Slurm partition (e.g., 'gpu' or 'compute') -account: "rl2-lab" # Slurm account (e.g., 'my_account') -cpus_per_task: 12 # Number of CPUs per task -nodes: ${launch_params.nodes} # Number of nodes -tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node +name: ${hydra.job.name} # Default job name +partition: "hoffman-lab" # Slurm partition (e.g., 'gpu' or 'compute') +account: "hoffman-lab" # Slurm account (e.g., 'my_account') +cpus_per_task: 12 # Number of CPUs per task +nodes: ${launch_params.nodes} # Number of nodes +tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:a40:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count -qos: "short" # Slurm QoS -timeout_min: 2880 # Timeout in minutes (48 hours) -exclude: "protocol, puma" # Nodes to exclude +qos: "short" # Slurm QoS +timeout_min: 2880 # Timeout in minutes (48 hours) +exclude: "protocol, puma" # Nodes to exclude additional_parameters: - requeue: true \ No newline at end of file + requeue: true diff --git a/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml b/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml index 2d9cd957..d34c2b98 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml @@ -4,6 +4,7 @@ defaults: _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher # Slurm configuration + name: ${hydra.job.name} # Default job name partition: "gpu-h200" # Slurm partition account: "gts-dxu345-rl2" # Slurm account @@ -11,7 +12,7 @@ cpus_per_task: 8 # Number of CPUs per task (ma nodes: ${launch_params.nodes} # Number of nodes tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:h200:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count (h100 for H100 GPUs) -qos: "short" # Slurm QoS +qos: "inferno" # Slurm QoS mem_per_gpu: 250G timeout_min: 2880 # Timeout in minutes (48 hours) # exclude: "protocol, puma" # Nodes to exclude diff --git a/egomimic/hydra_configs/logger/wandb.yaml b/egomimic/hydra_configs/logger/wandb.yaml index 6f574bd5..dc5d2ea3 100644 --- a/egomimic/hydra_configs/logger/wandb.yaml +++ b/egomimic/hydra_configs/logger/wandb.yaml @@ -7,7 +7,7 @@ wandb: offline: False id: "${name}_${description}_${now:%Y-%m-%d_%H-%M-%S}" # pass correct id to resume experiment! anonymous: null # enable anonymous logging - project: "zarr_test" + project: "keypoints_cotrain" log_model: False # upload lightning ckpts prefix: "" # a string to put at the beginning of metric keys entity: "rl2-group" # set to name of your wandb team diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml new file mode 100644 index 00000000..7c99a560 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml @@ -0,0 +1,54 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.identity.Identity + decoder: + _target_: egomimic.models.codec.identity.Identity + stem_specs: + eva_bimanual: null + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml new file mode 100644 index 00000000..deaf8e22 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml @@ -0,0 +1,61 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 5 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 5 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml new file mode 100644 index 00000000..b020b469 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + n_layers: 2 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 2 + output_dim: 14 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 140 + hidden_dim: 256 + n_layers: 7 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 7 + output_dim: 140 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml index 2d45f799..b256d18d 100644 --- a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml @@ -1,11 +1,11 @@ defaults: - - hpt_cotrain_enc_dec_base + - hpt_cotrain_keypoints_base robomimic_model: ac_keys: - eva_bimanual: "actions_cartesian" - aria_bimanual: "actions_cartesian" - shared_ac_key: "actions_cartesian" + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" 6dof: true diffusion: true @@ -20,43 +20,42 @@ robomimic_model: pooling: null padding: "zero" time_dist: "beta" - infer_ac_dims: - eva_bimanual: 14 - aria_bimanual: 14 model: _target_: egomimic.models.denoising_nets.CrossTransformer nblocks: 6 cond_dim: 256 - hidden_dim: 128 - act_dim: 14 - act_seq: 100 + hidden_dim: 256 + act_dim: 128 + act_seq: 12 n_heads: 4 dropout: 0.1 mlp_layers: 4 mlp_ratio: 4 - latent_map: + embodiment_specs: eva_bimanual: + ac_dims: 14 encoder: - _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalEncoder + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder action_dim: 14 hidden_dim: 128 activation: "gelu" use_layernorm: false decoder: - _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalDecoder + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder action_dim: 14 hidden_dim: 128 activation: "gelu" use_layernorm: true - aria_keypoints: + aria_bimanual: + ac_dims: 140 encoder: - _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalEncoder + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder action_dim: 140 hidden_dim: 128 activation: "gelu" use_layernorm: false decoder: - _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalDecoder + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder action_dim: 140 hidden_dim: 128 activation: "gelu" @@ -71,5 +70,5 @@ optimizer: scheduler: _target_: torch.optim.lr_scheduler.CosineAnnealingLR _partial_: true - T_max: 1400 + T_max: 1800 eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml new file mode 100644 index 00000000..9236502a --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml @@ -0,0 +1,78 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 2 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 2 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 7 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 7 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml new file mode 100644 index 00000000..95a1d2fd --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml @@ -0,0 +1,78 @@ +defaults: + - hpt_cotrain_keypoints_wrist + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 2 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 2 + aria_bimanual: + ac_dims: 138 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 138 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 7 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 138 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 7 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml new file mode 100644 index 00000000..9d3e20aa --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + n_layers: 2 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 2 + output_dim: 14 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 140 + hidden_dim: 256 + n_layers: 7 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 7 + output_dim: 140 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml new file mode 100644 index 00000000..e7e8cc46 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + n_layers: 2 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 2 + output_dim: 14 + aria_bimanual: + ac_dims: 138 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 138 + hidden_dim: 256 + n_layers: 7 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 7 + output_dim: 138 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml new file mode 100644 index 00000000..c03abc5b --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml @@ -0,0 +1,160 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.hpt.HPT + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + eva_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + reverse_kl_samples: 8 + + trunk: + embed_dim: 256 + num_blocks: 16 + num_heads: 8 + token_postprocessing: "action_token" + observation_horizon: 1 + action_horizon: 64 + no_trunk: false + use_domain_embedding: true + drop_path: 0.1 + weight_init_style: "pytorch" + + multitask: false + pretrained: false + pretrained_checkpoint: null + domains: ["eva_bimanual", "aria_bimanual"] + shared_obs_keys: ["front_img_1"] + + shared_stem_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + stem_specs: + aria_bimanual: + state_keypoints: # TODO: check if this is added to dataschematic correctly + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 140 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + # state_wrist_pose: + # _target_: egomimic.models.hpt_nets.MLPPolicyStem + # input_dim: 14 + # output_dim: 256 + # widths: [256] + # specs: + # random_horizon_masking: false + # cross_attn: + # crossattn_latent: 16 + # crossattn_heads: 8 + # crossattn_dim_head: 64 + # crossattn_modality_dropout: 0.1 + # modality_embed_dim: 256 + + eva_bimanual: + state_ee_pose: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 14 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + right_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + left_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + encoder_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + right_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + left_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ColorJitter + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.05 + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-5 + weight_decay: 0.0001 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml new file mode 100644 index 00000000..68dea4a4 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml @@ -0,0 +1,160 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.hpt.HPT + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + eva_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + reverse_kl_samples: 8 + + trunk: + embed_dim: 256 + num_blocks: 16 + num_heads: 8 + token_postprocessing: "action_token" + observation_horizon: 1 + action_horizon: 64 + no_trunk: false + use_domain_embedding: true + drop_path: 0.1 + weight_init_style: "pytorch" + + multitask: false + pretrained: false + pretrained_checkpoint: null + domains: ["eva_bimanual", "aria_bimanual"] + shared_obs_keys: ["front_img_1"] + + shared_stem_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + stem_specs: + aria_bimanual: + state_keypoints: # TODO: check if this is added to dataschematic correctly + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 126 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + state_wrist_pose: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 12 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + eva_bimanual: + state_ee_pose: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 14 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + right_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + left_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + encoder_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + right_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + left_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ColorJitter + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.05 + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-5 + weight_decay: 0.0001 diff --git a/egomimic/hydra_configs/train_zarr_latent.yaml b/egomimic/hydra_configs/train_zarr_latent.yaml new file mode 100644 index 00000000..cbcb902a --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent.yaml @@ -0,0 +1,120 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent_large + - visualization: eva_cartesian_aria_keypoints + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit_pace + - _self_ + +name: test +description: normal_latent +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +norm_percentage: 0.2 +num_workers: 6 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoints: # relative to wrist pose + key_type: proprio_keys + zarr_key: observations.state.keypoints + wrist_pose: + key_type: proprio_keys + zarr_key: observations.state.wrist_pose + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 + +model: + enable_adaptive_grad_clip: false diff --git a/egomimic/hydra_configs/train_zarr_latent_aria.yaml b/egomimic/hydra_configs/train_zarr_latent_aria.yaml new file mode 100644 index 00000000..6fbd90f8 --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent_aria.yaml @@ -0,0 +1,120 @@ +defaults: + - model: hpt_cotrain_flow_head_latent_aria + - visualization: eva_cartesian_aria_keypoints + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: aria_debug + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_debug +description: aria_conv_mlp_xl +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +norm_percentage: 1.0 +num_workers: 6 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + wrist_positions: + key_type: proprio_keys + zarr_key: observations.state.wrist_pose + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 + +model: + enable_adaptive_grad_clip: false diff --git a/egomimic/hydra_configs/train_zarr_latent_wrist.yaml b/egomimic/hydra_configs/train_zarr_latent_wrist.yaml new file mode 100644 index 00000000..125f0076 --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent_wrist.yaml @@ -0,0 +1,120 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent_large_wrist + - visualization: eva_cartesian_aria_keypoints_wrist + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain_wrist + - callbacks: checkpoints + - override hydra/launcher: submitit_pace + - _self_ + +name: test +description: wrist_latent +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +norm_percentage: 0.2 +num_workers: 6 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoints: # relative to wrist pose + key_type: proprio_keys + zarr_key: observations.state.keypoints + wrist_pose: + key_type: proprio_keys + zarr_key: observations.state.wrist_pose + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 + +model: + enable_adaptive_grad_clip: false diff --git a/egomimic/hydra_configs/trainer/ddp.yaml b/egomimic/hydra_configs/trainer/ddp.yaml index d3d90aca..aee487a3 100644 --- a/egomimic/hydra_configs/trainer/ddp.yaml +++ b/egomimic/hydra_configs/trainer/ddp.yaml @@ -8,4 +8,4 @@ devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'} num_nodes: ${launch_params.nodes} sync_batchnorm: True check_val_every_n_epoch: 200 -num_sanity_val_steps: 0 \ No newline at end of file +num_sanity_val_steps: 0 diff --git a/egomimic/hydra_configs/trainer/debug.yaml b/egomimic/hydra_configs/trainer/debug.yaml index e3a9a1a5..905d3711 100644 --- a/egomimic/hydra_configs/trainer/debug.yaml +++ b/egomimic/hydra_configs/trainer/debug.yaml @@ -3,7 +3,7 @@ defaults: strategy: ddp_find_unused_parameters_true limit_train_batches: 5 -limit_val_batches: 20 +limit_val_batches: 3 check_val_every_n_epoch: 2 profiler: simple max_epochs: 4 diff --git a/egomimic/hydra_configs/trainer/default.yaml b/egomimic/hydra_configs/trainer/default.yaml index a6b47e35..391656bf 100644 --- a/egomimic/hydra_configs/trainer/default.yaml +++ b/egomimic/hydra_configs/trainer/default.yaml @@ -11,7 +11,7 @@ devices: 1 # mixed precision for extra speed-up precision: bf16 limit_train_batches: 100 -limit_val_batches: 300 +limit_val_batches: 80 # perform a validation loop every N training epochs check_val_every_n_epoch: 200 diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml index 8c4d1c91..a2311911 100644 --- a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml +++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml @@ -1,14 +1,10 @@ eva_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.eva.Eva.viz - _partial_: true - mode: traj - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.eva.Eva.viz_cartesian_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints aria_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.human.Aria.viz - _partial_: true - mode: keypoints - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.human.Aria.viz_keypoints_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml new file mode 100644 index 00000000..c2bceea0 --- /dev/null +++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml @@ -0,0 +1,16 @@ +eva_bimanual: + _target_: egomimic.rldb.embodiment.eva.Eva.viz_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints + transform_list: + _target_: egomimic.rldb.embodiment.eva._build_eva_bimanual_revert_eef_frame_transform_list + is_quat: false +aria_bimanual: + _target_: egomimic.rldb.embodiment.human.Aria.viz_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human._build_aria_keypoints_revert_eef_frame_transform_list + is_quat: false diff --git a/egomimic/models/codec/identity.py b/egomimic/models/codec/identity.py new file mode 100644 index 00000000..a40b9495 --- /dev/null +++ b/egomimic/models/codec/identity.py @@ -0,0 +1,10 @@ +import torch +import torch.nn as nn + + +class Identity(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x diff --git a/egomimic/models/codec/mlp.py b/egomimic/models/codec/mlp.py new file mode 100644 index 00000000..0ab021ab --- /dev/null +++ b/egomimic/models/codec/mlp.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn + + +class MLPProjection(nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, n_layers: int, output_dim: int): + super().__init__() + layers = [nn.Linear(input_dim, hidden_dim), nn.GELU()] + for _ in range(n_layers - 1): + layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.GELU()]) + layers.append(nn.Linear(hidden_dim, output_dim)) + self.net = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x is in (B, T, D) -> (B, T, H) + return self.net(x) diff --git a/egomimic/models/codec/temporal_enc_dec.py b/egomimic/models/codec/temporal_enc_dec.py new file mode 100644 index 00000000..2a69ab33 --- /dev/null +++ b/egomimic/models/codec/temporal_enc_dec.py @@ -0,0 +1,520 @@ +from __future__ import annotations + +import torch +import torch.nn as nn + + +class SmallTemporalEncoder(nn.Module): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 64, + use_layernorm: bool = True, + ): + super().__init__() + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + layers = [ + nn.Conv1d(action_dim, action_dim * 2, kernel_size=8, stride=2, padding=3), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=2 + ), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=3 + ), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(action_dim * 2, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: (B, T, D) or (T, D) + Output: (B, K, H) or (K, H) + """ + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D, K) + x = x.transpose(1, 2) # (B, K, D) + x = self.proj(x) # (B, K, H) + + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalDecoder(nn.Module): + """ + Decoder that mirrors SmallTemporalEncoder: + Enc convs (over time, channels-first): + (D -> 2D) k=8 s=2 p=3 + (2D -> 2D) k=8 s=2 p=2 + (2D -> 2D) k=8 s=2 p=3 + For T=100 this encoder produces K=12. + + This decoder maps: + Input: (B, K=12, H=64) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + C2 = action_dim * 2 + + self.proj = nn.Linear(hidden_dim, C2) + self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() + + self.up = nn.Sequential( + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, 2D) + x = x.transpose(1, 2) # (B, 2D, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalEncoder(nn.Module): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.expect_T = expect_T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + D = action_dim + + self.down = nn.Sequential( + nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=2 + ), # 50 -> 24 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=3 + ), # 24 -> 12 + self.act, + ) + + self.proj = nn.Linear(action_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + B, T, D = x.shape + if D != self.action_dim: + raise ValueError(f"Expected D={self.action_dim}, got {D}") + if self.expect_T is not None and T != self.expect_T: + raise ValueError(f"Expected T={self.expect_T}, got {T}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D/2, K=12) + x = x.transpose(1, 2) # (B, K, D/2) + x = self.proj(x) # (B, K, H) + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalDecoder(nn.Module): + """ + Decoder that mirrors LargeTemporalEncoder: + time: 12 -> 24 -> 50 -> 100 + channels: H -> D/2 -> D + Input: (B, K=12, H) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + T: int = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.half_dim = action_dim // 2 + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + self.proj = nn.Linear(hidden_dim, action_dim) + self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() + + # Mirrors paddings/strides/kernels in reverse. + # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. + self.up = nn.Sequential( + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=2, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, D/2) + x = x.transpose(1, 2) # (B, D/2, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalEncoder_32_256(SmallTemporalEncoder): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 256, + use_layernorm: bool = True, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.Conv1d(action_dim, 512, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(512, hidden_dim) + proj_layers = [ + nn.Linear(512, hidden_dim), + self.act, + ] + for _ in range(n_layers - 1): + proj_layers.extend( + [ + nn.Linear(hidden_dim, hidden_dim), + self.act, + ] + ) + proj_layers.extend( + [nn.Linear(hidden_dim, hidden_dim)] + ) # TODO check if I need activation here later + self.proj = nn.Sequential(*proj_layers) + + +class SmallTemporalDecoder_32_256(SmallTemporalDecoder): + """ + Decoder that mirrors SmallTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.ConvTranspose1d(512, action_dim, kernel_size=9, stride=3, padding=1), + ] + + self.up = nn.Sequential(*layers) + self.norm = nn.LayerNorm(512) if use_layernorm else nn.Identity() + proj_layers = [] + for _ in range(n_layers - 1): + proj_layers.extend([nn.Linear(hidden_dim, hidden_dim), self.act]) + proj_layers.extend([nn.Linear(hidden_dim, 512)]) + self.proj = nn.Sequential(*proj_layers) + + +class LargeTemporalEncoder_32_256(LargeTemporalEncoder): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + expect_T=expect_T, + ) + layers = [ + nn.Conv1d(action_dim, 2048, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + proj_layers = [ + nn.Linear(2048, 1024), + self.act, + nn.Linear(1024, hidden_dim), + self.act, + ] + for _ in range(n_layers - 1): + proj_layers.extend( + [ + nn.Linear(hidden_dim, hidden_dim), + self.act, + ] + ) + proj_layers.extend( + [nn.Linear(hidden_dim, hidden_dim)] + ) # TODO check if I need activation here later + self.proj = nn.Sequential(*proj_layers) + + +class LargeTemporalDecoder_32_256(LargeTemporalDecoder): + """ + Decoder that mirrors LargeTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + T: int = 100, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + T=T, + ) + + layers = [ + nn.ConvTranspose1d(2048, action_dim, kernel_size=9, stride=3, padding=1), + ] + + self.up = nn.Sequential(*layers) + proj_layers = [] + for _ in range(n_layers - 1): + proj_layers.extend( + [ + nn.Linear(hidden_dim, hidden_dim), + self.act, + ] + ) + proj_layers.extend([nn.Linear(hidden_dim, 2048)]) + self.proj = nn.Sequential(*proj_layers) + self.norm = nn.LayerNorm(2048) if use_layernorm else nn.Identity() + + +def count_params(module: nn.Module, trainable_only: bool = False) -> int: + if trainable_only: + return sum(p.numel() for p in module.parameters() if p.requires_grad) + return sum(p.numel() for p in module.parameters()) + + +def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: + total = 0 + for name, p in module.named_parameters(): + if trainable_only and not p.requires_grad: + continue + n = p.numel() + total += n + print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") + print(f"\nTOTAL params: {total}") + + +if __name__ == "__main__": + B, T, D = 8, 100, 140 + + enc = LargeTemporalEncoder_32_256(action_dim=D, n_layers=5) + dec = LargeTemporalDecoder_32_256(action_dim=D, use_layernorm=True, n_layers=5) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + print("LargeTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) + + B, T, D = 8, 100, 14 + enc = SmallTemporalEncoder_32_256(action_dim=D, n_layers=5) + dec = SmallTemporalDecoder_32_256(action_dim=D, use_layernorm=True, n_layers=5) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + + print("SmallTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) diff --git a/egomimic/models/conv/temporal_enc_dec.py b/egomimic/models/conv/temporal_enc_dec.py deleted file mode 100644 index 4d438c82..00000000 --- a/egomimic/models/conv/temporal_enc_dec.py +++ /dev/null @@ -1,305 +0,0 @@ -from __future__ import annotations - -from typing import List - -import torch -import torch.nn as nn - - -class SmallTemporalEncoder(nn.Module): - """ - Fix temporal encoder for 100 seq of actiona - """ - def __init__( - self, - *, - action_dim: int, - activation: str = "gelu", - use_layernorm: bool = True, - ): - super().__init__() - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - layers = [nn.Conv1d(action_dim, action_dim*2, kernel_size=8, stride=2, padding=3), - act, - nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=2), - act, - nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=3), - act, - ] - - - hidden_dim = 64 - self.down = nn.Sequential(*layers) - self.proj = nn.Linear(action_dim*2, hidden_dim) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Input: (B, T, D) or (T, D) - Output: (B, K, H) or (K, H) - """ - squeeze_B = False - if x.dim() == 2: - x = x.unsqueeze(0) - squeeze_B = True - elif x.dim() != 3: - raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") - - x = x.transpose(1, 2) # (B, D, T) - x = self.down(x) # (B, D, K) - x = x.transpose(1, 2) # (B, K, D) - x = self.proj(x) # (B, K, H) - - return x.squeeze(0) if squeeze_B else x - -class SmallTemporalDecoder(nn.Module): - """ - Decoder that mirrors SmallTemporalEncoder: - Enc convs (over time, channels-first): - (D -> 2D) k=8 s=2 p=3 - (2D -> 2D) k=8 s=2 p=2 - (2D -> 2D) k=8 s=2 p=3 - For T=100 this encoder produces K=12. - - This decoder maps: - Input: (B, K=12, H=64) or (K, H) - Output: (B, T=100, D) or (T, D) - """ - def __init__( - self, - *, - action_dim: int, - hidden_dim: int = 64, - activation: str = "gelu", - use_layernorm: bool = True, - K: int = 12, - T: int = 100, - ): - super().__init__() - self.action_dim = action_dim - self.hidden_dim = hidden_dim - self.K = K - self.T = T - - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - C2 = action_dim * 2 - - self.proj = nn.Linear(hidden_dim, C2) - self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() - - self.up = nn.Sequential( - nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0), - act, - nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0), - act, - nn.ConvTranspose1d(C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), - ) - - def forward(self, z: torch.Tensor) -> torch.Tensor: - squeeze_B = False - if z.dim() == 2: - z = z.unsqueeze(0) - squeeze_B = True - elif z.dim() != 3: - raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") - - B, K, H = z.shape - if H != self.hidden_dim: - raise ValueError(f"Expected H={self.hidden_dim}, got {H}") - if K != self.K: - raise ValueError(f"Expected K={self.K}, got {K}") - - x = self.norm(self.proj(z)) # (B, K, 2D) - x = x.transpose(1, 2) # (B, 2D, K) - x = self.up(x) # (B, D, T) - if x.shape[-1] != self.T: - raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") - x = x.transpose(1, 2) # (B, T, D) - - return x.squeeze(0) if squeeze_B else x - -class LargeTemporalEncoder(nn.Module): - """ - Encoder for (B, T=100, D) that halves channels: D -> D/2, - and downsamples time: 100 -> 12. - Output: (B, K=12, H) - """ - def __init__( - self, - *, - action_dim: int, - hidden_dim: int = 64, - activation: str = "gelu", - use_layernorm: bool = True, - expect_T: int | None = 100, - ): - super().__init__() - if action_dim % 2 != 0: - raise ValueError(f"action_dim must be even to halve. Got {action_dim}") - - self.action_dim = action_dim - self.hidden_dim = hidden_dim - self.expect_T = expect_T - - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - D = action_dim - - self.down = nn.Sequential( - nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 - act, - nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2), # 50 -> 24 - act, - nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3), # 24 -> 12 - act, - ) - - self.proj = nn.Linear(action_dim, hidden_dim) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - squeeze_B = False - if x.dim() == 2: - x = x.unsqueeze(0) - squeeze_B = True - elif x.dim() != 3: - raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") - - B, T, D = x.shape - if D != self.action_dim: - raise ValueError(f"Expected D={self.action_dim}, got {D}") - if self.expect_T is not None and T != self.expect_T: - raise ValueError(f"Expected T={self.expect_T}, got {T}") - - x = x.transpose(1, 2) # (B, D, T) - x = self.down(x) # (B, D/2, K=12) - x = x.transpose(1, 2) # (B, K, D/2) - x = self.proj(x) # (B, K, H) - return x.squeeze(0) if squeeze_B else x - - -class LargeTemporalDecoder(nn.Module): - """ - Decoder that mirrors LargeTemporalEncoder: - time: 12 -> 24 -> 50 -> 100 - channels: H -> D/2 -> D - Input: (B, K=12, H) or (K, H) - Output: (B, T=100, D) or (T, D) - """ - def __init__( - self, - *, - action_dim: int, - hidden_dim: int = 64, - activation: str = "gelu", - use_layernorm: bool = True, - K: int = 12, - T: int = 100, - ): - super().__init__() - if action_dim % 2 != 0: - raise ValueError(f"action_dim must be even to halve. Got {action_dim}") - - self.action_dim = action_dim - self.half_dim = action_dim // 2 - self.hidden_dim = hidden_dim - self.K = K - self.T = T - - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - self.proj = nn.Linear(hidden_dim, action_dim) - self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() - - # Mirrors paddings/strides/kernels in reverse. - # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. - self.up = nn.Sequential( - nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), - act, - nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2, output_padding=0), - act, - nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), - ) - - def forward(self, z: torch.Tensor) -> torch.Tensor: - squeeze_B = False - if z.dim() == 2: - z = z.unsqueeze(0) - squeeze_B = True - elif z.dim() != 3: - raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") - - B, K, H = z.shape - if H != self.hidden_dim: - raise ValueError(f"Expected H={self.hidden_dim}, got {H}") - if K != self.K: - raise ValueError(f"Expected K={self.K}, got {K}") - - x = self.norm(self.proj(z)) # (B, K, D/2) - x = x.transpose(1, 2) # (B, D/2, K) - x = self.up(x) # (B, D, T) - if x.shape[-1] != self.T: - raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") - x = x.transpose(1, 2) # (B, T, D) - return x.squeeze(0) if squeeze_B else x - - -def count_params(module: nn.Module, trainable_only: bool = False) -> int: - if trainable_only: - return sum(p.numel() for p in module.parameters() if p.requires_grad) - return sum(p.numel() for p in module.parameters()) - - -def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: - total = 0 - for name, p in module.named_parameters(): - if trainable_only and not p.requires_grad: - continue - n = p.numel() - total += n - print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") - print(f"\nTOTAL params: {total}") - -if __name__ == "__main__": - B, T, D = 8, 100, 140 - - enc = LargeTemporalEncoder(action_dim=D) - dec = LargeTemporalDecoder(action_dim=D, use_layernorm=True) - - x = torch.randn(B, T, D) - z = enc(x) - x_hat = dec(z) - - print(count_params(enc)) - print(count_params(enc, trainable_only=True)) - print_param_breakdown(enc) - - \ No newline at end of file diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py index 25ccc641..5c5225b0 100644 --- a/egomimic/models/denoising_policy.py +++ b/egomimic/models/denoising_policy.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from egomimic.models.denoising_nets import ConditionalUnet1D +from egomimic.rldb.embodiment.embodiment import get_embodiment class DenoisingPolicy(nn.Module): @@ -23,29 +24,59 @@ def __init__( self, model: ConditionalUnet1D, action_horizon: int, - infer_ac_dims: dict, num_inference_steps: int = None, + embodiment_specs: dict = None, **kwargs, ): super().__init__() self.model = model self.action_horizon = action_horizon - self.infer_ac_dims = infer_ac_dims self.num_inference_steps = num_inference_steps + self.embodiment_specs = embodiment_specs + self.codec_enabled = False + + _codecs = {} + if embodiment_specs is not None: + for _emb_name, _spec in embodiment_specs.items(): + if _spec.get("encoder") is not None: + _codecs[f"{_emb_name}_encoder"] = _spec["encoder"] + if _spec.get("decoder") is not None: + _codecs[f"{_emb_name}_decoder"] = _spec["decoder"] + if _codecs: + self.codecs = nn.ModuleDict(_codecs) self.padding = kwargs.get("padding", None) self.pooling = kwargs.get("pooling", None) - self.model_type = kwargs.get("model_type", None) - - if not infer_ac_dims: - raise ValueError("infer_ac_dims must be a non-empty dict") for name, param in self.model.named_parameters(): if not param.requires_grad: print(f"[warn] {name} has requires_grad=False") total_params = sum(p.numel() for p in self.model.parameters()) + if self.embodiment_specs is not None: + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("ac_dims") is None: + raise ValueError(f"ac_dims must be specified for {embodiment_name}") + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("encoder") is not None: + encoder_params = sum( + p.numel() for p in spec["encoder"].parameters() + ) + self.codec_enabled = True + if spec.get("decoder") is not None: + decoder_params = sum( + p.numel() for p in spec["decoder"].parameters() + ) + self.codec_enabled = True + print( + f"[{embodiment_name}] Encoder params: {encoder_params / 1e6:.2f}M" + ) + print( + f"[{embodiment_name}] Decoder params: {decoder_params / 1e6:.2f}M" + ) + total_params += encoder_params + decoder_params + print( f"[{self.__class__.__name__}] Total trainable parameters: {total_params / 1e6:.2f}M" ) @@ -60,7 +91,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ( len(global_cond), self.action_horizon, - self.infer_ac_dims[embodiment_name], + self.embodiment_specs[embodiment_name].get("ac_dims"), ), dtype=global_cond.dtype, device=global_cond.device, @@ -68,7 +99,9 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ) return noise, global_cond - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] """ To be implemented in subclass: predict actions from noise and conditioning. """ @@ -78,13 +111,15 @@ def sample_action(self, global_cond, embodiment_name, generator=None): noise, global_cond = self.preprocess_sampling( global_cond, embodiment_name, generator ) - return self.inference(noise, global_cond, generator, embodiment_name) + return self.inference(noise, global_cond, embodiment_name, generator) - def forward(self, global_cond, embodiment_name): - cond, embodiment = global_cond - return self.sample_action(cond, embodiment, embodiment_name) + def forward(self, global_cond): + cond, embodiment_name = global_cond + return self.sample_action(cond, embodiment_name) - def predict(self, actions, global_cond, embodiment_name) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: """ To be implemented in subclass: returns (prediction, target) given action input and conditioning. """ @@ -121,7 +156,10 @@ def preprocess_compute_loss(self, global_cond, data, embodiment_name): return actions, global_cond - def compute_loss(self, global_cond, data, embodiment_name): - actions, global_cond = self.preprocess_compute_loss(global_cond, data, embodiment_name) + def compute_loss(self, global_cond, data): + embodiment_name = get_embodiment(data["embodiment"][0].item()).lower() + actions, global_cond = self.preprocess_compute_loss( + global_cond, data, embodiment_name + ) pred, target = self.predict(actions, global_cond, embodiment_name) return self.loss_fn(pred, target) diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py index 551853a6..27e74ee5 100644 --- a/egomimic/models/fm_policy.py +++ b/egomimic/models/fm_policy.py @@ -24,34 +24,37 @@ def __init__( self, model: ConditionalUnet1D, action_horizon, - infer_ac_dims, num_inference_steps=None, - encoder_map=None, + embodiment_specs=None, **kwargs, ): super().__init__( - model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs + model, action_horizon, num_inference_steps, embodiment_specs, **kwargs ) self.time_dist = kwargs.get("time_dist", "beta") - self.encoder_map = encoder_map + self.dt = -1.0 / self.num_inference_steps - def step(self, x_t, t, global_cond): + def step(self, x_t, t, global_cond, embodiment_name): if len(t.shape) != 1: t = torch.tensor([t], device=global_cond.device) - v_t = self.model(x_t, t, global_cond) + v_t = self.denoising_model(x_t, t, global_cond, embodiment_name) return x_t + self.dt * v_t, t + self.dt @override - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: self.dt = -1.0 / self.num_inference_steps x_t = noise time = torch.ones((len(global_cond)), device=global_cond.device) while time[0] >= -self.dt / 2: - x_t, time = self.step(x_t, time, global_cond) + x_t, time = self.step(x_t, time, global_cond, embodiment_name) return x_t @override - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: noise = torch.randn(actions.shape, device=actions.device) batch_shape = (actions.shape[0],) if self.time_dist == "beta": @@ -67,8 +70,45 @@ def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions - v_t = self.model(x_t, time, global_cond) + v_t = self.denoising_model(x_t, time, global_cond, embodiment_name) target = u_t pred = v_t return pred, target + + def denoising_model(self, x_t, time, global_cond, embodiment_name): + if self.codec_enabled: + x_t = self.embodiment_specs[embodiment_name]["encoder"](x_t) + else: + x_t = x_t + v_t = self.model(x_t, time, global_cond) + if self.codec_enabled: + v_t = self.embodiment_specs[embodiment_name]["decoder"](v_t) + else: + v_t = v_t + return v_t + + +if __name__ == "__main__": + import hydra + from omegaconf import OmegaConf + + cfg = OmegaConf.load( + "/coc/flash7/paphiwetsa3/projects/EgoVerse/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml" + ) + model = hydra.utils.instantiate(cfg.robomimic_model.head_specs.shared) + + # test the model + aria_input = torch.randn(8, 100, 140) + global_cond = torch.randn(8, 64, 256) + aria_output = model.step( + aria_input, torch.tensor([0.0]), global_cond, "aria_bimanual" + ) + aria_output_inference = model.inference(aria_input, global_cond, "aria_bimanual") + aria_output_predict = model.predict(aria_input, global_cond, "aria_bimanual") + + eva_input = torch.randn(8, 100, 14) + eva_output = model.step(eva_input, torch.tensor([0.0]), global_cond, "eva_bimanual") + eva_output_inference = model.inference(eva_input, global_cond, "eva_bimanual") + eva_output_predict = model.predict(eva_input, global_cond, "eva_bimanual") + breakpoint() diff --git a/egomimic/pl_utils/pl_model.py b/egomimic/pl_utils/pl_model.py index d44c1270..5c75a643 100644 --- a/egomimic/pl_utils/pl_model.py +++ b/egomimic/pl_utils/pl_model.py @@ -24,10 +24,19 @@ class ModelWrapper(LightningModule): grad_norm_mad_min_count = 100 grad_norm_mad_window = 200 - def __init__(self, robomimic_model, optimizer, scheduler): + def __init__( + self, + robomimic_model, + optimizer, + scheduler, + enable_adaptive_grad_clip: bool = True, + ): """ Args: model (PolicyAlgo): robomimic model to wrap. + enable_adaptive_grad_clip: if False, the MAD-based spike detection + and clipping in on_after_backward is skipped (grad norm is still + logged, just never clipped). """ super().__init__() self.save_hyperparameters() @@ -40,6 +49,7 @@ def __init__(self, robomimic_model, optimizer, scheduler): self.params = self.model.nets["policy"].params except Exception: pass + self.enable_adaptive_grad_clip = enable_adaptive_grad_clip self.grad_norm_history = deque(maxlen=self.grad_norm_mad_window) self.val_image_buffer, self.val_counter = {}, {} @@ -96,7 +106,10 @@ def on_after_backward(self): grad_norm_val = float(grad_norm) info = {"policy_grad_norms_raw": grad_norm_val} - if len(self.grad_norm_history) >= self.grad_norm_mad_min_count: + if ( + self.enable_adaptive_grad_clip + and len(self.grad_norm_history) >= self.grad_norm_mad_min_count + ): values = np.array(self.grad_norm_history, dtype=np.float32) median = float(np.median(values)) mad = float(np.median(np.abs(values - median))) diff --git a/egomimic/rldb/embodiment/embodiment.py b/egomimic/rldb/embodiment/embodiment.py index 13dc1b8f..798760d8 100644 --- a/egomimic/rldb/embodiment/embodiment.py +++ b/egomimic/rldb/embodiment/embodiment.py @@ -1,3 +1,4 @@ +import copy from abc import ABC from enum import Enum @@ -59,13 +60,31 @@ def get_keymap(): raise NotImplementedError @classmethod - def viz_cartesian_gt_preds(cls, predictions, batch, image_key, action_key): + def viz_gt_preds( + cls, + predictions, + batch, + image_key, + action_key, + transform_list=None, + mode="cartesian", + **kwargs, + ): embodiment_id = batch["embodiment"][0].item() embodiment_name = get_embodiment(embodiment_id).lower() + pred_actions = predictions[ + f"{embodiment_name}_{action_key}" + ] # TODO: make this work with groundtruth, clone batch and replace actions_keypoints with pred_actions + if transform_list is not None: + pred_batch = copy.deepcopy(batch) + pred_batch[action_key] = pred_actions + batch = cls.apply_transform(batch, transform_list) + pred_batch = cls.apply_transform(pred_batch, transform_list) + pred_actions = pred_batch[action_key] + images = batch[image_key] actions = batch[action_key] - pred_actions = predictions[f"{embodiment_name}_{action_key}"] ims_list = [] images = _to_numpy(images) actions = _to_numpy(actions) @@ -74,8 +93,8 @@ def viz_cartesian_gt_preds(cls, predictions, batch, image_key, action_key): image = images[i] action = actions[i] pred_action = pred_actions[i] - ims = cls.viz(image, action, mode="traj", color="Reds") - ims = cls.viz(ims, pred_action, mode="traj", color="Greens") + ims = cls.viz(image, action, mode=mode, color="Reds", **kwargs) + ims = cls.viz(ims, pred_action, mode=mode, color="Greens", **kwargs) ims_list.append(ims) ims = np.stack(ims_list, axis=0) return ims @@ -94,7 +113,7 @@ def apply_transform(cls, batch, transform_list: list[Transform]): results = [] for i in range(batch_size): sample = { - k: (v[i].numpy() if isinstance(v, torch.Tensor) else v[i]) + k: (v[i].cpu().numpy() if isinstance(v, torch.Tensor) else v[i]) if isinstance(v, (np.ndarray, torch.Tensor)) else v for k, v in batch.items() diff --git a/egomimic/rldb/embodiment/eva.py b/egomimic/rldb/embodiment/eva.py index ee762645..49676830 100644 --- a/egomimic/rldb/embodiment/eva.py +++ b/egomimic/rldb/embodiment/eva.py @@ -5,12 +5,14 @@ from egomimic.rldb.embodiment.embodiment import Embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, + BatchQuaternionPoseToYPR, ConcatKeys, DeleteKeys, InterpolateLinear, InterpolatePose, NumpyToTensor, PoseCoordinateFrameTransform, + QuaternionPoseToYPR, SplitKeys, Transform, XYZWXYZ_to_XYZYPR, @@ -34,16 +36,16 @@ class Eva(Embodiment): @staticmethod def get_transform_list( - mode: Literal["cartesian", "cartesian_wristframe"] = "cartesian", + mode: Literal[ + "cartesian", "cartesian_wristframe_ypr", "cartesian_wristframe_quat" + ] = "cartesian", ) -> list[Transform]: if mode == "cartesian": return _build_eva_bimanual_transform_list() - elif mode == "cartesian_wristframe": - return _build_eva_bimanual_eef_frame_transform_list() - else: - raise ValueError( - f"Unsupported mode '{mode}'. Expected one of: 'cartesian', 'cartesian_wristframe'." - ) + elif mode == "cartesian_wristframe_ypr": + return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + elif mode == "cartesian_wristframe_quat": + return _build_eva_bimanual_eef_frame_transform_list(is_quat=True) @classmethod def viz_transformed_batch( @@ -169,16 +171,21 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( right_obs_gripper: str = "right.obs_gripper", left_cmd_camframe: str = "left.cmd_ee_pose_camframe", right_cmd_camframe: str = "right.cmd_ee_pose_camframe", + is_quat: bool = True, ) -> list[Transform]: """Revert wrist-frame EVA actions back to camera frame for visualization.""" + if is_quat: + pose_shape = 7 + else: + pose_shape = 6 transform_list = [ # Extract obs camframe poses from the concatenated obs key SplitKeys( input_key=obs_key, output_key_list=[ - (left_obs_camframe, 6), + (left_obs_camframe, pose_shape), (left_obs_gripper, 1), - (right_obs_camframe, 6), + (right_obs_camframe, pose_shape), (right_obs_gripper, 1), ], ), @@ -186,9 +193,9 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( SplitKeys( input_key=action_key, output_key_list=[ - (left_cmd_wristframe, 6), + (left_cmd_wristframe, pose_shape), (left_gripper, 1), - (right_cmd_wristframe, 6), + (right_cmd_wristframe, pose_shape), (right_gripper, 1), ], ), @@ -254,8 +261,6 @@ def _build_eva_bimanual_eef_frame_transform_list( left_extra_batch_key = {"left_extrinsics_pose": left_extrinsics_pose} right_extra_batch_key = {"right_extrinsics_pose": right_extrinsics_pose} - mode = "xyzwxyz" if is_quat else "xyzypr" - # Step 1: transform cmd and obs into camera frame using extrinsics transform_list = [ ActionChunkCoordinateFrameTransform( @@ -263,40 +268,40 @@ def _build_eva_bimanual_eef_frame_transform_list( chunk_world=left_cmd_world, transformed_key_name=left_cmd_camframe, extra_batch_key=left_extra_batch_key, - mode=mode, + mode="xyzwxyz", ), ActionChunkCoordinateFrameTransform( target_world=right_target_world, chunk_world=right_cmd_world, transformed_key_name=right_cmd_camframe, extra_batch_key=right_extra_batch_key, - mode=mode, + mode="xyzwxyz", ), PoseCoordinateFrameTransform( target_world=left_target_world, pose_world=left_obs_pose, transformed_key_name=left_obs_camframe, - mode=mode, + mode="xyzwxyz", ), PoseCoordinateFrameTransform( target_world=right_target_world, pose_world=right_obs_pose, transformed_key_name=right_obs_camframe, - mode=mode, + mode="xyzwxyz", ), InterpolatePose( new_chunk_length=chunk_length, action_key=left_cmd_camframe, output_action_key=left_cmd_camframe, stride=stride, - mode=mode, + mode="xyzwxyz", ), InterpolatePose( new_chunk_length=chunk_length, action_key=right_cmd_camframe, output_action_key=right_cmd_camframe, stride=stride, - mode=mode, + mode="xyzwxyz", ), InterpolateLinear( new_chunk_length=chunk_length, @@ -315,26 +320,36 @@ def _build_eva_bimanual_eef_frame_transform_list( target_world=left_obs_camframe, chunk_world=left_cmd_camframe, transformed_key_name=left_cmd_wristframe, - mode=mode, + mode="xyzwxyz", ), ActionChunkCoordinateFrameTransform( target_world=right_obs_camframe, chunk_world=right_cmd_camframe, transformed_key_name=right_cmd_wristframe, - mode=mode, + mode="xyzwxyz", ), ] - if is_quat: - transform_list.append( - XYZWXYZ_to_XYZYPR( - keys=[ - left_cmd_wristframe, - right_cmd_wristframe, - left_obs_camframe, - right_obs_camframe, - ] - ) + if not is_quat: + transform_list.extend( + [ + BatchQuaternionPoseToYPR( + pose_key=left_cmd_wristframe, + output_key=left_cmd_wristframe, + ), + BatchQuaternionPoseToYPR( + pose_key=right_cmd_wristframe, + output_key=right_cmd_wristframe, + ), + QuaternionPoseToYPR( + pose_key=left_obs_camframe, + output_key=left_obs_camframe, + ), + QuaternionPoseToYPR( + pose_key=right_obs_camframe, + output_key=right_obs_camframe, + ), + ] ) transform_list.extend( diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index 7fce24ba..981ff98c 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -2,15 +2,15 @@ from typing import Literal -import numpy as np - -from egomimic.rldb.embodiment.embodiment import Embodiment, get_embodiment +from egomimic.rldb.embodiment.embodiment import Embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, + BatchQuaternionPoseToYPR, ConcatKeys, DeleteKeys, InterpolatePose, PoseCoordinateFrameTransform, + QuaternionPoseToYPR, Reshape, SplitKeys, Transform, @@ -30,32 +30,6 @@ class Human(Embodiment): VIZ_IMAGE_KEY = "observations.images.front_img_1" ACTION_STRIDE = 3 - @classmethod - def viz_keypoints_gt_preds( - cls, predictions, batch, image_key, action_key, transform_list=None, **kwargs - ): - if transform_list is not None: - batch = cls.apply_transform(batch, transform_list) - embodiment_id = batch["embodiment"][0].item() - embodiment_name = get_embodiment(embodiment_id).lower() - - images = batch[image_key] - actions = batch[action_key] - pred_actions = predictions[f"{embodiment_name}_{action_key}"] - ims_list = [] - images = _to_numpy(images) - actions = _to_numpy(actions) - pred_actions = _to_numpy(pred_actions) - for i in range(images.shape[0]): - image = images[i] - action = actions[i] - pred_action = pred_actions[i] - ims = cls.viz(image, action, mode="keypoints", color="Reds", **kwargs) - ims = cls.viz(ims, pred_action, mode="keypoints", color="Greens", **kwargs) - ims_list.append(ims) - ims = np.stack(ims_list, axis=0) - return ims - @classmethod def viz_transformed_batch( cls, @@ -120,7 +94,6 @@ def viz( else: colors = cls.FINGER_COLORS dot_color = cls.DOT_COLOR - return _viz_keypoints( images=images, actions=actions, @@ -257,28 +230,38 @@ class Aria(Human): } FINGER_EDGE_RANGES = [ ("thumb", 0, 3), - ("index", 3, 6), - ("middle", 6, 9), - ("ring", 9, 12), - ("pinky", 12, 15), + ("index", 3, 7), + ("middle", 7, 11), + ("ring", 11, 15), + ("pinky", 15, 19), ] DOT_COLOR = (255, 165, 0) @classmethod def get_transform_list( - cls, mode: Literal["cartesian", "keypoints_headframe", "keypoints_wristframe"] + cls, + mode: Literal[ + "cartesian", + "keypoints_headframe", + "keypoints_wristframe_ypr", + "keypoints_wristframe_quat", + ], ) -> list[Transform]: if mode == "cartesian": return _build_aria_cartesian_bimanual_transform_list( stride=cls.ACTION_STRIDE ) - elif mode == "keypoints": + elif mode == "keypoints_headframe": return _build_aria_keypoints_bimanual_transform_list( stride=cls.ACTION_STRIDE ) - elif mode == "keypoints_wristframe": + elif mode == "keypoints_wristframe_ypr": return _build_aria_keypoints_eef_frame_transform_list( - stride=cls.ACTION_STRIDE + stride=cls.ACTION_STRIDE, is_quat=False + ) + elif mode == "keypoints_wristframe_quat": + return _build_aria_keypoints_eef_frame_transform_list( + stride=cls.ACTION_STRIDE, is_quat=True ) else: raise ValueError( @@ -296,6 +279,7 @@ class Mecka(Human): ACTION_STRIDE = 1 +# this works for quat and ypr since actionChunkCoordinateFrameTransform works for both def _build_aria_keypoints_revert_eef_frame_transform_list( *, action_key: str = "actions_keypoints", @@ -305,12 +289,23 @@ def _build_aria_keypoints_revert_eef_frame_transform_list( right_wrist_obs_headframe: str = "right.obs_wrist_pose_headframe", left_wrist_action_headframe: str = "left.action_wrist_pose_headframe", right_wrist_action_headframe: str = "right.action_wrist_pose_headframe", + left_wrist_action_wristframe: str = "left.action_wrist_pose_wristframe", + right_wrist_action_wristframe: str = "right.action_wrist_pose_wristframe", + left_keypoints_action_headframe: str = "left.action_keypoints_headframe", + right_keypoints_action_headframe: str = "right.action_keypoints_headframe", + is_quat: bool = True, ) -> list[Transform]: + if is_quat: + pose_shape = 7 + else: + pose_shape = 6 transform_list = [ SplitKeys( input_key=action_key, output_key_list=[ + (left_wrist_action_wristframe, pose_shape), (left_keypoints_action_wristframe, 63), + (right_wrist_action_wristframe, pose_shape), (right_keypoints_action_wristframe, 63), ], ), @@ -327,31 +322,31 @@ def _build_aria_keypoints_revert_eef_frame_transform_list( ActionChunkCoordinateFrameTransform( target_world=left_wrist_obs_headframe, chunk_world=left_keypoints_action_wristframe, - transformed_key_name=left_wrist_action_headframe, + transformed_key_name=left_keypoints_action_headframe, mode="xyz", inverse=False, ), ActionChunkCoordinateFrameTransform( target_world=right_wrist_obs_headframe, chunk_world=right_keypoints_action_wristframe, - transformed_key_name=right_wrist_action_headframe, + transformed_key_name=right_keypoints_action_headframe, mode="xyz", inverse=False, ), Reshape( - input_key=left_wrist_action_headframe, - output_key=left_wrist_action_headframe, + input_key=left_keypoints_action_headframe, + output_key=left_keypoints_action_headframe, shape=(100, 63), ), Reshape( - input_key=right_wrist_action_headframe, - output_key=right_wrist_action_headframe, + input_key=right_keypoints_action_headframe, + output_key=right_keypoints_action_headframe, shape=(100, 63), ), ConcatKeys( key_list=[ - left_wrist_action_headframe, - right_wrist_action_headframe, + left_keypoints_action_headframe, + right_keypoints_action_headframe, ], new_key_name=action_key, delete_old_keys=True, @@ -390,6 +385,7 @@ def _build_aria_keypoints_eef_frame_transform_list( delete_target_world: bool = True, chunk_length: int = 100, stride: int = 3, + is_quat: bool = True, ) -> list[Transform]: transform_list = _build_aria_keypoints_bimanual_transform_list( target_world=target_world, @@ -486,13 +482,48 @@ def _build_aria_keypoints_eef_frame_transform_list( output_key=right_keypoints_obs_wristframe, shape=(63,), ), + ActionChunkCoordinateFrameTransform( + target_world=left_wrist_obs_headframe, + chunk_world=left_wrist_action_headframe, + transformed_key_name=left_wrist_action_wristframe, + mode="xyzwxyz", + ), + ActionChunkCoordinateFrameTransform( + target_world=right_wrist_obs_headframe, + chunk_world=right_wrist_action_headframe, + transformed_key_name=right_wrist_action_wristframe, + mode="xyzwxyz", + ), ] ) + if not is_quat: + transform_list.extend( + [ + BatchQuaternionPoseToYPR( + pose_key=left_wrist_action_wristframe, + output_key=left_wrist_action_wristframe, + ), + BatchQuaternionPoseToYPR( + pose_key=right_wrist_action_wristframe, + output_key=right_wrist_action_wristframe, + ), + QuaternionPoseToYPR( + pose_key=left_wrist_obs_headframe, + output_key=left_wrist_obs_headframe, + ), + QuaternionPoseToYPR( + pose_key=right_wrist_obs_headframe, + output_key=right_wrist_obs_headframe, + ), + ] + ) transform_list.extend( [ ConcatKeys( key_list=[ + left_wrist_action_wristframe, left_keypoints_action_wristframe, + right_wrist_action_wristframe, right_keypoints_action_wristframe, ], new_key_name="actions_keypoints", diff --git a/egomimic/rldb/zarr/action_chunk_transforms.py b/egomimic/rldb/zarr/action_chunk_transforms.py index 6ff5fdc2..379c3a6a 100644 --- a/egomimic/rldb/zarr/action_chunk_transforms.py +++ b/egomimic/rldb/zarr/action_chunk_transforms.py @@ -31,6 +31,8 @@ _xyz_to_matrix, _xyzwxyz_to_matrix, _xyzypr_to_matrix, + wxyz_to_xyzw, + xyzw_to_wxyz, ) # --------------------------------------------------------------------------- @@ -250,11 +252,75 @@ def transform(self, batch: dict) -> dict: f"'{self.pose_key}'" ) xyz = pose[:3] - ypr = R.from_quat(pose[3:7]).as_euler("ZYX", degrees=False) + xyzw = wxyz_to_xyzw(pose[3:7]) + ypr = R.from_quat(xyzw).as_euler("ZYX", degrees=False) batch[self.output_key] = np.concatenate([xyz, ypr], axis=0) return batch +class YPRToQuaternionPose(Transform): + """Convert a single pose from xyz + ypr to xyz + quat(x,y,z,w).""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.shape != (6,): + raise ValueError( + f"YPRToQuaternionPose expects shape (6,), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + xyz = pose[:3] + quat = R.from_euler("ZYX", pose[3:6], degrees=False).as_quat() # (x,y,z,w) + quat = xyzw_to_wxyz(quat) + batch[self.output_key] = np.concatenate([xyz, quat], axis=0) + return batch + + +class BatchQuaternionPoseToYPR(Transform): + """Convert a batch of poses from xyz + quat(x,y,z,w) to xyz + ypr.""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.ndim != 2 or pose.shape[-1] != 7: + raise ValueError( + f"BatchQuaternionPoseToYPR expects shape (N, 7), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + xyz = pose[:, :3] + xyzw = wxyz_to_xyzw(pose[:, 3:7]) + ypr = R.from_quat(xyzw).as_euler("ZYX", degrees=False) # (N, 3) + batch[self.output_key] = np.concatenate([xyz, ypr], axis=1) + return batch + + +class BatchYPRToQuaternionPose(Transform): + """Convert a batch of poses from xyz + ypr to xyz + quat(x,y,z,w).""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.ndim != 2 or pose.shape[-1] != 6: + raise ValueError( + f"BatchYPRToQuaternionPose expects shape (N, 6), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + xyz = pose[:, :3] + quat = R.from_euler("ZYX", pose[:, 3:6], degrees=False).as_quat() # (N, 4) + quat = xyzw_to_wxyz(quat) + batch[self.output_key] = np.concatenate([xyz, quat], axis=1) + return batch + + class PoseCoordinateFrameTransform(Transform): """Transform a single pose into a target frame pose.""" diff --git a/egomimic/scripts/tutorials/zarr_data_viz.ipynb b/egomimic/scripts/tutorials/zarr_data_viz.ipynb index 1c877626..b13e36f6 100644 --- a/egomimic/scripts/tutorials/zarr_data_viz.ipynb +++ b/egomimic/scripts/tutorials/zarr_data_viz.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "79d184b3", "metadata": {}, "outputs": [], @@ -23,19 +23,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "32d9110f", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/coc/flash7/paphiwetsa3/projects/EgoVerse/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.\n", - " import pynvml # type: ignore[import]\n" - ] - } - ], + "outputs": [], "source": [ "from pathlib import Path\n", "\n", @@ -62,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "cc9edba1", "metadata": {}, "outputs": [], @@ -73,18 +64,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "a4aa1a05", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tables in schema 'app': ['episodes']\n" - ] - } - ], + "outputs": [], "source": [ "# Point this at a single episode directory, e.g. /path/to/episode_hash.zarr\n", "# EPISODE_PATH = Path(\"/coc/flash7/scratch/egoverseDebugDatasets/1767495035712.zarr\")\n", @@ -112,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "58f0af00", "metadata": {}, "outputs": [], @@ -122,46 +105,20 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "67a60218", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([-0.2326, 0.1783, 0.3866, -0.0258, 0.0405, 0.8205, 0.0800, 0.3351,\n", - " 0.2074, 0.4526, -0.0582, -0.0042, 0.8754, 0.0000],\n", - " dtype=torch.float64)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "batch['actions_cartesian'][0,0]" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "4b72f3bb", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Separate YPR visualization preview\n", "for batch in loader:\n", @@ -172,26 +129,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "0d8c3da2", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "images = []\n", "for i, batch in enumerate(loader):\n", @@ -214,25 +155,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "b7384468", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tables in schema 'app': ['episodes']\n" - ] - } - ], + "outputs": [], "source": [ - "temp_dir = \"/coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest\"\n", + "temp_dir = \"/storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train\"\n", "\n", "intrinsics_key = \"base\"\n", "\n", "key_map = Aria.get_keymap(mode=\"keypoints\")\n", - "transform_list = Aria.get_transform_list(mode=\"keypoints\")\n", + "transform_list = Aria.get_transform_list(mode=\"keypoints_wristframe\")\n", "\n", "resolver = S3EpisodeResolver(\n", " temp_dir,\n", @@ -252,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "607f219c", "metadata": {}, "outputs": [], @@ -262,44 +195,30 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "1c57c9f0", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['observations.images.front_img_1', 'actions_keypoints', 'observations.state.keypoints', 'metadata.robot_name', 'embodiment'])" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "batch.keys()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, + "id": "853a2084", + "metadata": {}, + "outputs": [], + "source": [ + "batch['actions_keypoints'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "af65095a", "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'actions_cartesian'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mKeyError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m ims = []\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i, batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(loader):\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m vis = \u001b[43mAria\u001b[49m\u001b[43m.\u001b[49m\u001b[43mviz_transformed_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtraj\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m ims.append(vis)\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m i > \u001b[32m10\u001b[39m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m/coc/flash7/paphiwetsa3/projects/EgoVerse/egomimic/rldb/embodiment/human.py:9\u001b[39m, in \u001b[36mviz_transformed_batch\u001b[39m\u001b[34m(cls, batch, mode, action_key, image_key)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtyping\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Literal\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01megomimic\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrldb\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01membodiment\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01membodiment\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Embodiment\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01megomimic\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrldb\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mzarr\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01maction_chunk_transforms\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 7\u001b[39m ActionChunkCoordinateFrameTransform,\n\u001b[32m 8\u001b[39m ConcatKeys,\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m DeleteKeys,\n\u001b[32m 10\u001b[39m InterpolatePose,\n\u001b[32m 11\u001b[39m PoseCoordinateFrameTransform,\n\u001b[32m 12\u001b[39m Reshape,\n\u001b[32m 13\u001b[39m Transform,\n\u001b[32m 14\u001b[39m XYZWXYZ_to_XYZYPR,\n\u001b[32m 15\u001b[39m )\n\u001b[32m 16\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01megomimic\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mviz_utils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 17\u001b[39m _viz_axes,\n\u001b[32m 18\u001b[39m _viz_keypoints,\n\u001b[32m 19\u001b[39m _viz_traj,\n\u001b[32m 20\u001b[39m )\n\u001b[32m 21\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01megomimic\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtype_utils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m _to_numpy\n", - "\u001b[31mKeyError\u001b[39m: 'actions_cartesian'" - ] - } - ], + "outputs": [], "source": [ "ims = []\n", "for i, batch in enumerate(loader):\n", @@ -331,213 +250,26 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "60723adf", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ + "from egomimic.utils.viz_utils import save_image\n", + "from egomimic.rldb.embodiment.human import _build_aria_keypoints_revert_eef_frame_transform_list\n", "ims_keypoints = []\n", "for i, batch in enumerate(loader):\n", - " vis_keypoints = Aria.viz_transformed_batch(batch, mode=\"keypoints\", action_key=\"actions_keypoints\")\n", + " vis_keypoints = Aria.viz_transformed_batch(batch, mode=\"keypoints\", color=\"Reds\",action_key=\"actions_keypoints\", transform_list=_build_aria_keypoints_revert_eef_frame_transform_list())\n", " ims_keypoints.append(vis_keypoints)\n", - " if i > 360:\n", - " break\n", - "\n", - "mpy.show_video(ims_keypoints, fps=20)" - ] - }, - { - "cell_type": "markdown", - "id": "efecaba7", - "metadata": {}, - "source": [ - "## Keypoint Visualization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e39bca03", - "metadata": {}, - "outputs": [], - "source": [ - "# Load Scale episode with raw keypoints (no action chunking needed)\n", - "\n", - "from egomimic.rldb.zarr.action_chunk_transforms import _xyzwxyz_to_matrix\n", - "\n", - "key_map_kp = {\n", - " \"images.front_1\": {\"zarr_key\": \"images.front_1\"},\n", - " \"left.obs_keypoints\": {\"zarr_key\": \"left.obs_keypoints\"},\n", - " \"right.obs_keypoints\": {\"zarr_key\": \"right.obs_keypoints\"},\n", - " \"obs_head_pose\": {\"zarr_key\": \"obs_head_pose\"},\n", - "}\n", - "\n", - "filters = {\"episode_hash\": \"2026-01-20-20-59-43-376000\"}\n", - "\n", - "resolver = S3EpisodeResolver(\n", - " temp_dir,\n", - " key_map=key_map\n", - ")\n", - "\n", - "cloudflare_ds = MultiDataset._from_resolver(\n", - " resolver, filters=filters, sync_from_s3=True, mode=\"total\"\n", - ")\n", - "\n", - "loader_kp = torch.utils.data.DataLoader(cloudflare_ds, batch_size=1, shuffle=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "848c6d74", - "metadata": {}, - "outputs": [], - "source": [ - "# ARIA Keypoint Viz\n", - "# MANO skeleton edges: (parent, child) for drawing bones\n", - "MANO_EDGES = [\n", - " (0, 1), (1, 2), (2, 3), (3, 4), # thumb\n", - " (0, 5), (5, 6), (6, 7), (7, 8), # index\n", - " (0, 9), (9, 10), (10, 11), (11, 12), # middle\n", - " (0, 13), (13, 14), (14, 15), (15, 16), # ring\n", - " (0, 17), (17, 18), (18, 19), (19, 20), # pinky\n", - "]\n", - "\n", - "# aria configuration\n", - "MANO_EDGES = [\n", - " (5, 6,), (6, 7), (7, 0), # thumb\n", - " (5, 8), (8, 9), (9, 10), (9, 1), # index\n", - " (5, 11), (11, 12), (12, 13), (13, 2), # middle\n", - " (5, 14), (14, 15), (15, 16), (16, 3), # ring\n", - " (5, 17), (17, 18), (18, 19), (19, 4), # pinky\n", - "]\n", - "\n", - "FINGER_COLORS = {\n", - " \"thumb\": (255, 100, 100), # red\n", - " \"index\": (100, 255, 100), # green\n", - " \"middle\": (100, 100, 255), # blue\n", - " \"ring\": (255, 255, 100), # yellow\n", - " \"pinky\": (255, 100, 255), # magenta\n", - "}\n", - "FINGER_EDGE_RANGES = [\n", - " (\"thumb\", 0, 3), (\"index\", 3, 6), (\"middle\", 6, 9),\n", - " (\"ring\", 9, 12), (\"pinky\", 12, 15),\n", - "]\n", - "\n", - "\n", - "def viz_keypoints(batch, image_key=\"observations.images.front_img_1\"):\n", - " \"\"\"Visualize all 21 MANO keypoints per hand, projected onto the image.\"\"\"\n", - " # Prepare image\n", - " img = batch[image_key][0].detach().cpu()\n", - " if img.shape[0] in (1, 3):\n", - " img = img.permute(1, 2, 0)\n", - " img_np = img.numpy()\n", - " if img_np.dtype != np.uint8:\n", - " if img_np.max() <= 1.0:\n", - " img_np = (img_np * 255.0).clip(0, 255).astype(np.uint8)\n", - " else:\n", - " img_np = img_np.clip(0, 255).astype(np.uint8)\n", - " if img_np.shape[-1] == 1:\n", - " img_np = np.repeat(img_np, 3, axis=-1)\n", - "\n", - " intrinsics = INTRINSICS[\"base\"]\n", - " head_pose = batch[\"obs_head_pose\"][0].detach().cpu().numpy() # (6,)\n", - "\n", - " # T_head_world: camera pose in world (camera-to-world)\n", - " # We need world-to-camera = inv(T_head_world)\n", - " T_head_world = _xyzwxyz_to_matrix(head_pose[None, :])[0] # (4, 4)\n", - " T_world_to_cam = np.linalg.inv(T_head_world)\n", - "\n", - " vis = img_np.copy()\n", - " h, w = vis.shape[:2]\n", - "\n", - " for hand, dot_color in [(\"left\", (0, 120, 255)), (\"right\", (255, 80, 0))]:\n", - " kps_key = f\"{hand}.obs_keypoints\"\n", - " if kps_key not in batch:\n", - " continue\n", - " kps_flat = batch[kps_key][0].detach().cpu().numpy() # (63,)\n", - " kps_world = kps_flat.reshape(21, 3)\n", - "\n", - " # Skip if keypoints are all zero (invalid, clamped from 1e9)\n", - " if np.allclose(kps_world, 0.0, atol=1e-3):\n", - " continue\n", - "\n", - " # World -> camera frame\n", - " kps_h = np.concatenate([kps_world, np.ones((21, 1))], axis=1) # (21, 4)\n", - " kps_cam = (T_world_to_cam @ kps_h.T).T[:, :3] # (21, 3)\n", - "\n", - " # Camera frame -> pixels\n", - " kps_px = cam_frame_to_cam_pixels(kps_cam, intrinsics) # (21, 3+)\n", - "\n", - " # Identify valid keypoints (z > 0 and in image bounds)\n", - " valid = (kps_cam[:, 2] > 0.01)\n", - " valid &= (kps_px[:, 0] >= 0) & (kps_px[:, 0] < w)\n", - " valid &= (kps_px[:, 1] >= 0) & (kps_px[:, 1] < h)\n", - "\n", - " # Draw skeleton edges (colored by finger)\n", - " for finger, start, end in FINGER_EDGE_RANGES:\n", - " color = FINGER_COLORS[finger]\n", - " for edge_idx in range(start, end):\n", - " i, j = MANO_EDGES[edge_idx]\n", - " if valid[i] and valid[j]:\n", - " p1 = (int(kps_px[i, 0]), int(kps_px[i, 1]))\n", - " p2 = (int(kps_px[j, 0]), int(kps_px[j, 1]))\n", - " cv2.line(vis, p1, p2, color, 2)\n", - "\n", - " # Draw keypoint dots on top\n", - " for k in range(21):\n", - " if valid[k]:\n", - " center = (int(kps_px[k, 0]), int(kps_px[k, 1]))\n", - " cv2.circle(vis, center, 4, dot_color, -1)\n", - " cv2.circle(vis, center, 4, (255, 255, 255), 1) # white border\n", - "\n", - " # Label wrist\n", - " if valid[0]:\n", - " wrist_px = (int(kps_px[0, 0]) + 6, int(kps_px[0, 1]) - 6)\n", - " cv2.putText(vis, f\"{hand[0].upper()}\", wrist_px,\n", - " cv2.FONT_HERSHEY_SIMPLEX, 0.5, dot_color, 2)\n", - "\n", - " return vis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75dbfa95", - "metadata": {}, - "outputs": [], - "source": [ - "# Render keypoint video\n", - "ims_kp = []\n", - "for i, batch_kp in enumerate(loader_kp):\n", - " vis = viz_keypoints(batch_kp)\n", - " ims_kp.append(vis)\n", - " if i > 10:\n", - " break\n", - "\n", - "mpy.show_video(ims_kp, fps=30)" + " if i > 1:\n", + " save_image(vis_keypoints, \"keypoints.png\")\n", + " break\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "8f4fbaec", + "id": "3c5ee31a", "metadata": {}, "outputs": [], "source": [] @@ -559,9 +291,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.14" + "version": "3.11.15" } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/egomimic/scripts/tutorials/zarr_data_viz.py b/egomimic/scripts/tutorials/zarr_data_viz.py new file mode 100644 index 00000000..6bf96f43 --- /dev/null +++ b/egomimic/scripts/tutorials/zarr_data_viz.py @@ -0,0 +1,88 @@ +import imageio_ffmpeg +import mediapy as mpy +import torch + +from egomimic.rldb.embodiment.eva import Eva +from egomimic.rldb.embodiment.human import ( + Aria, + _build_aria_keypoints_revert_eef_frame_transform_list, + _build_eva_bimanual_revert_eef_frame_transform_list, +) +from egomimic.rldb.zarr.zarr_dataset_multi import MultiDataset, S3EpisodeResolver +from egomimic.utils.aws.aws_data_utils import load_env +from egomimic.utils.viz_utils import save_image + +# Ensure mediapy can find an ffmpeg executable in this environment +mpy.set_ffmpeg(imageio_ffmpeg.get_ffmpeg_exe()) + +TEMP_DIR = "/storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train" +load_env() + +# Point this at a single episode directory, e.g. /path/to/episode_hash.zarr +# EPISODE_PATH = Path("/coc/flash7/scratch/egoverseDebugDatasets/1767495035712.zarr") + +key_map = Eva.get_keymap() +transform_list = Eva.get_transform_list(mode="cartesian_wristframe_ypr") + +# Build a MultiDataset with exactly one ZarrDataset inside +# single_ds = ZarrDataset(Episode_path=EPISODE_PATH, key_map=key_map, transform_list=transform_list) +# single_ds = ZarrDataset(Episode_path=EPISODE_PATH, key_map=key_map) + +# multi_ds = MultiDataset(datasets={"single_episode": single_ds}, mode="total") +resolver = S3EpisodeResolver(TEMP_DIR, key_map=key_map, transform_list=transform_list) +filters = {"episode_hash": "2025-12-26-18-07-46-296000"} +multi_ds = MultiDataset._from_resolver( + resolver, filters=filters, sync_from_s3=True, mode="total" +) + +loader = torch.utils.data.DataLoader(multi_ds, batch_size=1, shuffle=False) + + +for batch in loader: + vis_ypr = Eva.viz_transformed_batch( + batch, + mode="axes", + transform_list=_build_eva_bimanual_revert_eef_frame_transform_list( + is_quat=False + ), + ) + save_image(vis_ypr, "vis_ypr.png") + break + +temp_dir = "/storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train" + +intrinsics_key = "base" + +key_map = Aria.get_keymap(mode="keypoints") +transform_list = Aria.get_transform_list(mode="keypoints_wristframe_ypr") + +resolver = S3EpisodeResolver( + temp_dir, + key_map=key_map, + transform_list=transform_list, +) + +filters = {"episode_hash": "2026-01-20-20-59-43-376000"} # aria +# filters = {"episode_hash": "692ee048ef7557106e6c4b8d"} # mecka + +cloudflare_ds = MultiDataset._from_resolver( + resolver, filters=filters, sync_from_s3=True, mode="total" +) + +loader = torch.utils.data.DataLoader(cloudflare_ds, batch_size=1, shuffle=False) + +ims_keypoints = [] +for i, batch in enumerate(loader): + vis_keypoints = Aria.viz_transformed_batch( + batch, + mode="keypoints", + color="Reds", + action_key="actions_keypoints", + transform_list=_build_aria_keypoints_revert_eef_frame_transform_list( + is_quat=False + ), + ) + ims_keypoints.append(vis_keypoints) + if i > 1: + save_image(vis_keypoints, "keypoints.png") + break diff --git a/egomimic/trainHydra.py b/egomimic/trainHydra.py index 7f2644c5..49e3a896 100644 --- a/egomimic/trainHydra.py +++ b/egomimic/trainHydra.py @@ -1,6 +1,7 @@ import copy import os import signal +import subprocess from collections.abc import Mapping from typing import Any, Dict, List, Optional, Tuple @@ -114,7 +115,8 @@ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: data_schematic.infer_norm_from_dataset( norm_dataset, dataset_name, - sample_frac=0.005, + sample_frac=cfg.norm_percentage, + num_workers=cfg.num_workers, benchmark_dir=os.path.join( cfg.trainer.default_root_dir, "benchmark_stats.json" ), @@ -216,6 +218,9 @@ def main(cfg: DictConfig) -> Optional[float]: :param cfg: DictConfig configuration composed by Hydra. :return: Optional[float] with optimized metric value. """ + script = os.path.join(os.path.dirname(__file__), "utils/aws/setup_secret.sh") + subprocess.run(["bash", script], check=True) + # apply extra utilities # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.) extras(cfg) diff --git a/egomimic/train_zarr.yaml b/egomimic/train_zarr.yaml new file mode 100644 index 00000000..26f14b3b --- /dev/null +++ b/egomimic/train_zarr.yaml @@ -0,0 +1,111 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent + - visualization: eva_cartesian_aria_cartesian + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_flow +description: latent_flow +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 diff --git a/egomimic/utils/pose_utils.py b/egomimic/utils/pose_utils.py index 4e6954ba..8dabde0b 100644 --- a/egomimic/utils/pose_utils.py +++ b/egomimic/utils/pose_utils.py @@ -8,6 +8,10 @@ def xyzw_to_wxyz(xyzw): return np.concatenate([xyzw[..., 3:4], xyzw[..., :3]], axis=-1) +def wxyz_to_xyzw(wxyz): + return np.concatenate([wxyz[..., 1:4], wxyz[..., 0:1]], axis=-1) + + def _interpolate_euler(seq: np.ndarray, chunk_length: int) -> np.ndarray: """Euler-aware interpolation for a single (T, 6) or (T, 7) sequence.""" T, D = seq.shape diff --git a/egomimic/utils/viz_utils.py b/egomimic/utils/viz_utils.py index ffba4992..c929fcd7 100644 --- a/egomimic/utils/viz_utils.py +++ b/egomimic/utils/viz_utils.py @@ -187,7 +187,12 @@ def _viz_keypoints( vis = images.copy() h, w = vis.shape[:2] - left_keypoints, right_keypoints = _split_keypoints(actions, wrist_in_data=False) + if actions.shape[-1] == 140: + left_xyz, left_wxyz, left_keypoints, right_xyz, right_wxyz, right_keypoints = ( + _split_keypoints(actions, wrist_in_data=True) + ) + else: + left_keypoints, right_keypoints = _split_keypoints(actions, wrist_in_data=False) keypoints = {} keypoints["left"] = left_keypoints.reshape(-1, 3) keypoints["right"] = right_keypoints.reshape(-1, 3)