diff --git a/egomimic/algo/hpt.py b/egomimic/algo/hpt.py index 6ae47832..929b7afe 100644 --- a/egomimic/algo/hpt.py +++ b/egomimic/algo/hpt.py @@ -359,7 +359,7 @@ def stem_process(self, domain, data): feat_dict = {} for modality in self.modalities.get(domain, []) + self.shared_keys: if modality not in data: - continue + raise ValueError(f"Modality {modality} not found in data") if modality in self.shared_keys: domain = "shared" @@ -829,7 +829,6 @@ def __init__( self.domains = domains.copy() self.auxiliary_ac_keys = auxiliary_ac_keys.copy() self.shared_ac_key = kwargs.get("shared_ac_key", None) - self.is_6dof = kwargs.get("6dof", False) self.kinematics_solver = kwargs.get("kinematics_solver", None) model = HPTModel(**trunk) @@ -1282,13 +1281,16 @@ def compute_losses(self, predictions, batch): embodiment_name = get_embodiment(embodiment_id).lower() bc_loss = predictions[f"{embodiment_name}_loss"] scaled_bc_loss = bc_weight * bc_loss - total_action_loss += scaled_bc_loss + total_action_loss = total_action_loss + scaled_bc_loss loss_dict[f"{embodiment_name}_loss"] = bc_loss # for logging if self.ot: loss_dict["ot_loss"] = predictions["ot_loss"] loss_dict["avg_feature_distance"] = predictions["avg_feature_distance"] - total_action_loss += ot_weight * self.temperature * predictions["ot_loss"] + total_action_loss = ( + total_action_loss + + ot_weight * self.temperature * predictions["ot_loss"] + ) loss_dict["action_loss"] = total_action_loss / len(self.domains) return loss_dict @@ -1372,7 +1374,6 @@ def _robomimic_to_hpt_data( if key in batch: data[key] = batch[key] - data["is_6dof"] = self.is_6dof data["pad_mask"] = batch["pad_mask"] data["embodiment"] = batch["embodiment"] diff --git a/egomimic/hydra_configs/data/aria_debug.yaml b/egomimic/hydra_configs/data/aria_debug.yaml new file mode 100644 index 00000000..2c0b1c54 --- /dev/null +++ b/egomimic/hydra_configs/data/aria_debug.yaml @@ -0,0 +1,39 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + episode_hash: "2025-11-27-23-44-43-234000" + mode: total +valid_datasets: + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + episode_hash: "2025-11-27-23-44-43-234000" + mode: total +train_dataloader_params: + aria_bimanual: + batch_size: 64 + num_workers: 10 +valid_dataloader_params: + aria_bimanual: + batch_size: 64 + num_workers: 10 diff --git a/egomimic/hydra_configs/data/eva_human_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_cotrain.yaml index ea70acc9..357e388b 100644 --- a/egomimic/hydra_configs/data/eva_human_cotrain.yaml +++ b/egomimic/hydra_configs/data/eva_human_cotrain.yaml @@ -4,7 +4,7 @@ train_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap transform_list: @@ -16,7 +16,7 @@ train_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.human.Aria.get_keymap mode: cartesian @@ -31,7 +31,7 @@ valid_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap transform_list: @@ -43,7 +43,7 @@ valid_datasets: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train key_map: _target_: egomimic.rldb.embodiment.human.Aria.get_keymap mode: cartesian diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml new file mode 100644 index 00000000..81cb1065 --- /dev/null +++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml @@ -0,0 +1,73 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +valid_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +train_dataloader_params: + eva_bimanual: + batch_size: 256 + num_workers: 6 + aria_bimanual: + batch_size: 256 + num_workers: 6 +valid_dataloader_params: + eva_bimanual: + batch_size: 256 + num_workers: 6 + aria_bimanual: + batch_size: 256 + num_workers: 6 diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml new file mode 100644 index 00000000..53610780 --- /dev/null +++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain_wrist.yaml @@ -0,0 +1,77 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + mode: cartesian_wristframe_ypr + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints_wristframe_ypr + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + lab: "rl2" + mode: total +valid_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + mode: cartesian_wristframe_ypr + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/paphiwetsa3/datasets/temp_train + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints_wristframe_ypr + filters: + robot_name: "aria_bimanual" + task: "fold_clothes" + lab: "rl2" + mode: total +train_dataloader_params: + eva_bimanual: + batch_size: 32 + num_workers: 6 + aria_bimanual: + batch_size: 64 + num_workers: 6 +valid_dataloader_params: + eva_bimanual: + batch_size: 16 + num_workers: 6 + aria_bimanual: + batch_size: 16 + num_workers: 6 diff --git a/egomimic/hydra_configs/hydra/launcher/submitit.yaml b/egomimic/hydra_configs/hydra/launcher/submitit.yaml index c56f2cd5..b068685e 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit.yaml @@ -4,15 +4,15 @@ defaults: _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher # Slurm configuration -name: ${hydra.job.name} # Default job name -partition: "rl2-lab" # Slurm partition (e.g., 'gpu' or 'compute') -account: "rl2-lab" # Slurm account (e.g., 'my_account') -cpus_per_task: 12 # Number of CPUs per task -nodes: ${launch_params.nodes} # Number of nodes -tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node +name: ${hydra.job.name} # Default job name +partition: "hoffman-lab" # Slurm partition (e.g., 'gpu' or 'compute') +account: "hoffman-lab" # Slurm account (e.g., 'my_account') +cpus_per_task: 12 # Number of CPUs per task +nodes: ${launch_params.nodes} # Number of nodes +tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:a40:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count -qos: "short" # Slurm QoS -timeout_min: 2880 # Timeout in minutes (48 hours) -exclude: "protocol, puma" # Nodes to exclude +qos: "short" # Slurm QoS +timeout_min: 2880 # Timeout in minutes (48 hours) +exclude: "protocol, puma" # Nodes to exclude additional_parameters: - requeue: true \ No newline at end of file + requeue: true diff --git a/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml b/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml index 2d9cd957..d34c2b98 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit_pace.yaml @@ -4,6 +4,7 @@ defaults: _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher # Slurm configuration + name: ${hydra.job.name} # Default job name partition: "gpu-h200" # Slurm partition account: "gts-dxu345-rl2" # Slurm account @@ -11,7 +12,7 @@ cpus_per_task: 8 # Number of CPUs per task (ma nodes: ${launch_params.nodes} # Number of nodes tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:h200:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count (h100 for H100 GPUs) -qos: "short" # Slurm QoS +qos: "inferno" # Slurm QoS mem_per_gpu: 250G timeout_min: 2880 # Timeout in minutes (48 hours) # exclude: "protocol, puma" # Nodes to exclude diff --git a/egomimic/hydra_configs/logger/wandb.yaml b/egomimic/hydra_configs/logger/wandb.yaml index 6f574bd5..dc5d2ea3 100644 --- a/egomimic/hydra_configs/logger/wandb.yaml +++ b/egomimic/hydra_configs/logger/wandb.yaml @@ -7,7 +7,7 @@ wandb: offline: False id: "${name}_${description}_${now:%Y-%m-%d_%H-%M-%S}" # pass correct id to resume experiment! anonymous: null # enable anonymous logging - project: "zarr_test" + project: "keypoints_cotrain" log_model: False # upload lightning ckpts prefix: "" # a string to put at the beginning of metric keys entity: "rl2-group" # set to name of your wandb team diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml new file mode 100644 index 00000000..7c99a560 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_aria_keypoints.yaml @@ -0,0 +1,54 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.identity.Identity + decoder: + _target_: egomimic.models.codec.identity.Identity + stem_specs: + eva_bimanual: null + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml new file mode 100644 index 00000000..deaf8e22 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria.yaml @@ -0,0 +1,61 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 5 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 5 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml new file mode 100644 index 00000000..b020b469 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_head_latent_aria_mlp.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + n_layers: 2 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 2 + output_dim: 14 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 140 + hidden_dim: 256 + n_layers: 7 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 7 + output_dim: 140 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml new file mode 100644 index 00000000..b256d18d --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 6 + cond_dim: 256 + hidden_dim: 256 + act_dim: 128 + act_seq: 12 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml new file mode 100644 index 00000000..9236502a --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml @@ -0,0 +1,78 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 2 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 2 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 7 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 7 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml new file mode 100644 index 00000000..95a1d2fd --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large_wrist.yaml @@ -0,0 +1,78 @@ +defaults: + - hpt_cotrain_keypoints_wrist + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 2 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 2 + aria_bimanual: + ac_dims: 138 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 138 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + n_layers: 7 + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 138 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + n_layers: 7 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml new file mode 100644 index 00000000..9d3e20aa --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + n_layers: 2 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 2 + output_dim: 14 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 140 + hidden_dim: 256 + n_layers: 7 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 7 + output_dim: 140 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml new file mode 100644 index 00000000..e7e8cc46 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp_wrist.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + n_layers: 2 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 2 + output_dim: 14 + aria_bimanual: + ac_dims: 138 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 138 + hidden_dim: 256 + n_layers: 7 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + n_layers: 7 + output_dim: 138 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 3e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml new file mode 100644 index 00000000..c03abc5b --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml @@ -0,0 +1,160 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.hpt.HPT + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + eva_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + reverse_kl_samples: 8 + + trunk: + embed_dim: 256 + num_blocks: 16 + num_heads: 8 + token_postprocessing: "action_token" + observation_horizon: 1 + action_horizon: 64 + no_trunk: false + use_domain_embedding: true + drop_path: 0.1 + weight_init_style: "pytorch" + + multitask: false + pretrained: false + pretrained_checkpoint: null + domains: ["eva_bimanual", "aria_bimanual"] + shared_obs_keys: ["front_img_1"] + + shared_stem_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + stem_specs: + aria_bimanual: + state_keypoints: # TODO: check if this is added to dataschematic correctly + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 140 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + # state_wrist_pose: + # _target_: egomimic.models.hpt_nets.MLPPolicyStem + # input_dim: 14 + # output_dim: 256 + # widths: [256] + # specs: + # random_horizon_masking: false + # cross_attn: + # crossattn_latent: 16 + # crossattn_heads: 8 + # crossattn_dim_head: 64 + # crossattn_modality_dropout: 0.1 + # modality_embed_dim: 256 + + eva_bimanual: + state_ee_pose: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 14 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + right_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + left_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + encoder_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + right_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + left_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ColorJitter + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.05 + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-5 + weight_decay: 0.0001 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml new file mode 100644 index 00000000..68dea4a4 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_wrist.yaml @@ -0,0 +1,160 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.hpt.HPT + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + eva_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + reverse_kl_samples: 8 + + trunk: + embed_dim: 256 + num_blocks: 16 + num_heads: 8 + token_postprocessing: "action_token" + observation_horizon: 1 + action_horizon: 64 + no_trunk: false + use_domain_embedding: true + drop_path: 0.1 + weight_init_style: "pytorch" + + multitask: false + pretrained: false + pretrained_checkpoint: null + domains: ["eva_bimanual", "aria_bimanual"] + shared_obs_keys: ["front_img_1"] + + shared_stem_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + stem_specs: + aria_bimanual: + state_keypoints: # TODO: check if this is added to dataschematic correctly + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 126 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + state_wrist_pose: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 12 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + eva_bimanual: + state_ee_pose: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 14 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + right_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + left_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + encoder_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + right_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + left_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ColorJitter + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.05 + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-5 + weight_decay: 0.0001 diff --git a/egomimic/hydra_configs/train_zarr_latent.yaml b/egomimic/hydra_configs/train_zarr_latent.yaml new file mode 100644 index 00000000..cbcb902a --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent.yaml @@ -0,0 +1,120 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent_large + - visualization: eva_cartesian_aria_keypoints + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit_pace + - _self_ + +name: test +description: normal_latent +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +norm_percentage: 0.2 +num_workers: 6 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoints: # relative to wrist pose + key_type: proprio_keys + zarr_key: observations.state.keypoints + wrist_pose: + key_type: proprio_keys + zarr_key: observations.state.wrist_pose + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 + +model: + enable_adaptive_grad_clip: false diff --git a/egomimic/hydra_configs/train_zarr_latent_aria.yaml b/egomimic/hydra_configs/train_zarr_latent_aria.yaml new file mode 100644 index 00000000..6fbd90f8 --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent_aria.yaml @@ -0,0 +1,120 @@ +defaults: + - model: hpt_cotrain_flow_head_latent_aria + - visualization: eva_cartesian_aria_keypoints + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: aria_debug + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_debug +description: aria_conv_mlp_xl +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +norm_percentage: 1.0 +num_workers: 6 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + wrist_positions: + key_type: proprio_keys + zarr_key: observations.state.wrist_pose + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 + +model: + enable_adaptive_grad_clip: false diff --git a/egomimic/hydra_configs/train_zarr_latent_wrist.yaml b/egomimic/hydra_configs/train_zarr_latent_wrist.yaml new file mode 100644 index 00000000..125f0076 --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent_wrist.yaml @@ -0,0 +1,120 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent_large_wrist + - visualization: eva_cartesian_aria_keypoints_wrist + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain_wrist + - callbacks: checkpoints + - override hydra/launcher: submitit_pace + - _self_ + +name: test +description: wrist_latent +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +norm_percentage: 0.2 +num_workers: 6 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoints: # relative to wrist pose + key_type: proprio_keys + zarr_key: observations.state.keypoints + wrist_pose: + key_type: proprio_keys + zarr_key: observations.state.wrist_pose + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 + +model: + enable_adaptive_grad_clip: false diff --git a/egomimic/hydra_configs/trainer/ddp.yaml b/egomimic/hydra_configs/trainer/ddp.yaml index d3d90aca..aee487a3 100644 --- a/egomimic/hydra_configs/trainer/ddp.yaml +++ b/egomimic/hydra_configs/trainer/ddp.yaml @@ -8,4 +8,4 @@ devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'} num_nodes: ${launch_params.nodes} sync_batchnorm: True check_val_every_n_epoch: 200 -num_sanity_val_steps: 0 \ No newline at end of file +num_sanity_val_steps: 0 diff --git a/egomimic/hydra_configs/trainer/debug.yaml b/egomimic/hydra_configs/trainer/debug.yaml index e3a9a1a5..905d3711 100644 --- a/egomimic/hydra_configs/trainer/debug.yaml +++ b/egomimic/hydra_configs/trainer/debug.yaml @@ -3,7 +3,7 @@ defaults: strategy: ddp_find_unused_parameters_true limit_train_batches: 5 -limit_val_batches: 20 +limit_val_batches: 3 check_val_every_n_epoch: 2 profiler: simple max_epochs: 4 diff --git a/egomimic/hydra_configs/trainer/default.yaml b/egomimic/hydra_configs/trainer/default.yaml index a6b47e35..391656bf 100644 --- a/egomimic/hydra_configs/trainer/default.yaml +++ b/egomimic/hydra_configs/trainer/default.yaml @@ -11,7 +11,7 @@ devices: 1 # mixed precision for extra speed-up precision: bf16 limit_train_batches: 100 -limit_val_batches: 300 +limit_val_batches: 80 # perform a validation loop every N training epochs check_val_every_n_epoch: 200 diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml index 8c4d1c91..a2311911 100644 --- a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml +++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml @@ -1,14 +1,10 @@ eva_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.eva.Eva.viz - _partial_: true - mode: traj - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.eva.Eva.viz_cartesian_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints aria_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.human.Aria.viz - _partial_: true - mode: keypoints - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.human.Aria.viz_keypoints_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml new file mode 100644 index 00000000..c2bceea0 --- /dev/null +++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints_wrist.yaml @@ -0,0 +1,16 @@ +eva_bimanual: + _target_: egomimic.rldb.embodiment.eva.Eva.viz_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints + transform_list: + _target_: egomimic.rldb.embodiment.eva._build_eva_bimanual_revert_eef_frame_transform_list + is_quat: false +aria_bimanual: + _target_: egomimic.rldb.embodiment.human.Aria.viz_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_eva_cart_aria_keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human._build_aria_keypoints_revert_eef_frame_transform_list + is_quat: false diff --git a/egomimic/models/codec/identity.py b/egomimic/models/codec/identity.py new file mode 100644 index 00000000..a40b9495 --- /dev/null +++ b/egomimic/models/codec/identity.py @@ -0,0 +1,10 @@ +import torch +import torch.nn as nn + + +class Identity(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x diff --git a/egomimic/models/codec/mlp.py b/egomimic/models/codec/mlp.py new file mode 100644 index 00000000..0ab021ab --- /dev/null +++ b/egomimic/models/codec/mlp.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn + + +class MLPProjection(nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, n_layers: int, output_dim: int): + super().__init__() + layers = [nn.Linear(input_dim, hidden_dim), nn.GELU()] + for _ in range(n_layers - 1): + layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.GELU()]) + layers.append(nn.Linear(hidden_dim, output_dim)) + self.net = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x is in (B, T, D) -> (B, T, H) + return self.net(x) diff --git a/egomimic/models/codec/temporal_enc_dec.py b/egomimic/models/codec/temporal_enc_dec.py new file mode 100644 index 00000000..2a69ab33 --- /dev/null +++ b/egomimic/models/codec/temporal_enc_dec.py @@ -0,0 +1,520 @@ +from __future__ import annotations + +import torch +import torch.nn as nn + + +class SmallTemporalEncoder(nn.Module): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 64, + use_layernorm: bool = True, + ): + super().__init__() + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + layers = [ + nn.Conv1d(action_dim, action_dim * 2, kernel_size=8, stride=2, padding=3), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=2 + ), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=3 + ), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(action_dim * 2, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: (B, T, D) or (T, D) + Output: (B, K, H) or (K, H) + """ + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D, K) + x = x.transpose(1, 2) # (B, K, D) + x = self.proj(x) # (B, K, H) + + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalDecoder(nn.Module): + """ + Decoder that mirrors SmallTemporalEncoder: + Enc convs (over time, channels-first): + (D -> 2D) k=8 s=2 p=3 + (2D -> 2D) k=8 s=2 p=2 + (2D -> 2D) k=8 s=2 p=3 + For T=100 this encoder produces K=12. + + This decoder maps: + Input: (B, K=12, H=64) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + C2 = action_dim * 2 + + self.proj = nn.Linear(hidden_dim, C2) + self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() + + self.up = nn.Sequential( + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, 2D) + x = x.transpose(1, 2) # (B, 2D, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalEncoder(nn.Module): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.expect_T = expect_T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + D = action_dim + + self.down = nn.Sequential( + nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=2 + ), # 50 -> 24 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=3 + ), # 24 -> 12 + self.act, + ) + + self.proj = nn.Linear(action_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + B, T, D = x.shape + if D != self.action_dim: + raise ValueError(f"Expected D={self.action_dim}, got {D}") + if self.expect_T is not None and T != self.expect_T: + raise ValueError(f"Expected T={self.expect_T}, got {T}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D/2, K=12) + x = x.transpose(1, 2) # (B, K, D/2) + x = self.proj(x) # (B, K, H) + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalDecoder(nn.Module): + """ + Decoder that mirrors LargeTemporalEncoder: + time: 12 -> 24 -> 50 -> 100 + channels: H -> D/2 -> D + Input: (B, K=12, H) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + T: int = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.half_dim = action_dim // 2 + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + self.proj = nn.Linear(hidden_dim, action_dim) + self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() + + # Mirrors paddings/strides/kernels in reverse. + # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. + self.up = nn.Sequential( + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=2, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, D/2) + x = x.transpose(1, 2) # (B, D/2, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalEncoder_32_256(SmallTemporalEncoder): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 256, + use_layernorm: bool = True, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.Conv1d(action_dim, 512, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(512, hidden_dim) + proj_layers = [ + nn.Linear(512, hidden_dim), + self.act, + ] + for _ in range(n_layers - 1): + proj_layers.extend( + [ + nn.Linear(hidden_dim, hidden_dim), + self.act, + ] + ) + proj_layers.extend( + [nn.Linear(hidden_dim, hidden_dim)] + ) # TODO check if I need activation here later + self.proj = nn.Sequential(*proj_layers) + + +class SmallTemporalDecoder_32_256(SmallTemporalDecoder): + """ + Decoder that mirrors SmallTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.ConvTranspose1d(512, action_dim, kernel_size=9, stride=3, padding=1), + ] + + self.up = nn.Sequential(*layers) + self.norm = nn.LayerNorm(512) if use_layernorm else nn.Identity() + proj_layers = [] + for _ in range(n_layers - 1): + proj_layers.extend([nn.Linear(hidden_dim, hidden_dim), self.act]) + proj_layers.extend([nn.Linear(hidden_dim, 512)]) + self.proj = nn.Sequential(*proj_layers) + + +class LargeTemporalEncoder_32_256(LargeTemporalEncoder): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + expect_T=expect_T, + ) + layers = [ + nn.Conv1d(action_dim, 2048, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + proj_layers = [ + nn.Linear(2048, 1024), + self.act, + nn.Linear(1024, hidden_dim), + self.act, + ] + for _ in range(n_layers - 1): + proj_layers.extend( + [ + nn.Linear(hidden_dim, hidden_dim), + self.act, + ] + ) + proj_layers.extend( + [nn.Linear(hidden_dim, hidden_dim)] + ) # TODO check if I need activation here later + self.proj = nn.Sequential(*proj_layers) + + +class LargeTemporalDecoder_32_256(LargeTemporalDecoder): + """ + Decoder that mirrors LargeTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + T: int = 100, + n_layers: int = 4, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + T=T, + ) + + layers = [ + nn.ConvTranspose1d(2048, action_dim, kernel_size=9, stride=3, padding=1), + ] + + self.up = nn.Sequential(*layers) + proj_layers = [] + for _ in range(n_layers - 1): + proj_layers.extend( + [ + nn.Linear(hidden_dim, hidden_dim), + self.act, + ] + ) + proj_layers.extend([nn.Linear(hidden_dim, 2048)]) + self.proj = nn.Sequential(*proj_layers) + self.norm = nn.LayerNorm(2048) if use_layernorm else nn.Identity() + + +def count_params(module: nn.Module, trainable_only: bool = False) -> int: + if trainable_only: + return sum(p.numel() for p in module.parameters() if p.requires_grad) + return sum(p.numel() for p in module.parameters()) + + +def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: + total = 0 + for name, p in module.named_parameters(): + if trainable_only and not p.requires_grad: + continue + n = p.numel() + total += n + print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") + print(f"\nTOTAL params: {total}") + + +if __name__ == "__main__": + B, T, D = 8, 100, 140 + + enc = LargeTemporalEncoder_32_256(action_dim=D, n_layers=5) + dec = LargeTemporalDecoder_32_256(action_dim=D, use_layernorm=True, n_layers=5) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + print("LargeTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) + + B, T, D = 8, 100, 14 + enc = SmallTemporalEncoder_32_256(action_dim=D, n_layers=5) + dec = SmallTemporalDecoder_32_256(action_dim=D, use_layernorm=True, n_layers=5) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + + print("SmallTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py index 645a8c44..5c5225b0 100644 --- a/egomimic/models/denoising_policy.py +++ b/egomimic/models/denoising_policy.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from egomimic.models.denoising_nets import ConditionalUnet1D +from egomimic.rldb.embodiment.embodiment import get_embodiment class DenoisingPolicy(nn.Module): @@ -23,29 +24,59 @@ def __init__( self, model: ConditionalUnet1D, action_horizon: int, - infer_ac_dims: dict, num_inference_steps: int = None, + embodiment_specs: dict = None, **kwargs, ): super().__init__() self.model = model self.action_horizon = action_horizon - self.infer_ac_dims = infer_ac_dims self.num_inference_steps = num_inference_steps + self.embodiment_specs = embodiment_specs + self.codec_enabled = False + + _codecs = {} + if embodiment_specs is not None: + for _emb_name, _spec in embodiment_specs.items(): + if _spec.get("encoder") is not None: + _codecs[f"{_emb_name}_encoder"] = _spec["encoder"] + if _spec.get("decoder") is not None: + _codecs[f"{_emb_name}_decoder"] = _spec["decoder"] + if _codecs: + self.codecs = nn.ModuleDict(_codecs) self.padding = kwargs.get("padding", None) self.pooling = kwargs.get("pooling", None) - self.model_type = kwargs.get("model_type", None) - - if not infer_ac_dims: - raise ValueError("infer_ac_dims must be a non-empty dict") for name, param in self.model.named_parameters(): if not param.requires_grad: print(f"[warn] {name} has requires_grad=False") total_params = sum(p.numel() for p in self.model.parameters()) + if self.embodiment_specs is not None: + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("ac_dims") is None: + raise ValueError(f"ac_dims must be specified for {embodiment_name}") + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("encoder") is not None: + encoder_params = sum( + p.numel() for p in spec["encoder"].parameters() + ) + self.codec_enabled = True + if spec.get("decoder") is not None: + decoder_params = sum( + p.numel() for p in spec["decoder"].parameters() + ) + self.codec_enabled = True + print( + f"[{embodiment_name}] Encoder params: {encoder_params / 1e6:.2f}M" + ) + print( + f"[{embodiment_name}] Decoder params: {decoder_params / 1e6:.2f}M" + ) + total_params += encoder_params + decoder_params + print( f"[{self.__class__.__name__}] Total trainable parameters: {total_params / 1e6:.2f}M" ) @@ -60,7 +91,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ( len(global_cond), self.action_horizon, - self.infer_ac_dims[embodiment_name], + self.embodiment_specs[embodiment_name].get("ac_dims"), ), dtype=global_cond.dtype, device=global_cond.device, @@ -68,7 +99,9 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ) return noise, global_cond - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] """ To be implemented in subclass: predict actions from noise and conditioning. """ @@ -78,13 +111,15 @@ def sample_action(self, global_cond, embodiment_name, generator=None): noise, global_cond = self.preprocess_sampling( global_cond, embodiment_name, generator ) - return self.inference(noise, global_cond, generator) + return self.inference(noise, global_cond, embodiment_name, generator) def forward(self, global_cond): - cond, embodiment = global_cond - return self.sample_action(cond, embodiment) + cond, embodiment_name = global_cond + return self.sample_action(cond, embodiment_name) - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: """ To be implemented in subclass: returns (prediction, target) given action input and conditioning. """ @@ -96,7 +131,7 @@ def loss_fn(self, pred, target): """ return F.mse_loss(pred, target) - def preprocess_compute_loss(self, global_cond, data): + def preprocess_compute_loss(self, global_cond, data, embodiment_name): if self.pooling == "mean": global_cond = global_cond.mean(dim=1) elif self.pooling == "flatten": @@ -122,6 +157,9 @@ def preprocess_compute_loss(self, global_cond, data): return actions, global_cond def compute_loss(self, global_cond, data): - actions, global_cond = self.preprocess_compute_loss(global_cond, data) - pred, target = self.predict(actions, global_cond) + embodiment_name = get_embodiment(data["embodiment"][0].item()).lower() + actions, global_cond = self.preprocess_compute_loss( + global_cond, data, embodiment_name + ) + pred, target = self.predict(actions, global_cond, embodiment_name) return self.loss_fn(pred, target) diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py index e41f4943..27e74ee5 100644 --- a/egomimic/models/fm_policy.py +++ b/egomimic/models/fm_policy.py @@ -24,32 +24,37 @@ def __init__( self, model: ConditionalUnet1D, action_horizon, - infer_ac_dims, num_inference_steps=None, + embodiment_specs=None, **kwargs, ): super().__init__( - model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs + model, action_horizon, num_inference_steps, embodiment_specs, **kwargs ) self.time_dist = kwargs.get("time_dist", "beta") + self.dt = -1.0 / self.num_inference_steps - def step(self, x_t, t, global_cond): + def step(self, x_t, t, global_cond, embodiment_name): if len(t.shape) != 1: t = torch.tensor([t], device=global_cond.device) - v_t = self.model(x_t, t, global_cond) + v_t = self.denoising_model(x_t, t, global_cond, embodiment_name) return x_t + self.dt * v_t, t + self.dt @override - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: self.dt = -1.0 / self.num_inference_steps x_t = noise time = torch.ones((len(global_cond)), device=global_cond.device) while time[0] >= -self.dt / 2: - x_t, time = self.step(x_t, time, global_cond) + x_t, time = self.step(x_t, time, global_cond, embodiment_name) return x_t @override - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: noise = torch.randn(actions.shape, device=actions.device) batch_shape = (actions.shape[0],) if self.time_dist == "beta": @@ -65,8 +70,45 @@ def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions - v_t = self.model(x_t, time, global_cond) + v_t = self.denoising_model(x_t, time, global_cond, embodiment_name) target = u_t pred = v_t return pred, target + + def denoising_model(self, x_t, time, global_cond, embodiment_name): + if self.codec_enabled: + x_t = self.embodiment_specs[embodiment_name]["encoder"](x_t) + else: + x_t = x_t + v_t = self.model(x_t, time, global_cond) + if self.codec_enabled: + v_t = self.embodiment_specs[embodiment_name]["decoder"](v_t) + else: + v_t = v_t + return v_t + + +if __name__ == "__main__": + import hydra + from omegaconf import OmegaConf + + cfg = OmegaConf.load( + "/coc/flash7/paphiwetsa3/projects/EgoVerse/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml" + ) + model = hydra.utils.instantiate(cfg.robomimic_model.head_specs.shared) + + # test the model + aria_input = torch.randn(8, 100, 140) + global_cond = torch.randn(8, 64, 256) + aria_output = model.step( + aria_input, torch.tensor([0.0]), global_cond, "aria_bimanual" + ) + aria_output_inference = model.inference(aria_input, global_cond, "aria_bimanual") + aria_output_predict = model.predict(aria_input, global_cond, "aria_bimanual") + + eva_input = torch.randn(8, 100, 14) + eva_output = model.step(eva_input, torch.tensor([0.0]), global_cond, "eva_bimanual") + eva_output_inference = model.inference(eva_input, global_cond, "eva_bimanual") + eva_output_predict = model.predict(eva_input, global_cond, "eva_bimanual") + breakpoint() diff --git a/egomimic/pl_utils/pl_model.py b/egomimic/pl_utils/pl_model.py index d44c1270..5c75a643 100644 --- a/egomimic/pl_utils/pl_model.py +++ b/egomimic/pl_utils/pl_model.py @@ -24,10 +24,19 @@ class ModelWrapper(LightningModule): grad_norm_mad_min_count = 100 grad_norm_mad_window = 200 - def __init__(self, robomimic_model, optimizer, scheduler): + def __init__( + self, + robomimic_model, + optimizer, + scheduler, + enable_adaptive_grad_clip: bool = True, + ): """ Args: model (PolicyAlgo): robomimic model to wrap. + enable_adaptive_grad_clip: if False, the MAD-based spike detection + and clipping in on_after_backward is skipped (grad norm is still + logged, just never clipped). """ super().__init__() self.save_hyperparameters() @@ -40,6 +49,7 @@ def __init__(self, robomimic_model, optimizer, scheduler): self.params = self.model.nets["policy"].params except Exception: pass + self.enable_adaptive_grad_clip = enable_adaptive_grad_clip self.grad_norm_history = deque(maxlen=self.grad_norm_mad_window) self.val_image_buffer, self.val_counter = {}, {} @@ -96,7 +106,10 @@ def on_after_backward(self): grad_norm_val = float(grad_norm) info = {"policy_grad_norms_raw": grad_norm_val} - if len(self.grad_norm_history) >= self.grad_norm_mad_min_count: + if ( + self.enable_adaptive_grad_clip + and len(self.grad_norm_history) >= self.grad_norm_mad_min_count + ): values = np.array(self.grad_norm_history, dtype=np.float32) median = float(np.median(values)) mad = float(np.median(np.abs(values - median))) diff --git a/egomimic/rldb/embodiment/embodiment.py b/egomimic/rldb/embodiment/embodiment.py index 13dc1b8f..798760d8 100644 --- a/egomimic/rldb/embodiment/embodiment.py +++ b/egomimic/rldb/embodiment/embodiment.py @@ -1,3 +1,4 @@ +import copy from abc import ABC from enum import Enum @@ -59,13 +60,31 @@ def get_keymap(): raise NotImplementedError @classmethod - def viz_cartesian_gt_preds(cls, predictions, batch, image_key, action_key): + def viz_gt_preds( + cls, + predictions, + batch, + image_key, + action_key, + transform_list=None, + mode="cartesian", + **kwargs, + ): embodiment_id = batch["embodiment"][0].item() embodiment_name = get_embodiment(embodiment_id).lower() + pred_actions = predictions[ + f"{embodiment_name}_{action_key}" + ] # TODO: make this work with groundtruth, clone batch and replace actions_keypoints with pred_actions + if transform_list is not None: + pred_batch = copy.deepcopy(batch) + pred_batch[action_key] = pred_actions + batch = cls.apply_transform(batch, transform_list) + pred_batch = cls.apply_transform(pred_batch, transform_list) + pred_actions = pred_batch[action_key] + images = batch[image_key] actions = batch[action_key] - pred_actions = predictions[f"{embodiment_name}_{action_key}"] ims_list = [] images = _to_numpy(images) actions = _to_numpy(actions) @@ -74,8 +93,8 @@ def viz_cartesian_gt_preds(cls, predictions, batch, image_key, action_key): image = images[i] action = actions[i] pred_action = pred_actions[i] - ims = cls.viz(image, action, mode="traj", color="Reds") - ims = cls.viz(ims, pred_action, mode="traj", color="Greens") + ims = cls.viz(image, action, mode=mode, color="Reds", **kwargs) + ims = cls.viz(ims, pred_action, mode=mode, color="Greens", **kwargs) ims_list.append(ims) ims = np.stack(ims_list, axis=0) return ims @@ -94,7 +113,7 @@ def apply_transform(cls, batch, transform_list: list[Transform]): results = [] for i in range(batch_size): sample = { - k: (v[i].numpy() if isinstance(v, torch.Tensor) else v[i]) + k: (v[i].cpu().numpy() if isinstance(v, torch.Tensor) else v[i]) if isinstance(v, (np.ndarray, torch.Tensor)) else v for k, v in batch.items() diff --git a/egomimic/rldb/embodiment/eva.py b/egomimic/rldb/embodiment/eva.py index ee762645..49676830 100644 --- a/egomimic/rldb/embodiment/eva.py +++ b/egomimic/rldb/embodiment/eva.py @@ -5,12 +5,14 @@ from egomimic.rldb.embodiment.embodiment import Embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, + BatchQuaternionPoseToYPR, ConcatKeys, DeleteKeys, InterpolateLinear, InterpolatePose, NumpyToTensor, PoseCoordinateFrameTransform, + QuaternionPoseToYPR, SplitKeys, Transform, XYZWXYZ_to_XYZYPR, @@ -34,16 +36,16 @@ class Eva(Embodiment): @staticmethod def get_transform_list( - mode: Literal["cartesian", "cartesian_wristframe"] = "cartesian", + mode: Literal[ + "cartesian", "cartesian_wristframe_ypr", "cartesian_wristframe_quat" + ] = "cartesian", ) -> list[Transform]: if mode == "cartesian": return _build_eva_bimanual_transform_list() - elif mode == "cartesian_wristframe": - return _build_eva_bimanual_eef_frame_transform_list() - else: - raise ValueError( - f"Unsupported mode '{mode}'. Expected one of: 'cartesian', 'cartesian_wristframe'." - ) + elif mode == "cartesian_wristframe_ypr": + return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + elif mode == "cartesian_wristframe_quat": + return _build_eva_bimanual_eef_frame_transform_list(is_quat=True) @classmethod def viz_transformed_batch( @@ -169,16 +171,21 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( right_obs_gripper: str = "right.obs_gripper", left_cmd_camframe: str = "left.cmd_ee_pose_camframe", right_cmd_camframe: str = "right.cmd_ee_pose_camframe", + is_quat: bool = True, ) -> list[Transform]: """Revert wrist-frame EVA actions back to camera frame for visualization.""" + if is_quat: + pose_shape = 7 + else: + pose_shape = 6 transform_list = [ # Extract obs camframe poses from the concatenated obs key SplitKeys( input_key=obs_key, output_key_list=[ - (left_obs_camframe, 6), + (left_obs_camframe, pose_shape), (left_obs_gripper, 1), - (right_obs_camframe, 6), + (right_obs_camframe, pose_shape), (right_obs_gripper, 1), ], ), @@ -186,9 +193,9 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( SplitKeys( input_key=action_key, output_key_list=[ - (left_cmd_wristframe, 6), + (left_cmd_wristframe, pose_shape), (left_gripper, 1), - (right_cmd_wristframe, 6), + (right_cmd_wristframe, pose_shape), (right_gripper, 1), ], ), @@ -254,8 +261,6 @@ def _build_eva_bimanual_eef_frame_transform_list( left_extra_batch_key = {"left_extrinsics_pose": left_extrinsics_pose} right_extra_batch_key = {"right_extrinsics_pose": right_extrinsics_pose} - mode = "xyzwxyz" if is_quat else "xyzypr" - # Step 1: transform cmd and obs into camera frame using extrinsics transform_list = [ ActionChunkCoordinateFrameTransform( @@ -263,40 +268,40 @@ def _build_eva_bimanual_eef_frame_transform_list( chunk_world=left_cmd_world, transformed_key_name=left_cmd_camframe, extra_batch_key=left_extra_batch_key, - mode=mode, + mode="xyzwxyz", ), ActionChunkCoordinateFrameTransform( target_world=right_target_world, chunk_world=right_cmd_world, transformed_key_name=right_cmd_camframe, extra_batch_key=right_extra_batch_key, - mode=mode, + mode="xyzwxyz", ), PoseCoordinateFrameTransform( target_world=left_target_world, pose_world=left_obs_pose, transformed_key_name=left_obs_camframe, - mode=mode, + mode="xyzwxyz", ), PoseCoordinateFrameTransform( target_world=right_target_world, pose_world=right_obs_pose, transformed_key_name=right_obs_camframe, - mode=mode, + mode="xyzwxyz", ), InterpolatePose( new_chunk_length=chunk_length, action_key=left_cmd_camframe, output_action_key=left_cmd_camframe, stride=stride, - mode=mode, + mode="xyzwxyz", ), InterpolatePose( new_chunk_length=chunk_length, action_key=right_cmd_camframe, output_action_key=right_cmd_camframe, stride=stride, - mode=mode, + mode="xyzwxyz", ), InterpolateLinear( new_chunk_length=chunk_length, @@ -315,26 +320,36 @@ def _build_eva_bimanual_eef_frame_transform_list( target_world=left_obs_camframe, chunk_world=left_cmd_camframe, transformed_key_name=left_cmd_wristframe, - mode=mode, + mode="xyzwxyz", ), ActionChunkCoordinateFrameTransform( target_world=right_obs_camframe, chunk_world=right_cmd_camframe, transformed_key_name=right_cmd_wristframe, - mode=mode, + mode="xyzwxyz", ), ] - if is_quat: - transform_list.append( - XYZWXYZ_to_XYZYPR( - keys=[ - left_cmd_wristframe, - right_cmd_wristframe, - left_obs_camframe, - right_obs_camframe, - ] - ) + if not is_quat: + transform_list.extend( + [ + BatchQuaternionPoseToYPR( + pose_key=left_cmd_wristframe, + output_key=left_cmd_wristframe, + ), + BatchQuaternionPoseToYPR( + pose_key=right_cmd_wristframe, + output_key=right_cmd_wristframe, + ), + QuaternionPoseToYPR( + pose_key=left_obs_camframe, + output_key=left_obs_camframe, + ), + QuaternionPoseToYPR( + pose_key=right_obs_camframe, + output_key=right_obs_camframe, + ), + ] ) transform_list.extend( diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index 7fce24ba..981ff98c 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -2,15 +2,15 @@ from typing import Literal -import numpy as np - -from egomimic.rldb.embodiment.embodiment import Embodiment, get_embodiment +from egomimic.rldb.embodiment.embodiment import Embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, + BatchQuaternionPoseToYPR, ConcatKeys, DeleteKeys, InterpolatePose, PoseCoordinateFrameTransform, + QuaternionPoseToYPR, Reshape, SplitKeys, Transform, @@ -30,32 +30,6 @@ class Human(Embodiment): VIZ_IMAGE_KEY = "observations.images.front_img_1" ACTION_STRIDE = 3 - @classmethod - def viz_keypoints_gt_preds( - cls, predictions, batch, image_key, action_key, transform_list=None, **kwargs - ): - if transform_list is not None: - batch = cls.apply_transform(batch, transform_list) - embodiment_id = batch["embodiment"][0].item() - embodiment_name = get_embodiment(embodiment_id).lower() - - images = batch[image_key] - actions = batch[action_key] - pred_actions = predictions[f"{embodiment_name}_{action_key}"] - ims_list = [] - images = _to_numpy(images) - actions = _to_numpy(actions) - pred_actions = _to_numpy(pred_actions) - for i in range(images.shape[0]): - image = images[i] - action = actions[i] - pred_action = pred_actions[i] - ims = cls.viz(image, action, mode="keypoints", color="Reds", **kwargs) - ims = cls.viz(ims, pred_action, mode="keypoints", color="Greens", **kwargs) - ims_list.append(ims) - ims = np.stack(ims_list, axis=0) - return ims - @classmethod def viz_transformed_batch( cls, @@ -120,7 +94,6 @@ def viz( else: colors = cls.FINGER_COLORS dot_color = cls.DOT_COLOR - return _viz_keypoints( images=images, actions=actions, @@ -257,28 +230,38 @@ class Aria(Human): } FINGER_EDGE_RANGES = [ ("thumb", 0, 3), - ("index", 3, 6), - ("middle", 6, 9), - ("ring", 9, 12), - ("pinky", 12, 15), + ("index", 3, 7), + ("middle", 7, 11), + ("ring", 11, 15), + ("pinky", 15, 19), ] DOT_COLOR = (255, 165, 0) @classmethod def get_transform_list( - cls, mode: Literal["cartesian", "keypoints_headframe", "keypoints_wristframe"] + cls, + mode: Literal[ + "cartesian", + "keypoints_headframe", + "keypoints_wristframe_ypr", + "keypoints_wristframe_quat", + ], ) -> list[Transform]: if mode == "cartesian": return _build_aria_cartesian_bimanual_transform_list( stride=cls.ACTION_STRIDE ) - elif mode == "keypoints": + elif mode == "keypoints_headframe": return _build_aria_keypoints_bimanual_transform_list( stride=cls.ACTION_STRIDE ) - elif mode == "keypoints_wristframe": + elif mode == "keypoints_wristframe_ypr": return _build_aria_keypoints_eef_frame_transform_list( - stride=cls.ACTION_STRIDE + stride=cls.ACTION_STRIDE, is_quat=False + ) + elif mode == "keypoints_wristframe_quat": + return _build_aria_keypoints_eef_frame_transform_list( + stride=cls.ACTION_STRIDE, is_quat=True ) else: raise ValueError( @@ -296,6 +279,7 @@ class Mecka(Human): ACTION_STRIDE = 1 +# this works for quat and ypr since actionChunkCoordinateFrameTransform works for both def _build_aria_keypoints_revert_eef_frame_transform_list( *, action_key: str = "actions_keypoints", @@ -305,12 +289,23 @@ def _build_aria_keypoints_revert_eef_frame_transform_list( right_wrist_obs_headframe: str = "right.obs_wrist_pose_headframe", left_wrist_action_headframe: str = "left.action_wrist_pose_headframe", right_wrist_action_headframe: str = "right.action_wrist_pose_headframe", + left_wrist_action_wristframe: str = "left.action_wrist_pose_wristframe", + right_wrist_action_wristframe: str = "right.action_wrist_pose_wristframe", + left_keypoints_action_headframe: str = "left.action_keypoints_headframe", + right_keypoints_action_headframe: str = "right.action_keypoints_headframe", + is_quat: bool = True, ) -> list[Transform]: + if is_quat: + pose_shape = 7 + else: + pose_shape = 6 transform_list = [ SplitKeys( input_key=action_key, output_key_list=[ + (left_wrist_action_wristframe, pose_shape), (left_keypoints_action_wristframe, 63), + (right_wrist_action_wristframe, pose_shape), (right_keypoints_action_wristframe, 63), ], ), @@ -327,31 +322,31 @@ def _build_aria_keypoints_revert_eef_frame_transform_list( ActionChunkCoordinateFrameTransform( target_world=left_wrist_obs_headframe, chunk_world=left_keypoints_action_wristframe, - transformed_key_name=left_wrist_action_headframe, + transformed_key_name=left_keypoints_action_headframe, mode="xyz", inverse=False, ), ActionChunkCoordinateFrameTransform( target_world=right_wrist_obs_headframe, chunk_world=right_keypoints_action_wristframe, - transformed_key_name=right_wrist_action_headframe, + transformed_key_name=right_keypoints_action_headframe, mode="xyz", inverse=False, ), Reshape( - input_key=left_wrist_action_headframe, - output_key=left_wrist_action_headframe, + input_key=left_keypoints_action_headframe, + output_key=left_keypoints_action_headframe, shape=(100, 63), ), Reshape( - input_key=right_wrist_action_headframe, - output_key=right_wrist_action_headframe, + input_key=right_keypoints_action_headframe, + output_key=right_keypoints_action_headframe, shape=(100, 63), ), ConcatKeys( key_list=[ - left_wrist_action_headframe, - right_wrist_action_headframe, + left_keypoints_action_headframe, + right_keypoints_action_headframe, ], new_key_name=action_key, delete_old_keys=True, @@ -390,6 +385,7 @@ def _build_aria_keypoints_eef_frame_transform_list( delete_target_world: bool = True, chunk_length: int = 100, stride: int = 3, + is_quat: bool = True, ) -> list[Transform]: transform_list = _build_aria_keypoints_bimanual_transform_list( target_world=target_world, @@ -486,13 +482,48 @@ def _build_aria_keypoints_eef_frame_transform_list( output_key=right_keypoints_obs_wristframe, shape=(63,), ), + ActionChunkCoordinateFrameTransform( + target_world=left_wrist_obs_headframe, + chunk_world=left_wrist_action_headframe, + transformed_key_name=left_wrist_action_wristframe, + mode="xyzwxyz", + ), + ActionChunkCoordinateFrameTransform( + target_world=right_wrist_obs_headframe, + chunk_world=right_wrist_action_headframe, + transformed_key_name=right_wrist_action_wristframe, + mode="xyzwxyz", + ), ] ) + if not is_quat: + transform_list.extend( + [ + BatchQuaternionPoseToYPR( + pose_key=left_wrist_action_wristframe, + output_key=left_wrist_action_wristframe, + ), + BatchQuaternionPoseToYPR( + pose_key=right_wrist_action_wristframe, + output_key=right_wrist_action_wristframe, + ), + QuaternionPoseToYPR( + pose_key=left_wrist_obs_headframe, + output_key=left_wrist_obs_headframe, + ), + QuaternionPoseToYPR( + pose_key=right_wrist_obs_headframe, + output_key=right_wrist_obs_headframe, + ), + ] + ) transform_list.extend( [ ConcatKeys( key_list=[ + left_wrist_action_wristframe, left_keypoints_action_wristframe, + right_wrist_action_wristframe, right_keypoints_action_wristframe, ], new_key_name="actions_keypoints", diff --git a/egomimic/rldb/zarr/action_chunk_transforms.py b/egomimic/rldb/zarr/action_chunk_transforms.py index 6ff5fdc2..379c3a6a 100644 --- a/egomimic/rldb/zarr/action_chunk_transforms.py +++ b/egomimic/rldb/zarr/action_chunk_transforms.py @@ -31,6 +31,8 @@ _xyz_to_matrix, _xyzwxyz_to_matrix, _xyzypr_to_matrix, + wxyz_to_xyzw, + xyzw_to_wxyz, ) # --------------------------------------------------------------------------- @@ -250,11 +252,75 @@ def transform(self, batch: dict) -> dict: f"'{self.pose_key}'" ) xyz = pose[:3] - ypr = R.from_quat(pose[3:7]).as_euler("ZYX", degrees=False) + xyzw = wxyz_to_xyzw(pose[3:7]) + ypr = R.from_quat(xyzw).as_euler("ZYX", degrees=False) batch[self.output_key] = np.concatenate([xyz, ypr], axis=0) return batch +class YPRToQuaternionPose(Transform): + """Convert a single pose from xyz + ypr to xyz + quat(x,y,z,w).""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.shape != (6,): + raise ValueError( + f"YPRToQuaternionPose expects shape (6,), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + xyz = pose[:3] + quat = R.from_euler("ZYX", pose[3:6], degrees=False).as_quat() # (x,y,z,w) + quat = xyzw_to_wxyz(quat) + batch[self.output_key] = np.concatenate([xyz, quat], axis=0) + return batch + + +class BatchQuaternionPoseToYPR(Transform): + """Convert a batch of poses from xyz + quat(x,y,z,w) to xyz + ypr.""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.ndim != 2 or pose.shape[-1] != 7: + raise ValueError( + f"BatchQuaternionPoseToYPR expects shape (N, 7), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + xyz = pose[:, :3] + xyzw = wxyz_to_xyzw(pose[:, 3:7]) + ypr = R.from_quat(xyzw).as_euler("ZYX", degrees=False) # (N, 3) + batch[self.output_key] = np.concatenate([xyz, ypr], axis=1) + return batch + + +class BatchYPRToQuaternionPose(Transform): + """Convert a batch of poses from xyz + ypr to xyz + quat(x,y,z,w).""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.ndim != 2 or pose.shape[-1] != 6: + raise ValueError( + f"BatchYPRToQuaternionPose expects shape (N, 6), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + xyz = pose[:, :3] + quat = R.from_euler("ZYX", pose[:, 3:6], degrees=False).as_quat() # (N, 4) + quat = xyzw_to_wxyz(quat) + batch[self.output_key] = np.concatenate([xyz, quat], axis=1) + return batch + + class PoseCoordinateFrameTransform(Transform): """Transform a single pose into a target frame pose.""" diff --git a/egomimic/scripts/tutorials/zarr_data_viz.ipynb b/egomimic/scripts/tutorials/zarr_data_viz.ipynb index 1c877626..b13e36f6 100644 --- a/egomimic/scripts/tutorials/zarr_data_viz.ipynb +++ b/egomimic/scripts/tutorials/zarr_data_viz.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "79d184b3", "metadata": {}, "outputs": [], @@ -23,19 +23,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "32d9110f", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/coc/flash7/paphiwetsa3/projects/EgoVerse/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.\n", - " import pynvml # type: ignore[import]\n" - ] - } - ], + "outputs": [], "source": [ "from pathlib import Path\n", "\n", @@ -62,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "cc9edba1", "metadata": {}, "outputs": [], @@ -73,18 +64,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "a4aa1a05", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tables in schema 'app': ['episodes']\n" - ] - } - ], + "outputs": [], "source": [ "# Point this at a single episode directory, e.g. /path/to/episode_hash.zarr\n", "# EPISODE_PATH = Path(\"/coc/flash7/scratch/egoverseDebugDatasets/1767495035712.zarr\")\n", @@ -112,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "58f0af00", "metadata": {}, "outputs": [], @@ -122,46 +105,20 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "67a60218", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([-0.2326, 0.1783, 0.3866, -0.0258, 0.0405, 0.8205, 0.0800, 0.3351,\n", - " 0.2074, 0.4526, -0.0582, -0.0042, 0.8754, 0.0000],\n", - " dtype=torch.float64)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "batch['actions_cartesian'][0,0]" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "4b72f3bb", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "