From 2f3e1c0e886c7557ba22270b8c9b42ffda89d35b Mon Sep 17 00:00:00 2001 From: mengwei Date: Thu, 4 Dec 2025 10:15:46 +0800 Subject: [PATCH 01/12] [feat] Add training code for InternVLA-N1 --- README.md | 177 ++- .../dataset/internvla_n1_dataset_lerobot.py | 1345 +++++++++++++++++ internnav/dataset/rope2d.py | 334 ++++ .../env/utils/habitat_extensions/measures.py | 52 + internnav/evaluator/habitat_vln_evaluator.py | 763 ++++++---- .../basemodel/internvla_n1/internvla_n1.py | 352 +++-- .../internvla_n1/internvla_n1_arch.py | 198 +++ .../model/basemodel/internvla_n1/navdp.py | 248 +-- .../internvla_n1/nextdit_crossattn_traj.py | 95 ++ .../basemodel/internvla_n1/nextdit_traj.py | 381 +++++ .../depth_anything_v2/dinov2.py | 37 +- internnav/qwenvl_trainer/base.py | 329 ++++ .../qwenvl_trainer/internvla_n1_argument.py | 52 + .../qwenvl_trainer/internvla_n1_trainer.py | 239 +++ scripts/eval/bash/eval_dual_system.sh | 12 +- scripts/eval/bash/eval_system2.sh | 11 +- scripts/eval/configs/vln_rxr.yaml | 80 + scripts/eval/eval_habitat.py | 23 +- .../train_internvla_n1/train_dual_system.sh | 83 + scripts/train_internvla_n1/train_system2.sh | 80 + scripts/train_internvla_n1/zero2.json | 23 + scripts/train_internvla_n1/zero3.json | 28 + scripts/train_internvla_n1/zero3_offload.json | 56 + 23 files changed, 4318 insertions(+), 680 deletions(-) create mode 100644 internnav/dataset/internvla_n1_dataset_lerobot.py create mode 100644 internnav/dataset/rope2d.py create mode 100644 internnav/model/basemodel/internvla_n1/internvla_n1_arch.py create mode 100644 internnav/model/basemodel/internvla_n1/nextdit_crossattn_traj.py create mode 100644 internnav/model/basemodel/internvla_n1/nextdit_traj.py create mode 100644 internnav/qwenvl_trainer/base.py create mode 100644 internnav/qwenvl_trainer/internvla_n1_argument.py create mode 100644 internnav/qwenvl_trainer/internvla_n1_trainer.py create mode 100644 scripts/eval/configs/vln_rxr.yaml create mode 100644 scripts/train_internvla_n1/train_dual_system.sh create mode 100644 scripts/train_internvla_n1/train_system2.sh create mode 100644 scripts/train_internvla_n1/zero2.json create mode 100644 scripts/train_internvla_n1/zero3.json create mode 100644 scripts/train_internvla_n1/zero3_offload.json diff --git a/README.md b/README.md index 1518bb2..0195fe7 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The toolbox supports the most comprehensive 6 datasets \& benchmarks and 10+ pop The toolbox supports the most advanced high-quality navigation dataset, InternData-N1, which includes 3k+ scenes and 830k VLN data covering diverse embodiments and scenes, and the first dual-system navigation foundation model with leading performance on all the benchmarks and zero-shot generalization capability in the real world, InternVLA-N1. ## πŸ”₯ News +- [2025/12] Training code for InternVLA-N1 is now available. This official release provides two dual-system configurations: **InternVLA-N1 (Dual System)** with NavDP* and **InternVLA-N1 (Dual System)** DualVLN. For model architecture and training details, please refer to the [DualVLN paper](TO_BE_UPDATED). - [2025/10] Add a simple [inference-only demo](scripts/notebooks/inference_only_demo.ipynb) of InternVLA-N1. - [2025/10] InternVLA-N1 [technical report](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf) is released. Please check our [homepage](https://internrobotics.github.io/internvla-n1.github.io/). - [2025/09] Real-world deployment code of InternVLA-N1 is released. Upload 3D printing [files](assets/3d_printing_files/go2_stand.STEP) for Unitree Go2. @@ -55,133 +56,147 @@ The toolbox supports the most advanced high-quality navigation dataset, InternDa Please refer to the [documentation](https://internrobotics.github.io/user_guide/internnav/quick_start/index.html) for quick start with InternNav, from installation to training or evaluating supported models. -## πŸ“¦ Overview of Benchmark and Model Zoo +## πŸ“¦ Overview -### Datasets \& Benchmarks +### πŸ§ͺ Supported Benchmarks - -
- System2 (VLN-CE) + VLN Benchmarks - System1 (VN) - - Whole-system (VLN) + VN Benchmarks
- -
-### Models +### πŸ€— Model Zoo & Downloads
- System2 (VLN-CE) + 🧠 VLN Single-System - System1 (VN) + 🎯 VN System (System1) - Whole-system (VLN) + 🀝 VLN Multi-System
-### Benchmark Results - -#### VLN-CE Task -| Model | Dataset/Benchmark | NE | OS | SR | SPL | Download | -| ------ | ----------------- | -- | -- | --------- | -- | --------- | -| `InternVLA-N1 (S2)` | R2R | 4.89 | 60.6 | 55.4 | 52.1| [Model](https://huggingface.co/InternRobotics/InternVLA-N1-S2) | -| `InternVLA-N1` | R2R | **4.83** | **63.3** | **58.2** | **54.0** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) | -| `InternVLA-N1 (S2)` | RxR | 6.67 | 56.5 | 48.6 | 42.6 | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-S2) | -| `InternVLA-N1` | RxR | **5.91** | **60.8** | **53.5** | **46.1** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) | -| `InternVLA-N1-Preview (S2)` | R2R | 5.09 | 60.9 | 53.7 | 49.7 | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview-S2) | -| `InternVLA-N1-Preview` | R2R | **4.76** | **63.4** | **56.7** | **52.6** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) | -| `InternVLA-N1-Preview (S2)` | RxR | 6.39 | 60.1 | 50.5 | 43.3 | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview-S2) | -| `InternVLA-N1-Preview` | RxR | **5.65** | **63.2** | **53.5** | **45.7** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) | - -#### VLN-PE Task -| Model | Dataset/Benchmark | NE | OS | SR | SPL | Download | -| ------ | ----------------- | -- | -- | -- | --- | --- | -| `Seq2Seq` | Flash | 8.27 | 43.0 | 15.7 | 9.7 | [Model](https://huggingface.co/InternRobotics/VLN-PE) | -| `CMA` | Flash | 7.52 | 45.0 | 24.4 | 18.2 | [Model](https://huggingface.co/InternRobotics/VLN-PE) | -| `RDP` | Flash | 6.98 | 42.5 | 24.9 | 17.5 | [Model](https://huggingface.co/InternRobotics/VLN-PE) | -| `InternVLA-N1-Preview` | Flash | **4.21** | **68.0** | **59.8** | **54.0** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) | -| `InternVLA-N1` | Flash | **4.13** | **67.6** | **60.4** | **54.9** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) | -| `Seq2Seq` | Physical | 7.88 | 28.1 | 15.1 | 10.7 | [Model](https://huggingface.co/InternRobotics/VLN-PE) | -| `CMA` | Physical | 7.26 | 31.4 | 22.1 | 18.6 | [Model](https://huggingface.co/InternRobotics/VLN-PE) | -| `RDP` | Physical | 6.72 | 36.9 | 25.2 | 17.7 | [Model](https://huggingface.co/InternRobotics/VLN-PE) | -| `InternVLA-N1-Preview` | Physical | **5.31** | **49.0** | **42.6** | **35.8** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) | -| `InternVLA-N1` | Physical | **4.73** | **56.7** | **50.6** | **43.3** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) | - -#### Visual Navigation Task - PointGoal Navigation -| Model | Dataset/Benchmark | SR | SPL | Download | -| ------ | ----------------- | -- | -- | --------- | -| `iPlanner` | ClutteredEnv | 84.8 | 83.6 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) | -| `ViPlanner` | ClutteredEnv | 72.4 | 72.3 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) | -| `InternVLA-N1 (S1)` | ClutteredEnv | **89.8** | **87.7** | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) | -| `iPlanner` | InternScenes | 48.8 | 46.7 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) | -| `ViPlanner` | InternScenes | 54.3 | 52.5 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) | -| `InternVLA-N1 (S1)` | InternScenes | **65.7** | **60.7** | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) | - - - -**NOTE:** -- VLN-CE RxR benchmark and StreamVLN will be supported soon. + + + +### πŸ“Š Benchmark Results + + +#### VLN-CE Benchmarks + +**πŸ“ R2R Dataset** +| Model | Observation | NE ↓ | OS ↑ | SR ↑ | SPL ↑ | +|-------|-------------|------|------|------|-------| +| InternVLA-N1-wo-dagger (S2) + [ShortestPathFollower](https://aihabitat.org/docs/habitat-lab/habitat.tasks.nav.shortest_path_follower.ShortestPathFollower.html) | - | 4.89 | 60.6 | 55.4 | 52.1 | +| InternVLA-N1-wo-dagger (Dual System) with NavDP* | RGB-D | 4.83 | 63.3 | 58.2 | 54.0 | +| InternVLA-N1 (S2) + [ShortestPathFollower](https://aihabitat.org/docs/habitat-lab/habitat.tasks.nav.shortest_path_follower.ShortestPathFollower.html) | - | 4.25 | 68.3 | 60.9 | 55.2 | +| InternVLA-N1 (Dual System) with NavDP* | RGB-D | 4.22 | 70.4 | 64.1 | 58.1 | +| InternVLA-N1 (Dual System) DualVLN | RGB | **4.05** | **70.7** | **64.3** | **58.5** | + +**πŸ“ RxR Dataset** +| Model | Observation | NE ↓ | SR ↑ | SPL ↑ | nDTW ↑ | +|-------|-------------|------|------|------|-------| +| InternVLA-N1 (S2) + [ShortestPathFollower](https://aihabitat.org/docs/habitat-lab/habitat.tasks.nav.shortest_path_follower.ShortestPathFollower.html) | - | 5.71 | 63.5 | 55.0 | 46.8 | +| InternVLA-N1 (Dual System) with NavDP* | RGB-D | 4.70 | 59.7 | 50.6 | 69.7 | +| InternVLA-N1 (Dual System) DualVLN | RGB | **4.58** | **61.4** | **51.8** | **70.0** | + +--- + +#### VLN-PE Benchmarks + +**πŸ“ Flash Controller on R2R Unseen** +| Model | NE ↓ | OS ↑ | SR ↑ | SPL ↑ | +|-------|------|------|------|-------| +| Seq2Seq | 8.27 | 43.0 | 15.7 | 9.7 | +| CMA | 7.52 | 45.0 | 24.4 | 18.2 | +| RDP | 6.98 | 42.5 | 24.9 | 17.5 | +| InternVLA-N1 (System 2) + iPlanner | 4.91 | 55.53 | 47.07 | 41.09 | +| InternVLA-N1 (System 2) + NavDP | 4.22 | 67.33 | 58.72 | 50.98 | +| InternVLA-N1 (Dual System) DualVLN | **3.90** | **69.93** | **63.62** | **56.49** | + +**πŸ“ Physical Controller on R2R Unseen** +| Model | NE ↓ | OS ↑ | SR ↑ | SPL ↑ | +|-------|------|------|------|-------| +| Seq2Seq | 7.88 | 28.1 | 15.1 | 10.7 | +| CMA | 7.26 | 31.4 | 22.1 | 18.6 | +| RDP | 6.72 | 36.9 | 25.2 | 17.7 | +| InternVLA-N1 (Dual System) DualVLN | **4.66** | **55.9** | **51.6** | **42.49** | + + +#### Visual Navigation Benchmarks + +**πŸ“ ClutteredEnv Dataset** +| Model | SR ↑ | SPL ↑ | +|-------|------|-------| +| iPlanner | 84.8 | 83.6 | +| ViPlanner | 72.4 | 72.3 | +| NavDP | **89.8** | **87.7** | + +**πŸ“ InternScenes Dataset** +| Model | SR ↑ | SPL ↑ | +|-------|------|-------| +| iPlanner | 48.8 | 46.7 | +| ViPlanner | 54.3 | 52.5 | +| NavDP | **65.7** | **60.7** | + +--- ## πŸ”§ Customization @@ -236,6 +251,12 @@ If you use the specific pretrained models and benchmarks, please kindly cite the year = {2025}, booktitle={arXiv}, } +@misc{dualvln, + title = {{InternVLA-N1: An} Open Dual-System Navigation Foundation Model with Learned Latent Plans}, + author = {InternVLA-N1 Team}, + year = {2025}, + booktitle={arXiv}, +} ``` diff --git a/internnav/dataset/internvla_n1_dataset_lerobot.py b/internnav/dataset/internvla_n1_dataset_lerobot.py new file mode 100644 index 0000000..8cd39a6 --- /dev/null +++ b/internnav/dataset/internvla_n1_dataset_lerobot.py @@ -0,0 +1,1345 @@ +import copy +import itertools +import json +import os +import random +import re +import time +from dataclasses import dataclass +from typing import Dict, List, Sequence, Tuple + +import numpy as np +import torch +import transformers +from decord import VideoReader +from PIL import Image +from torch.utils.data import Dataset +from torchcodec.decoders import VideoDecoder +from transformers.image_utils import to_numpy_array + +from .rope2d import get_rope_index_2, get_rope_index_25 + +# Define placeholders for dataset paths +CAMBRIAN_737K = { + "annotation_path": "PATH_TO_CAMBRIAN_737K_ANNOTATION", + "data_path": "", +} + +CAMBRIAN_737K_PACK = { + "annotation_path": f"PATH_TO_CAMBRIAN_737K_ANNOTATION_PACKED", # noqa: F541 + "data_path": "", +} + +MP_DOC = { + "annotation_path": "PATH_TO_MP_DOC_ANNOTATION", + "data_path": "PATH_TO_MP_DOC_DATA", +} + +CLEVR_MC = { + "annotation_path": "PATH_TO_CLEVR_MC_ANNOTATION", + "data_path": "PATH_TO_CLEVR_MC_DATA", +} + +VIDEOCHATGPT = { + "annotation_path": "PATH_TO_VIDEOCHATGPT_ANNOTATION", + "data_path": "PATH_TO_VIDEOCHATGPT_DATA", +} + + +R2R_125CM_0_30 = { + "data_path": "traj_data/r2r", + "height": 125, + "pitch_1": 0, + "pitch_2": 30, +} + +R2R_125CM_0_45 = { + "data_path": "traj_data/r2r", + "height": 125, + "pitch_1": 0, + "pitch_2": 45, +} + +R2R_60CM_15_15 = { + "data_path": "traj_data/r2r", + "height": 60, + "pitch_1": 15, + "pitch_2": 15, +} + +R2R_60CM_30_30 = { + "data_path": "traj_data/r2r", + "height": 60, + "pitch_1": 30, + "pitch_2": 30, +} + +RxR_125CM_0_30 = { + "data_path": "traj_data/rxr", + "height": 125, + "pitch_1": 0, + "pitch_2": 30, +} + +RxR_125CM_0_45 = { + "data_path": "traj_data/rxr", + "height": 125, + "pitch_1": 0, + "pitch_2": 45, +} + +RxR_60CM_15_15 = { + "data_path": "traj_data/rxr", + "height": 60, + "pitch_1": 15, + "pitch_2": 15, +} + +RxR_60CM_30_30 = { + "data_path": "traj_data/rxr", + "height": 60, + "pitch_1": 30, + "pitch_2": 30, +} + +SCALEVLN_125CM_0_30 = { + "data_path": "traj_data/scalevln", + "height": 125, + "pitch_1": 0, + "pitch_2": 30, +} + +SCALEVLN_125CM_0_45 = { + "data_path": "traj_data/scalevln", + "height": 125, + "pitch_1": 0, + "pitch_2": 45, +} + +SCALEVLN_60CM_30_30 = { + "data_path": "traj_data/scalevln", + "height": 60, + "pitch_1": 30, + "pitch_2": 30, +} + +data_dict = { + "cambrian_737k": CAMBRIAN_737K, + "cambrian_737k_pack": CAMBRIAN_737K_PACK, + "mp_doc": MP_DOC, + "clevr_mc": CLEVR_MC, + "videochatgpt": VIDEOCHATGPT, + "r2r_125cm_0_30": R2R_125CM_0_30, + "r2r_125cm_0_45": R2R_125CM_0_45, + "r2r_60cm_15_15": R2R_60CM_15_15, + "r2r_60cm_30_30": R2R_60CM_30_30, + "rxr_125cm_0_30": RxR_125CM_0_30, + "rxr_125cm_0_45": RxR_125CM_0_45, + "rxr_60cm_15_15": RxR_60CM_15_15, + "rxr_60cm_30_30": RxR_60CM_30_30, + "scalevln_125cm_0_30": SCALEVLN_125CM_0_30, + "scalevln_125cm_0_45": SCALEVLN_125CM_0_45, + "scalevln_60cm_30_30": SCALEVLN_60CM_30_30, +} + + +def parse_sampling_rate(dataset_name): + match = re.search(r"%(\d+)$", dataset_name) + if match: + return int(match.group(1)) / 100.0 + return 1.0 + + +def data_list(dataset_names): + config_list = [] + for dataset_name in dataset_names: + sampling_rate = parse_sampling_rate(dataset_name) + dataset_name = re.sub(r"%(\d+)$", "", dataset_name) + if dataset_name in data_dict.keys(): + config = data_dict[dataset_name].copy() + config["sampling_rate"] = sampling_rate + config_list.append(config) + else: + raise ValueError(f"do not find {dataset_name}") + return config_list + + +IGNORE_INDEX = -100 +IMAGE_TOKEN_INDEX = 151655 +VIDEO_TOKEN_INDEX = 151656 +TRAJ_TOKEN_INDEX = 151667 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_VIDEO_TOKEN = "