From 2f3e1c0e886c7557ba22270b8c9b42ffda89d35b Mon Sep 17 00:00:00 2001
From: mengwei <kellymeng0427@gmail.com>
Date: Thu, 4 Dec 2025 10:15:46 +0800
Subject: [PATCH 01/12] [feat] Add training code for InternVLA-N1

---
 README.md                                     |  177 ++-
 .../dataset/internvla_n1_dataset_lerobot.py   | 1345 +++++++++++++++++
 internnav/dataset/rope2d.py                   |  334 ++++
 .../env/utils/habitat_extensions/measures.py  |   52 +
 internnav/evaluator/habitat_vln_evaluator.py  |  763 ++++++----
 .../basemodel/internvla_n1/internvla_n1.py    |  352 +++--
 .../internvla_n1/internvla_n1_arch.py         |  198 +++
 .../model/basemodel/internvla_n1/navdp.py     |  248 +--
 .../internvla_n1/nextdit_crossattn_traj.py    |   95 ++
 .../basemodel/internvla_n1/nextdit_traj.py    |  381 +++++
 .../depth_anything_v2/dinov2.py               |   37 +-
 internnav/qwenvl_trainer/base.py              |  329 ++++
 .../qwenvl_trainer/internvla_n1_argument.py   |   52 +
 .../qwenvl_trainer/internvla_n1_trainer.py    |  239 +++
 scripts/eval/bash/eval_dual_system.sh         |   12 +-
 scripts/eval/bash/eval_system2.sh             |   11 +-
 scripts/eval/configs/vln_rxr.yaml             |   80 +
 scripts/eval/eval_habitat.py                  |   23 +-
 .../train_internvla_n1/train_dual_system.sh   |   83 +
 scripts/train_internvla_n1/train_system2.sh   |   80 +
 scripts/train_internvla_n1/zero2.json         |   23 +
 scripts/train_internvla_n1/zero3.json         |   28 +
 scripts/train_internvla_n1/zero3_offload.json |   56 +
 23 files changed, 4318 insertions(+), 680 deletions(-)
 create mode 100644 internnav/dataset/internvla_n1_dataset_lerobot.py
 create mode 100644 internnav/dataset/rope2d.py
 create mode 100644 internnav/model/basemodel/internvla_n1/internvla_n1_arch.py
 create mode 100644 internnav/model/basemodel/internvla_n1/nextdit_crossattn_traj.py
 create mode 100644 internnav/model/basemodel/internvla_n1/nextdit_traj.py
 create mode 100644 internnav/qwenvl_trainer/base.py
 create mode 100644 internnav/qwenvl_trainer/internvla_n1_argument.py
 create mode 100644 internnav/qwenvl_trainer/internvla_n1_trainer.py
 create mode 100644 scripts/eval/configs/vln_rxr.yaml
 create mode 100644 scripts/train_internvla_n1/train_dual_system.sh
 create mode 100644 scripts/train_internvla_n1/train_system2.sh
 create mode 100644 scripts/train_internvla_n1/zero2.json
 create mode 100644 scripts/train_internvla_n1/zero3.json
 create mode 100644 scripts/train_internvla_n1/zero3_offload.json
diff --git a/README.md b/README.md
index 1518bb2..0195fe7 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ The toolbox supports the most comprehensive 6 datasets \& benchmarks and 10+ pop
 The toolbox supports the most advanced high-quality navigation dataset, InternData-N1, which includes 3k+ scenes and 830k VLN data covering diverse embodiments and scenes, and the first dual-system navigation foundation model with leading performance on all the benchmarks and zero-shot generalization capability in the real world, InternVLA-N1.
 
 ## 🔥 News
+- [2025/12] Training code for InternVLA-N1 is now available. This official release provides two dual-system configurations: **InternVLA-N1 (Dual System)**<span style="color: #28a745; font-size: 0.9em"> with NavDP*</span> and **InternVLA-N1 (Dual System)**<span style="color: #28a745; font-size: 0.9em"> DualVLN</span>. For model architecture and training details, please refer to the [DualVLN paper](TO_BE_UPDATED).
 - [2025/10] Add a simple [inference-only demo](scripts/notebooks/inference_only_demo.ipynb) of InternVLA-N1.
 - [2025/10] InternVLA-N1 [technical report](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf) is released. Please check our [homepage](https://internrobotics.github.io/internvla-n1.github.io/).
 - [2025/09] Real-world deployment code of InternVLA-N1 is released. Upload 3D printing [files](assets/3d_printing_files/go2_stand.STEP) for Unitree Go2.
@@ -55,133 +56,147 @@ The toolbox supports the most advanced high-quality navigation dataset, InternDa
 
 Please refer to the [documentation](https://internrobotics.github.io/user_guide/internnav/quick_start/index.html) for quick start with InternNav, from installation to training or evaluating supported models.
 
-## 📦 Overview of Benchmark and Model Zoo
+## 📦 Overview
 
-### Datasets \& Benchmarks
+### 🧪 Supported Benchmarks
 
 <table align="center">
   <tbody>
     <tr align="center" valign="bottom">
       <td>
-         <b>System2 (VLN-CE)</b>
+         <b>VLN Benchmarks</b>
       </td>
       <td>
-         <b>System1 (VN)</b>
-      </td>
-      <td>
-         <b>Whole-system (VLN)</b>
+         <b>VN Benchmarks</b>
       </td>
    </tr>
    <tr align="center" valign="top">
       <td>
          <ul>
-            <li align="left"><a href="">VLN-CE R2R</a></li>
-            <li align="left"><a href="">VLN-CE RxR</a></li>
+            <li align="left"><a href="https://arxiv.org/abs/2004.02857">VLN-CE</a></li>
+            <li align="left"><a href="https://arxiv.org/abs/2507.13019">VLN-PE</a></li>
          </ul>
       </td>
       <td>
          <ul>
-            <li align="left"><a href="">Cluttered Envs</a></li>
-            <li align="left"><a href="">GRScenes-100</a></li>
-         </ul>
-      </td>
-      <td>
-         <ul>
-            <li align="left"><a href="">VLN-CE</a></li>
-            <li align="left"><a href="">VLN-PE</a></li>
+            <li align="left"><a href="https://arxiv.org/abs/2505.08712">Cluttered Environments</a></li>
+            <li align="left"><a href="https://arxiv.org/abs/2505.08712">GRScenes-100</a></li>
          </ul>
       </td>
    </tbody>
 </table>
 
-### Models
+### 🤗 Model Zoo & Downloads
 
 <table align="center">
   <tbody>
     <tr align="center" valign="bottom">
       <td>
-         <b>System2 (VLN-CE)</b>
+         <b>🧠 VLN Single-System</b>
       </td>
       <td>
-         <b>System1 (VN)</b>
+         <b>🎯 VN System (System1)</b>
       </td>
       <td>
-         <b>Whole-system (VLN)</b>
+         <b>🤝 VLN Multi-System</b>
       </td>
    </tr>
    <tr align="center" valign="top">
       <td>
          <ul>
-            <li align="left"><a href="">StreamVLN</a></li>
-            <li align="left"><a href="">InternVLA-N1-Preview (S2)</a></li>
-            <li align="left"><a href="">InternVLA-N1 (S2)</a></li>
+            <li align="left"><a href="https://huggingface.co/InternRobotics/VLN-PE">Seq2Seq</a></li>
+            <li align="left"><a href="https://huggingface.co/InternRobotics/VLN-PE">CMA</a></li>
+            <li align="left"><a href="https://huggingface.co/InternRobotics/VLN-PE">RDP</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/StreamVLN">StreamVLN</a> <em>(coming soon)</em></li>
          </ul>
       </td>
       <td>
          <ul>
-            <li align="left"><a href="">DD-PPO</a></li>
-            <li align="left"><a href="">iPlanner</a></li>
-            <li align="left"><a href="">ViPlanner</a></li>
-            <li align="left"><a href="">GNM</a></li>
-            <li align="left"><a href="">ViNT</a></li>
-            <li align="left"><a href="">NoMad</a></li>
-            <li align="left"><a href="">NavDP</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">DD-PPO</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">iPlanner</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">ViPlanner</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">GNM</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">ViNT</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">NoMad</a></li>
+            <li align="left"><a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library">NavDP <small>InternVLA-N1 (System 1)</small></a></li>
          </ul>
       </td>
       <td>
          <ul>
-            <li align="left"><a href="">Seq2Seq</a></li>
-            <li align="left"><a href="">CMA</a></li>
-            <li align="left"><a href="">RDP</a></li>
-            <li align="left"><a href="">InternVLA-N1-Preview</a></li>
-            <li align="left"><a href="">InternVLA-N1</a></li>
+            <li align="left"><a href="https://huggingface.co/InternRobotics/InternVLA-N1-System2">InternVLA-N1 (System 2)</a> + <a href="https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library" style="color: #1e90ff;">Decoupled System1</a></li>
+            <li align="left"><a href="https://huggingface.co/InternRobotics/InternVLA-N1-w-NavDP">InternVLA-N1 (Dual System) <small>w/ NavDP*</small> </a>  <small> (NavDP*</small> indicates joint tuning with System 2)</li>
+            <li align="left"><a href="https://huggingface.co/InternRobotics/InternVLA-N1-DualVLN">InternVLA-N1 (Dual System) <small>DualVLN</small></a></li>
          </ul>
       </td>
    </tbody>
 </table>
 
-### Benchmark Results
-
-#### VLN-CE Task
-| Model  | Dataset/Benchmark | NE | OS | SR | SPL | Download |
-| ------ | ----------------- | -- | -- | --------- |  -- | --------- |
-| `InternVLA-N1 (S2)` | R2R | 4.89 | 60.6 | 55.4 | 52.1| [Model](https://huggingface.co/InternRobotics/InternVLA-N1-S2) |
-| `InternVLA-N1` | R2R | **4.83** | **63.3** | **58.2** | **54.0** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) |
-| `InternVLA-N1 (S2)` | RxR | 6.67 | 56.5 | 48.6 | 42.6 | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-S2) |
-| `InternVLA-N1` | RxR | **5.91** | **60.8** | **53.5** | **46.1** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) |
-| `InternVLA-N1-Preview (S2)` | R2R | 5.09 | 60.9 | 53.7 | 49.7 | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview-S2) |
-| `InternVLA-N1-Preview` | R2R | **4.76** | **63.4** | **56.7** | **52.6** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) |
-| `InternVLA-N1-Preview (S2)` | RxR | 6.39 | 60.1 | 50.5 | 43.3 | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview-S2) |
-| `InternVLA-N1-Preview` | RxR | **5.65** | **63.2** | **53.5** | **45.7** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) |
-
-#### VLN-PE Task
-| Model  | Dataset/Benchmark | NE | OS | SR | SPL | Download |
-| ------ | ----------------- | -- | -- | -- | --- | --- |
-| `Seq2Seq` | Flash | 8.27 | 43.0 | 15.7 | 9.7 | [Model](https://huggingface.co/InternRobotics/VLN-PE) |
-| `CMA` | Flash | 7.52 | 45.0 | 24.4 | 18.2 | [Model](https://huggingface.co/InternRobotics/VLN-PE) |
-| `RDP` | Flash | 6.98 | 42.5 | 24.9 | 17.5 | [Model](https://huggingface.co/InternRobotics/VLN-PE) |
-| `InternVLA-N1-Preview` | Flash | **4.21** | **68.0** | **59.8** | **54.0** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) |
-| `InternVLA-N1` | Flash | **4.13** | **67.6** | **60.4** | **54.9** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) |
-| `Seq2Seq` | Physical | 7.88 | 28.1 | 15.1 | 10.7 | [Model](https://huggingface.co/InternRobotics/VLN-PE) |
-| `CMA` | Physical | 7.26 | 31.4 | 22.1 | 18.6 | [Model](https://huggingface.co/InternRobotics/VLN-PE) |
-| `RDP` | Physical | 6.72 | 36.9 | 25.2 | 17.7 | [Model](https://huggingface.co/InternRobotics/VLN-PE) |
-| `InternVLA-N1-Preview` | Physical | **5.31** | **49.0** | **42.6** | **35.8** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1-Preview) |
-| `InternVLA-N1` | Physical | **4.73** | **56.7** | **50.6** | **43.3** | [Model](https://huggingface.co/InternRobotics/InternVLA-N1) |
-
-#### Visual Navigation Task - PointGoal Navigation
-| Model  | Dataset/Benchmark | SR | SPL | Download |
-| ------ | ----------------- | -- | -- | --------- |
-| `iPlanner` | ClutteredEnv | 84.8 | 83.6 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) |
-| `ViPlanner` | ClutteredEnv | 72.4 | 72.3 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) |
-| `InternVLA-N1 (S1)` | ClutteredEnv | **89.8** | **87.7** | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) |
-| `iPlanner` | InternScenes | 48.8 | 46.7 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) |
-| `ViPlanner` | InternScenes | 54.3 | 52.5 | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) |
-| `InternVLA-N1 (S1)` | InternScenes | **65.7** | **60.7** | [Model](https://github.com/InternRobotics/NavDP?tab=readme-ov-file#%EF%B8%8F-installation-of-baseline-library) |
-
-
-
-**NOTE:**
-- VLN-CE RxR benchmark and StreamVLN will be supported soon.
+<!-- **📝 Note:**
+- VLN-CE RxR benchmark and StreamVLN model will be supported soon.
+- **NE**: Navigation Error (lower is better) • **OS**: Oracle Success (higher is better) • **SR**: Success Rate (higher is better) • **SPL**: Success weighted by Path Length (higher is better) -->
+
+
+### 📊 Benchmark Results
+
+
+#### <u>VLN-CE Benchmarks</u>
+
+**📍 R2R Dataset**
+| Model | Observation | NE ↓ | OS ↑ | SR ↑ | SPL ↑ |
+|-------|-------------|------|------|------|-------|
+| InternVLA-N1-wo-dagger (S2) + [ShortestPathFollower](https://aihabitat.org/docs/habitat-lab/habitat.tasks.nav.shortest_path_follower.ShortestPathFollower.html) | - | 4.89 | 60.6 | 55.4 | 52.1 |
+| InternVLA-N1-wo-dagger (Dual System) <span style="color: #28a745; font-size: 0.9em"> with NavDP*</span>  | RGB-D | 4.83 | 63.3 | 58.2 | 54.0 |
+| InternVLA-N1 (S2) + [ShortestPathFollower](https://aihabitat.org/docs/habitat-lab/habitat.tasks.nav.shortest_path_follower.ShortestPathFollower.html) | - | 4.25 | 68.3 | 60.9 | 55.2 |
+| InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> with NavDP*</span> | RGB-D | 4.22 | 70.4 | 64.1 | 58.1 |
+| InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> DualVLN </span> | RGB | **4.05** | **70.7** | **64.3** | **58.5** |
+
+**📍 RxR Dataset**
+| Model | Observation | NE ↓ |  SR ↑ | SPL ↑ | nDTW ↑ |
+|-------|-------------|------|------|------|-------|
+| InternVLA-N1 (S2) + [ShortestPathFollower](https://aihabitat.org/docs/habitat-lab/habitat.tasks.nav.shortest_path_follower.ShortestPathFollower.html) | - | 5.71 | 63.5 | 55.0 | 46.8 |
+| InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> with NavDP*</span> | RGB-D | 4.70 | 59.7 | 50.6 | 69.7 |
+| InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> DualVLN </span> | RGB | **4.58** | **61.4** | **51.8** | **70.0** |
+
+---
+
+#### <u>VLN-PE Benchmarks</u>
+
+**📍 Flash Controller on R2R Unseen**
+| Model | NE ↓ | OS ↑ | SR ↑ | SPL ↑ |
+|-------|------|------|------|-------|
+| Seq2Seq | 8.27 | 43.0 | 15.7 | 9.7 |
+| CMA | 7.52 | 45.0 | 24.4 | 18.2 |
+| RDP | 6.98 | 42.5 | 24.9 | 17.5 |
+| InternVLA-N1 (System 2) + iPlanner | 4.91 | 55.53 | 47.07 | 41.09 |
+| InternVLA-N1 (System 2) + NavDP | 4.22 | 67.33 | 58.72 | 50.98 |
+| InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> DualVLN </span> | **3.90** | **69.93** | **63.62** | **56.49** |
+
+**📍 Physical Controller on R2R Unseen**
+| Model | NE ↓ | OS ↑ | SR ↑ | SPL ↑ |
+|-------|------|------|------|-------|
+| Seq2Seq | 7.88 | 28.1 | 15.1 | 10.7 |
+| CMA | 7.26 | 31.4 | 22.1 | 18.6 |
+| RDP | 6.72 | 36.9 | 25.2 | 17.7 |
+| InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> DualVLN </span> | **4.66** | **55.9** | **51.6** | **42.49** |
+
+
+#### <u>Visual Navigation Benchmarks</u>
+
+**📍 ClutteredEnv Dataset**
+| Model | SR ↑ | SPL ↑ |
+|-------|------|-------|
+| iPlanner | 84.8 | 83.6 |
+| ViPlanner | 72.4 | 72.3 |
+| NavDP <InternVLA-N1 (System 1)> | **89.8** | **87.7** |
+
+**📍 InternScenes Dataset**
+| Model | SR ↑ | SPL ↑ |
+|-------|------|-------|
+| iPlanner | 48.8 | 46.7 |
+| ViPlanner | 54.3 | 52.5 |
+| NavDP <InternVLA-N1 (System 1)> | **65.7** | **60.7** |
+
+---
 
 ## 🔧 Customization
 
@@ -236,6 +251,12 @@ If you use the specific pretrained models and benchmarks, please kindly cite the
     year = {2025},
     booktitle={arXiv},
 }
+@misc{dualvln,
+    title = {{InternVLA-N1: An} Open Dual-System Navigation Foundation Model with Learned Latent Plans},
+    author = {InternVLA-N1 Team},
+    year = {2025},
+    booktitle={arXiv},
+}
 ```
 
 </details>
diff --git a/internnav/dataset/internvla_n1_dataset_lerobot.py b/internnav/dataset/internvla_n1_dataset_lerobot.py
new file mode 100644
index 0000000..8cd39a6
--- /dev/null
+++ b/internnav/dataset/internvla_n1_dataset_lerobot.py
@@ -0,0 +1,1345 @@
+import copy
+import itertools
+import json
+import os
+import random
+import re
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import torch
+import transformers
+from decord import VideoReader
+from PIL import Image
+from torch.utils.data import Dataset
+from torchcodec.decoders import VideoDecoder
+from transformers.image_utils import to_numpy_array
+
+from .rope2d import get_rope_index_2, get_rope_index_25
+
+# Define placeholders for dataset paths
+CAMBRIAN_737K = {
+    "annotation_path": "PATH_TO_CAMBRIAN_737K_ANNOTATION",
+    "data_path": "",
+}
+
+CAMBRIAN_737K_PACK = {
+    "annotation_path": f"PATH_TO_CAMBRIAN_737K_ANNOTATION_PACKED",  # noqa: F541
+    "data_path": "",
+}
+
+MP_DOC = {
+    "annotation_path": "PATH_TO_MP_DOC_ANNOTATION",
+    "data_path": "PATH_TO_MP_DOC_DATA",
+}
+
+CLEVR_MC = {
+    "annotation_path": "PATH_TO_CLEVR_MC_ANNOTATION",
+    "data_path": "PATH_TO_CLEVR_MC_DATA",
+}
+
+VIDEOCHATGPT = {
+    "annotation_path": "PATH_TO_VIDEOCHATGPT_ANNOTATION",
+    "data_path": "PATH_TO_VIDEOCHATGPT_DATA",
+}
+
+
+R2R_125CM_0_30 = {
+    "data_path": "traj_data/r2r",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 30,
+}
+
+R2R_125CM_0_45 = {
+    "data_path": "traj_data/r2r",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 45,
+}
+
+R2R_60CM_15_15 = {
+    "data_path": "traj_data/r2r",
+    "height": 60,
+    "pitch_1": 15,
+    "pitch_2": 15,
+}
+
+R2R_60CM_30_30 = {
+    "data_path": "traj_data/r2r",
+    "height": 60,
+    "pitch_1": 30,
+    "pitch_2": 30,
+}
+
+RxR_125CM_0_30 = {
+    "data_path": "traj_data/rxr",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 30,
+}
+
+RxR_125CM_0_45 = {
+    "data_path": "traj_data/rxr",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 45,
+}
+
+RxR_60CM_15_15 = {
+    "data_path": "traj_data/rxr",
+    "height": 60,
+    "pitch_1": 15,
+    "pitch_2": 15,
+}
+
+RxR_60CM_30_30 = {
+    "data_path": "traj_data/rxr",
+    "height": 60,
+    "pitch_1": 30,
+    "pitch_2": 30,
+}
+
+SCALEVLN_125CM_0_30 = {
+    "data_path": "traj_data/scalevln",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 30,
+}
+
+SCALEVLN_125CM_0_45 = {
+    "data_path": "traj_data/scalevln",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 45,
+}
+
+SCALEVLN_60CM_30_30 = {
+    "data_path": "traj_data/scalevln",
+    "height": 60,
+    "pitch_1": 30,
+    "pitch_2": 30,
+}
+
+data_dict = {
+    "cambrian_737k": CAMBRIAN_737K,
+    "cambrian_737k_pack": CAMBRIAN_737K_PACK,
+    "mp_doc": MP_DOC,
+    "clevr_mc": CLEVR_MC,
+    "videochatgpt": VIDEOCHATGPT,
+    "r2r_125cm_0_30": R2R_125CM_0_30,
+    "r2r_125cm_0_45": R2R_125CM_0_45,
+    "r2r_60cm_15_15": R2R_60CM_15_15,
+    "r2r_60cm_30_30": R2R_60CM_30_30,
+    "rxr_125cm_0_30": RxR_125CM_0_30,
+    "rxr_125cm_0_45": RxR_125CM_0_45,
+    "rxr_60cm_15_15": RxR_60CM_15_15,
+    "rxr_60cm_30_30": RxR_60CM_30_30,
+    "scalevln_125cm_0_30": SCALEVLN_125CM_0_30,
+    "scalevln_125cm_0_45": SCALEVLN_125CM_0_45,
+    "scalevln_60cm_30_30": SCALEVLN_60CM_30_30,
+}
+
+
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+
+def data_list(dataset_names):
+    config_list = []
+    for dataset_name in dataset_names:
+        sampling_rate = parse_sampling_rate(dataset_name)
+        dataset_name = re.sub(r"%(\d+)$", "", dataset_name)
+        if dataset_name in data_dict.keys():
+            config = data_dict[dataset_name].copy()
+            config["sampling_rate"] = sampling_rate
+            config_list.append(config)
+        else:
+            raise ValueError(f"do not find {dataset_name}")
+    return config_list
+
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+TRAJ_TOKEN_INDEX = 151667
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_TRAJ_TOKEN = "<traj>"
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
+
+
+def preprocess_qwen_2_visual(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    grid_thw_image: List = [],
+    grid_thw_video: List = [],
+) -> Dict:
+    roles = {"human": "user", "gpt": "assistant"}
+    system_message = "You are a helpful assistant."
+
+    tokenizer = copy.deepcopy(tokenizer)
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+
+    visual_replicate_index_image = 0
+    visual_replicate_index_video = 0
+    input_ids, targets = [], []
+
+    for i, source in enumerate(sources):
+        try:
+            if roles[source[0]["from"]] != roles["human"]:
+                source = source[1:]
+        except:  # noqa: E722
+            print(sources)
+
+        input_id, target = [], []
+
+        input_id += tokenizer.apply_chat_template([{"role": "system", "content": system_message}])
+        target += [IGNORE_INDEX] * len(input_id)
+
+        for conv in source:
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:  # noqa: E722
+                role = conv["from"]
+                content = conv["value"]
+
+            role = roles.get(role, role)
+            if role == "user":
+                if "<image>" in content:
+                    parts = content.split("<image>")
+                    new_parts = []
+                    for i in range(len(parts) - 1):
+                        new_parts.append(parts[i])
+                        replacement = (
+                            "<|vision_start|>"
+                            + f"<|image_pad|>" * grid_thw_image[visual_replicate_index_image]  # noqa: F541
+                            + "<|vision_end|>"
+                        )
+                        new_parts.append(replacement)
+                        visual_replicate_index_image += 1
+                    new_parts.append(parts[-1])
+                    content = "".join(new_parts)
+
+                if "<video>" in content:
+                    parts = content.split("<video>")
+                    new_parts = []
+                    for i in range(len(parts) - 1):
+                        new_parts.append(parts[i])
+                        replacement = (
+                            "<|vision_start|>"
+                            + "<|video_pad|>" * grid_thw_video[visual_replicate_index_video]
+                            + "<|vision_end|>"
+                        )
+                        new_parts.append(replacement)
+                        visual_replicate_index_video += 1
+                    new_parts.append(parts[-1])
+                    content = "".join(new_parts)
+
+            conv = [{"role": role, "content": content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target_mask = encode_id.copy()
+                target_mask[:3] = [IGNORE_INDEX] * 3
+                target += target_mask
+
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        input_ids.append(input_id)
+        targets.append(target)
+
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset = data_args.dataset_use.split(",")
+        dataset_list = data_list(dataset)
+        rank0_print(f"Loading datasets: {dataset_list}")
+        self.video_max_total_pixels = getattr(data_args, "video_max_total_pixels", 1664 * 28 * 28)
+        self.video_min_total_pixels = getattr(data_args, "video_min_total_pixels", 256 * 28 * 28)
+        self.model_type = data_args.model_type
+        if data_args.model_type == "qwen2.5vl":
+            self.get_rope_index = get_rope_index_25
+        else:
+            self.get_rope_index = get_rope_index_2
+
+        list_data_dict = []
+
+        for data in dataset_list:
+            file_format = data["annotation_path"].split(".")[-1]
+            if file_format == "jsonl":
+                annotations = read_jsonl(data["annotation_path"])
+            else:
+                annotations = json.load(open(data["annotation_path"], "r"))
+            sampling_rate = data.get("sampling_rate", 1.0)
+            if sampling_rate < 1.0:
+                annotations = random.sample(annotations, int(len(annotations) * sampling_rate))
+                print(f"sampling {len(annotations)} examples from dataset {data}")
+            else:
+                rank0_print(f"dataset name: {data}")
+            for ann in annotations:
+                ann["data_path"] = data["data_path"]
+            list_data_dict += annotations
+
+        rank0_print(f"Total training samples: {len(list_data_dict)}")
+
+        random.shuffle(list_data_dict)  # Randomly shuffle the data for training
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+        self.data_args.image_processor.max_pixels = data_args.max_pixels
+        self.data_args.image_processor.min_pixels = data_args.min_pixels
+        self.data_args.image_processor.size["longest_edge"] = data_args.max_pixels
+        self.data_args.image_processor.size["shortest_edge"] = data_args.min_pixels
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample) or ("video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    @property
+    def pre_calculated_length(self):
+        if "num_tokens" in self.list_data_dict[0]:
+            length_list = [sample["num_tokens"] for sample in self.list_data_dict]
+            return np.array(length_list)
+        else:
+            print("No pre-calculated length available.")
+            return np.array([1] * len(self.list_data_dict))
+
+    def process_image_unified(self, image_file):
+        processor = copy.deepcopy(self.data_args.image_processor)
+        image = Image.open(image_file).convert("RGB")
+
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, List):
+            image_tensor = image_tensor[0]
+        grid_thw = visual_processed["image_grid_thw"][0]
+        return image_tensor, grid_thw
+
+    def process_video(self, video_file):
+        decord_video = None
+        decord_attempts = 0
+        max_decord_attempts = 3
+        while decord_attempts < max_decord_attempts:
+            try:
+                decord_video = self.video_decord(video_file)
+                return decord_video
+                if decord_video:
+                    break
+            except Exception as e:
+                print(f"Decord attempt {decord_attempts + 1} failed: {e}")
+                decord_attempts += 1
+
+        torchcodec_video = None
+        try:
+            torchcodec_video = self.video_torchcodec(video_file)
+            return torchcodec_video
+        except Exception as e:
+            print(f"torchcodec attempt failed: {e}")
+
+    def video_decord(self, video_file):
+        if not os.path.exists(video_file):
+            print(f"File not exist: {video_file}")
+        vr = VideoReader(video_file, num_threads=4)
+        total_frames = len(vr)
+        avg_fps = vr.get_avg_fps()
+        video_length = total_frames / avg_fps
+        interval = getattr(self.data_args, "base_interval", 4)
+
+        num_frames_to_sample = round(video_length / interval)
+        video_min_frames = getattr(self.data_args, "video_min_frames", 4)
+        video_max_frames = getattr(self.data_args, "video_max_frames", 8)
+
+        target_frames = min(max(num_frames_to_sample, video_min_frames), video_max_frames)
+        frame_idx = np.linspace(0, total_frames - 1, target_frames, dtype=int)
+        frame_idx = np.unique(frame_idx)
+        video = vr.get_batch(frame_idx).asnumpy()
+        return self.process_video_frames(video, frame_idx, video_length)
+
+    def video_torchcodec(self, video_file):
+        device = "cpu"  # or e.g. "cuda"
+        decoder = VideoDecoder(video_file, device=device)
+        total_frames = decoder.metadata.num_frames
+        avg_fps = decoder.metadata.average_fps
+        video_length = total_frames / avg_fps
+        interval = getattr(self.data_args, "base_interval", 4)
+
+        num_frames_to_sample = round(video_length / interval)
+        video_min_frames = getattr(self.data_args, "video_min_frames", 4)
+        video_max_frames = getattr(self.data_args, "video_max_frames", 8)
+
+        target_frames = min(max(num_frames_to_sample, video_min_frames), video_max_frames)
+        frame_idx = np.linspace(0, total_frames - 1, target_frames, dtype=int)
+        frame_idx = np.unique(frame_idx)
+        frame_batch = decoder.get_frames_at(indices=frame_idx.tolist())
+        video = frame_batch.data.cpu().numpy()
+        return self.process_video_frames(video, frame_idx, video_length)
+
+    def process_video_frames(self, video, frame_idx, video_length):
+        fps = len(frame_idx) / video_length
+        processor = copy.deepcopy(self.data_args.image_processor)
+        processor.max_pixels = self.data_args.video_max_frame_pixels
+        processor.min_pixels = self.data_args.video_min_frame_pixels
+        processor.size["longest_edge"] = processor.max_pixels
+        processor.size["shortest_edge"] = processor.min_pixels
+        video_processed = processor.preprocess(images=None, videos=video, return_tensors="pt")
+        video_tensor = video_processed["pixel_values_videos"]
+        grid_thw = video_processed["video_grid_thw"][0]
+        second_per_grid_ts = [self.data_args.image_processor.temporal_patch_size / fps] * len(grid_thw)
+        return video_tensor, grid_thw, second_per_grid_ts
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        num_base_retries = 3  # noqa: F841
+        num_final_retries = 30  # noqa: F841
+
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                time.sleep(1)
+
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(
+                    f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:",
+                    e,
+                )
+
+        try:
+            sample = self._get_item(i)
+            return sample
+        except Exception as e:
+            raise e
+
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+
+        # define some variables
+        grid_thw_merged = None
+        video_grid_thw_merged = None
+        grid_thw = None
+        video_grid_thw = None
+        second_per_grid_ts = None
+
+        if "image" in sources[0]:
+            image_folder = self.list_data_dict[i]["data_path"]
+            image_file = self.list_data_dict[i]["image"]
+            if isinstance(image_file, List):
+                if len(image_file) > 1:
+                    image_file = [os.path.join(image_folder, file) for file in image_file]
+                    results = [self.process_image_unified(file) for file in image_file]
+                    image, grid_thw = zip(*results)
+                else:
+                    image_file = image_file[0]
+                    image_file = os.path.join(image_folder, image_file)
+                    image, grid_thw = self.process_image_unified(image_file)
+                    image = [image]
+            else:
+                image_file = os.path.join(image_folder, image_file)
+                image, grid_thw = self.process_image_unified(image_file)
+                image = [image]
+            grid_thw_merged = copy.deepcopy(grid_thw)
+            if not isinstance(grid_thw, Sequence):
+                grid_thw_merged = [grid_thw_merged]
+                grid_thw = [grid_thw]
+            grid_thw_merged = [
+                merged_thw.prod() // self.data_args.image_processor.merge_size**2 for merged_thw in grid_thw_merged
+            ]
+        if "video" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_folder = self.list_data_dict[i]["data_path"]
+            if isinstance(video_file, List):
+                if len(video_file) > 1:
+                    video_file = [os.path.join(video_folder, file) for file in video_file]
+                    results = [self.process_video(file) for file in video_file]
+                    video, video_grid_thw, second_per_grid_ts = zip(*results)
+                else:
+                    video_file = video_file[0]
+                    video_file = os.path.join(video_folder, video_file)
+                    video, video_grid_thw, second_per_grid_ts = self.process_video(video_file)
+                    video = [video]
+            else:
+                video_file = os.path.join(video_folder, video_file)
+                video, video_grid_thw, second_per_grid_ts = self.process_video(video_file)
+                video = [video]
+            video_grid_thw_merged = copy.deepcopy(video_grid_thw)
+            if not isinstance(video_grid_thw, Sequence):
+                video_grid_thw_merged = [video_grid_thw_merged]
+                video_grid_thw = [video_grid_thw]
+            video_grid_thw_merged = [
+                merged_thw.prod() // self.data_args.image_processor.merge_size**2
+                for merged_thw in video_grid_thw_merged
+            ]
+        chat_sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess_qwen_2_visual(
+            chat_sources,
+            self.tokenizer,
+            grid_thw_image=grid_thw_merged if grid_thw_merged else None,
+            grid_thw_video=video_grid_thw_merged if video_grid_thw_merged else None,
+        )
+        position_ids, _ = self.get_rope_index(
+            self.data_args.image_processor.merge_size,
+            data_dict["input_ids"],
+            image_grid_thw=torch.stack(grid_thw, dim=0) if grid_thw else None,
+            video_grid_thw=(torch.stack(video_grid_thw, dim=0) if video_grid_thw else None),
+            second_per_grid_ts=second_per_grid_ts if second_per_grid_ts else None,
+        )
+        if "image" not in sources[0] and "video" not in sources[0]:
+            grid_thw_merged = None
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged)
+            position_ids = torch.arange(0, data_dict["input_ids"].size(1)).view(1, -1).unsqueeze(0).expand(3, -1, -1)
+
+        data_dict["position_ids"] = position_ids
+        data_dict["attention_mask"] = [data_dict["input_ids"][0].size(0)]
+
+        if "image" in self.list_data_dict[i]:
+            data_dict["pixel_values"] = torch.cat(image, dim=0)
+            data_dict["image_grid_thw"] = torch.cat([thw.unsqueeze(0) for thw in grid_thw], dim=0)
+        # video exist in the data
+        elif "video" in self.list_data_dict[i]:
+            data_dict["pixel_values_videos"] = torch.cat(video, dim=0)
+            data_dict["video_grid_thw"] = torch.cat([thw.unsqueeze(0) for thw in video_grid_thw], dim=0)
+
+        return data_dict
+
+
+def interpolate_and_resample_trajectory(absolute_trajectories, predict_step_num=None):
+    start_point = np.array([[0.0, 0.0]])  # Avoid creating arrays repeatedly
+
+    traj = absolute_trajectories[..., :2]
+    # Vectorized filtering of valid steps (distance squared > 0.05)
+    steps = traj[1:] - traj[:-1]  # (T, 2)
+    steps_sq = (steps**2).sum(axis=-1)  # (T,)
+    mask = steps_sq > 0.05  # (T,)
+
+    # Filter and concatenate starting point
+    filtered_traj = traj[1:][mask]  # (T, 2), where M is the number of filtered steps
+    filtered_traj = np.concatenate([start_point, filtered_traj], axis=0)  # (T+1, 2)
+
+    resampled_trajectories = smooth_and_resample_trajectory(filtered_traj, sample_length=predict_step_num + 1)
+    resampled_relative_poses = xy_to_delta_xyt(resampled_trajectories)
+
+    resampled_relative_poses[:, 0:2] *= 4  # norm
+
+    return resampled_trajectories, resampled_relative_poses
+
+
+def get_trajectory_relative_to_frame(extrinsics, camera_deg=0):
+    """
+    Calculate trajectory poses (x, y, yaw) relative to a reference frame
+
+    Args:
+        extrinsics: Sequence of 4x4 extrinsic matrices [T_world2camera], shape: (n, 4, 4), numpy array
+        camera_deg: Camera pitch angle
+
+    Returns:
+        relative_xyyaw: Pose sequence relative to the reference frame (x, y, yaw), shape: (n, 3), numpy array
+    """
+    # T_world2camera
+    # Coordinate transformation matrices
+    T_camera2robot = np.array(
+        [[[0.0, -1.0, 0.0, 0.0], [0.0, 0.0, -1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]]
+    )
+
+    T_robot2camera = np.array(
+        [[[0.0, 0.0, 1.0, 0.0], [-1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]]
+    )
+
+    # Apply camera pitch angle transformation (30 degrees downward)
+    if camera_deg is not None:
+        camera_rad = np.radians(camera_deg)
+        # Clockwise rotation around the x-axis of the level camera is downward view
+        T_deg = np.array(
+            [
+                [
+                    [1.0, 0.0, 0.0, 0.0],
+                    [0.0, np.cos(-camera_rad), -np.sin(-camera_rad), 0.0],
+                    [0.0, np.sin(-camera_rad), np.cos(-camera_rad), 0.0],
+                    [0.0, 0.0, 0.0, 1.0],
+                ]
+            ],
+            dtype=np.float32,
+        )
+        T_robot2camera = np.matmul(T_robot2camera, T_deg)
+        T_camera2robot = np.linalg.inv(T_robot2camera)
+
+    # Transform to robot coordinate system
+    extrinsics_robot = np.matmul(extrinsics, T_camera2robot)
+
+    # Get the transformation matrix of the reference frame and compute its inverse
+    T_ref = extrinsics_robot[0]
+    T_ref_inv = np.linalg.inv(T_ref)
+
+    # Calculate transformations of all frames relative to the reference frame
+    # T_relative = T_ref^{-1} * T_current
+    relative_to_ref = np.matmul(T_ref_inv[np.newaxis, :, :], extrinsics_robot)
+
+    # Extract relative poses
+    relative_translations = relative_to_ref[:, :2, 3]  # (x, y), only take the xy-plane
+    relative_yaws = np.arctan2(relative_to_ref[:, 1, 0], relative_to_ref[:, 0, 0])
+
+    relative_xyyaw = np.concatenate((relative_translations, relative_yaws.reshape(-1, 1)), axis=-1)
+
+    return relative_xyyaw
+
+
+from scipy.interpolate import CubicSpline
+
+
+def smooth_and_resample_trajectory(points, sample_length=33, interval=0.1):
+    total_distance = sample_length * interval  # Total sampling length
+
+    if len(points) == 0:
+        return np.zeros((sample_length, 2))
+
+    if len(points) == 1:
+        return np.tile(points[0], (sample_length, 1))
+
+    # Calculate cumulative distance of the original trajectory
+    diff = np.diff(points, axis=0)
+    segment_lengths = np.sqrt(np.sum(diff**2, axis=1))
+    cumulative_distances = np.cumsum(segment_lengths)
+    cumulative_distances = np.insert(cumulative_distances, 0, 0)  # Starting point distance is 0
+
+    # Use cubic spline interpolation for smoothing
+    if len(points) > 3:  # At least 4 points are needed for cubic spline interpolation
+        # Construct cubic splines using cumulative distance as the parameter
+        cs_x = CubicSpline(cumulative_distances, points[:, 0])
+        cs_y = CubicSpline(cumulative_distances, points[:, 1])
+
+        # Perform dense sampling within the original cumulative distance range
+        dense_distances = np.linspace(0, cumulative_distances[-1], max(50, len(points) * 2))
+        x_smooth = cs_x(dense_distances)
+        y_smooth = cs_y(dense_distances)
+        smoothed_points = np.column_stack((x_smooth, y_smooth))
+
+        # Recalculate cumulative distance of the smoothed trajectory
+        smooth_diff = np.diff(smoothed_points, axis=0)
+        smooth_segment_lengths = np.sqrt(np.sum(smooth_diff**2, axis=1))
+        smooth_cumulative_distances = np.cumsum(smooth_segment_lengths)
+        smooth_cumulative_distances = np.insert(smooth_cumulative_distances, 0, 0)
+    else:
+        # Too few points for cubic spline interpolation, use original points directly
+        smoothed_points = points
+        smooth_cumulative_distances = cumulative_distances
+
+    # Target sampling point distances
+    target_distances = np.linspace(0, total_distance, sample_length)
+
+    # Initialize result array
+    resampled = np.zeros((sample_length, 2))
+
+    # Interpolate for each target distance
+    for i, target_dist in enumerate(target_distances):
+        # If target distance exceeds total trajectory length, use the last point
+        if target_dist >= smooth_cumulative_distances[-1]:
+            resampled[i] = smoothed_points[-1]
+            continue
+
+        # Find the line segment where the target distance is located
+        segment_idx = np.searchsorted(smooth_cumulative_distances, target_dist, side='right') - 1
+
+        # Calculate interpolation ratio
+        start_dist = smooth_cumulative_distances[segment_idx]
+        end_dist = smooth_cumulative_distances[segment_idx + 1]
+        t = (target_dist - start_dist) / (end_dist - start_dist)
+
+        # Linear interpolation
+        resampled[i] = smoothed_points[segment_idx] + t * (
+            smoothed_points[segment_idx + 1] - smoothed_points[segment_idx]
+        )
+
+    return resampled
+
+
+def xy_to_delta_xyt(xy_actions):
+    """
+    Compute (dx, dy, delta_yaw) where dx, dy in global frame and delta_yaw is heading difference.
+
+    Args:
+        xy_actions: [N, 2] array of absolute positions
+
+    Returns:
+        delta_xyt: [N-1, 3] array
+    """
+    vectors = np.diff(xy_actions, axis=0)  # [N-1, 2]
+    yaw = np.arctan2(vectors[:, 1], vectors[:, 0])  # [N-1] yaw angles w.r.t x-axis
+
+    delta_yaw = np.diff(yaw)  # [N-2]
+    delta_yaw = (delta_yaw + np.pi) % (2 * np.pi) - np.pi  # wrap to [-π, π]
+
+    # prepend first yaw (absolute angle of first segment) as delta_yaw[0]
+    delta_yaw = np.concatenate([[yaw[0]], delta_yaw])  # now length = N-1
+
+    delta_xyt = np.concatenate([vectors, delta_yaw[:, None]], axis=1)
+    return delta_xyt
+
+
+def clip_or_pad(arr, fixed_len):
+    T, D = arr.shape
+    if T >= fixed_len:
+        return arr[:fixed_len]
+    else:
+        pad = np.zeros((fixed_len - T, D), dtype=arr.dtype)
+        return np.concatenate([arr, pad], axis=0)
+
+
+def get_annotations_from_lerobot_data(data_path, setting):
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+
+    import pyarrow.parquet as pq
+
+    annotations = {
+        "axis_align_matrix": [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]],
+        "episodes": [],
+    }
+    scene_ids = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
+
+    def process_scene(scene_id):
+        scene_path = os.path.join(data_path, scene_id)
+        episodes = read_jsonl(os.path.join(scene_path, "meta", "episodes.jsonl"))
+        scene_annotations = []
+
+        for ep in episodes:
+            ep_id = ep["episode_index"]
+            ep_instructions = ep["tasks"][0].split("<INSTRUCTION_SEP>")
+            ep_len = ep["length"]
+            parquet_path = os.path.join(
+                scene_path, "data", f"chunk-{ep_id // 1000:03d}", f"episode_{ep_id:06d}.parquet"
+            )
+
+            table = pq.read_table(parquet_path)
+            df = table.to_pandas()
+
+            ep_actions = df["action"].tolist()
+
+            pose_key = f"pose.{setting}"
+            goal_key = f"goal.{setting}"
+            relative_goal_frame_id_key = f"relative_goal_frame_id.{setting}"
+
+            if pose_key in df.columns and goal_key in df.columns and relative_goal_frame_id_key in df.columns:
+                ep_poses = df[pose_key].apply(lambda x: x.tolist()).tolist()
+                ep_pixel_goals = [
+                    [df[relative_goal_frame_id_key][idx].tolist(), df[goal_key][idx].tolist()] for idx in range(len(df))
+                ]
+            else:
+                print(f"Warning: Missing data for setting {setting} in episode {ep_id}, filling with defaults.")
+
+            assert len(ep_actions) == ep_len, f"Action length mismatch in episode {ep_id}"
+
+            for ep_instruction in ep_instructions:
+                episode = {
+                    "id": ep_id,
+                    "video": f"{data_path}/{scene_id}/videos/chunk-{ep_id // 1000:03d}",
+                    "instructions": ep_instruction,
+                    "actions": ep_actions,
+                    "length": ep_len,
+                    f"poses_{setting}": ep_poses,
+                    "pixel_goals": ep_pixel_goals,
+                }
+                scene_annotations.append(episode)
+
+        return scene_annotations
+
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        futures = {executor.submit(process_scene, scene_id): scene_id for scene_id in scene_ids}
+        for future in as_completed(futures):
+            scene_id = futures[future]
+            try:
+                scene_annotations = future.result()
+                annotations["episodes"].extend(scene_annotations)
+            except Exception as e:
+                print(f"Error processing scene {scene_id}: {e}")
+
+    return annotations
+
+
+class NavPixelGoalDataset(Dataset):
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
+        super(NavPixelGoalDataset, self).__init__()
+        dataset = data_args.vln_dataset_use.split(",")
+        dataset_list = data_list(dataset)
+        rank0_print(f"Loading datasets: {dataset_list}")
+        self.video_max_total_pixels = getattr(data_args, "video_max_total_pixels", 1664 * 28 * 28)
+        self.video_min_total_pixels = getattr(data_args, "video_min_total_pixels", 256 * 28 * 28)
+        self.model_type = data_args.model_type
+        if data_args.model_type == "qwen2.5vl":
+            self.get_rope_index = get_rope_index_25
+        else:
+            self.get_rope_index = get_rope_index_2
+
+        self.sample_step = data_args.sample_step
+        self.predict_step_num = data_args.predict_step_num
+        self.pixel_goal_only = data_args.pixel_goal_only
+        self.num_future_steps = data_args.num_future_steps
+
+        self.list_data_dict = []
+
+        for data in dataset_list:
+            sampling_rate = data.get("sampling_rate", 1.0)
+            height = data.get("height", None)
+            pitch_1 = data.get("pitch_1", None)
+            pitch_2 = data.get("pitch_2", None)
+
+            data_path = data['data_path']
+            setting = f'{height}cm_{pitch_2}deg'
+            annotations = get_annotations_from_lerobot_data(data_path, setting)
+
+            pixel_goal_list = []
+            turn_list = []
+            stop_list = []
+            list_data_dict = []
+            for item in annotations['episodes']:
+                ep_id = item['id']
+                instruction = item['instructions']
+                video = item['video']
+                actions = item['actions'][1:] + [0]
+                pixel_goals = item['pixel_goals']
+                poses = item[f'poses_{height}cm_{pitch_2}deg']
+
+                actions_len = len(actions)
+                if actions_len < 4:
+                    continue
+
+                num_rounds = actions_len // self.sample_step
+                for n in range(num_rounds + 1):
+                    if n * self.sample_step == actions_len or n * self.sample_step == actions_len - 1:
+                        continue
+                    start_frame_id = n * self.sample_step
+                    action_flag = actions[start_frame_id]
+                    pixel_goal = pixel_goals[start_frame_id]
+                    if pixel_goal[0] == -1:
+                        if action_flag == 1:
+                            continue
+                        else:
+                            end_frame_id = min(actions_len, start_frame_id + self.num_future_steps)
+                            turn_actions = []
+                            for id in range(start_frame_id, end_frame_id):
+                                if actions[id] == 1:
+                                    break
+                                turn_actions.append(actions[id])
+                            turn_list.append(
+                                (
+                                    ep_id,
+                                    data_path,
+                                    video,
+                                    height,
+                                    pitch_1,
+                                    pitch_2,
+                                    instruction,
+                                    (start_frame_id, start_frame_id + 1),
+                                    turn_actions,
+                                    None,
+                                )
+                            )
+                    else:
+                        goal_len = pixel_goal[0]
+                        if goal_len < 3:
+                            continue
+                        action = pixel_goal[1]
+                        pose = poses[start_frame_id : start_frame_id + goal_len + 1]
+                        pixel_goal_list.append(
+                            (
+                                ep_id,
+                                data_path,
+                                video,
+                                height,
+                                pitch_1,
+                                pitch_2,
+                                instruction,
+                                (start_frame_id, start_frame_id + goal_len + 1),
+                                action,
+                                pose,
+                            )
+                        )
+
+                stop_list.append(
+                    (
+                        ep_id,
+                        data_path,
+                        video,
+                        height,
+                        pitch_1,
+                        pitch_2,
+                        instruction,
+                        (actions_len - 1, actions_len),
+                        0,
+                        None,
+                    )
+                )
+
+            list_data_dict = pixel_goal_list
+            rank0_print(len(turn_list), len(pixel_goal_list), len(stop_list))
+            if not self.pixel_goal_only:
+                list_data_dict += turn_list
+                list_data_dict += stop_list * 5
+            if sampling_rate < 1.0:
+                list_data_dict = random.sample(list_data_dict, int(len(list_data_dict) * sampling_rate))
+                print(f"sampling {len(list_data_dict)} examples from dataset {data}")
+            else:
+                rank0_print(f"dataset name: {data}")
+
+            self.list_data_dict.extend(list_data_dict)
+
+        self.num_history = data_args.num_history
+        self.idx2actions = {0: 'STOP', 1: "↑", 2: "←", 3: "→", 5: "↓"}
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+        self.data_args = data_args
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    def process_image_unified(self, image):
+        processor = copy.deepcopy(self.data_args.image_processor)
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, List):
+            image_tensor = image_tensor[0]
+        grid_thw = visual_processed["image_grid_thw"][0]
+        return image_tensor, grid_thw
+
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']
+            target_width = self.image_processor.crop_size['width']
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+        img[img > 5.0] = 5.0
+        return img, (target_width, target_height)
+
+    def __getitem__(self, i):
+        (
+            ep_id,
+            data_path,
+            video,
+            height,
+            pitch_1,
+            pitch_2,
+            instruction,
+            (start_frame_id, end_frame_id),
+            action,
+            pose,
+        ) = self.list_data_dict[i]
+        if start_frame_id != 0:
+            history_id = np.unique(np.linspace(0, start_frame_id - 1, self.num_history, dtype=np.int32)).tolist()
+        else:
+            history_id = []
+
+        images = []
+        grid_thws = []
+        traj_images = []
+        traj_depths = []  # optional
+
+        for id in range(0, end_frame_id):
+            image_file = os.path.join(
+                video, f"observation.images.rgb.{height}cm_{pitch_1}deg", f"episode_{ep_id:06d}_{id}.jpg"
+            )
+            image = Image.open(image_file).convert('RGB')
+            lookdown_image = Image.open(image_file.replace(f'_{pitch_1}deg', f'_{pitch_2}deg')).convert('RGB')
+
+            depth_image = Image.open(
+                image_file.replace(f'_{pitch_1}deg', f'_{pitch_2}deg').replace('rgb', 'depth').replace('.jpg', '.png')
+            )
+
+            depth_image, resize_shape = self.preprocess_depth_image_v2(
+                depth_image, do_depth_scale=True, depth_scale=1000, target_height=224, target_width=224
+            )
+            depth_image = torch.as_tensor(np.ascontiguousarray(depth_image)).float()  # [H, W]
+            if id in history_id or id == start_frame_id:
+                if self.data_args.transform_train is not None:
+                    image = self.data_args.transform_train(image)
+                image, grid_thw = self.process_image_unified(image)
+                images.append(image)
+                grid_thws.append(grid_thw)
+                if id == start_frame_id and pose is not None:
+                    image, grid_thw = self.process_image_unified(lookdown_image)
+                    images.append(image)
+                    grid_thws.append(grid_thw)
+                    traj_images.append(lookdown_image)
+                    traj_depths.append(depth_image)
+            elif id > start_frame_id:
+                traj_images.append(lookdown_image)
+                traj_depths.append(depth_image)
+
+        history_imgs = "<image>\n" * len(history_id)
+
+        if start_frame_id != 0:
+            chat_sources = [
+                [
+                    {
+                        'from': 'human',
+                        'value': f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint's coordinates in the image. Please output STOP when you have successfully completed the task. These are your historical observations: <history>. {random.choice(self.conjunctions)}<image>.",
+                    }
+                ]
+            ]
+            chat_sources[0][0]['value'] = (
+                chat_sources[0][0]['value'].replace('<instruction>', instruction).replace('<history>', history_imgs)
+            )
+        else:
+            chat_sources = [
+                [
+                    {
+                        'from': 'human',
+                        'value': f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint's coordinates in the image. Please output STOP when you have successfully completed the task. {random.choice(self.conjunctions)}<image>.",
+                    }
+                ]
+            ]
+            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction)
+
+        if pose is not None:
+            chat_sources[0].extend(
+                [
+                    {'from': 'gpt', 'value': self.idx2actions[5]},
+                    {'from': 'human', 'value': f'{random.choice(self.conjunctions)}<image>.'},
+                    {'from': 'gpt', 'value': f'{action[0]} {action[1]}'},
+                ]
+            )
+        elif action == 0:
+            chat_sources[0].extend([{'from': 'gpt', 'value': self.idx2actions[action]}])
+        else:
+            turn_action_text = ''.join([self.idx2actions[idx] for idx in action])
+            chat_sources[0].extend([{'from': 'gpt', 'value': turn_action_text}])
+
+        grid_thw_merged = copy.deepcopy(grid_thws)
+
+        if not isinstance(grid_thws, Sequence):
+            grid_thw_merged = [grid_thw_merged]
+            grid_thws = [grid_thws]
+
+        grid_thw_merged = [
+            merged_thw.prod() // self.data_args.image_processor.merge_size**2 for merged_thw in grid_thw_merged
+        ]
+
+        data_dict = preprocess_qwen_2_visual(
+            chat_sources,
+            self.tokenizer,
+            grid_thw_image=grid_thw_merged if grid_thw_merged else None,
+        )
+
+        position_ids, _ = self.get_rope_index(
+            self.data_args.image_processor.merge_size,
+            data_dict["input_ids"],
+            image_grid_thw=torch.stack(grid_thws, dim=0) if grid_thws else None,
+        )
+
+        data_dict["position_ids"] = position_ids
+        data_dict["attention_mask"] = [data_dict["input_ids"][0].size(0)]
+        data_dict["pixel_values"] = torch.cat(images, dim=0)
+        data_dict["image_grid_thw"] = torch.cat([thw.unsqueeze(0) for thw in grid_thws], dim=0)
+
+        if self.pixel_goal_only:
+            goal_len = end_frame_id - start_frame_id - 1
+            interval = 2
+            frame_ids = np.arange(0, goal_len, interval)
+            max_len = 12
+            traj_images = torch.tensor(np.stack([np.asarray(timg.resize((224, 224))) for timg in traj_images])) / 255.0
+            if len(frame_ids) > max_len:
+                interval = int(np.ceil(goal_len / max_len))
+                frame_ids = np.arange(0, goal_len, interval)
+
+            traj_poses_gt = []
+            for cid in frame_ids:
+                discrete_traj_pose = get_trajectory_relative_to_frame(pose[cid:], camera_deg=pitch_2)
+                rel_trajectory, rel_pose_resample = interpolate_and_resample_trajectory(
+                    discrete_traj_pose, self.predict_step_num
+                )
+                rel_pose_resample = clip_or_pad(rel_pose_resample, self.predict_step_num)
+                traj_poses_gt.append(torch.tensor(rel_pose_resample))
+
+            data_dict["traj_images"] = traj_images[:goal_len][::interval]
+            data_dict["traj_depths"] = torch.stack(traj_depths[:goal_len][::interval])
+            data_dict["traj_poses"] = torch.stack(traj_poses_gt)
+        return data_dict
+
+
+def pad_and_cat(tensor_list):
+    max_length = max(tensor.shape[2] for tensor in tensor_list)
+
+    padded_tensors = []
+    for tensor in tensor_list:
+        pad_length = max_length - tensor.shape[2]
+        padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), "constant", 1)
+        padded_tensors.append(padded_tensor)
+
+    stacked_tensor = torch.cat(padded_tensors, dim=1)
+
+    return stacked_tensor
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def process_input_with_traj_tokens(
+        self,
+        input_ids: torch.Tensor,
+        labels: torch.Tensor,
+        max_seq_len: int = None,
+        traj_token_length: int = 4,  # TODO hard-code
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[int]]:
+        if max_seq_len is None:
+            max_seq_len = self.tokenizer.model_max_length - traj_token_length
+
+        batch_size = len(input_ids)
+        multi_input_ids = [None] * batch_size
+        multi_labels = [None] * batch_size
+        t_s_pos = [0] * batch_size
+
+        traj_token_template = torch.full(
+            (traj_token_length,), TRAJ_TOKEN_INDEX, dtype=input_ids[0].dtype, device=input_ids[0].device
+        )
+
+        for i in range(batch_size):
+            truncated_input = input_ids[i][:max_seq_len]
+            truncated_label = labels[i][:max_seq_len]
+
+            t_s_pos[i] = len(truncated_input)
+
+            multi_input_ids[i] = torch.cat([truncated_input, traj_token_template])
+            multi_labels[i] = torch.cat([truncated_label, traj_token_template.clone()])
+
+        return multi_input_ids, multi_labels, t_s_pos
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels", "position_ids")
+        )
+        input_ids = [ids.squeeze(0) for ids in input_ids]
+        labels = [ids.squeeze(0) for ids in labels]
+
+        if "traj_images" in instances[0]:
+            input_ids, labels, t_s_pos = self.process_input_with_traj_tokens(input_ids, labels)
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+
+        if input_ids.shape[1] > self.tokenizer.model_max_length:
+            print(
+                f"Warning input with length {input_ids.shape[1]} is longer than max length {self.tokenizer.model_max_length}"
+            )
+
+        position_ids = pad_and_cat(position_ids)
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
+        position_ids = position_ids[:, :, : self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        images = list(instance["pixel_values"] for instance in instances if "pixel_values" in instance)
+        videos = list(instance["pixel_values_videos"] for instance in instances if "pixel_values_videos" in instance)
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = [instance["image_grid_thw"] for instance in instances if "image_grid_thw" in instance]
+            grid_thw = torch.cat(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = [instance["video_grid_thw"] for instance in instances if "video_grid_thw" in instance]
+            video_grid_thw = torch.cat(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+        batch["position_ids"] = position_ids
+
+        if "traj_images" in instances[0]:
+            traj_images, traj_depths, traj_poses = tuple(
+                [instance[key] for instance in instances] for key in ("traj_images", "traj_depths", "traj_poses")
+            )
+            video_frame_num = []
+            max_len = max(img.shape[0] for img in traj_images)
+            traj_image_batch = []
+            traj_depth_batch = []
+            traj_pose_batch = []
+            # import pdb; pdb.set_trace()
+            for idx in range(len(traj_images)):
+                t_img = traj_images[idx]
+                t_depth = traj_depths[idx]
+                t_pose = traj_poses[idx]
+                n_frames = t_img.shape[0]
+                video_frame_num.append(n_frames)
+                if n_frames < max_len:
+                    pad_len = max_len - n_frames
+                    last_img = t_img[-1]
+                    last_depth = t_depth[-1]
+                    last_pose = t_pose[-1]
+                    padding_img = last_img.unsqueeze(0).repeat(pad_len, 1, 1, 1)
+                    padding_depth = last_depth.unsqueeze(0).repeat(pad_len, 1, 1)
+                    padding_pose = last_pose.unsqueeze(0).repeat(pad_len, 1, 1)
+                    padded_img = torch.cat([t_img, padding_img], dim=0)
+                    padded_depth = torch.cat([t_depth, padding_depth], dim=0)
+                    padded_pose = torch.cat([t_pose, padding_pose], dim=0)
+                else:
+                    padded_img = t_img
+                    padded_depth = t_depth
+                    padded_pose = t_pose
+                traj_image_batch.append(padded_img)
+                traj_depth_batch.append(padded_depth)
+                traj_pose_batch.append(padded_pose)
+            batch['position_ids'] = None
+            batch['t_s_pos'] = t_s_pos
+            batch['traj_images'] = torch.stack(traj_image_batch)
+            batch['traj_depths'] = torch.stack(traj_depth_batch)
+            batch['traj_poses'] = torch.stack(traj_pose_batch)
+            batch['video_frame_num'] = torch.tensor(video_frame_num)
+
+        return batch
+
+
+@dataclass
+class FlattenedDataCollatorForSupervisedDataset(DataCollatorForSupervisedDataset):
+    """Collate examples into packed sequence with multi-modal support."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids, attention_mask = tuple(
+            [instance[key] for instance in instances]
+            for key in ("input_ids", "labels", "position_ids", "attention_mask")
+        )
+        attention_mask = list(
+            itertools.chain(*(instance["attention_mask"] for instance in instances if "attention_mask" in instance))
+        )
+        seq_lens = torch.tensor([0] + attention_mask, dtype=torch.int32)
+        cumsum_seq_lens = torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
+        input_ids = torch.cat(input_ids, dim=1)
+        labels = torch.cat(labels, dim=1)
+        position_ids = torch.cat(position_ids, dim=2)
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=cumsum_seq_lens,
+            position_ids=position_ids,
+        )
+        images = list(instance["pixel_values"] for instance in instances if "pixel_values" in instance)
+        videos = list(instance["pixel_values_videos"] for instance in instances if "pixel_values_videos" in instance)
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = [instance["image_grid_thw"] for instance in instances if "image_grid_thw" in instance]
+            grid_thw = torch.cat(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = [instance["video_grid_thw"] for instance in instances if "video_grid_thw" in instance]
+            video_grid_thw = torch.cat(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = NavPixelGoalDataset(tokenizer=tokenizer, data_args=data_args)
+    # train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    if data_args.data_flatten:
+        data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
+        return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+if __name__ == "__main__":
+    pass
diff --git a/internnav/dataset/rope2d.py b/internnav/dataset/rope2d.py
new file mode 100644
index 0000000..29ed688
--- /dev/null
+++ b/internnav/dataset/rope2d.py
@@ -0,0 +1,334 @@
+from typing import Optional, Tuple
+
+import torch
+
+
+def get_rope_index_25(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+        For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embedding for text part.
+        Examples:
+            Temporal (Time): 3 patches, representing different segments of the video in time.
+            Height: 2 patches, dividing each frame vertically.
+            Width: 2 patches, dividing each frame horizontally.
+            We also have some important parameters:
+            fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+            tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+            temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+            interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [101, 102, 103, 104, 105]
+            text height position_ids: [101, 102, 103, 104, 105]
+            text width position_ids: [101, 102, 103, 104, 105]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+            The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        attention_mask = attention_mask.to(total_input_ids.device)
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    second_per_grid_t = 0
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    if second_per_grid_ts is not None:
+                        second_per_grid_t = second_per_grid_ts[video_index]
+                    else:
+                        second_per_grid_t = 1.0
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+
+                time_tensor = expanded_range * second_per_grid_t * 2
+
+                time_tensor_long = time_tensor.long()
+                t_index = time_tensor_long.flatten()
+
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+
+        return position_ids, mrope_position_deltas
+
+
+def get_rope_index_2(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+        For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embeddin for text part.
+        Examples:
+            Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [3, 4, 5, 6, 7]
+            text height position_ids: [3, 4, 5, 6, 7]
+            text width position_ids: [3, 4, 5, 6, 7]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+
+        return position_ids, mrope_position_deltas
diff --git a/internnav/env/utils/habitat_extensions/measures.py b/internnav/env/utils/habitat_extensions/measures.py
index 5dddcae..fb9e5c7 100644
--- a/internnav/env/utils/habitat_extensions/measures.py
+++ b/internnav/env/utils/habitat_extensions/measures.py
@@ -1,3 +1,5 @@
+import gzip
+import json
 from typing import Any, List, Union
 
 import numpy as np
@@ -131,6 +133,56 @@ def update_metric(self, *args: Any, **kwargs: Any):
         self._metric += 1.0
 
 
+from dtw import dtw
+
+
+@registry.register_measure
+class NDTW(Measure):
+    """NDTW (Normalized Dynamic Time Warping)
+    ref: https://arxiv.org/abs/1907.05446
+    """
+
+    cls_uuid: str = "ndtw"
+
+    def __init__(self, *args: Any, sim: Simulator, config: Any, **kwargs: Any):
+        self._sim = sim
+        self._config = config
+        self.dtw_func = dtw
+
+        with gzip.open("val_unseen_guide_gt.json.gz", "rt") as f:
+            self.gt_json = json.load(f)  # HARDCODED
+
+        super().__init__()
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, episode, **kwargs: Any):
+        self.locations = []
+        self.gt_locations = self.gt_json[episode.episode_id]["locations"]
+        self.update_metric()
+
+    def update_metric(self, *args: Any, **kwargs: Any):
+        current_position = self._sim.get_agent_state().position.tolist()
+        if len(self.locations) == 0:
+            self.locations.append(current_position)
+        else:
+            if current_position == self.locations[-1]:
+                return
+            self.locations.append(current_position)
+
+        dtw_distance = self.dtw_func(self.locations, self.gt_locations, dist=euclidean_distance)[0]
+
+        # nDTW = np.exp(
+        #     -dtw_distance
+        #     / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
+        # )
+
+        nDTW = np.exp(-dtw_distance / (len(self.gt_locations) * 3.0))  # HARDCODED
+
+        self._metric = nDTW
+
+
 # import gzip
 # import json
 # import pickle
diff --git a/internnav/evaluator/habitat_vln_evaluator.py b/internnav/evaluator/habitat_vln_evaluator.py
index 3bf4c54..5310743 100644
--- a/internnav/evaluator/habitat_vln_evaluator.py
+++ b/internnav/evaluator/habitat_vln_evaluator.py
@@ -5,15 +5,24 @@
 import os
 import random
 import re
+import sys
 from collections import OrderedDict
+from pathlib import Path
 from typing import Any
 
-import habitat
+import cv2
 import numpy as np
 import quaternion
 import torch
 import tqdm
 from depth_camera_filtering import filter_depth
+from PIL import Image
+from transformers.image_utils import to_numpy_array
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.append(str(project_root))
+
+import habitat
 from habitat import Env
 from habitat.config.default import get_agent_config
 from habitat.config.default_structured_configs import (
@@ -25,28 +34,21 @@
 from habitat.utils.visualizations.utils import images_to_video, observations_to_image
 from habitat_baselines.config.default import get_config as get_habitat_config
 from omegaconf import OmegaConf
-from PIL import Image, ImageDraw, ImageFont
-from torch import Tensor
-from transformers.image_utils import to_numpy_array
 
-from internnav.model.utils.vln_utils import (
-    chunk_token,
-    image_resize,
-    open_image,
-    rho_theta,
-    split_and_clean,
-    traj_to_actions,
-)
+from internnav.model.utils.vln_utils import split_and_clean, traj_to_actions
 from internnav.utils.dist import *  # noqa: F403
 
 DEFAULT_IMAGE_TOKEN = "<image>"
 
+MAX_STEPS = 8
+MAX_LOCAL_STEPS = 4
+
 
 class VLNEvaluator:
     def __init__(
         self,
         config_path: str,
-        split: str = "val_seen",
+        split: str = "val_unseen",
         env_num: int = 1,
         output_path: str = None,
         model: Any = None,
@@ -67,7 +69,6 @@ def __init__(
         self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
 
         with habitat.config.read_write(self.config):
-            # self.config.habitat.task.measurements.success.success_distance=3.0
             self.config.habitat.dataset.split = self.split
             self.config.habitat.task.measurements.update(
                 {
@@ -103,8 +104,7 @@ def __init__(
 
         self.model = model
         self.processor = processor
-
-        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
+        prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."  # noqa: F541
         answer = ""
         self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
 
@@ -128,26 +128,12 @@ def __init__(
             }
         )
 
-        self.objectnav_instructions = ["Search for the {target_object}."]
-
         self.num_frames = args.num_frames
-        self.num_future_steps = args.num_future_steps
         self.num_history = args.num_history
 
-    def preprocess_depth_image_v2(
-        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
-    ):
-        if target_height is None:
-            target_height = self.image_processor.crop_size['height']  # 384
-            target_width = self.image_processor.crop_size['width']  # 384
-
-        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
-
-        img = to_numpy_array(resized_depth_image)
-        if do_depth_scale:
-            img = img / depth_scale
-
-        return img, (target_width, target_height)
+    def config_env(self) -> Env:
+        env = Env(config=self.config)
+        return env
 
     def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
         width = sensor_cfg.width
@@ -163,22 +149,6 @@ def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
         )
         return intrinsic_matrix
 
-    def preprocess_instrinsic(self, intrinsic, ori_size, target_size):  # (V, 4, 4) (resize_shape) (h, w)
-        intrinsic = copy.deepcopy(intrinsic)
-        if len(intrinsic.shape) == 2:
-            intrinsic = intrinsic[None, :, :]  # (1, 4, 4) or (B, 4, 4)
-
-        intrinsic[:, 0] /= ori_size[0] / target_size[0]  # width
-        intrinsic[:, 1] /= ori_size[1] / target_size[1]  # height
-
-        # for crop transform
-        intrinsic[:, 0, 2] -= (target_size[0] - target_size[1]) / 2
-
-        if intrinsic.shape[0] == 1:
-            intrinsic = intrinsic.squeeze(0)
-
-        return intrinsic
-
     def get_axis_align_matrix(self):
         ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
         return ma
@@ -261,134 +231,372 @@ def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
 
         return (x, y)  # same as habitat gps
 
-    def config_env(self) -> Env:
-        env = Env(config=self.config)
-        # env.episodes = env.episodes[0:1]
-        return env
-
-    def dot_matrix_two_dimensional(
-        self,
-        image_or_image_path,
-        save_path=None,
-        dots_size_w=8,
-        dots_size_h=8,
-        save_img=False,
-        font_path='fonts/arial.ttf',
-        pixel_goal=None,
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
     ):
-        """
-        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
-        control args:
-        1. dots_size_w: the number of columns of the dots matrix
-        2. dots_size_h: the number of rows of the dots matrix
-        """
-        with open_image(image_or_image_path) as img:
-            if img.mode != 'RGB':
-                img = img.convert('RGB')
-            draw = ImageDraw.Draw(img, 'RGB')
-
-            width, height = img.size
-            grid_size_w = dots_size_w + 1
-            grid_size_h = dots_size_h + 1
-            cell_width = width / grid_size_w
-            cell_height = height / grid_size_h
-
-            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
-
-            target_i = target_j = None
-            if pixel_goal is not None:
-                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
-                # Validate pixel coordinates
-                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
-                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
-
-                # Convert to grid coordinates
-                target_i = round(x_pixel / cell_width)
-                target_j = round(y_pixel / cell_height)
-
-                # Validate grid bounds
-                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
-                    raise ValueError(
-                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
-                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
-                    )
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+
+        return img, (target_width, target_height)
+
+    def eval_dual_system(self, idx) -> None:  # noqa: C901
+        self.model.eval()
+        env = self.config_env()
+        scene_episode_dict = {}
+        for episode in env.episodes:
+            if episode.scene_id not in scene_episode_dict:
+                scene_episode_dict[episode.scene_id] = []
+            scene_episode_dict[episode.scene_id].append(episode)
+
+        sucs, spls, oss, nes, dtws = [], [], [], [], []
+        done_res = []
+        metrics = {}
+        if os.path.exists(os.path.join(self.output_path, 'result.json')):
+            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
+                for line in f.readlines():
+                    try:
+                        res = json.loads(line)
+                    except:  # noqa: E722
+                        continue
+                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
+                    sucs.append(res['success'])
+                    spls.append(res['spl'])
+                    oss.append(res['os'])
+                    nes.append(res['ne'])
+                    if 'ndtw' in res:
+                        dtws.append(res['ndtw'])
+
+        for scene in sorted(scene_episode_dict.keys()):
+            episodes = scene_episode_dict[scene]
+            scene_id = scene.split('/')[-2]
+            process_bar = tqdm.tqdm(range(len(episodes[idx :: self.env_num])), desc=f"scene {scene_id}")
+            for episode in episodes[idx :: self.env_num]:
+                episode_instruction = (
+                    episode.instruction.instruction_text
+                    if 'objectnav' not in self.config_path
+                    else episode.object_category
+                )
+                episode_id = int(episode.episode_id)
+                if [scene_id, episode_id, episode_instruction] in done_res:
+                    continue
+
+                env.current_episode = episode
+                observations = env.reset()
+
+                agent_state = env.sim.get_agent_state()
+                rotation = agent_state.rotation
+                translation = agent_state.position
+                rotation_matrix = quaternion.as_rotation_matrix(rotation)
+                transformation_matrix = np.eye(4)
+                transformation_matrix[:3, :3] = rotation_matrix
+                transformation_matrix[:3, 3] = translation
+
+                os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
+                Image.fromarray(observations['rgb']).save(
+                    os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
+                )
+
+                vis_frames = []
+                step_id = 0
+
+                if self.save_video:
+                    os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+
+                rgb_list = []
+                action_seq = []
+                output_ids = None
+
+                pixel_goal = None
+                action = None
+                messages = []
+                local_actions = []
+                flag = False
+                while not env.episode_over and step_id <= 500:
+                    rgb = observations["rgb"]
+                    depth = observations["depth"]  # optional
+                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                    depth = depth * 1000
+
+                    image = Image.fromarray(rgb).convert('RGB')
+                    save_raw_image = image.copy()
+
+                    if action == 5:
+                        look_down_image = image
+                        save_raw_image = look_down_image.copy()
+                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                            do_depth_scale=True,
+                            depth_scale=1000,
+                            target_height=224,
+                            target_width=224,
+                        )
+                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                        look_down_depth[look_down_depth > 5.0] = 5.0
+                    else:
+                        image = image.resize((self.args.resize_w, self.args.resize_h))
+                        rgb_list.append(image)
+
+                        down_observations = env.step(5)
+                        down_observations = env.step(5)
+
+                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
+
+                        depth = down_observations["depth"]
+                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                        depth = depth * 1000
+                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                            do_depth_scale=True,
+                            depth_scale=1000,
+                            target_height=224,
+                            target_width=224,
+                        )
+                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                        # depth clip to 5m
+                        look_down_depth[look_down_depth > 5.0] = 5.0
+
+                        env.step(4)
+                        env.step(4)
+
+                    info = env.get_metrics()
+
+                    if len(action_seq) == 0 and pixel_goal is None:
+                        if action != 5:
+                            sources = copy.deepcopy(self.conversation)
+                            sources[0]["value"] = sources[0]["value"].replace(
+                                '<instruction>.', episode.instruction.instruction_text[:-1]
+                            )
+                            cur_images = rgb_list[-1:]
+                            if step_id == 0:
+                                history_id = []
+                            else:
+                                history_id = np.unique(
+                                    np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                                ).tolist()
+                                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+                            history_id = sorted(history_id)
+                            input_images = [rgb_list[i] for i in history_id] + cur_images
+                            input_img_id = 0
+                        else:
+                            assert action == 5  # last action is look down
+                            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                            input_images += [look_down_image]
+                            messages.append(
+                                {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                            )
+                            input_img_id = -1
+
+                        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+                        sources[0]["value"] += f" {prompt}."
+                        prompt_instruction = copy.deepcopy(sources[0]["value"])
+                        parts = split_and_clean(prompt_instruction)
 
-            count = 0
+                        content = []
+                        for i in range(len(parts)):
+                            if parts[i] == "<image>":
+                                content.append({"type": "image", "image": input_images[input_img_id]})
+                                input_img_id += 1
+                            else:
+                                content.append({"type": "text", "text": parts[i]})
+
+                        messages.append({'role': 'user', 'content': content})
+
+                        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+                        inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(
+                            self.model.device
+                        )
+
+                        with torch.no_grad():
+                            outputs = self.model.generate(
+                                **inputs,
+                                max_new_tokens=128,
+                                do_sample=False,
+                                use_cache=True,
+                                past_key_values=None,
+                                return_dict_in_generate=True,
+                            )
+                            output_ids = outputs.sequences
+
+                        llm_outputs = self.processor.tokenizer.decode(
+                            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                        )
+                        print('step_id:', step_id, 'output text:', llm_outputs)
+                        if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
+                            forward_action = 0
+                            coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+
+                            pixel_goal = [int(coord[1]), int(coord[0])]
+
+                            env.step(4)
+                            env.step(4)
+
+                            local_actions = []
+                            pixel_values = inputs.pixel_values
+                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
 
-            for j in range(1, grid_size_h):
-                for i in range(1, grid_size_w):
-                    x = int(i * cell_width)
-                    y = int(j * cell_height)
+                            with torch.no_grad():
+                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
 
-                    pixel_color = img.getpixel((x, y))
-                    # choose a more contrasting color from black and white
-                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
-                        opposite_color = (0, 0, 0)
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+                            pix_goal_image = copy.copy(image_dp)
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                            pix_goal_depth = copy.copy(depth_dp)
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(traj_latents, images_dp, depths_dp)
+
+                            action_list = traj_to_actions(dp_actions)
+                            if len(action_list) < MAX_STEPS:
+                                action_list += [0] * (MAX_STEPS - len(action_list))
+
+                            local_actions = action_list
+                            if len(local_actions) >= MAX_LOCAL_STEPS:
+                                local_actions = local_actions[:MAX_LOCAL_STEPS]
+
+                            action = local_actions[0]
+                            if action == 0:
+                                pixel_goal = None
+                                output_ids = None
+                                action = 2
+                                observations = env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
+                            print('predicted goal', pixel_goal, flush=True)
+
+                        else:
+                            action_seq = self.parse_actions(llm_outputs)
+                            print('actions', action_seq, flush=True)
+
+                    if len(action_seq) != 0:
+                        action = action_seq[0]
+                        action_seq.pop(0)
+                    elif pixel_goal is not None:
+                        if len(local_actions) == 0:
+                            local_actions = []
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(traj_latents, images_dp, depths_dp)
+
+                            action_list = traj_to_actions(dp_actions)
+                            if len(action_list) < MAX_STEPS:
+                                action_list += [0] * (MAX_STEPS - len(action_list))
+
+                            local_actions = action_list
+                            if len(local_actions) >= MAX_LOCAL_STEPS:
+                                local_actions = local_actions[:MAX_LOCAL_STEPS]
+                            print("local_actions", local_actions)
+                            action = local_actions.pop(0)
+                        else:
+                            action = local_actions.pop(0)
+
+                        forward_action += 1
+                        if forward_action > MAX_STEPS:
+                            pixel_goal = None
+                            output_ids = None
+                            messages = []
+                            step_id += 1
+                            forward_action = 0
+                            local_actions = []
+                            continue
+                        if action == 0:
+                            pixel_goal = None
+                            output_ids = None
+                            messages = []
+                            step_id += 1
+                            forward_action = 0
+                            local_actions = []
+                            continue
                     else:
-                        opposite_color = (255, 255, 255)
+                        action = 0
+
+                    if info['top_down_map'] is not None:
+                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                        if pixel_goal is not None and flag:
+                            cv2.circle(frame, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                        vis_frames.append(frame)
+
+                    print("step_id", step_id, "action", action)
+
+                    if action == 5:
+                        env.step(action)
+                        observations = env.step(action)
+                        flag = True
+                    else:
+                        observations = env.step(action)
+                        step_id += 1
+                        messages = []
+                        flag = False
 
-                    if pixel_goal is not None and i == target_i and j == target_j:
-                        opposite_color = (255, 0, 0)  # Red for target
+                process_bar.update(1)
 
-                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
-                    draw.ellipse(
-                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
-                        fill=opposite_color,
+                metrics = env.get_metrics()
+                if self.save_video:
+                    images_to_video(
+                        vis_frames,
+                        os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
+                        f'{episode_id:04d}',
+                        fps=6,
+                        quality=9,
                     )
+                vis_frames.clear()
+                sucs.append(metrics['success'])
+                spls.append(metrics['spl'])
+                oss.append(metrics['oracle_success'])
+                nes.append(metrics["distance_to_goal"])
+                if 'ndtw' in metrics:
+                    dtws.append(metrics["ndtw"])
+                print(
+                    f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
+                )
 
-                    text_x, text_y = x + 3, y
-                    count_w = count // dots_size_w
-                    count_h = count % dots_size_w
-                    label_str = f"({count_w+1},{count_h+1})"
-                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
-                    count += 1
-            if save_img:
-                print(">>> dots overlaid image processed, stored in", save_path)
-                img.save(save_path)
-            return img
-
-    def _pointnav(
-        self,
-        goal: np.ndarray,
-        depth: np.ndarray,
-        step_id: int,
-        robot_xy: np.ndarray,
-        robot_heading: float,
-        stop: bool = False,
-    ) -> Tensor:
-        '''
-        Args:
-            goal (np.ndarray): goal position
-            stop (bool): whether to stop
-        Returns:
-            action: action tensor
-        '''
+                result = {
+                    "scene_id": scene_id,
+                    "episode_id": episode_id,
+                    "success": metrics["success"],
+                    "spl": metrics["spl"],
+                    "os": metrics['oracle_success'],
+                    "ne": metrics["distance_to_goal"],
+                    "steps": step_id,
+                    "episode_instruction": episode_instruction,
+                }
+                if 'ndtw' in metrics:
+                    result['ndtw'] = metrics['ndtw']
+                with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
+                    f.write(json.dumps(result) + "\n")
+        env.close()
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(dtws).to(self.device) if 'ndtw' in metrics else None,
+            torch.tensor(len(sucs)).to(self.device),
+        )
 
-        masks = torch.tensor([step_id != 0], dtype=torch.bool, device="cuda")
-        if not np.array_equal(goal, self._last_goal):
-            if np.linalg.norm(goal - self._last_goal) > 0.1:
-                self._pointnav_policy.reset()
-                print('Pointnav policy reset!')
-                masks = torch.zeros_like(masks)
-            self._last_goal = goal
-        rho, theta = rho_theta(robot_xy, robot_heading, goal)
-        rho_theta_tensor = torch.tensor([[rho, theta]], device="cuda", dtype=torch.float32)
-        obs_pointnav = {
-            "depth": image_resize(
-                depth,
-                (self._pointnav_depth_image_shape[0], self._pointnav_depth_image_shape[1]),
-                channels_last=True,
-                interpolation_mode="area",
-            ),
-            "pointgoal_with_gps_compass": rho_theta_tensor,
-        }
-
-        if rho < self._pointnav_stop_radius and stop:
-            return 0
-        action = self._pointnav_policy.act(obs_pointnav, masks, deterministic=True)
-        return action
-
-    def eval_action(self, idx) -> None:  # noqa: C901
+    def eval_system2(self, idx) -> None:  # noqa: C901
         self.model.eval()
         env = self.config_env()
         scene_episode_dict = {}
@@ -400,7 +608,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
         intrinsic_matrix = self.get_intrinsic_matrix(
             self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
         )
-        sucs, spls, oss, nes = [], [], [], []
+        sucs, spls, oss, nes, ndtw = [], [], [], [], []
         done_res = []
 
         if os.path.exists(os.path.join(self.output_path, 'result.json')):
@@ -413,6 +621,8 @@ def eval_action(self, idx) -> None:  # noqa: C901
                         spls.append(res['spl'])
                         oss.append(res['os'])
                         nes.append(res['ne'])
+                        if 'ndtw' in res:
+                            ndtw.append(res['ndtw'])
 
         for scene in sorted(scene_episode_dict.keys()):
             episodes = scene_episode_dict[scene]
@@ -420,6 +630,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
             print(f"scene_id = {scene_id}")
             process_bar = tqdm.tqdm(range(len(episodes[idx :: self.env_num])), desc=f"scene {scene_id}")
             for episode in episodes[idx :: self.env_num]:
+                metrics = env.get_metrics()
                 episode_instruction = (
                     episode.instruction.instruction_text
                     if 'objectnav' not in self.config_path
@@ -462,7 +673,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
                 goal = None
                 action = None
                 messages = []
-                local_actions = []
+                flag = False
 
                 while not env.episode_over and step_id <= 500:
                     rgb = observations["rgb"]
@@ -474,64 +685,40 @@ def eval_action(self, idx) -> None:  # noqa: C901
                     depth = depth * 1000
 
                     agent_state = env.sim.get_agent_state()
-                    height = agent_state.position[1] - initial_height
+                    height = agent_state.position[1] - initial_height  # Habitat GPS makes west negative, so flip y
                     camera_position = np.array([x, -y, self._camera_height + height])
                     tf_camera_to_episodic = (
                         self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
                         @ self.get_axis_align_matrix()
                     )
 
-                    image = Image.fromarray(rgb).convert('RGB')
+                    image = Image.fromarray(rgb).convert('RGB')  # raw observation image
                     save_raw_image = image.copy()
 
-                    save_dot = False
                     if action == 5:
                         look_down_image = image
                         save_raw_image = look_down_image.copy()
-                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                            do_depth_scale=True,
-                            depth_scale=1000,
-                            target_height=224,
-                            target_width=224,
-                        )
-                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                        look_down_depth[look_down_depth > 5.0] = 5.0
                     else:
                         image = image.resize((self.args.resize_w, self.args.resize_h))
                         rgb_list.append(image)
 
-                        if self.args.mode == 'dual_system':
-                            down_observations = env.step(5)
-                            down_observations = env.step(5)
-
-                            look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
-                            depth = down_observations["depth"]
-                            depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                            depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                            depth = depth * 1000
-                            look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                                Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                                do_depth_scale=True,
-                                depth_scale=1000,
-                                target_height=224,
-                                target_width=224,
-                            )
-                            look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                            look_down_depth[look_down_depth > 5.0] = 5.0
-
-                            env.step(4)
-                            env.step(4)
-
                     info = env.get_metrics()
 
                     if len(action_seq) == 0 and goal is None:
                         if action != 5:
                             sources = copy.deepcopy(self.conversation)
-                            sources[0]["value"] = sources[0]["value"].replace(
-                                '<instruction>.', episode.instruction.instruction_text[:-1]
-                            )
-                            cur_images = rgb_list[-1:]
+                            if 'objectnav' in self.config_path:
+                                sources[0]["value"] = sources[0]["value"].replace(
+                                    '<instruction>.',
+                                    random.choice(self.objectnav_instructions).format(
+                                        target_object=episode.object_category.replace('_', ' ')
+                                    ),
+                                )
+                            else:
+                                sources[0]["value"] = sources[0]["value"].replace(
+                                    '<instruction>.', episode.instruction.instruction_text[:-1]
+                                )
+                            cur_images = rgb_list[-1:]  # current observation
                             if step_id == 0:
                                 history_id = []
                             else:
@@ -542,11 +729,10 @@ def eval_action(self, idx) -> None:  # noqa: C901
                                 sources[0]["value"] += f' These are your historical observations: {placeholder}.'
 
                             history_id = sorted(history_id)
-                            print('history_idddddddd', step_id, history_id)
                             input_images = [rgb_list[i] for i in history_id] + cur_images
                             input_img_id = 0
                         else:
-                            assert action == 5
+                            assert action == 5  # last action is look down
                             sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
                             input_images += [look_down_image]
                             messages.append(
@@ -579,20 +765,30 @@ def eval_action(self, idx) -> None:  # noqa: C901
                         )
 
                         with torch.no_grad():
-                            output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+                            output = self.model.generate(
+                                **inputs,
+                                max_new_tokens=128,
+                                do_sample=False,
+                                use_cache=True,
+                                past_key_values=None,
+                                return_dict_in_generate=True,
+                            )
+                            output_ids = output.sequences
 
                         llm_outputs = self.processor.tokenizer.decode(
                             output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
                         )
                         print('step_id:', step_id, 'output text:', llm_outputs)
 
-                        if bool(re.search(r'\d', llm_outputs)):
+                        if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
                             forward_action = 0
                             coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
-                            pixel_goal = [int(coord[1]), int(coord[0])]
+                            print('coords:', coord)
+
+                            pixel_goal = [int(coord[1]), int(coord[0])]  # switch the goal o
 
                             goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
-                            print('before', goal, depth.shape)
+
                             goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
 
                             if not env.sim.pathfinder.is_navigable(np.array(goal)):
@@ -602,63 +798,16 @@ def eval_action(self, idx) -> None:  # noqa: C901
                             env.step(4)
                             env.step(4)
 
-                            # Forking logic based on mode
-                            if self.args.mode == 'system2':
-                                action = agent.get_next_action(goal)
-                                if action == 0:
-                                    goal = None
-                                    output_ids = None
-                                    action = 2  # random action
-                                    print('conduct a random action 2')
-                                    observations = env.step(action)
-                                    step_id += 1
-                                    messages = []
-                                    continue
-                            else:  # dual-system logic
-                                local_actions = []
-                                pixel_values = inputs.pixel_values
-                                image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-
-                                with torch.no_grad():
-                                    traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-
-                                # prepocess align with navdp
-                                image_dp = (
-                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                                )
-                                pix_goal_image = copy.copy(image_dp)
-                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                                pix_goal_depth = copy.copy(depth_dp)
-                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-
-                                with torch.no_grad():
-                                    dp_actions = self.model.generate_traj(
-                                        traj_latents, images_dp, depths_dp, use_async=True
-                                    )
-
-                                random_choice = np.random.choice(dp_actions.shape[0])
-                                if self.args.continuous_traj:
-                                    action_list = traj_to_actions(dp_actions)
-                                    if len(action_list) < 8:
-                                        action_list += [0] * (8 - len(action_list))
-                                else:
-                                    action_list = chunk_token(dp_actions[random_choice])
-
-                                local_actions = action_list
-                                if len(local_actions) >= 4:
-                                    local_actions = local_actions[:4]
-                                action = local_actions[0]
-                                if action == 0:
-                                    goal = None
-                                    output_ids = None
-                                    action = 2  # random action
-                                    print('conduct a random action 2')
-                                    observations = env.step(action)
-                                    step_id += 1
-                                    messages = []
-                                    continue
-
+                            action = agent.get_next_action(goal)
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2
+                                print('conduct a random action 2')
+                                observations = env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
                             print('predicted goal', pixel_goal, goal, flush=True)
                         else:
                             action_seq = self.parse_actions(llm_outputs)
@@ -668,49 +817,9 @@ def eval_action(self, idx) -> None:  # noqa: C901
                         action = action_seq[0]
                         action_seq.pop(0)
                     elif goal is not None:
-                        # Forking logic based on mode
-                        if self.args.mode == 'system2':
-                            action = agent.get_next_action(goal)
-                            action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
-                            action = action[0] if hasattr(action, "__len__") else action
-                        else:  # dual-system logic
-                            if len(local_actions) == 0:
-                                # navdp
-                                local_actions = []
-                                image_dp = (
-                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                                )
-
-                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-
-                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-                                with torch.no_grad():
-                                    dp_actions = self.model.generate_traj(
-                                        traj_latents, images_dp, depths_dp, use_async=True
-                                    )
-
-                                random_choice = np.random.choice(dp_actions.shape[0])
-                                if self.args.continuous_traj:
-                                    action_list = traj_to_actions(dp_actions)
-                                    if len(action_list) < 8:
-                                        action_list += [0] * (8 - len(action_list))
-                                else:
-                                    action_list = chunk_token(dp_actions[random_choice])
-                                print("first action_list", action_list)
-
-                                local_actions = action_list
-                                if len(local_actions) >= 4:
-                                    local_actions = local_actions[:4]
-                                # if len(local_actions) >= 2:
-                                #     local_actions = local_actions[:2]
-
-                                print("local_actions", local_actions)
-
-                                action = local_actions.pop(0)
-                                # navdp
-                            else:
-                                action = local_actions.pop(0)
+                        action = agent.get_next_action(goal)
+                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                        action = action[0] if hasattr(action, "__len__") else action
 
                         forward_action += 1
                         print('forward_action', forward_action, flush=True)
@@ -720,25 +829,20 @@ def eval_action(self, idx) -> None:  # noqa: C901
                             messages = []
                             step_id += 1
                             forward_action = 0
-                            local_actions = []
                             continue
                         if action == 0:
                             goal = None
                             output_ids = None
                             messages = []
                             step_id += 1
-                            forward_action = 0
-                            local_actions = []
                             continue
                     else:
                         action = 0
 
                     if info['top_down_map'] is not None:
-                        if save_dot:
-                            save_raw_image = self.dot_matrix_two_dimensional(
-                                save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
-                            )
                         frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                        if goal is not None and flag:
+                            cv2.circle(frame, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
                         vis_frames.append(frame)
 
                     print("step_id", step_id, "action", action)
@@ -746,15 +850,17 @@ def eval_action(self, idx) -> None:  # noqa: C901
                     if action == 5:
                         env.step(action)
                         observations = env.step(action)
+                        flag = True
                     else:
                         observations = env.step(action)
                         step_id += 1
                         messages = []
+                        flag = False
 
                 process_bar.update(1)
 
                 metrics = env.get_metrics()
-                if self.save_video:
+                if self.save_video and metrics['success'] == 1.0:
                     images_to_video(
                         vis_frames,
                         os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
@@ -767,6 +873,8 @@ def eval_action(self, idx) -> None:  # noqa: C901
                 spls.append(metrics['spl'])
                 oss.append(metrics['oracle_success'])
                 nes.append(metrics["distance_to_goal"])
+                if 'ndtw' in metrics:
+                    ndtw.append(metrics["ndtw"])
                 print(
                     f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
                 )
@@ -781,31 +889,26 @@ def eval_action(self, idx) -> None:  # noqa: C901
                     "steps": step_id,
                     "episode_instruction": episode_instruction,
                 }
+                if 'ndtw' in metrics:
+                    result['ndtw'] = metrics['ndtw']
 
                 with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
                     f.write(json.dumps(result) + "\n")
+
         env.close()
         return (
             torch.tensor(sucs).to(self.device),
             torch.tensor(spls).to(self.device),
             torch.tensor(oss).to(self.device),
             torch.tensor(nes).to(self.device),
+            torch.tensor(ndtw).to(self.device) if 'ndtw' in metrics else None,
             torch.tensor(len(sucs)).to(self.device),
         )
 
     def parse_actions(self, output):
         action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
-        # import ipdb; ipdb.set_trace()
         regex = re.compile(action_patterns)
         matches = regex.findall(output)
         actions = [self.actions2idx[match] for match in matches]
         actions = itertools.chain.from_iterable(actions)
         return list(actions)
-
-    def preprocess_qwenvl(self, source):
-        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-        if len(source[0]["value"]) != 0:
-            source[0]["value"] += f" {prompt}."
-        else:
-            source[0]["value"] = f"{prompt}."  # Please output the next waypoint\'s coordinates in the image."
-        return source
diff --git a/internnav/model/basemodel/internvla_n1/internvla_n1.py b/internnav/model/basemodel/internvla_n1/internvla_n1.py
index 40e01dc..6703617 100644
--- a/internnav/model/basemodel/internvla_n1/internvla_n1.py
+++ b/internnav/model/basemodel/internvla_n1/internvla_n1.py
@@ -1,8 +1,11 @@
-from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils.torch_utils import randn_tensor
 from transformers import (
     Qwen2_5_VLConfig,
     Qwen2_5_VLForConditionalGeneration,
@@ -10,60 +13,12 @@
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from .navdp import NavDP_Policy_DPT_CriticSum_DAT
+from .internvla_n1_arch import InternVLAN1MetaForCausalLM, InternVLAN1MetaModel
 
-
-def build_navdp(navdp_cfg):
-    navdp_version = getattr(navdp_cfg, "navdp_version", 0.0)
-    if navdp_version > 0.0:
-        memory_size = 2
-    else:
-        memory_size = 3
-
-    navdp = NavDP_Policy_DPT_CriticSum_DAT(
-        memory_size=memory_size, navdp_pretrained=navdp_cfg.navdp_pretrained, navdp_version=navdp_version
-    )
-    navdp.load_model()
-    return navdp
-
-
-class InternVLAN1MetaModel:
-    def __init__(self, config):
-        super(InternVLAN1MetaModel, self).__init__(config)
-        if hasattr(config, "navdp"):
-            self.latent_queries = nn.Parameter(torch.randn(1, config.n_query, config.hidden_size))
-            self.navdp = build_navdp(config)
-
-    def initialize_vision_modules(self, model_args):
-        if getattr(self, 'navdp', None) is None:
-            self.config.navdp = model_args.navdp
-            self.config.navdp_pretrained = model_args.navdp_pretrained
-            self.navdp = build_navdp(model_args)
-
-        self.config.n_query = model_args.n_query
-        if getattr(self, 'latent_queries', None) is None:
-            print("random initiation the latent_queries !!!")
-            self.latent_queries = nn.Parameter(torch.randn(1, self.config.n_query, self.config.hidden_size))
-
-
-class InternVLAN1MetaForCausalLM(ABC):
-    @abstractmethod
-    def get_model(self):
-        pass
-
-    def get_navdp(self):
-        return self.get_model().navdp
-
-    def get_mm_projector(self):
-        return self.get_model().mm_projector
-
-    def get_n_query(self):
-        return self.get_model().config.n_query
-
-
-TRAJ_START_TOKEN_INDEX = 151665
-IMAGE_TOKEN_INDEX = 151655
 TRAJ_TOKEN_INDEX = 151667
+IMAGE_TOKEN_INDEX = 151655
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
 
 
 class InternVLAN1ModelConfig(Qwen2_5_VLConfig):
@@ -94,51 +49,12 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+        for name, value in (("_resnet_mean", _RESNET_MEAN), ("_resnet_std", _RESNET_STD)):
+            self.register_buffer(name, torch.FloatTensor(value).view(1, 1, 3, 1, 1), persistent=False)
+
     def get_model(self):
         return self.model
 
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        position_ids=None,
-        use_cache=True,
-        pixel_values=None,
-        pixel_values_videos=None,
-        image_grid_thw=None,
-        video_grid_thw=None,
-        second_per_grid_ts=None,
-        **kwargs,
-    ):
-        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
-
-        model_inputs = super().prepare_inputs_for_generation(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            cache_position=cache_position,
-            position_ids=position_ids,
-            pixel_values=pixel_values,
-            pixel_values_videos=pixel_values_videos,
-            image_grid_thw=image_grid_thw,
-            video_grid_thw=video_grid_thw,
-            second_per_grid_ts=second_per_grid_ts,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        # Qwen2-5-VL position_ids are prepareed with rope_deltas in forward
-        model_inputs["position_ids"] = None
-
-        # add for QwenVL kv cache
-        model_inputs["pixel_values"] = pixel_values
-        model_inputs["pixel_values_videos"] = pixel_values_videos
-
-        return model_inputs
-
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -159,7 +75,10 @@ def forward(
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         second_per_grid_ts: Optional[torch.Tensor] = None,
-        raw_input_ids: Optional[torch.LongTensor] = None,
+        traj_images: Optional[torch.Tensor] = None,
+        traj_depths: Optional[torch.Tensor] = None,
+        video_frame_num: Optional[torch.Tensor] = None,
+        traj_poses: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -208,11 +127,10 @@ def forward(
 
         if inputs_embeds is None:
             inputs_embeds = self.model.embed_tokens(input_ids)
-            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
-            if pixel_values is not None and n_image_tokens > 0:
+            if pixel_values is not None:
                 pixel_values = pixel_values.type(self.visual.dtype)
                 image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
-                image_embeds = image_embeds[-n_image_tokens:]
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                 n_image_features = image_embeds.shape[0]
                 if n_image_tokens != n_image_features:
                     raise ValueError(
@@ -272,20 +190,6 @@ def forward(
                     attention_mask,
                 )
                 self.rope_deltas = rope_deltas
-            elif n_image_tokens > 0:  # using only for kv cache
-                attention_mask = attention_mask[:, : raw_input_ids.shape[1]]
-                position_ids, rope_deltas = self.get_rope_index(
-                    raw_input_ids,
-                    image_grid_thw,
-                    video_grid_thw,
-                    second_per_grid_ts,
-                    attention_mask,
-                )
-                delta = (
-                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) if cache_position is not None else 0
-                )
-                position_ids = position_ids[:, :, -input_ids.shape[1] :]
-                self.rope_deltas = rope_deltas
             # then use the prev pre-calculated rope-deltas to get the correct position ids
             else:
                 batch_size, seq_length, _ = inputs_embeds.shape
@@ -316,6 +220,107 @@ def forward(
         logits = self.lm_head(hidden_states)
 
         loss = None
+        if labels is not None:
+            # # Upcast to float if we need to compute the loss to avoid potential precision issues
+            # logits = logits.float()
+            # # Shift so that tokens < n predict n
+            # shift_logits = logits[..., :-1, :].contiguous()
+            # shift_labels = labels[..., 1:].contiguous()
+            # # Flatten the tokens
+            # loss_fct = torch.nn.CrossEntropyLoss()
+            # shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            # shift_labels = shift_labels.view(-1)
+            # # Enable model parallelism
+            # shift_labels = shift_labels.to(shift_logits.device)
+            # loss = loss_fct(shift_logits, shift_labels)
+
+            traj_hidden_states = []
+            for b in range(hidden_states.shape[0]):
+                traj_hidden_states.append(hidden_states[b, t_s_pos[b] : t_s_pos[b] + self.config.n_query, :])
+
+            traj_hidden_states = torch.stack(traj_hidden_states, dim=0)
+            traj_hidden_states = traj_hidden_states.unsqueeze(1).repeat(1, traj_poses.size(1), 1, 1).flatten(0, 1)
+            loss_mask = torch.arange(traj_images.size(1), device=self.device).expand(
+                traj_images.size(0), traj_images.size(1)
+            ) < video_frame_num.unsqueeze(1)
+
+            if 'nextdit' in self.get_system1_type():
+                if 'async' in self.get_system1_type():
+                    cur_images = traj_images.flatten(0, 1)
+                    pix_goal_images = traj_images[:, 0:1].repeat(1, traj_images.size(1), 1, 1, 1).flatten(0, 1)
+                    bsz = cur_images.size(0)
+                    images_dp = torch.stack([pix_goal_images, cur_images], dim=1).permute(0, 1, 4, 2, 3)
+                    images_dp_norm = (images_dp - self._resnet_mean) / self._resnet_std
+
+                    images_dp_feat = (
+                        self.get_model()
+                        .rgb_model.get_intermediate_layers(images_dp_norm.flatten(0, 1))[0]
+                        .unflatten(dim=0, sizes=(bsz, -1))
+                    )
+
+                    memory_feat = self.get_model().memory_encoder(
+                        images_dp_feat.flatten(1, 2)
+                    )  # [bs*select_size,512,384]
+                    memory_feat = torch.cat([images_dp_feat.flatten(1, 2), memory_feat], dim=-1)
+                    memory_tokens = self.get_model().rgb_resampler(memory_feat)
+
+                    traj_hidden_states = self.get_model().cond_projector(traj_hidden_states)
+                    latents = torch.cat([memory_tokens, traj_hidden_states], dim=1)
+                else:
+                    traj_hidden_states = self.get_model().cond_projector(traj_hidden_states)
+                    latents = traj_hidden_states
+
+                relative_poses = traj_poses.flatten(0, 1)
+                bsz = relative_poses.shape[0]
+                noise = torch.randn(relative_poses.shape, device=relative_poses.device, dtype=relative_poses.dtype)
+                u = torch.rand(size=(bsz,), device="cpu")
+                indices = (u * self.get_model().noise_scheduler.config.num_train_timesteps).long()
+                timesteps = self.get_model().noise_scheduler.timesteps[indices].to(device=latents.device)
+                sigmas = self.get_sigmas(
+                    timesteps, latents.device, n_dim=relative_poses.shape[-1], dtype=relative_poses.dtype
+                )
+
+                noisy_trajectory = (1 - sigmas) * relative_poses + sigmas * noise
+                action_features = self.get_model().action_encoder(noisy_trajectory)
+                pos_ids = torch.arange(relative_poses.shape[1]).reshape(1, -1).repeat(bsz, 1).to(relative_poses.device)
+                pos_embed = self.get_model().pos_encoding(pos_ids)
+                action_features += pos_embed
+
+                noise_pred = self.get_model().traj_dit(
+                    x=action_features,
+                    timestep=timesteps,
+                    z_latents=latents,
+                )
+                noise_pred = self.get_model().action_decoder(noise_pred)
+                target = noise - relative_poses
+                loss = F.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                mask = loss_mask.flatten(0, 1)[:, None, None]
+                masked_loss = loss * mask
+                loss = masked_loss.sum() / mask.sum() / (loss.shape[1] * loss.shape[2])
+            elif 'navdp' in self.get_system1_type():
+                if 'async' in self.get_system1_type():
+                    cur_images = traj_images.flatten(0, 1)
+                    cur_depths = traj_depths.flatten(0, 1)
+                    pix_goal_images = traj_images[:, 0:1].repeat(1, traj_images.size(1), 1, 1, 1).flatten(0, 1)
+                    pix_goal_depths = traj_depths[:, 0:1].repeat(1, traj_depths.size(1), 1, 1).flatten(0, 1)
+                    images_dp = torch.stack([pix_goal_images, cur_images], dim=1)  # (bs*select_size, 2, 224, 224, 3)
+                    depths_dp = torch.stack([pix_goal_depths, cur_depths], dim=1).unsqueeze(
+                        -1
+                    )  # (bs*select_size, 2, 224, 224, 1)
+                    pred_pg, noise = self.model.navdp.forward_vlm_traj(
+                        traj_hidden_states, images_dp, depths_dp, tensor_label_actions=traj_poses
+                    )
+                    pg_action_loss = (pred_pg - noise).square()
+                    mask = loss_mask.flatten(0, 1)[:, None, None]
+                    masked_loss = pg_action_loss * mask
+                    loss = masked_loss.sum() / mask.sum() / (pg_action_loss.shape[1] * pg_action_loss.shape[2])
+
+            else:
+                raise NotImplementedError
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
 
         return CausalLMOutputWithPast(
             loss=loss,
@@ -327,8 +332,8 @@ def forward(
 
     def generate_latents(self, input_ids, pixel_values, image_grid_thw):
         input_ids.to(self.get_model().device)
-        input_ids = torch.cat([input_ids, torch.tensor([[TRAJ_START_TOKEN_INDEX]]).to(input_ids.device)], dim=1)
-        text_embeds = self.get_model().embed_tokens(input_ids)
+        with torch.no_grad():
+            text_embeds = self.get_model().embed_tokens(input_ids)
         latent_queries = self.get_model().latent_queries.repeat(text_embeds.shape[0], 1, 1)
         image_idx = input_ids == IMAGE_TOKEN_INDEX
         N_QUERY = self.get_n_query()
@@ -342,23 +347,108 @@ def generate_latents(self, input_ids, pixel_values, image_grid_thw):
         text_embeds = torch.cat([text_embeds, latent_queries], dim=1)
 
         position_ids, _ = self.get_rope_index(input_ids, image_grid_thw)
-        outputs = self.model(
-            inputs_embeds=text_embeds,
-            position_ids=position_ids,
-            # attention_mask=attention_mask,
-            output_hidden_states=True,
-            return_dict=True,
-        )
+        with torch.no_grad():
+            outputs = self.model(
+                inputs_embeds=text_embeds,
+                position_ids=position_ids,
+                # attention_mask=attention_mask,
+                output_hidden_states=True,
+                return_dict=True,
+            )
         hidden_states = outputs.hidden_states[-1][:, -N_QUERY:, :]
+
         return hidden_states
 
-    def generate_traj(self, traj_latents, images_dp=None, depths_dp=None, use_async=False):
-        if use_async:
-            all_trajs = self.model.navdp.predict_pointgoal_action_async(
-                traj_latents.to(self.get_model().device), images_dp, depths_dp, vlm_mask=None
-            )
-        else:
-            all_trajs = self.model.navdp.predict_pointgoal_action(
-                traj_latents.to(self.get_model().device), vlm_mask=None
+    def generate_traj(
+        self,
+        traj_latents,
+        images_dp,
+        depths_dp=None,
+        predict_step_nums=32,
+        guidance_scale: float = 1.0,
+        num_inference_steps: int = 10,
+        num_sample_trajs: int = 32,
+    ):
+        if 'nextdit' in self.get_system1_type():
+            scheduler = FlowMatchEulerDiscreteScheduler()
+            device = traj_latents.device
+            dtype = traj_latents.dtype
+
+            traj_latents = self.get_model().cond_projector(traj_latents)
+            if 'async' in self.get_system1_type():
+                with torch.no_grad():
+                    images_dp = images_dp.permute(0, 1, 4, 2, 3)
+                    images_dp_norm = (images_dp - self._resnet_mean) / self._resnet_std
+                    self.get_model().rgb_model.to(dtype)
+                    images_dp_feat = (
+                        self.get_model()
+                        .rgb_model.get_intermediate_layers(images_dp_norm.flatten(0, 1).to(dtype))[0]
+                        .unflatten(dim=0, sizes=(1, -1))
+                    )
+                    memory_feat = self.get_model().memory_encoder(
+                        images_dp_feat.flatten(1, 2)
+                    )  # [bs*select_size,512,384]
+                    memory_feat = torch.cat([images_dp_feat.flatten(1, 2), memory_feat], dim=-1)
+                    memory_tokens = self.get_model().rgb_resampler(memory_feat)
+                hidden_states = torch.cat([memory_tokens, traj_latents], dim=1)
+            else:
+                hidden_states = traj_latents
+            hidden_states_null = torch.zeros_like(hidden_states, device=device, dtype=dtype)
+            hidden_states_input = torch.cat([hidden_states_null, hidden_states], 0)
+            batch_size = traj_latents.shape[0]
+            latent_size = predict_step_nums
+            latent_channels = 3
+
+            latents = randn_tensor(
+                shape=(batch_size * num_sample_trajs, latent_size, latent_channels),
+                generator=None,
+                device=device,
+                dtype=dtype,
             )
-        return all_trajs
+
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+            scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
+
+            hidden_states_input = hidden_states_input.repeat_interleave(num_sample_trajs, dim=0)
+
+            for t in scheduler.timesteps:
+                latent_features = self.get_model().action_encoder(latents)
+                pos_ids = (
+                    torch.arange(latent_features.shape[1])
+                    .reshape(1, -1)
+                    .repeat(batch_size, 1)
+                    .to(latent_features.device)
+                )
+                pos_embed = self.get_model().pos_encoding(pos_ids)
+                latent_features += pos_embed  # [num_sample_trajs, t, 384]
+                latent_model_input = latent_features.repeat(2, 1, 1)
+                if hasattr(scheduler, "scale_model_input"):
+                    latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+
+                # predict noise model_output
+                noise_pred = self.get_model().traj_dit(
+                    x=latent_model_input,
+                    timestep=t.unsqueeze(0)
+                    .expand(latent_model_input.shape[0])
+                    .to(latent_model_input.device, torch.long),
+                    z_latents=hidden_states_input,
+                )
+
+                noise_pred = self.get_model().action_decoder(noise_pred)
+
+                # perform guidance
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+                # compute previous: x_t -> x_t-1
+                latents = scheduler.step(noise_pred, t, latents).prev_sample
+            return latents
+
+        elif 'navdp' in self.get_system1_type():
+            if 'async' in self.get_system1_type():
+                all_trajs = self.model.navdp.predict_pointgoal_action_async(
+                    traj_latents.to(self.get_model().device), images_dp, depths_dp
+                )
+            else:
+                all_trajs = self.model.navdp.predict_pointgoal_action(traj_latents.to(self.get_model().device))
+            return all_trajs
diff --git a/internnav/model/basemodel/internvla_n1/internvla_n1_arch.py b/internnav/model/basemodel/internvla_n1/internvla_n1_arch.py
new file mode 100644
index 0000000..fe7eaa0
--- /dev/null
+++ b/internnav/model/basemodel/internvla_n1/internvla_n1_arch.py
@@ -0,0 +1,198 @@
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+LatentEmbSize = 768
+MODEL_PATH_TO = "checkpoints"
+
+
+def build_navdp(navdp_cfg, memory_size):
+    from .navdp import NavDP_Policy_DPT_CriticSum_DAT
+
+    navdp = NavDP_Policy_DPT_CriticSum_DAT(memory_size=memory_size, navdp_version=0.1)
+    navdp.load_model()
+    return navdp
+
+
+def build_traj_dit(config):
+    from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+
+    from .nextdit_crossattn_traj import NextDiTCrossAttn, NextDiTCrossAttnConfig
+
+    dit = NextDiTCrossAttn(NextDiTCrossAttnConfig(latent_embedding_size=LatentEmbSize))
+    noise_scheduler = FlowMatchEulerDiscreteScheduler()
+    return dit, noise_scheduler
+
+
+def build_depthanythingv2(config):
+    from internnav.model.encoder.depth_anything.depth_anything_v2.dpt import (
+        DepthAnythingV2,
+    )
+
+    model_configs = {'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}}
+    DAv2_model = DepthAnythingV2(**model_configs['vits'])
+    DAv2_model.load_state_dict(
+        torch.load(f'{MODEL_PATH_TO}/depth_anything_v2_metric_hypersim_vits.pth', map_location="cpu")
+    )  # download from https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Small/resolve/main/depth_anything_v2_metric_hypersim_vits.pth
+    rgb_model = DAv2_model.pretrained
+
+    return rgb_model
+
+
+class SinusoidalPositionalEncoding(nn.Module):
+    """
+    Produces a sinusoidal encoding of shape (B, T, w)
+    given timesteps of shape (B, T).
+    """
+
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+
+    def forward(self, timesteps):
+        # timesteps: shape (B, T)
+        # We'll compute sin/cos frequencies across dim T
+        timesteps = timesteps.float()  # ensure float
+
+        B, T = timesteps.shape
+        device = timesteps.device
+
+        half_dim = self.embedding_dim // 2
+        # typical log space frequencies for sinusoidal encoding
+        exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * (
+            torch.log(torch.tensor(10000.0)) / half_dim
+        )
+        # Expand timesteps to (B, T, 1) then multiply
+        freqs = timesteps.unsqueeze(-1) * exponent.exp()  # (B, T, half_dim)
+
+        sin = torch.sin(freqs)
+        cos = torch.cos(freqs)
+        enc = torch.cat([sin, cos], dim=-1)  # (B, T, w)
+
+        return enc
+
+
+class MemoryEncoder(nn.Module):
+    def __init__(self, hidden_size=384, num_heads=6, num_layers=3, max_len=512, dropout=0.1):
+        super().__init__()
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_size, nhead=num_heads, batch_first=True, dropout=dropout
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.memory_pos = nn.Parameter(torch.randn(max_len, hidden_size))
+
+    def forward(self, memory, memory_mask=None):
+        """
+        memory: (B, N, C)
+        memory_mask: (B, N)
+        """
+        B, N, C = memory.shape
+        pos = self.memory_pos[:N, :].unsqueeze(0).expand(B, -1, -1)  # (B, N, C)
+        memory = memory + pos
+        encoded_memory = self.encoder(memory, src_key_padding_mask=memory_mask)
+        return encoded_memory
+
+
+class QFormer(nn.Module):
+    def __init__(self, num_query=32, hidden_size=768, num_layers=3, num_heads=12):
+        super().__init__()
+        self.num_query = num_query
+        self.hidden_size = hidden_size
+
+        self.query_tokens = nn.Parameter(torch.randn(num_query, hidden_size))
+        self.query_pos = nn.Parameter(torch.randn(num_query, hidden_size))
+
+        decoder_layer = nn.TransformerDecoderLayer(d_model=hidden_size, nhead=num_heads, batch_first=True)
+        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
+
+        self.visual_proj = nn.Linear(hidden_size, hidden_size)
+
+    def forward(self, visual_feats, visual_attn_mask=None):
+        B = visual_feats.size(0)
+
+        query_tokens = self.query_tokens.unsqueeze(0).expand(B, -1, -1)
+        query_tokens = query_tokens + self.query_pos.unsqueeze(0)
+
+        out = self.decoder(query_tokens, visual_feats, memory_key_padding_mask=visual_attn_mask)
+        return out
+
+
+class InternVLAN1MetaModel:
+    def __init__(self, config):
+        super(InternVLAN1MetaModel, self).__init__(config)
+        if hasattr(config, "system1"):
+            self.latent_queries = nn.Parameter(torch.randn(1, config.n_query, config.hidden_size))
+
+            if 'nextdit' in config.system1:
+                self.traj_dit, self.noise_scheduler = build_traj_dit(config)
+                self.action_encoder = nn.Linear(3, 384, bias=True)
+                self.pos_encoding = SinusoidalPositionalEncoding(384)
+                self.action_decoder = nn.Linear(384, 3, bias=True)
+                self.cond_projector = nn.Sequential(
+                    nn.Linear(3584, LatentEmbSize), nn.GELU(approximate="tanh"), nn.Linear(LatentEmbSize, LatentEmbSize)
+                )
+
+                if 'async' in config.system1:
+                    self.rgb_model = build_depthanythingv2(config)
+                    self.memory_encoder = MemoryEncoder()
+                    self.rgb_resampler = QFormer()
+
+            elif 'navdp' in config.system1:
+                if 'async' in config.system1:
+                    self.navdp = build_navdp(config, memory_size=2)
+            else:
+                raise NotImplementedError
+
+    def initialize_vision_modules(self, model_args):
+        if 'nextdit' in model_args.system1:
+            self.traj_dit, self.noise_scheduler = build_traj_dit(model_args)
+            self.action_encoder = nn.Linear(3, 384, bias=True)
+            self.pos_encoding = SinusoidalPositionalEncoding(384)
+            self.action_decoder = nn.Linear(384, 3, bias=True)
+
+            self.cond_projector = nn.Sequential(
+                nn.Linear(3584, LatentEmbSize), nn.GELU(approximate="tanh"), nn.Linear(LatentEmbSize, LatentEmbSize)
+            )
+
+            if 'async' in model_args.system1:
+                self.rgb_model = build_depthanythingv2(model_args)
+                self.memory_encoder = MemoryEncoder()
+                self.rgb_resampler = QFormer()
+        elif 'navdp' in model_args.system1:
+            if 'async' in model_args.system1:
+                self.navdp = build_navdp(model_args, memory_size=2)
+        else:
+            raise NotImplementedError
+
+        self.config.system1 = model_args.system1
+        self.config.n_query = model_args.n_query
+        if getattr(self, 'latent_queries', None) is None:
+            print("random initiation the latent_queries !!!")
+            self.latent_queries = nn.Parameter(torch.randn(1, self.config.n_query, self.config.hidden_size))
+
+
+class InternVLAN1MetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_mm_projector(self):
+        return self.get_model().mm_projector
+
+    def get_n_query(self):
+        return self.get_model().config.n_query
+
+    def get_system1_type(self):
+        return self.get_model().config.system1
+
+    def get_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32):
+        sigmas = self.get_model().noise_scheduler.sigmas.to(device=device, dtype=dtype)
+        schedule_timesteps = self.get_model().noise_scheduler.timesteps.to(device=device)
+        timesteps = timesteps.to(device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
diff --git a/internnav/model/basemodel/internvla_n1/navdp.py b/internnav/model/basemodel/internvla_n1/navdp.py
index c457dd3..a5eac70 100644
--- a/internnav/model/basemodel/internvla_n1/navdp.py
+++ b/internnav/model/basemodel/internvla_n1/navdp.py
@@ -1,30 +1,38 @@
 import torch
 import torch.nn as nn
-import numpy as np
-from internnav.model.utils.misc import rank0_print
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from diffusion_policy.model.diffusion.positional_embedding import SinusoidalPosEmb
-from internnav.model.encoder.navdp_backbone import *
-import random
+
+from internnav.model.encoder.navdp_backbone import *  # noqa: F403
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
 
 class NavDP_Policy_DPT_CriticSum_DAT(nn.Module):
-    def __init__(self,
-                 image_size=224,
-                 memory_size=2,
-                 predict_size=32,
-                 temporal_depth=16,
-                 heads=8,
-                 token_dim=384,
-                 vlm_token_dim=3584,
-                 channels=3,
-                 dropout=0.1,
-                 scratch=False,
-                 finetune=False,
-                 use_critic=False,
-                 input_dtype="bf16",
-                 navdp_pretrained=None,
-                 navdp_version=0.0,
-                 device='cuda:0'):
+    def __init__(
+        self,
+        image_size=224,
+        memory_size=2,
+        predict_size=32,
+        temporal_depth=16,
+        heads=8,
+        token_dim=384,
+        vlm_token_dim=3584,
+        channels=3,
+        dropout=0.1,
+        scratch=False,
+        finetune=False,
+        use_critic=False,
+        input_dtype="bf16",
+        navdp_pretrained=None,
+        navdp_version=0.0,
+        device='cuda:0',
+    ):
         super().__init__()
         self.image_size = image_size
         self.memory_size = memory_size
@@ -33,7 +41,7 @@ def __init__(self,
         self.attention_heads = heads
         self.input_channels = channels
         self.dropout = dropout
-        
+
         self.use_critic = use_critic
         self.token_dim = token_dim
         self.vlm_token_dim = vlm_token_dim
@@ -41,107 +49,110 @@ def __init__(self,
             self.input_dtype = torch.bfloat16
         else:
             self.input_dtype = torch.float32
-        
-        self.rgbd_encoder = DAT_RGBD_Patch_Backbone(image_size, token_dim, memory_size=memory_size, finetune=finetune, version=navdp_version)
+
+        self.rgbd_encoder = DAT_RGBD_Patch_Backbone(  # noqa: F405
+            image_size, token_dim, memory_size=memory_size, finetune=finetune, version=navdp_version
+        )
         self.point_encoder = nn.Linear(3, self.token_dim)
-        self.decoder_layer = nn.TransformerDecoderLayer(d_model=token_dim,
-                                                        nhead=heads,
-                                                        dim_feedforward=4 * token_dim,
-                                                        dropout=dropout,
-                                                        activation='gelu',
-                                                        batch_first=True,
-                                                        norm_first=True)
-        self.decoder = nn.TransformerDecoder(decoder_layer=self.decoder_layer,
-                                             num_layers=self.temporal_depth)
+        self.decoder_layer = nn.TransformerDecoderLayer(
+            d_model=token_dim,
+            nhead=heads,
+            dim_feedforward=4 * token_dim,
+            dropout=dropout,
+            activation='gelu',
+            batch_first=True,
+            norm_first=True,
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer=self.decoder_layer, num_layers=self.temporal_depth)
 
         self.input_embed = nn.Linear(3, token_dim)
         self.cond_pos_embed = nn.Parameter(torch.zeros((1, memory_size * 16 + 2, token_dim), dtype=self.input_dtype))
         self.out_pos_embed = nn.Parameter(torch.zeros((1, predict_size, token_dim), dtype=self.input_dtype))
         self.drop = nn.Dropout(dropout)
         self.time_emb = SinusoidalPosEmb(token_dim)
-        
+
         self.noise_scheduler = DDPMScheduler(
-            num_train_timesteps=20,
-            beta_schedule='squaredcos_cap_v2',
-            clip_sample=True,
-            prediction_type='epsilon'
+            num_train_timesteps=20, beta_schedule='squaredcos_cap_v2', clip_sample=True, prediction_type='epsilon'
         )
-        
+
         self.layernorm = nn.LayerNorm(token_dim)
         self.action_head = nn.Linear(token_dim, 3)
         self.critic_head = nn.Linear(token_dim, 1)
-        
+
         self.tgt_mask = (torch.triu(torch.ones(predict_size, predict_size)) == 1).transpose(0, 1)
-        self.tgt_mask = self.tgt_mask.float().masked_fill(self.tgt_mask == 0, float('-inf')).masked_fill(self.tgt_mask == 1, float(0.0))
+        self.tgt_mask = (
+            self.tgt_mask.float()
+            .masked_fill(self.tgt_mask == 0, float('-inf'))
+            .masked_fill(self.tgt_mask == 1, float(0.0))
+        )
         self.tgt_mask = self.tgt_mask.to(dtype=self.input_dtype)
-        
+
         self.cond_critic_mask = torch.zeros((predict_size, 2 + memory_size * 16))
         self.cond_critic_mask[:, 0:2] = float('-inf')
         self.cond_critic_mask = self.cond_critic_mask.to(dtype=self.input_dtype)
-        
+
         self.vlm_embed_mlp = nn.Sequential(
-            nn.Linear(vlm_token_dim, vlm_token_dim//4),
-            nn.ReLU(),
-            nn.Linear(vlm_token_dim//4, vlm_token_dim//8),
+            nn.Linear(vlm_token_dim, vlm_token_dim // 4),
             nn.ReLU(),
-            nn.Linear(vlm_token_dim//8, token_dim)
-        )
-        
-        self.goal_compressor = TokenCompressor(token_dim, 8, 1)
-        
-        self.pg_embed_mlp = nn.Sequential(
-            nn.Linear(2, token_dim//2),
+            nn.Linear(vlm_token_dim // 4, vlm_token_dim // 8),
             nn.ReLU(),
-            nn.Linear(token_dim//2, token_dim)
+            nn.Linear(vlm_token_dim // 8, token_dim),
         )
+
+        self.goal_compressor = TokenCompressor(token_dim, 8, 1)  # noqa: F405
+
+        self.pg_embed_mlp = nn.Sequential(nn.Linear(2, token_dim // 2), nn.ReLU(), nn.Linear(token_dim // 2, token_dim))
         self.pg_pred_mlp = nn.Sequential(
-            nn.Linear(token_dim, token_dim//2),
+            nn.Linear(token_dim, token_dim // 2),
             nn.ReLU(),
-            nn.Linear(token_dim//2, token_dim//4),
+            nn.Linear(token_dim // 2, token_dim // 4),
             nn.ReLU(),
-            nn.Linear(token_dim//4, 2)
+            nn.Linear(token_dim // 4, 2),
         )
 
         self.model_name = "NavDP_Policy_DPT_CriticSum_DAT"
         self.navdp_pretrained = navdp_pretrained
-    
+
     def load_model(self):
         rank0_print(f"Loading navdp model: {self.model_name}")
         rank0_print(f"Pretrained: {self.navdp_pretrained}")
-        
+
         if self.navdp_pretrained is None:
             rank0_print("No pretrained weights provided, initializing randomly.")
             return
 
         try:
             pretrained_dict = torch.load(self.navdp_pretrained)
-            
+
             if 'state_dict' in pretrained_dict:
                 pretrained_dict = pretrained_dict['state_dict']
-            
+
             model_dict = self.state_dict()
-            
-            matched_dict = {k: v for k, v in pretrained_dict.items() 
-                        if k in model_dict and v.size() == model_dict[k].size()}
-            
+
+            matched_dict = {
+                k: v for k, v in pretrained_dict.items() if k in model_dict and v.size() == model_dict[k].size()
+            }
+
             unmatched_pretrained = [k for k in pretrained_dict if k not in matched_dict]
             unmatched_model = [k for k in model_dict if k not in pretrained_dict]
-            
+
             model_dict.update(matched_dict)
             self.load_state_dict(model_dict)
-            
+
             rank0_print(f"Successfully loaded pretrained weights from {self.navdp_pretrained}")
             rank0_print(f"Loaded {len(matched_dict)}/{len(model_dict)} layers")
-            
+
             if unmatched_pretrained:
                 rank0_print("\nParameters in pretrained but NOT loaded:")
                 for k in unmatched_pretrained:
                     if k in model_dict:
-                        reason = f"size mismatch (pretrained: {pretrained_dict[k].size()}, model: {model_dict[k].size()})"
+                        reason = (
+                            f"size mismatch (pretrained: {pretrained_dict[k].size()}, model: {model_dict[k].size()})"
+                        )
                     else:
                         reason = "not in model"
                     rank0_print(f"  - {k} ({reason})")
-            
+
             if unmatched_model:
                 rank0_print("\nParameters in model but NOT in pretrained:")
                 for k in unmatched_model:
@@ -150,51 +161,60 @@ def load_model(self):
         except Exception as e:
             rank0_print(f"Error loading pretrained weights: {str(e)}")
             rank0_print("Continuing with random initialization.")
-    
+
     def sample_noise(self, action):
         noise = torch.randn(action.shape, dtype=action.dtype).to(action.device)
-        timesteps = torch.randint(0, self.noise_scheduler.config.num_train_timesteps, (action.shape[0],)).long().to(action.device)
+        timesteps = (
+            torch.randint(0, self.noise_scheduler.config.num_train_timesteps, (action.shape[0],))
+            .long()
+            .to(action.device)
+        )
         time_embeds = self.time_emb(timesteps).unsqueeze(1).to(dtype=self.input_dtype)
         noisy_action = self.noise_scheduler.add_noise(action, noise, timesteps)
         noisy_action_embed = self.input_embed(noisy_action)
         return noise, time_embeds, noisy_action_embed
-    
+
     def predict_noise(self, last_actions, timestep, goal_embed, rgbd_embed=None):
         action_embeds = self.input_embed(last_actions)
         time_embeds = self.time_emb(timestep.to(last_actions.device)).unsqueeze(1).to(dtype=last_actions.dtype)
-        
+
         if rgbd_embed is not None:
-            cond_embedding = torch.cat([time_embeds, goal_embed, rgbd_embed], dim=1) + self.cond_pos_embed[:, :self.memory_size*16+2, :]
+            cond_embedding = (
+                torch.cat([time_embeds, goal_embed, rgbd_embed], dim=1)
+                + self.cond_pos_embed[:, : self.memory_size * 16 + 2, :]
+            )
         else:
             cond_embedding = torch.cat([time_embeds, goal_embed], dim=1) + self.cond_pos_embed[:, :2, :]
 
         cond_embedding = cond_embedding.repeat(action_embeds.shape[0], 1, 1)
-        input_embedding = action_embeds + self.out_pos_embed[:, :self.predict_size, :]
-        
+        input_embedding = action_embeds + self.out_pos_embed[:, : self.predict_size, :]
+
         output = self.decoder(tgt=input_embedding, memory=cond_embedding, tgt_mask=self.tgt_mask)
         output = self.layernorm(output)
         output = self.action_head(output)
         return output
-    
-    def predict_pointgoal_action_async(self, vlm_tokens, input_images=None, input_depths=None, vlm_mask=None, sample_num=32):
+
+    def predict_pointgoal_action_async(
+        self, vlm_tokens, input_images=None, input_depths=None, vlm_mask=None, sample_num=32
+    ):
         """
         Predict action sequence for point goal navigation using diffusion-based approach.
 
-        This method generates a sequence of actions to reach a target point using 
+        This method generates a sequence of actions to reach a target point using
         vision-language model (VLM) embeddings and RGB-D sensory inputs, leveraging
         a diffusion model to denoise action predictions.
 
         Args:
-            vlm_tokens (Tensor): Token embeddings from vision-language model, 
+            vlm_tokens (Tensor): Token embeddings from vision-language model,
                 shape (batch_size, token_numbers, 3584)
             input_images (Tensor, optional): Input RGB images including memory frames,
-                shape (batch_size, memory_frames, 224, 224, 3). 
+                shape (batch_size, memory_frames, 224, 224, 3).
                 Defaults to None.
             input_depths (Tensor, optional): Input depth maps,
-                shape (batch_size, memory_frames, 224, 224, 1). 
+                shape (batch_size, memory_frames, 224, 224, 1).
                 Defaults to None.
             vlm_mask (Tensor, optional): Mask for VLM tokens indicating valid positions,
-                shape (batch_size, token_numbers). 
+                shape (batch_size, token_numbers).
                 Defaults to None.
             sample_num (int, optional): Number of action sequences to sample through diffusion.
                 Defaults to 32.
@@ -205,32 +225,33 @@ def predict_pointgoal_action_async(self, vlm_tokens, input_images=None, input_de
         """
         with torch.no_grad():
             bs = vlm_tokens.shape[0]
-            device_ = vlm_tokens.device
             if bs != 1:
                 vlm_tokens = vlm_tokens[0:1]
                 vlm_mask = vlm_mask[0:1]
                 bs = 1
-            
+
             if vlm_mask is not None:
                 vlm_mask_ = vlm_mask.bool()
                 vlm_mask = ~vlm_mask_
-            
+
             vlm_tokens = self.vlm_embed_mlp(vlm_tokens)
             vlm_embed = self.goal_compressor(vlm_tokens, vlm_mask)
-            
+
             rgbd_embed = self.rgbd_encoder(input_images, input_depths)
-            
-            noisy_action = torch.randn((sample_num * bs, self.predict_size, 3), dtype=vlm_embed.dtype).to(vlm_embed.device)
+
+            noisy_action = torch.randn((sample_num * bs, self.predict_size, 3), dtype=vlm_embed.dtype).to(
+                vlm_embed.device
+            )
             naction = noisy_action
-            
+
             self.noise_scheduler.set_timesteps(self.noise_scheduler.config.num_train_timesteps)
             for k in self.noise_scheduler.timesteps[:]:
                 noise_pred = self.predict_noise(naction, k.unsqueeze(0), vlm_embed, rgbd_embed)
-                naction = self.noise_scheduler.step(model_output=noise_pred,timestep=k,sample=naction).prev_sample
-            
+                naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample
+
             current_trajectory = naction
             return current_trajectory
-        
+
     def predict_pointgoal_action(self, vlm_tokens, input_images=None, input_depths=None, vlm_mask=None, sample_num=32):
         """
         Args:
@@ -245,26 +266,47 @@ def predict_pointgoal_action(self, vlm_tokens, input_images=None, input_depths=N
                 vlm_tokens = vlm_tokens[0:1]
                 vlm_mask = vlm_mask[0:1]
                 bs = 1
-            
+
             if vlm_mask is not None:
                 vlm_mask_ = vlm_mask.bool()
                 # mask==True parts will be ignored by transformer, but now vlm valid parts have mask==True!
                 vlm_mask = ~vlm_mask_
-            
+
             vlm_tokens = self.vlm_embed_mlp(vlm_tokens)
             vlm_embed = torch.mean(vlm_tokens, dim=1).unsqueeze(1)
-            
-            noisy_action = torch.randn((sample_num * bs, self.predict_size, 3), dtype=vlm_embed.dtype).to(vlm_embed.device)
+
+            noisy_action = torch.randn((sample_num * bs, self.predict_size, 3), dtype=vlm_embed.dtype).to(
+                vlm_embed.device
+            )
             naction = noisy_action
-            
+
             self.noise_scheduler.set_timesteps(self.noise_scheduler.config.num_train_timesteps)
             for k in self.noise_scheduler.timesteps[:]:
                 noise_pred = self.predict_noise(naction, k.unsqueeze(0), vlm_embed)
                 naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample
-                
 
             current_trajectory = naction
             return current_trajectory
-        
-            
-            
\ No newline at end of file
+
+    def forward_vlm_traj(
+        self, vlm_tokens, input_images, input_depths, tensor_label_actions, tensor_augment_actions=None
+    ):
+        vlm_tokens = self.vlm_embed_mlp(vlm_tokens)
+        vlm_embed = self.goal_compressor(vlm_tokens)  # （bs,1,384)
+
+        tensor_label_actions = tensor_label_actions.flatten(0, 1)
+
+        # sample noise and actions
+        pg_noise, pg_time_embed, pg_noisy_action_embed = self.sample_noise(tensor_label_actions)
+
+        rgbd_embed = self.rgbd_encoder(input_images, input_depths)  # [64, 32, 384]
+
+        cond_embed = torch.cat([pg_time_embed, vlm_embed, rgbd_embed], dim=1)
+        pg_cond_embeddings = self.drop(cond_embed + self.cond_pos_embed[:, : cond_embed.size(1)])
+
+        pg_action_embeddings = self.drop(pg_noisy_action_embed + self.out_pos_embed[:, : self.predict_size, :])
+
+        pg_output = self.decoder(tgt=pg_action_embeddings, memory=pg_cond_embeddings, tgt_mask=self.tgt_mask)
+        pg_output = self.layernorm(pg_output)
+        noise_pred_pg = self.action_head(pg_output)
+        return noise_pred_pg, pg_noise
diff --git a/internnav/model/basemodel/internvla_n1/nextdit_crossattn_traj.py b/internnav/model/basemodel/internvla_n1/nextdit_crossattn_traj.py
new file mode 100644
index 0000000..6ba6bb6
--- /dev/null
+++ b/internnav/model/basemodel/internvla_n1/nextdit_crossattn_traj.py
@@ -0,0 +1,95 @@
+from typing import Optional
+
+import torch
+from diffusers.models.embeddings import get_2d_rotary_pos_embed_lumina
+from transformers import PretrainedConfig, PreTrainedModel
+
+from .nextdit_traj import LuminaNextDiT2DModel
+
+
+class NextDiTCrossAttnConfig(PretrainedConfig):
+    model_type = "nextdit-crossattn"
+
+    def __init__(
+        self,
+        input_size: int = 8,
+        patch_size: int = 1,
+        in_channels: int = 384,
+        dim: int = 384,
+        n_layers: int = 12,
+        n_heads: int = 6,
+        n_kv_heads: int = 6,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        latent_embedding_size: int = 3584,
+        learn_sigma: bool = False,
+        qk_norm: bool = True,
+        _gradient_checkpointing: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_size = input_size
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.multiple_of = multiple_of
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.norm_eps = norm_eps
+        self.learn_sigma = learn_sigma
+        self.qk_norm = qk_norm
+        self.latent_embedding_size = latent_embedding_size
+        self._gradient_checkpointing = _gradient_checkpointing
+
+
+class NextDiTCrossAttn(PreTrainedModel):
+    config_class = NextDiTCrossAttnConfig
+
+    def __init__(
+        self,
+        config: NextDiTCrossAttnConfig,
+    ) -> None:
+        super().__init__(config)
+        assert config.learn_sigma is False, "learn_sigma is not supported in nextdit-crossattn"
+        self._gradient_checkpointing = config._gradient_checkpointing
+
+        self.model = LuminaNextDiT2DModel(
+            sample_size=config.input_size,
+            patch_size=config.patch_size,
+            in_channels=config.in_channels,
+            hidden_size=config.dim,
+            num_layers=config.n_layers,
+            num_attention_heads=config.n_heads,
+            num_kv_heads=config.n_kv_heads,
+            multiple_of=config.multiple_of,
+            ffn_dim_multiplier=config.ffn_dim_multiplier,
+            norm_eps=config.norm_eps,
+            learn_sigma=config.learn_sigma,
+            qk_norm=config.qk_norm,
+            cross_attention_dim=config.latent_embedding_size,
+        )
+
+        if self._gradient_checkpointing:
+            self.model.enable_gradient_checkpointing()
+
+        # self.model.requires_grad_(False)
+
+        self.freqs_cis = get_2d_rotary_pos_embed_lumina(
+            config.dim // config.n_heads,
+            384,
+            384,
+        )
+
+    def forward(self, x, timestep, z_latents, **kwargs):
+        model_pred = self.model(
+            hidden_states=x,
+            timestep=timestep,
+            encoder_hidden_states=z_latents,
+            encoder_mask=torch.ones((z_latents.shape[0], z_latents.shape[1]), device=z_latents.device),
+            image_rotary_emb=None,
+            cross_attention_kwargs=dict(),
+        ).sample
+        return model_pred
diff --git a/internnav/model/basemodel/internvla_n1/nextdit_traj.py b/internnav/model/basemodel/internvla_n1/nextdit_traj.py
new file mode 100644
index 0000000..cb80fc4
--- /dev/null
+++ b/internnav/model/basemodel/internvla_n1/nextdit_traj.py
@@ -0,0 +1,381 @@
+# Copyright 2024 Alpha-VLLM Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention import LuminaFeedForward
+from diffusers.models.attention_processor import Attention, LuminaAttnProcessor2_0
+from diffusers.models.embeddings import (
+    LuminaCombinedTimestepCaptionEmbedding,
+    LuminaPatchEmbed,
+    PixArtAlphaTextProjection,
+)
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import (
+    LuminaLayerNormContinuous,
+    LuminaRMSNormZero,
+    RMSNorm,
+)
+from diffusers.utils import is_torch_version, logging
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LuminaNextDiTBlock(nn.Module):
+    """
+    A LuminaNextDiTBlock for LuminaNextDiT2DModel.
+
+    Parameters:
+        dim (`int`): Embedding dimension of the input features.
+        num_attention_heads (`int`): Number of attention heads.
+        num_kv_heads (`int`):
+            Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
+        multiple_of (`int`): The number of multiple of ffn layer.
+        ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
+        norm_eps (`float`): The eps for norm layer.
+        qk_norm (`bool`): normalization for query and key.
+        cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
+        norm_elementwise_affine (`bool`, *optional*, defaults to True),
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        cross_attention_dim: int,
+        norm_elementwise_affine: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = dim // num_attention_heads
+
+        self.gate = nn.Parameter(torch.zeros([num_attention_heads]))
+
+        # Self-attention
+        self.attn1 = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            qk_norm="layer_norm_across_heads" if qk_norm else None,
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=LuminaAttnProcessor2_0(),
+        )
+        self.attn1.to_out = nn.Identity()
+
+        # Cross-attention
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            dim_head=dim // num_attention_heads,
+            qk_norm="layer_norm_across_heads" if qk_norm else None,
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=LuminaAttnProcessor2_0(),
+        )
+
+        self.feed_forward = LuminaFeedForward(
+            dim=dim,
+            inner_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+
+        self.norm1 = LuminaRMSNormZero(
+            embedding_dim=dim,
+            norm_eps=norm_eps,
+            norm_elementwise_affine=norm_elementwise_affine,
+        )
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+        self.norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+        self.norm1_context = RMSNorm(cross_attention_dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        temb: torch.Tensor,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Perform a forward pass through the LuminaNextDiTBlock.
+
+        Parameters:
+            hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
+            attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
+            image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
+            encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
+            encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
+            temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
+            cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
+        """
+        residual = hidden_states
+
+        # Self-attention
+        norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+        self_attn_output = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_hidden_states,
+            attention_mask=attention_mask,
+            query_rotary_emb=image_rotary_emb,
+            key_rotary_emb=image_rotary_emb,
+            **cross_attention_kwargs,
+        )
+
+        # Cross-attention
+        norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
+        cross_attn_output = self.attn2(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=encoder_mask,
+            query_rotary_emb=image_rotary_emb,
+            key_rotary_emb=None,
+            **cross_attention_kwargs,
+        )
+        cross_attn_output = cross_attn_output * self.gate.tanh().view(1, 1, -1, 1)
+        mixed_attn_output = self_attn_output + cross_attn_output
+        mixed_attn_output = mixed_attn_output.flatten(-2)
+        # linear proj
+        hidden_states = self.attn2.to_out[0](mixed_attn_output)
+
+        hidden_states = residual + gate_msa.unsqueeze(1).tanh() * self.norm2(hidden_states)
+
+        mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+
+        hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+
+        return hidden_states
+
+
+class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
+    """
+    LuminaNextDiT: Diffusion model with a Transformer backbone.
+
+    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
+
+    Parameters:
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
+            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of input channels for the model. Typically, this matches the number of channels in the input
+            images.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
+            hidden representations.
+        num_layers (`int`, *optional*, default to 32):
+            The number of layers in the model. This defines the depth of the neural network.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of attention heads in each attention layer. This parameter specifies how many separate attention
+            mechanisms are used.
+        num_kv_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
+            If None, it defaults to num_attention_heads.
+        multiple_of (`int`, *optional*, defaults to 256):
+            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
+            configurations.
+        ffn_dim_multiplier (`float`, *optional*):
+            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
+            the model configuration.
+        norm_eps (`float`, *optional*, defaults to 1e-5):
+            A small value added to the denominator for numerical stability in normalization layers.
+        learn_sigma (`bool`, *optional*, defaults to True):
+            Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
+            predictions.
+        qk_norm (`bool`, *optional*, defaults to True):
+            Indicates if the queries and keys in the attention mechanism should be normalized.
+        cross_attention_dim (`int`, *optional*, defaults to 2048):
+            The dimensionality of the text embeddings. This parameter defines the size of the text representations used
+            in the model.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
+            overall scale of the model's operations.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["LuminaNextDiTBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: Optional[int] = 2,
+        in_channels: Optional[int] = 4,
+        hidden_size: Optional[int] = 2304,
+        num_layers: Optional[int] = 32,  # 32
+        num_attention_heads: Optional[int] = 32,  # 32
+        num_kv_heads: Optional[int] = None,
+        multiple_of: Optional[int] = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: Optional[float] = 1e-5,
+        learn_sigma: Optional[bool] = True,
+        qk_norm: Optional[bool] = True,
+        cross_attention_dim: Optional[int] = 2048,
+        scaling_factor: Optional[float] = 1.0,
+    ) -> None:
+        super().__init__()
+        self.sample_size = sample_size
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = hidden_size // num_attention_heads
+        self.scaling_factor = scaling_factor
+        self.gradient_checkpointing = False
+
+        self.caption_projection = PixArtAlphaTextProjection(in_features=cross_attention_dim, hidden_size=hidden_size)
+        self.patch_embedder = LuminaPatchEmbed(
+            patch_size=patch_size, in_channels=in_channels, embed_dim=hidden_size, bias=True
+        )
+
+        self.time_caption_embed = LuminaCombinedTimestepCaptionEmbedding(
+            hidden_size=min(hidden_size, 1024), cross_attention_dim=hidden_size
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                LuminaNextDiTBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    hidden_size,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            bias=True,
+            out_dim=patch_size * patch_size * self.out_channels,
+        )
+        # self.final_layer = LuminaFinalLayer(hidden_size, patch_size, self.out_channels)
+
+        assert (hidden_size // num_attention_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        return_dict=True,
+    ) -> torch.Tensor:
+        """
+        Forward pass of LuminaNextDiT.
+
+        Parameters:
+            hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
+            timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
+            encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
+            encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
+        """
+        # hidden_states = torch.randn((16,1792,16,16)).to(timestep.device).bfloat16()
+        # hidden_states, mask, img_size, image_rotary_emb = self.patch_embedder(hidden_states, image_rotary_emb)
+        # torch.Size([16, 256, 1792]) torch.Size([16, 256])
+        # image_rotary_emb = image_rotary_emb.to(hidden_states.device)
+        # breakpoint()
+        mask = torch.ones(
+            hidden_states.shape[0], hidden_states.shape[1], dtype=torch.int32, device=hidden_states.device
+        )
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        temb = self.time_caption_embed(timestep, encoder_hidden_states, encoder_mask)
+
+        encoder_mask = encoder_mask.bool()
+
+        for layer in self.layers:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    mask,
+                    image_rotary_emb,
+                    encoder_hidden_states,
+                    encoder_mask,
+                    temb,
+                    cross_attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = layer(
+                    hidden_states,
+                    mask,
+                    image_rotary_emb,
+                    encoder_hidden_states,
+                    encoder_mask,
+                    temb=temb,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+        hidden_states = self.norm_out(hidden_states, temb)
+
+        # unpatchify
+        # height_tokens = width_tokens = self.patch_size
+        # height, width = img_size[0]
+        # batch_size = hidden_states.size(0)
+        # sequence_length = (height // height_tokens) * (width // width_tokens)
+        # hidden_states = hidden_states[:, :sequence_length].view(
+        #     batch_size, height // height_tokens, width // width_tokens, height_tokens, width_tokens, self.out_channels
+        # )
+        # output = hidden_states.permute(0, 5, 1, 3, 2, 4).flatten(4, 5).flatten(2, 3)
+        output = hidden_states
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/internnav/model/encoder/depth_anything/depth_anything_v2/dinov2.py b/internnav/model/encoder/depth_anything/depth_anything_v2/dinov2.py
index e84cfa7..3fd3be9 100644
--- a/internnav/model/encoder/depth_anything/depth_anything_v2/dinov2.py
+++ b/internnav/model/encoder/depth_anything/depth_anything_v2/dinov2.py
@@ -7,18 +7,19 @@
 #   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
 #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
 
-from functools import partial
-import math
 import logging
-from typing import Sequence, Tuple, Union, Callable
+import math
+from functools import partial
+from typing import Callable, Sequence, Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
 from torch.nn.init import trunc_normal_
 
-from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
-
+from .dinov2_layers import MemEffAttention, Mlp
+from .dinov2_layers import NestedTensorBlock as Block
+from .dinov2_layers import PatchEmbed, SwiGLUFFNFused
 
 logger = logging.getLogger("dinov2")
 
@@ -165,8 +166,7 @@ def f(*args, **kwargs):
         self.norm = norm_layer(embed_dim)
         self.head = nn.Identity()
 
-        # TODO: Hack for navdp training using transformers 4.51.0
-        # self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
 
         self.init_weights()
 
@@ -194,7 +194,7 @@ def interpolate_pos_encoding(self, x, w, h):
         # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
         w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
         # w0, h0 = w0 + 0.1, h0 + 0.1
-        
+
         sqrt_N = math.sqrt(N)
         sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
         patch_pos_embed = nn.functional.interpolate(
@@ -202,9 +202,9 @@ def interpolate_pos_encoding(self, x, w, h):
             scale_factor=(sx, sy),
             # (int(w0), int(h0)), # to solve the upsampling shape issue
             mode="bicubic",
-            antialias=self.interpolate_antialias
+            antialias=self.interpolate_antialias,
         )
-        
+
         assert int(w0) == patch_pos_embed.shape[-2]
         assert int(h0) == patch_pos_embed.shape[-1]
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
@@ -301,7 +301,7 @@ def get_intermediate_layers(
         n: Union[int, Sequence] = 1,  # Layers or n last layers to take
         reshape: bool = False,
         return_class_token: bool = False,
-        norm=True
+        norm=True,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
         if self.chunked_blocks:
             outputs = self._get_intermediate_layers_chunked(x, n)
@@ -310,7 +310,7 @@ def get_intermediate_layers(
         if norm:
             outputs = [self.norm(out) for out in outputs]
         class_tokens = [out[:, 0] for out in outputs]
-        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
         if reshape:
             B, _, w, h = x.shape
             outputs = [
@@ -397,13 +397,8 @@ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
 
 
 def DINOv2(model_name):
-    model_zoo = {
-        "vits": vit_small, 
-        "vitb": vit_base, 
-        "vitl": vit_large, 
-        "vitg": vit_giant2
-    }
-    
+    model_zoo = {"vits": vit_small, "vitb": vit_base, "vitl": vit_large, "vitg": vit_giant2}
+
     return model_zoo[model_name](
         img_size=518,
         patch_size=14,
@@ -412,5 +407,5 @@ def DINOv2(model_name):
         block_chunks=0,
         num_register_tokens=0,
         interpolate_antialias=False,
-        interpolate_offset=0.1
-    )
\ No newline at end of file
+        interpolate_offset=0.1,
+    )
diff --git a/internnav/qwenvl_trainer/base.py b/internnav/qwenvl_trainer/base.py
new file mode 100644
index 0000000..1d86091
--- /dev/null
+++ b/internnav/qwenvl_trainer/base.py
@@ -0,0 +1,329 @@
+from typing import Optional
+
+import torch
+from flash_attn.flash_attn_interface import flash_attn_varlen_func
+from transformers import Trainer
+from transformers.cache_utils import Cache
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionTransformerPretrainedModel,
+    Qwen2_5_VLModel,
+)
+from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+    Qwen2VisionTransformerPretrainedModel,
+    Qwen2VLModel,
+)
+from transformers.trainer import ALL_LAYERNORM_LAYERS, get_parameter_names
+
+
+def _flash_attention_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    query_length: int,
+    is_causal: bool,
+    dropout: float = 0.0,
+    position_ids: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    use_top_left_mask: bool = False,
+    softcap: Optional[float] = None,
+    deterministic: bool = None,
+    cu_seq_lens_q: Optional[torch.LongTensor] = None,
+    cu_seq_lens_k: Optional[torch.LongTensor] = None,
+    max_length_q: Optional[int] = None,
+    max_length_k: Optional[int] = None,
+    target_dtype: Optional[torch.dtype] = None,
+    **kwargs,
+):
+    """
+    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+    first unpad the input, then computes the attention scores and pad the final attention scores.
+
+    Args:
+        query_states (`torch.Tensor`):
+            Input query states to be passed to Flash Attention API
+        key_states (`torch.Tensor`):
+            Input key states to be passed to Flash Attention API
+        value_states (`torch.Tensor`):
+            Input value states to be passed to Flash Attention API
+        attention_mask (`torch.Tensor`):
+            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+            position of padding tokens and 1 for the position of non-padding tokens.
+        dropout (`float`):
+            Attention dropout
+        softmax_scale (`float`, *optional*):
+            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        use_top_left_mask (`bool`, defaults to `False`):
+            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
+        softcap (`float`, *optional*):
+            Softcap for the attention logits, used e.g. in gemma2.
+        deterministic (`bool`, *optional*):
+            Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
+    """
+    assert query_states.size(0) == key_states.size(0) == value_states.size(0) == 1
+    query_states = query_states.squeeze(0)
+    key_states = key_states.squeeze(0)
+    value_states = value_states.squeeze(0)
+    cu_seqlens = attention_mask
+
+    with torch.no_grad():
+        max_seqlen = max([cu_seqlens[idx + 1] - cu_seqlens[idx] for idx in range(cu_seqlens.size(0) - 1)]).item()
+
+    if not use_top_left_mask:
+        causal = is_causal
+    else:
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.
+        causal = is_causal and query_length != 1
+
+    # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
+    flash_kwargs = {}
+
+    if softcap is not None:
+        flash_kwargs["softcap"] = softcap
+
+    attn_output = flash_attn_varlen_func(
+        query_states,
+        key_states,
+        value_states,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
+        dropout_p=dropout,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        **flash_kwargs,
+    )
+
+    attn_output = attn_output.unsqueeze(0)
+    query_states = query_states.unsqueeze(0)
+    key_states = key_states.unsqueeze(0)
+    value_states = value_states.unsqueeze(0)
+
+    return attn_output
+
+
+def _update_causal_mask(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: Cache,
+    output_attentions: bool,
+):
+    return attention_mask
+
+
+def replace_qwen2_vl_attention_class():
+    import transformers
+    import transformers.modeling_flash_attention_utils
+
+    transformers.models.qwen2_vl.modeling_qwen2_vl._flash_attention_forward = _flash_attention_forward
+    transformers.models.qwen2_vl.modeling_qwen2_vl.Qwen2VLModel._update_causal_mask = _update_causal_mask
+    transformers.models.qwen2_5_vl.modeling_qwen2_5_vl._flash_attention_forward = _flash_attention_forward
+    transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLModel._update_causal_mask = _update_causal_mask
+
+
+def print_trainable_parameters_visual(self) -> None:
+    """
+    Prints the trainable status of all vision components including attention blocks and merger module.
+    Outputs the indices of trainable/non-trainable blocks and the merger module status.
+    """
+    trainable_blocks = []
+    non_trainable_blocks = []
+
+    # Check trainable status of vision attention blocks
+    for block_idx, block in enumerate(self.blocks):
+        is_trainable = all(param.requires_grad for param in block.parameters())
+        if is_trainable:
+            trainable_blocks.append(block_idx)
+        else:
+            non_trainable_blocks.append(block_idx)
+
+    # Check trainable status of merger module
+    is_merger_trainable = any(param.requires_grad for param in self.merger.parameters())
+
+    # Print results
+    print("Vision Module - Attention Blocks:")
+    print(f"Trainable Block Indices: {trainable_blocks if trainable_blocks else 'None'}")
+    print(f"Non-Trainable Block Indices: {non_trainable_blocks if non_trainable_blocks else 'None'}")
+    print(f"Merger Module Trainable: {is_merger_trainable}")
+
+
+def print_trainable_parameters(self) -> None:
+    """
+    Prints the trainable status of all LLM components including embeddings, layers, and normalization.
+    Outputs the indices of trainable/non-trainable layers and other module statuses.
+    """
+    # Check embed_tokens
+    is_embed_trainable = any(param.requires_grad for param in self.embed_tokens.parameters())
+    print(f"LLM Module - Embed Tokens Trainable: {is_embed_trainable}")
+
+    # Check each decoder layer
+    trainable_layers = []
+    non_trainable_layers = []
+
+    for layer_idx, layer in enumerate(self.layers):
+        is_trainable = any(param.requires_grad for param in layer.parameters())
+        if is_trainable:
+            trainable_layers.append(layer_idx)
+        else:
+            non_trainable_layers.append(layer_idx)
+
+    # Print layer status
+    print(f"LLM Module - Trainable Layer Indices: {trainable_layers if trainable_layers else 'None'}")
+    print(f"LLM Module - Non-Trainable Layer Indices: {non_trainable_layers if non_trainable_layers else 'None'}")
+
+
+def create_optimizer(self):
+
+    opt_model = self.model
+
+    if self.optimizer is None:
+        decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+        decay_parameters = [name for name in decay_parameters if "bias" not in name]
+        if self.args.mm_projector_lr is not None and self.args.mm_projector_lr != 0:
+            projector_parameters = [name for name, _ in opt_model.named_parameters() if "merger" in name]
+            if self.args.vision_tower_lr is not None and self.args.vision_tower_lr != 0:
+                vision_tower_parameters = [name for name, _ in opt_model.named_parameters() if "visual" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n not in projector_parameters
+                                and n not in vision_tower_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n not in projector_parameters
+                                and n in vision_tower_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.vision_tower_lr,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n not in projector_parameters
+                                and n not in vision_tower_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n not in projector_parameters
+                                and n in vision_tower_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.vision_tower_lr,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+        else:
+            optimizer_grouped_parameters = [
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": self.args.weight_decay,
+                },
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                    ],
+                    "weight_decay": 0.0,
+                },
+            ]
+
+        optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+        self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+
+    return self.optimizer
+
+
+# Apply monkey patches
+Trainer.create_optimizer = create_optimizer
+
+Qwen2VisionTransformerPretrainedModel.print_trainable_parameters = print_trainable_parameters_visual
+Qwen2VLModel.print_trainable_parameters = print_trainable_parameters
+Qwen2_5_VisionTransformerPretrainedModel.print_trainable_parameters = print_trainable_parameters_visual
+Qwen2_5_VLModel.print_trainable_parameters = print_trainable_parameters
diff --git a/internnav/qwenvl_trainer/internvla_n1_argument.py b/internnav/qwenvl_trainer/internvla_n1_argument.py
new file mode 100644
index 0000000..7bdd68a
--- /dev/null
+++ b/internnav/qwenvl_trainer/internvla_n1_argument.py
@@ -0,0 +1,52 @@
+from dataclasses import dataclass, field
+from typing import Optional
+
+import transformers
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="Qwen/Qwen2.5-VL-3B-Instruct")
+    tune_mm_llm: bool = field(default=False)
+    tune_mm_mlp: bool = field(default=False)
+    tune_mm_vision: bool = field(default=False)
+
+    system1: Optional[str] = field(default='nextdit')
+    n_query: int = field(default=4)
+
+
+@dataclass
+class DataArguments:
+    dataset_use: str = field(default="")
+    video_max_frames: Optional[int] = field(default=8)
+    video_min_frames: Optional[int] = field(default=4)
+    data_flatten: bool = field(default=False)
+    data_packing: bool = field(default=False)
+    base_interval: int = field(default=2)
+    max_pixels: int = field(default=28 * 28 * 576)
+    min_pixels: int = field(default=28 * 28 * 16)
+    video_max_frame_pixels: int = field(default=32 * 28 * 28)
+    video_min_frame_pixels: int = field(default=4 * 28 * 28)
+
+    vln_dataset_use: str = field(default="")
+    sample_step: int = field(default=4)
+    num_history: Optional[int] = field(default=8)
+    predict_step_num: Optional[int] = field(default=32)
+    pixel_goal_only: Optional[bool] = field(default=False)
+    data_augmentation: Optional[bool] = field(default=False)
+    transform_train: Optional[str] = field(default=None)
+    resize_h: Optional[int] = field(default=384)
+    resize_w: Optional[int] = field(default=384)
+    num_future_steps: Optional[int] = field(default=4)
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=512,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    mm_projector_lr: Optional[float] = None
+    vision_tower_lr: Optional[float] = None
diff --git a/internnav/qwenvl_trainer/internvla_n1_trainer.py b/internnav/qwenvl_trainer/internvla_n1_trainer.py
new file mode 100644
index 0000000..16c3d0d
--- /dev/null
+++ b/internnav/qwenvl_trainer/internvla_n1_trainer.py
@@ -0,0 +1,239 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import logging
+import os
+import pathlib
+import sys
+from pathlib import Path
+from typing import Dict
+
+import torch
+import transformers
+from torchvision.transforms import v2
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.append(str(project_root))
+
+from base import replace_qwen2_vl_attention_class
+from transformers import (
+    AutoProcessor,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLImageProcessor,
+    Trainer,
+)
+
+from internnav.dataset.internvla_n1_dataset_lerobot import make_supervised_data_module
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.qwenvl_trainer.internvla_n1_argument import (
+    DataArguments,
+    ModelArguments,
+    TrainingArguments,
+)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+
+
+def set_model(model_args, model):
+    if model_args.tune_mm_vision:
+        for n, p in model.visual.named_parameters():
+            p.requires_grad = True
+    else:
+        for n, p in model.visual.named_parameters():
+            p.requires_grad = False
+
+    if model_args.tune_mm_mlp:
+        for n, p in model.visual.merger.named_parameters():
+            p.requires_grad = True
+    else:
+        for n, p in model.visual.merger.named_parameters():
+            p.requires_grad = False
+
+    if model_args.tune_mm_llm:
+        for n, p in model.model.named_parameters():
+            p.requires_grad = True
+        model.lm_head.requires_grad = True
+    else:
+        for n, p in model.model.named_parameters():
+            p.requires_grad = False
+        # model.lm_head.requires_grad = False
+        for n, p in model.lm_head.named_parameters():
+            p.requires_grad = False
+
+    if 'nextdit' in model_args.system1:
+        modules = [
+            'action_encoder',
+            'action_decoder',
+            'traj_dit',
+            'cond_projector',
+            'memory_encoder',
+            'rgb_resampler',
+            'rgb_model',
+        ]
+        for n, p in model.model.named_parameters():
+            if any(k in n for k in modules):
+                p.requires_grad = True
+        model.model.latent_queries.requires_grad = True
+    elif 'navdp' in model_args.system1:
+        for n, p in model.model.navdp.named_parameters():
+            if "rgb_model" not in n:
+                p.requires_grad = True
+        model.model.latent_queries.requires_grad = True
+
+
+def train(attn_implementation="flash_attention_2"):
+    global local_rank
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    local_rank = training_args.local_rank
+    os.makedirs(training_args.output_dir, exist_ok=True)
+
+    if data_args.data_augmentation:
+        data_args.transform_train = v2.Compose(
+            [
+                v2.ToImage(),
+                v2.ColorJitter(brightness=0.2, saturation=0.2),
+                v2.RandomPosterize(bits=4),
+                v2.RandomAdjustSharpness(sharpness_factor=1.5),
+                v2.RandomAutocontrast(),
+                v2.ToPILImage(),
+                v2.Resize((data_args.resize_h, data_args.resize_w)),
+            ]
+        )
+    else:
+        data_args.transform_train = v2.Resize((data_args.resize_h, data_args.resize_w))
+
+    if 'internvla-n1-system2' in model_args.model_name_or_path.lower():
+        model = InternVLAN1ForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        )
+        data_args.image_processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+        ).image_processor
+        data_args.model_type = "internvla-n1"
+    elif "qwen2.5" in model_args.model_name_or_path.lower():
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        )
+        data_args.image_processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+        ).image_processor
+        data_args.model_type = "qwen2.5vl"
+    else:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        )
+        data_args.image_processor = Qwen2VLImageProcessor.from_pretrained(
+            model_args.model_name_or_path,
+        )
+        data_args.model_type = "qwen2vl"
+
+    if data_args.data_flatten:
+        replace_qwen2_vl_attention_class()
+    model.config.use_cache = False
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+
+    if data_args.model_type == "internvla-n1":
+        model.get_model().initialize_vision_modules(model_args=model_args)
+    set_model(model_args, model)
+
+    if torch.distributed.get_rank() == 0:
+        model.visual.print_trainable_parameters()
+        model.model.print_trainable_parameters()
+
+    if data_args.data_packing:
+        data_module = make_supervised_data_module_packed(tokenizer=tokenizer, data_args=data_args)  # noqa: F821
+    else:
+        data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+    trainer = Trainer(model=model, processing_class=tokenizer, args=training_args, **data_module)
+    from tabulate import tabulate
+
+    if trainer.is_world_process_zero():
+        stat = []
+        for i, (n, p) in enumerate(trainer.model.named_parameters()):
+            stat.append([i, n, p.shape, p.requires_grad])
+        print(tabulate(stat, headers=["idx", "name", "shape", "trainable"]))
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        logging.info("checkpoint found, resume training")
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    data_args.image_processor.save_pretrained(training_args.output_dir)
+
+    model.config.use_cache = True
+
+    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train(attn_implementation="flash_attention_2")
diff --git a/scripts/eval/bash/eval_dual_system.sh b/scripts/eval/bash/eval_dual_system.sh
index ef4be1e..b22a9a7 100755
--- a/scripts/eval/bash/eval_dual_system.sh
+++ b/scripts/eval/bash/eval_dual_system.sh
@@ -1,19 +1,15 @@
 export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
-export NCCL_SOCKET_IFNAME=bond0
-export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
 
-MID_RUN_NAME="InternVLA-N1"
+MID_RUN_NAME=InternVLA-N1-DualVLN
 
-srun -p efm_t \
+srun -p gpu_partition \
     --gres=gpu:8 \
     --ntasks=8 \
-    -x HOST-10-140-66-68,HOST-10-140-66-182,HOST-10-140-66-181 \
     --time=0-20:00:00 \
     --ntasks-per-node=8 \
     --cpus-per-task=16 \
     --kill-on-bad-exit=1 \
     python scripts/eval/eval_habitat.py \
+    --mode dual_system \
     --model_path checkpoints/${MID_RUN_NAME} \
-    --predict_step_nums 32 \
-    --continuous_traj \
-    --output_path results/$MID_RUN_NAME/val_unseen_32traj_8steps \
+    --output_path results/$MID_RUN_NAME/val_unseen_32traj_8steps
diff --git a/scripts/eval/bash/eval_system2.sh b/scripts/eval/bash/eval_system2.sh
index bedfbb3..9d31dcb 100755
--- a/scripts/eval/bash/eval_system2.sh
+++ b/scripts/eval/bash/eval_system2.sh
@@ -1,19 +1,16 @@
 export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
-export NCCL_SOCKET_IFNAME=bond0
-export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
 
+MID_RUN_NAME=InternVLA-N1-System2
 
-MID_RUN_NAME="vln_one_stage_with_qa_bs256_backup-checkpoint-21000"
-
-srun -p efm_t \
+srun -p gpu_partition \
     --gres=gpu:8 \
     --ntasks=8 \
-    -x HOST-10-140-66-68,HOST-10-140-66-182,HOST-10-140-66-181 \
     --time=0-20:00:00 \
     --ntasks-per-node=8 \
     --cpus-per-task=16 \
     --kill-on-bad-exit=1 \
     python scripts/eval/eval_habitat.py \
-    --model_path /path/to/${MID_RUN_NAME} \
+    --model_path checkpoints/${MID_RUN_NAME} \
     --mode system2 \
     --output_path results/$MID_RUN_NAME/val_unseen \
+    --habitat_config_path scripts/eval/configs/vln_r2r.yaml
diff --git a/scripts/eval/configs/vln_rxr.yaml b/scripts/eval/configs/vln_rxr.yaml
new file mode 100644
index 0000000..10fab5b
--- /dev/null
+++ b/scripts/eval/configs/vln_rxr.yaml
@@ -0,0 +1,80 @@
+# @package _global_
+
+defaults:
+  - /habitat: habitat_config_base
+  - /habitat/task: vln_r2r
+  - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgbd_agent
+  - /habitat/dataset/vln: mp3d_r2r
+  - /habitat/task/lab_sensors:
+    - gps_sensor
+    - compass_sensor
+  - _self_
+
+habitat:
+  environment:
+    max_episode_steps: 50000
+    iterator_options:
+      max_scene_repeat_steps: 50000
+      shuffle: False
+  simulator:
+    agents:
+      main_agent:
+        sim_sensors:
+          rgb_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+          depth_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            min_depth: 0.0
+            max_depth: 10.0
+    forward_step_size: 0.25
+    turn_angle: 15
+    tilt_angle: 15
+    action_space_config: "v1"
+    habitat_sim_v0:
+      gpu_device_id: 0
+  task:
+    measurements:
+      distance_to_goal:
+        type: DistanceToGoal
+        distance_to: POINT
+      success:
+        type: Success
+        success_distance: 3.0
+      spl:
+        type: SPL
+      oracle_success:
+        type: OracleSuccess
+        # success_distance: 3.0
+      oracle_navigation_error:
+        type: OracleNavigationError
+      ndtw:
+        type: NDTW
+    actions:
+      stop:
+        type: StopAction
+        agent_index: 0
+      move_forward:
+        type: MoveForwardAction
+        agent_index: 0
+      turn_left:
+        type: TurnLeftAction
+        agent_index: 0
+      turn_right:
+        type: TurnRightAction
+        agent_index: 0
+      look_up:
+        type: LookUpAction
+        agent_index: 0
+      look_down:
+        type: LookDownAction
+        agent_index: 0
+
+  dataset:
+    type: R2RVLN-v1
+    split: val_unseen
+    scenes_dir: data/scene_data/mp3d_ce
+    data_path: data/vln_ce/raw_data/rxr/{split}/{split}_guide_en.json.gz
diff --git a/scripts/eval/eval_habitat.py b/scripts/eval/eval_habitat.py
index e78a8d6..238a990 100644
--- a/scripts/eval/eval_habitat.py
+++ b/scripts/eval/eval_habitat.py
@@ -3,7 +3,11 @@
 import os
 import sys
 
-sys.path.append('./src/diffusion-policy')
+# sys.path.append('./src/diffusion-policy')
+from pathlib import Path
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.append(str(project_root))
 
 import numpy as np
 import torch
@@ -90,20 +94,32 @@ def main():
     )
 
     # * 3. do eval
-    sucs, spls, oss, nes, ep_num = evaluator.eval_action(idx=get_rank())
+    if args.mode == 'dual_system':
+        sucs, spls, oss, nes, ndtws, ep_num = evaluator.eval_dual_system(idx=get_rank())
+    elif args.mode == 'system2':
+        sucs, spls, oss, nes, ndtws, ep_num = evaluator.eval_system2(idx=get_rank())
+    else:
+        raise ValueError(f"Invalid mode: {args.mode}")
+
     ep_num_all = [torch.zeros_like(ep_num) for _ in range(world_size)]
 
     # import ipdb; ipdb.set_trace()
+    ep_num_all = [torch.zeros_like(ep_num) for _ in range(world_size)]
     dist.all_gather(ep_num_all, ep_num)
     sucs_all = [torch.zeros(ep_num_all[i], dtype=sucs.dtype).to(sucs.device) for i in range(world_size)]
     spls_all = [torch.zeros(ep_num_all[i], dtype=spls.dtype).to(spls.device) for i in range(world_size)]
     oss_all = [torch.zeros(ep_num_all[i], dtype=oss.dtype).to(oss.device) for i in range(world_size)]
     nes_all = [torch.zeros(ep_num_all[i], dtype=nes.dtype).to(nes.device) for i in range(world_size)]
+    if ndtws is not None:
+        ndtws_all = [torch.zeros(ep_num_all[i], dtype=ndtws.dtype).to(ndtws.device) for i in range(world_size)]
     dist.barrier()
+
     dist.all_gather(sucs_all, sucs)
     dist.all_gather(spls_all, spls)
     dist.all_gather(oss_all, oss)
     dist.all_gather(nes_all, nes)
+    if ndtws is not None:
+        dist.all_gather(ndtws_all, ndtws)
 
     sucs_all = torch.cat(sucs_all, dim=0)
     spls_all = torch.cat(spls_all, dim=0)
@@ -116,6 +132,9 @@ def main():
         "nes_all": (sum(nes_all) / len(nes_all)).item(),
         'length': len(sucs_all),
     }
+    if ndtws is not None:
+        ndtws_all = torch.cat(ndtws_all, dim=0)
+        result_all['ndtws_all'] = (sum(ndtws_all) / len(ndtws_all)).item()
 
     print(result_all)
     if get_rank() == 0:
diff --git a/scripts/train_internvla_n1/train_dual_system.sh b/scripts/train_internvla_n1/train_dual_system.sh
new file mode 100644
index 0000000..efa661c
--- /dev/null
+++ b/scripts/train_internvla_n1/train_dual_system.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#SBATCH -J qwenvl
+#SBATCH -p gpu_partition
+#SBATCH -N 8
+#SBATCH --gres=gpu:8
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=1
+#SBATCH -o ./slurm-%j.out
+#SBATCH -e ./slurm-%j.err
+
+# Distributed training configuration
+MASTER_ADDR=`scontrol show hostname $SLURM_JOB_NODELIST | head -n1`
+MASTER_PORT=$((RANDOM % 101 + 20001))
+
+# DeepSpeed configuration
+deepspeed=scripts/train_internvla_n1/zero2.json
+
+# Model configuration
+llm=Qwen/Qwen2.5-VL-7B-Instruct
+
+# Training hyperparameters
+lr=1e-4
+batch_size=2
+grad_accum_steps=1
+max_pixels=313600
+min_pixels=3136
+
+# Dataset configuration (replace with public dataset names)
+vln_datasets=r2r_125cm_0_30%30,r2r_60cm_15_15%30,rxr_125cm_0_30%30,rxr_60cm_15_15%30,scalevln_125cm_0_30%30,scalevln_60cm_30_30%30
+
+# Output configuration
+run_name=InternVLA-N1-DualVLN
+output_dir=checkpoints/${run_name}
+# system 1 options: nextdit_async, navdp_async, nextdit
+system1=nextdit_async
+
+system2_ckpt=checkpoints/InternVLA-N1-System2
+
+srun torchrun --nnodes=$SLURM_NNODES --nproc_per_node=8 \
+    --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+    internnav/qwenvl_trainer/internvla_n1_trainer.py \
+    --deepspeed ${deepspeed} \
+    --model_name_or_path "${system2_ckpt}" \
+    --vln_dataset_use ${vln_datasets} \
+    --data_flatten False \
+    --tune_mm_vision False \
+    --tune_mm_mlp False \
+    --tune_mm_llm False \
+    --bf16 \
+    \
+    --num_history 8 \
+    --data_augmentation True \
+    --resize_h 384 \
+    --resize_w 384 \
+    --sample_step 4 \
+    --num_future_steps 4 \
+    --predict_step_num 32 \
+    --pixel_goal_only True \
+    --system1 ${system1} \
+    \
+    --output_dir ${output_dir} \
+    --num_train_epochs 3.0 \
+    --per_device_train_batch_size ${batch_size} \
+    --per_device_eval_batch_size $((batch_size*2)) \
+    --gradient_accumulation_steps ${grad_accum_steps} \
+    --max_pixels ${max_pixels} \
+    --min_pixels ${min_pixels} \
+    --eval_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 5 \
+    --learning_rate ${lr} \
+    --weight_decay 0 \
+    --warmup_ratio 0.003 \
+    --max_grad_norm 1 \
+    --lr_scheduler_type "cosine_with_min_lr" \
+    --lr_scheduler_kwargs '{"min_lr": 1e-05}' \
+    --logging_steps 1 \
+    --model_max_length 8192 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --run_name ${run_name} \
+    --report_to wandb
diff --git a/scripts/train_internvla_n1/train_system2.sh b/scripts/train_internvla_n1/train_system2.sh
new file mode 100644
index 0000000..899fd9a
--- /dev/null
+++ b/scripts/train_internvla_n1/train_system2.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#SBATCH -J qwenvl
+#SBATCH -p gpu_partition
+#SBATCH -N 8
+#SBATCH --gres=gpu:8
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=1
+#SBATCH -o ./slurm-%j.out
+#SBATCH -e ./slurm-%j.err
+
+# Distributed training configuration
+MASTER_ADDR=`scontrol show hostname $SLURM_JOB_NODELIST | head -n1`
+MASTER_PORT=$((RANDOM % 101 + 20001))
+
+# DeepSpeed configuration
+deepspeed=scripts/train_internvla_n1/zero2.json
+
+# Model configuration
+llm=Qwen/Qwen2.5-VL-7B-Instruct
+
+# Training hyperparameters
+lr=2e-5
+vision_tower_lr=5e-6
+batch_size=2
+grad_accum_steps=1
+max_pixels=313600
+min_pixels=3136
+
+# Dataset configuration (replace with public dataset names)
+vln_datasets=r2r_125cm_0_30,r2r_125cm_0_45,r2r_60cm_15_15,r2r_60cm_30_30,rxr_125cm_0_30,rxr_125cm_0_45,rxr_60cm_15_15,rxr_60cm_30_30 #,scalevln_125cm_0_30,scalevln_60cm_30_30
+
+# Output configuration
+run_name=InternVLA-N1-System2
+output_dir=checkpoints/${run_name}
+
+srun torchrun --nnodes=$SLURM_NNODES --nproc_per_node=8 \
+    --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+    internnav/qwenvl_trainer/internvla_n1_trainer.py \
+    --deepspeed ${deepspeed} \
+    --model_name_or_path "${llm}" \
+    --vln_dataset_use ${vln_datasets} \
+    --data_flatten False \
+    --tune_mm_vision True \
+    --tune_mm_mlp True \
+    --tune_mm_llm True \
+    --bf16 \
+    \
+    --num_history 8 \
+    --data_augmentation True \
+    --resize_h 384 \
+    --resize_w 384 \
+    --sample_step 4 \
+    --num_future_steps 4 \
+    --predict_step_num 32 \
+    --pixel_goal_only False \
+    --system1 "none" \
+    \
+    --output_dir ${output_dir} \
+    --num_train_epochs 2.0 \
+    --per_device_train_batch_size ${batch_size} \
+    --per_device_eval_batch_size $((batch_size*2)) \
+    --gradient_accumulation_steps ${grad_accum_steps} \
+    --max_pixels ${max_pixels} \
+    --min_pixels ${min_pixels} \
+    --eval_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 5 \
+    --learning_rate ${lr} \
+    --vision_tower_lr ${vision_tower_lr} \
+    --weight_decay 0 \
+    --warmup_ratio 0.003 \
+    --max_grad_norm 1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 8192 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --run_name ${run_name} \
+    --report_to wandb
diff --git a/scripts/train_internvla_n1/zero2.json b/scripts/train_internvla_n1/zero2.json
new file mode 100644
index 0000000..8542ca5
--- /dev/null
+++ b/scripts/train_internvla_n1/zero2.json
@@ -0,0 +1,23 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": false,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
diff --git a/scripts/train_internvla_n1/zero3.json b/scripts/train_internvla_n1/zero3.json
new file mode 100644
index 0000000..14b7b3e
--- /dev/null
+++ b/scripts/train_internvla_n1/zero3.json
@@ -0,0 +1,28 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
diff --git a/scripts/train_internvla_n1/zero3_offload.json b/scripts/train_internvla_n1/zero3_offload.json
new file mode 100644
index 0000000..2dcde84
--- /dev/null
+++ b/scripts/train_internvla_n1/zero3_offload.json
@@ -0,0 +1,56 @@
+{
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "steps_per_print": 1e5,
+  "wall_clock_breakdown": false
+}

From 152daf1c5917cf14c329da15fc89db86065d8b19 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Fri, 5 Dec 2025 08:37:38 +0000
Subject: [PATCH 02/12] merge refactored evaluator with decoupled
 system2/dual_system eval

---
 internnav/evaluator/habitat_vln_evaluator.py  |  10 +-
 .../habitat_vln_evaluator.py                  | 698 ++++++++++++------
 .../eval/configs/habitat_dual_system_cfg.py   |   2 -
 scripts/eval/configs/habitat_s2_cfg.py        |   2 -
 4 files changed, 465 insertions(+), 247 deletions(-)

diff --git a/internnav/evaluator/habitat_vln_evaluator.py b/internnav/evaluator/habitat_vln_evaluator.py
index 273914e..44f9747 100644
--- a/internnav/evaluator/habitat_vln_evaluator.py
+++ b/internnav/evaluator/habitat_vln_evaluator.py
@@ -595,7 +595,7 @@ def eval_system2(self, idx) -> None:  # noqa: C901
                     if len(action_seq) == 0 and goal is None:
                         if action != 5:
                             sources = copy.deepcopy(self.conversation)
-                            if 'objectnav' in self.config_path:
+                            if 'objectnav' in self.config_path:  # not exist?
                                 sources[0]["value"] = sources[0]["value"].replace(
                                     '<instruction>.',
                                     random.choice(self.objectnav_instructions).format(
@@ -675,6 +675,10 @@ def eval_system2(self, idx) -> None:  # noqa: C901
 
                             pixel_goal = [int(coord[1]), int(coord[0])]  # switch the goal o
 
+                            # look down --> horizontal
+                            env.step(4)
+                            env.step(4)
+
                             goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
 
                             goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
@@ -682,10 +686,6 @@ def eval_system2(self, idx) -> None:  # noqa: C901
                             if not env.sim.pathfinder.is_navigable(np.array(goal)):
                                 goal = np.array(env.sim.pathfinder.snap_point(np.array(goal)))
 
-                            # look down --> horizontal
-                            env.step(4)
-                            env.step(4)
-
                             action = agent.get_next_action(goal)
                             if action == 0:
                                 goal = None
diff --git a/internnav/habitat_extensions/habitat_vln_evaluator.py b/internnav/habitat_extensions/habitat_vln_evaluator.py
index 7b04dc8..9855fe5 100644
--- a/internnav/habitat_extensions/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/habitat_vln_evaluator.py
@@ -2,6 +2,7 @@
 import json
 import os
 import sys
+from enum import Enum
 
 sys.path.append('./src/diffusion-policy')
 import copy
@@ -10,6 +11,7 @@
 import re
 from collections import OrderedDict
 
+import cv2
 import habitat
 import numpy as np
 import quaternion
@@ -30,18 +32,28 @@
 
 from internnav.configs.evaluator import EvalCfg
 from internnav.evaluator import DistributedEvaluator, Evaluator
+from internnav.habitat_extensions.utils import preprocess_depth_image_v2
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from internnav.model.utils.vln_utils import (
-    chunk_token,
-    split_and_clean,
-    traj_to_actions,
-)
+from internnav.model.utils.vln_utils import split_and_clean, traj_to_actions
 
 # Import for Habitat registry side effects — do not remove
 import internnav.habitat_extensions.measures  # noqa: F401 # isort: skip
 
+
 DEFAULT_IMAGE_TOKEN = "<image>"
 
+MAX_STEPS = 8
+MAX_LOCAL_STEPS = 4
+
+
+class action_code(Enum):
+    STOP = 0
+    FORWARD = 1
+    LEFT = 2
+    RIGHT = 3
+    LOOKUP = 4
+    LOOKDOWN = 5
+
 
 @Evaluator.register('habitat_vln')
 class HabitatVLNEvaluator(DistributedEvaluator):
@@ -140,10 +152,7 @@ def __init__(self, cfg: EvalCfg):
             }
         )
 
-        self.objectnav_instructions = ["Search for the {target_object}."]
-
         self.num_frames = self.model_args.num_frames
-        self.num_future_steps = self.model_args.num_future_steps
         self.num_history = self.model_args.num_history
 
         self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
@@ -165,19 +174,23 @@ def eval_action(self):
         # Now just implement the actual eval here and return dict.
 
         if self.model_args.mode == 'dual_system':
-            sucs, spls, oss, nes, _ = self._run_eval_dual_system()
+            sucs, spls, oss, nes, ndtws = self._run_eval_dual_system()
         elif self.model_args.mode == 'system2':
-            sucs, spls, oss, nes, _ = self._run_eval_system2()
+            sucs, spls, oss, nes, ndtws = self._run_eval_system2()
         else:
             raise ValueError(f"Invalid mode: {self.model_args.mode}")
 
-        return {
+        result = {
             "sucs": sucs,  # shape [N_local]
             "spls": spls,  # shape [N_local]
             "oss": oss,  # shape [N_local]
             "nes": nes,  # shape [N_local]
         }
 
+        if ndtws:
+            result["ndtws"] = ndtws  # shape [N_local]
+        return result
+
     def calc_metrics(self, global_metrics: dict) -> dict:
         """
         global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
@@ -190,7 +203,7 @@ def calc_metrics(self, global_metrics: dict) -> dict:
         # avoid /0 if no episodes
         denom = max(len(sucs_all), 1)
 
-        return {
+        result_all = {
             "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
             "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
             "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
@@ -198,53 +211,48 @@ def calc_metrics(self, global_metrics: dict) -> dict:
             # "length" will be filled by base class
         }
 
-    def _run_local_eval(self) -> None:  # noqa: C901
-        """
-        Run local evaluation on this rank.
+        if "ndtws" in global_metrics:
+            ndtws_all = global_metrics["ndtws"]
+            result_all["ndtws_all"] = float(ndtws_all.mean().item()) if denom > 0 else 0.0
 
-        Important: if resuming from previous results, need to read from / write to "self.output_path/progress.json".
-                    For each episode, save the result dict in jsonl format to that file.
-                    In Env, the episodes are already filtered by this file, tasks that have the same (scene_id, episode_id) are skipped.
+        return result_all
 
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        # import ipdb; ipdb.set_trace()
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
 
-        Returns
-        -------
-        dict[str, Tensor]:
-            {
-                "sucs": [N_local],
-                "spls": [N_local],
-                "oss":  [N_local],
-                "nes":  [N_local],
-            }
-        """
-        # Create / get env
-        # self.env = self.env  # HabitatEnv from DistributedEvaluator
-
-        sucs, spls, oss, nes = [], [], [], []
-        self.model.eval()
+    def resume_from_output_path(self) -> None:
+        sucs, spls, oss, nes, ndtw = [], [], [], [], []
+        if self.rank != 0:
+            return sucs, spls, oss, nes, ndtw
 
         # resume from previous results
         if os.path.exists(os.path.join(self.output_path, 'progress.json')):
             with open(os.path.join(self.output_path, 'progress.json'), 'r') as f:
                 for line in f.readlines():
                     res = json.loads(line)
-                    if "scene_id" not in res:
-                        print("This evaluation has already finished!")
-                        return (
-                            torch.tensor(sucs).to(self.device),
-                            torch.tensor(spls).to(self.device),
-                            torch.tensor(oss).to(self.device),
-                            torch.tensor(nes).to(self.device),
-                            torch.tensor(len(sucs)).to(self.device),
-                        )
-                    if self.rank == 0:  # noqa: F405
-                        sucs.append(res['success'])
-                        spls.append(res['spl'])
-                        oss.append(res['os'])
-                        nes.append(res['ne'])
+                    sucs.append(res['success'])
+                    spls.append(res['spl'])
+                    oss.append(res['os'])
+                    nes.append(res['ne'])
+                    if 'ndtw' in res:
+                        ndtw.append(res['ndtw'])
+        return sucs, spls, oss, nes, ndtw
+
+    def _run_eval_dual_system(self) -> tuple:
+        self.model.eval()
+
+        # resume from previous results
+        sucs, spls, oss, nes, ndtw = self.resume_from_output_path()
 
         # Episode loop is now driven by env.reset() + env.is_running
         process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {self.rank}")
+
         while self.env.is_running:
 
             # ------------ 1. Start of episode ------------
@@ -257,9 +265,345 @@ def _run_local_eval(self) -> None:  # noqa: C901
             episode = self.env.get_current_episode()
             scene_id = episode.scene_id.split('/')[-2]
             episode_id = int(episode.episode_id)
-            episode_instruction = (
-                episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
+            episode_instruction = episode.instruction.instruction_text
+            print("episode start", episode_instruction)
+
+            # save first frame per rank to validate sim quality
+            os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
+            Image.fromarray(observations['rgb']).save(
+                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{self.rank}.jpg')
+            )
+
+            vis_frames = []
+            step_id = 0
+
+            if self.save_video:
+                os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+
+            rgb_list = []
+            action_seq = []
+            input_images = []
+            output_ids = None
+            llm_outputs = ""
+            action = None
+            messages = []
+            local_actions = []
+
+            done = False
+            flag = False
+            pixel_goal = None
+
+            # ---------- 2. Episode step loop -----------
+            while (not done) and (step_id <= self.max_steps_per_episode):
+                # refactor agent get action
+                rgb = observations["rgb"]
+                depth = observations["depth"]
+                x, y = observations["gps"]
+                depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                depth = depth * 1000
+
+                image = Image.fromarray(rgb).convert('RGB')
+                save_raw_image = image.copy()
+
+                if action == action_code.LOOKDOWN:
+                    look_down_image = image
+                    save_raw_image = look_down_image.copy()
+                    look_down_depth, resize_shape = preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
+                    )
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                    look_down_depth[look_down_depth > 5.0] = 5.0
+                else:
+                    image = image.resize((self.model_args.resize_w, self.model_args.resize_h))
+                    rgb_list.append(image)
+
+                    down_observations, _, _, _ = self.env.step(action_code.LOOKDOWN)
+                    down_observations, _, _, _ = self.env.step(action_code.LOOKDOWN)
+
+                    look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
+                    depth = down_observations["depth"]
+                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                    depth = depth * 1000
+                    look_down_depth, resize_shape = preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
+                    )
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                    look_down_depth[look_down_depth > 5.0] = 5.0
+
+                    self.env.step(action_code.LOOKUP)
+                    self.env.step(action_code.LOOKUP)
+
+                if len(action_seq) == 0 and pixel_goal is None:
+                    if action == action_code.LOOKDOWN:
+                        # last action is look down
+                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                        input_images += [look_down_image]
+                        messages.append(
+                            {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                        )
+                        input_img_id = -1
+                    else:
+                        sources = copy.deepcopy(self.conversation)
+                        sources[0]["value"] = sources[0]["value"].replace(
+                            '<instruction>.', episode.instruction.instruction_text[:-1]
+                        )
+                        cur_images = rgb_list[-1:]
+                        if step_id == 0:
+                            history_id = []
+                        else:
+                            history_id = np.unique(
+                                np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                            ).tolist()
+                            placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                            sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+                        history_id = sorted(history_id)
+                        input_images = [rgb_list[i] for i in history_id] + cur_images
+                        input_img_id = 0
+
+                    prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+                    sources[0]["value"] += f" {prompt}."
+                    prompt_instruction = copy.deepcopy(sources[0]["value"])
+                    parts = split_and_clean(prompt_instruction)
+
+                    content = []
+                    for i in range(len(parts)):
+                        if parts[i] == "<image>":
+                            content.append({"type": "image", "image": input_images[input_img_id]})
+                            input_img_id += 1
+                        else:
+                            content.append({"type": "text", "text": parts[i]})
+
+                    messages.append({'role': 'user', 'content': content})
+
+                    text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+                    inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
+
+                    with torch.no_grad():
+                        output_ids = self.model.generate(
+                            **inputs,
+                            max_new_tokens=128,
+                            do_sample=False,
+                            use_cache=True,
+                            past_key_values=None,
+                            return_dict_in_generate=True,
+                        ).sequences
+
+                    llm_outputs = self.processor.tokenizer.decode(
+                        output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                    )
+                    print('step_id:', step_id, 'output text:', llm_outputs)
+
+                    if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
+                        forward_action = 0
+                        coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+
+                        pixel_goal = [int(coord[1]), int(coord[0])]
+
+                        # look down --> horizontal
+                        self.env.step(action_code.LOOKUP)
+                        self.env.step(action_code.LOOKUP)
+
+                        local_actions = []
+                        pixel_values = inputs.pixel_values
+                        image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+
+                        with torch.no_grad():
+                            traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
+
+                        # prepocess align with navdp
+                        image_dp = torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                        pix_goal_image = copy.copy(image_dp)
+                        images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                        depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                        pix_goal_depth = copy.copy(depth_dp)
+                        depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+
+                        with torch.no_grad():
+                            dp_actions = self.model.generate_traj(traj_latents, images_dp, depths_dp)
+
+                        action_list = traj_to_actions(dp_actions)
+                        if len(action_list) < MAX_STEPS:
+                            action_list += [0] * (MAX_STEPS - len(action_list))
+
+                        local_actions = action_list
+                        if len(local_actions) >= MAX_LOCAL_STEPS:
+                            local_actions = local_actions[:MAX_LOCAL_STEPS]
+
+                        action = local_actions[0]
+                        if action == action_code.STOP:
+                            pixel_goal = None
+                            output_ids = None
+                            action = action_code.LEFT
+                            observations, _, done, _ = self.env.step(action)
+                            step_id += 1
+                            messages = []
+                            continue
+                        print('predicted goal', pixel_goal, flush=True)
+
+                    else:
+                        action_seq = self.parse_actions(llm_outputs)
+                        print('actions', action_seq, flush=True)
+
+                if len(action_seq) != 0:
+                    action = action_seq[0]
+                    action_seq.pop(0)
+                elif pixel_goal is not None:
+                    if len(local_actions) == 0:
+                        # navdp
+                        local_actions = []
+                        image_dp = torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+
+                        images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                        depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+
+                        depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+                        with torch.no_grad():
+                            dp_actions = self.model.generate_traj(traj_latents, images_dp, depths_dp)
+
+                        action_list = traj_to_actions(dp_actions)
+                        if len(action_list) < MAX_STEPS:
+                            action_list += [0] * (MAX_STEPS - len(action_list))
+
+                        local_actions = action_list
+                        if len(local_actions) >= MAX_LOCAL_STEPS:
+                            local_actions = local_actions[:MAX_LOCAL_STEPS]
+                        print("local_actions", local_actions)
+                        action = local_actions.pop(0)
+                    else:
+                        action = local_actions.pop(0)
+
+                    forward_action += 1
+                    if forward_action > MAX_STEPS:
+                        pixel_goal = None
+                        output_ids = None
+                        messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                    if action == action_code.STOP:
+                        pixel_goal = None
+                        output_ids = None
+                        messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                else:
+                    action = 0
+
+                info = self.env.get_metrics()
+
+                if info['top_down_map'] is not None and self.save_video:
+                    frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                    if pixel_goal is not None and flag:
+                        cv2.circle(frame, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_frames.append(frame)
+
+                print("step_id", step_id, "action", action)
+
+                if action == action_code.LOOKDOWN:
+                    self.env.step(action)
+                    observations, _, done, _ = self.env.step(action)
+                    flag = True
+                else:
+                    observations, _, done, _ = self.env.step(action)
+                    step_id += 1
+                    messages = []
+                    flag = False
+
+            # ---------- 3. End of episode -----------
+            # collect the metric result of this episode and write progress to the output_path/progress.json
+
+            process_bar.update(1)
+
+            # After the episode finishes, collect metrics:
+            metrics = self.env.get_metrics()
+
+            sucs.append(metrics['success'])
+            spls.append(metrics['spl'])
+            oss.append(metrics['oracle_success'])
+            nes.append(metrics["distance_to_goal"])
+            if 'ndtw' in metrics:
+                ndtw.append(metrics["ndtw"])
+
+            print(
+                f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, "
+                f"spl: {metrics['spl']}, os: {metrics['oracle_success']}, "
+                f"ne: {metrics['distance_to_goal']}"
             )
+
+            # Write per-episode result.json entry (still per-rank)
+            result = {
+                "scene_id": scene_id,
+                "episode_id": episode_id,
+                "success": metrics["success"],
+                "spl": metrics["spl"],
+                "os": metrics['oracle_success'],
+                "ne": metrics["distance_to_goal"],
+                "steps": step_id,
+                "episode_instruction": episode_instruction,
+            }
+            if 'ndtw' in metrics:
+                result['ndtw'] = metrics['ndtw']
+
+            os.makedirs(self.output_path, exist_ok=True)
+            with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
+                f.write(json.dumps(result) + "\n")
+            if self.save_video and metrics['success'] == 1.0:
+                images_to_video(
+                    vis_frames,
+                    os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
+                    f'{episode_id:04d}',
+                    fps=6,
+                    quality=9,
+                )
+            vis_frames.clear()
+
+        self.env.close()
+
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(ndtw).to(self.device) if 'ndtw' in metrics else None,
+        )
+
+    def _run_eval_system2(self) -> tuple:
+        self.model.eval()
+
+        # resume from previous results
+        sucs, spls, oss, nes, ndtw = self.resume_from_output_path()
+
+        # Episode loop is now driven by env.reset() + env.is_running
+        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {self.rank}")
+
+        while self.env.is_running:
+
+            # ------------ 1. Start of episode ------------
+            observations = self.env.reset()
+            if not self.env.is_running or observations is None:
+                break
+
+            # ---- episode meta (scene_id, episode_id, instruction) ----
+            # we get it from the underlying habitat env
+            episode = self.env.get_current_episode()
+            scene_id = episode.scene_id.split('/')[-2]
+            episode_id = int(episode.episode_id)
+            episode_instruction = episode.instruction.instruction_text
             print("episode start", episode_instruction)
 
             agent_state = self.env._env.sim.get_agent_state()
@@ -272,6 +616,10 @@ def _run_local_eval(self) -> None:  # noqa: C901
 
             agent = ShortestPathFollower(self.env._env.sim, 0.25, False)
 
+            intrinsic_matrix = self.get_intrinsic_matrix(
+                self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
+            )
+
             # save first frame per rank to validate sim quality
             os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
             Image.fromarray(observations['rgb']).save(
@@ -287,14 +635,15 @@ def _run_local_eval(self) -> None:  # noqa: C901
 
             rgb_list = []
             action_seq = []
+            input_images = []
             output_ids = None
-
+            llm_outputs = ""
             goal = None
             action = None
             messages = []
-            local_actions = []
 
             done = False
+            flag = False
 
             # ---------- 2. Episode step loop -----------
             while (not done) and (step_id <= self.max_steps_per_episode):
@@ -308,7 +657,7 @@ def _run_local_eval(self) -> None:  # noqa: C901
                 depth = depth * 1000
 
                 agent_state = self.env._env.sim.get_agent_state()
-                height = agent_state.position[1] - initial_height
+                height = agent_state.position[1] - initial_height  # Habitat GPS makes west negative, so flip y
                 camera_position = np.array([x, -y, self._camera_height + height])
                 tf_camera_to_episodic = (
                     self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
@@ -318,49 +667,23 @@ def _run_local_eval(self) -> None:  # noqa: C901
                 image = Image.fromarray(rgb).convert('RGB')
                 save_raw_image = image.copy()
 
-                save_dot = False
-                if action == 5:
+                if action == action_code.LOOKDOWN:
                     look_down_image = image
                     save_raw_image = look_down_image.copy()
-                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                        do_depth_scale=True,
-                        depth_scale=1000,
-                        target_height=224,
-                        target_width=224,
-                    )
-                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                    look_down_depth[look_down_depth > 5.0] = 5.0
                 else:
                     image = image.resize((self.model_args.resize_w, self.model_args.resize_h))
                     rgb_list.append(image)
 
-                    if self.model_args.mode == 'dual_system':
-                        down_observations, _, done, _ = self.env.step(5)
-                        down_observations, _, done, _ = self.env.step(5)
-
-                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
-                        depth = down_observations["depth"]
-                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                        depth = depth * 1000
-                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                            do_depth_scale=True,
-                            depth_scale=1000,
-                            target_height=224,
-                            target_width=224,
-                        )
-                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                        look_down_depth[look_down_depth > 5.0] = 5.0
-
-                        self.env.step(4)
-                        self.env.step(4)
-
-                info = self.env.get_metrics()
-
                 if len(action_seq) == 0 and goal is None:
-                    if action != 5:
+                    if action == action_code.LOOKDOWN:
+                        # last action is look down
+                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                        input_images += [look_down_image]
+                        messages.append(
+                            {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                        )
+                        input_img_id = -1
+                    else:
                         sources = copy.deepcopy(self.conversation)
                         sources[0]["value"] = sources[0]["value"].replace(
                             '<instruction>.', episode.instruction.instruction_text[:-1]
@@ -376,21 +699,11 @@ def _run_local_eval(self) -> None:  # noqa: C901
                             sources[0]["value"] += f' These are your historical observations: {placeholder}.'
 
                         history_id = sorted(history_id)
-                        print('history_idddddddd', step_id, history_id)
                         input_images = [rgb_list[i] for i in history_id] + cur_images
                         input_img_id = 0
-                    else:
-                        assert action == 5
-                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-                        input_images += [look_down_image]
-                        # messages.append(
-                        #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
-                        # )
-                        input_img_id = -1
 
                     prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
                     sources[0]["value"] += f" {prompt}."
-                    print('sources', step_id, sources)
                     prompt_instruction = copy.deepcopy(sources[0]["value"])
                     parts = split_and_clean(prompt_instruction)
 
@@ -404,97 +717,53 @@ def _run_local_eval(self) -> None:  # noqa: C901
 
                     messages.append({'role': 'user', 'content': content})
 
-                    print('step_id', step_id, 'messages:', messages)
-
                     text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
                     inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
 
                     with torch.no_grad():
-                        output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+                        output_ids = self.model.generate(
+                            **inputs,
+                            max_new_tokens=128,
+                            do_sample=False,
+                            use_cache=True,
+                            past_key_values=None,
+                            return_dict_in_generate=True,
+                        ).sequences
 
                     llm_outputs = self.processor.tokenizer.decode(
                         output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
                     )
                     print('step_id:', step_id, 'output text:', llm_outputs)
 
-                    if bool(re.search(r'\d', llm_outputs)):
+                    if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
                         forward_action = 0
                         coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+
                         pixel_goal = [int(coord[1]), int(coord[0])]
 
-                        intrinsic_matrix = self.get_intrinsic_matrix(
-                            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
-                        )
+                        # look down --> horizontal
+                        self.env.step(action_code.LOOKUP)
+                        self.env.step(action_code.LOOKUP)
+
                         goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
-                        print('before', goal, depth.shape)
+
                         goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
 
                         if not self.env._env.sim.pathfinder.is_navigable(np.array(goal)):
                             goal = np.array(self.env._env.sim.pathfinder.snap_point(np.array(goal)))
 
-                        # look down --> horizontal
-                        self.env.step(4)
-                        self.env.step(4)
-
-                        # Forking logic based on mode
-                        if self.model_args.mode == 'system2':
-                            action = agent.get_next_action(goal)
-                            if action == 0:
-                                goal = None
-                                output_ids = None
-                                action = 2  # random action
-                                print('conduct a random action 2')
-                                observations, _, done, _ = self.env.step(action)
-                                step_id += 1
-                                messages = []
-                                continue
-                        else:  # dual-system logic
-                            local_actions = []
-                            pixel_values = inputs.pixel_values
-                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-
-                            with torch.no_grad():
-                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-
-                            # prepocess align with navdp
-                            image_dp = (
-                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                            )
-                            pix_goal_image = copy.copy(image_dp)
-                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                            pix_goal_depth = copy.copy(depth_dp)
-                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-
-                            with torch.no_grad():
-                                dp_actions = self.model.generate_traj(
-                                    traj_latents, images_dp, depths_dp, use_async=True
-                                )
-
-                            random_choice = np.random.choice(dp_actions.shape[0])
-                            if self.model_args.continuous_traj:
-                                action_list = traj_to_actions(dp_actions)
-                                if len(action_list) < 8:
-                                    action_list += [0] * (8 - len(action_list))
-                            else:
-                                action_list = chunk_token(dp_actions[random_choice])
-
-                            local_actions = action_list
-                            if len(local_actions) >= 4:
-                                local_actions = local_actions[:4]
-                            action = local_actions[0]
-                            if action == 0:
-                                goal = None
-                                output_ids = None
-                                action = 2  # random action
-                                print('conduct a random action 2')
-                                observations, _, done, _ = self.env.step(action)
-                                step_id += 1
-                                messages = []
-                                continue
-
+                        action = agent.get_next_action(goal)
+                        if action == action_code.STOP:
+                            goal = None
+                            output_ids = None
+                            action = action_code.LEFT  # random action to avoid deadlock
+                            observations, _, done, _ = self.env.step(action)
+                            step_id += 1
+                            messages = []
+                            continue
                         print('predicted goal', pixel_goal, goal, flush=True)
+
                     else:
                         action_seq = self.parse_actions(llm_outputs)
                         print('actions', action_seq, flush=True)
@@ -502,94 +771,51 @@ def _run_local_eval(self) -> None:  # noqa: C901
                 if len(action_seq) != 0:
                     action = action_seq[0]
                     action_seq.pop(0)
-                elif goal is not None:
-                    # Forking logic based on mode
-                    if self.model_args.mode == 'system2':
-                        action = agent.get_next_action(goal)
-                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
-                        action = action[0] if hasattr(action, "__len__") else action
-                    else:  # dual-system logic
-                        if len(local_actions) == 0:
-                            # navdp
-                            local_actions = []
-                            image_dp = (
-                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                            )
-
-                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-
-                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-                            with torch.no_grad():
-                                dp_actions = self.model.generate_traj(
-                                    traj_latents, images_dp, depths_dp, use_async=True
-                                )
-
-                            random_choice = np.random.choice(dp_actions.shape[0])
-                            if self.model_args.continuous_traj:
-                                action_list = traj_to_actions(dp_actions)
-                                if len(action_list) < 8:
-                                    action_list += [0] * (8 - len(action_list))
-                            else:
-                                action_list = chunk_token(dp_actions[random_choice])
-                            print("first action_list", action_list)
-
-                            local_actions = action_list
-                            if len(local_actions) >= 4:
-                                local_actions = local_actions[:4]
-                            # if len(local_actions) >= 2:
-                            #     local_actions = local_actions[:2]
-
-                            print("local_actions", local_actions)
-
-                            action = local_actions.pop(0)
-                            # navdp
-                        else:
-                            action = local_actions.pop(0)
+                elif pixel_goal is not None:
+                    action = agent.get_next_action(goal)
+                    action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                    action = action[0] if hasattr(action, "__len__") else action
 
                     forward_action += 1
-                    print('forward_action', forward_action, flush=True)
-                    if forward_action > 8:
+                    if forward_action > MAX_STEPS:
                         goal = None
                         output_ids = None
                         messages = []
                         step_id += 1
                         forward_action = 0
-                        local_actions = []
                         continue
-                    if action == 0:
+                    if action == action_code.STOP:
                         goal = None
                         output_ids = None
                         messages = []
                         step_id += 1
                         forward_action = 0
-                        local_actions = []
                         continue
                 else:
                     action = 0
 
-                if info['top_down_map'] is not None:
-                    if save_dot:
-                        save_raw_image = self.dot_matrix_two_dimensional(
-                            save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
-                        )
-                    if self.save_video:
-                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
-                        vis_frames.append(frame)
+                info = self.env.get_metrics()
+
+                if info['top_down_map'] is not None and self.save_video:
+                    frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                    if goal is not None and flag:
+                        cv2.circle(frame, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
+                    vis_frames.append(frame)
 
                 print("step_id", step_id, "action", action)
 
-                # refactor: core
-                if action == 5:
+                if action == action_code.LOOKDOWN:
                     self.env.step(action)
                     observations, _, done, _ = self.env.step(action)
+                    flag = True
                 else:
                     observations, _, done, _ = self.env.step(action)
                     step_id += 1
                     messages = []
+                    flag = False
 
             # ---------- 3. End of episode -----------
-            # Update result and write progress to the output_path/progress.json
+            # collect the metric result of this episode and write progress to the output_path/progress.json
 
             process_bar.update(1)
 
@@ -600,6 +826,8 @@ def _run_local_eval(self) -> None:  # noqa: C901
             spls.append(metrics['spl'])
             oss.append(metrics['oracle_success'])
             nes.append(metrics["distance_to_goal"])
+            if 'ndtw' in metrics:
+                ndtw.append(metrics["ndtw"])
 
             print(
                 f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, "
@@ -618,10 +846,13 @@ def _run_local_eval(self) -> None:  # noqa: C901
                 "steps": step_id,
                 "episode_instruction": episode_instruction,
             }
+            if 'ndtw' in metrics:
+                result['ndtw'] = metrics['ndtw']
+
             os.makedirs(self.output_path, exist_ok=True)
             with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
                 f.write(json.dumps(result) + "\n")
-            if self.save_video:
+            if self.save_video and metrics['success'] == 1.0:
                 images_to_video(
                     vis_frames,
                     os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
@@ -638,14 +869,5 @@ def _run_local_eval(self) -> None:  # noqa: C901
             torch.tensor(spls).to(self.device),
             torch.tensor(oss).to(self.device),
             torch.tensor(nes).to(self.device),
-            torch.tensor(len(sucs)).to(self.device),
+            torch.tensor(ndtw).to(self.device) if 'ndtw' in metrics else None,
         )
-
-    def parse_actions(self, output):
-        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
-        # import ipdb; ipdb.set_trace()
-        regex = re.compile(action_patterns)
-        matches = regex.findall(output)
-        actions = [self.actions2idx[match] for match in matches]
-        actions = itertools.chain.from_iterable(actions)
-        return list(actions)
diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
index d44a5aa..a6b40bc 100644
--- a/scripts/eval/configs/habitat_dual_system_cfg.py
+++ b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -12,8 +12,6 @@
             "num_history": 8,
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
-            "predict_step_nums": 32,  # number of steps to predict
-            "continuous_traj": True,  # whether to use continuous trajectory
             "max_new_tokens": 1024,  # maximum number of tokens for generation
         },
     ),
diff --git a/scripts/eval/configs/habitat_s2_cfg.py b/scripts/eval/configs/habitat_s2_cfg.py
index 4a794c5..5222b36 100644
--- a/scripts/eval/configs/habitat_s2_cfg.py
+++ b/scripts/eval/configs/habitat_s2_cfg.py
@@ -12,8 +12,6 @@
             "num_history": 8,
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
-            "predict_step_nums": 32,  # number of steps to predict
-            "continuous_traj": True,  # whether to use continuous trajectory
             "max_new_tokens": 1024,  # maximum number of tokens for generation
         },
     ),

From ed992d9063082c6045407bfab3afce06f17e92f8 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Fri, 5 Dec 2025 09:15:12 +0000
Subject: [PATCH 03/12] update unused code; update setup.py and requirements
 for dtw

---
 internnav/evaluator/habitat_vln_evaluator.py  | 802 ------------------
 .../habitat_vln_evaluator.py                  |   1 -
 internnav/habitat_extensions/measures.py      | 434 ----------
 requirements/core_requirements.txt            |   2 +-
 requirements/habitat_requirements.txt         |   8 +
 .../eval/configs/habitat_dual_system_cfg.py   |   4 +-
 scripts/eval/configs/habitat_s2_cfg.py        |   4 +-
 setup.py                                      |   5 +-
 8 files changed, 15 insertions(+), 1245 deletions(-)
 delete mode 100644 internnav/evaluator/habitat_vln_evaluator.py

diff --git a/internnav/evaluator/habitat_vln_evaluator.py b/internnav/evaluator/habitat_vln_evaluator.py
deleted file mode 100644
index 44f9747..0000000
--- a/internnav/evaluator/habitat_vln_evaluator.py
+++ /dev/null
@@ -1,802 +0,0 @@
-import argparse
-import copy
-import itertools
-import json
-import os
-import random
-import re
-import sys
-from collections import OrderedDict
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-import quaternion
-import torch
-import tqdm
-from depth_camera_filtering import filter_depth
-from PIL import Image
-
-project_root = Path(__file__).parent.parent.parent
-sys.path.append(str(project_root))
-
-import habitat
-from habitat import Env
-from habitat.config.default import get_agent_config
-from habitat.config.default_structured_configs import (
-    CollisionsMeasurementConfig,
-    FogOfWarConfig,
-    TopDownMapMeasurementConfig,
-)
-from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
-from habitat.utils.visualizations.utils import images_to_video, observations_to_image
-from habitat_baselines.config.default import get_config as get_habitat_config
-from omegaconf import OmegaConf
-
-from internnav.model.utils.vln_utils import split_and_clean, traj_to_actions
-from internnav.utils.dist import *  # noqa: F403
-
-DEFAULT_IMAGE_TOKEN = "<image>"
-
-MAX_STEPS = 8
-MAX_LOCAL_STEPS = 4
-
-
-class VLNEvaluator:
-    def __init__(
-        self,
-        config_path: str,
-        split: str = "val_unseen",
-        env_num: int = 1,
-        output_path: str = None,
-        model: Any = None,
-        processor: Any = None,
-        epoch: int = 0,
-        args: argparse.Namespace = None,
-    ):
-        self.args = args
-        self.device = torch.device('cuda')
-        self.split = split
-        self.env_num = env_num
-        self.save_video = args.save_video
-        self.output_path = output_path
-        self.epoch = epoch
-        self.config_path = config_path
-        self.config = get_habitat_config(config_path)
-        self.agent_config = get_agent_config(self.config.habitat.simulator)
-        self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
-
-        with habitat.config.read_write(self.config):
-            self.config.habitat.dataset.split = self.split
-            self.config.habitat.task.measurements.update(
-                {
-                    "top_down_map": TopDownMapMeasurementConfig(
-                        map_padding=3,
-                        map_resolution=1024,
-                        draw_source=True,
-                        draw_border=True,
-                        draw_shortest_path=True,
-                        draw_view_points=True,
-                        draw_goal_positions=True,
-                        draw_goal_aabbs=True,
-                        fog_of_war=FogOfWarConfig(
-                            draw=True,
-                            visibility_dist=5.0,
-                            fov=90,
-                        ),
-                    ),
-                    "collisions": CollisionsMeasurementConfig(),
-                }
-            )
-
-        print(f"config = {type(self.config)}")
-        print(OmegaConf.to_yaml(self.config))
-
-        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
-        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
-        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
-
-        camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
-        self._camera_fov = camera_fov_rad
-        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
-
-        self.model = model
-        self.processor = processor
-        prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."  # noqa: F541
-        answer = ""
-        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
-
-        self.conjunctions = [
-            'you can see ',
-            'in front of you is ',
-            'there is ',
-            'you can spot ',
-            'you are toward the ',
-            'ahead of you is ',
-            'in your sight is ',
-        ]
-
-        self.actions2idx = OrderedDict(
-            {
-                'STOP': [0],
-                "↑": [1],
-                "←": [2],
-                "→": [3],
-                "↓": [5],
-            }
-        )
-
-        self.num_frames = args.num_frames
-        self.num_history = args.num_history
-
-    def config_env(self) -> Env:
-        env = Env(config=self.config)
-        return env
-
-    def eval_dual_system(self, idx) -> None:  # noqa: C901
-        self.model.eval()
-        env = self.config_env()
-        scene_episode_dict = {}
-        for episode in env.episodes:
-            if episode.scene_id not in scene_episode_dict:
-                scene_episode_dict[episode.scene_id] = []
-            scene_episode_dict[episode.scene_id].append(episode)
-
-        sucs, spls, oss, nes, dtws = [], [], [], [], []
-        done_res = []
-        metrics = {}
-        if os.path.exists(os.path.join(self.output_path, 'result.json')):
-            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
-                for line in f.readlines():
-                    try:
-                        res = json.loads(line)
-                    except:  # noqa: E722
-                        continue
-                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
-                    sucs.append(res['success'])
-                    spls.append(res['spl'])
-                    oss.append(res['os'])
-                    nes.append(res['ne'])
-                    if 'ndtw' in res:
-                        dtws.append(res['ndtw'])
-
-        for scene in sorted(scene_episode_dict.keys()):
-            episodes = scene_episode_dict[scene]
-            scene_id = scene.split('/')[-2]
-            process_bar = tqdm.tqdm(range(len(episodes[idx :: self.env_num])), desc=f"scene {scene_id}")
-            for episode in episodes[idx :: self.env_num]:
-                episode_instruction = (
-                    episode.instruction.instruction_text
-                    if 'objectnav' not in self.config_path
-                    else episode.object_category
-                )
-                episode_id = int(episode.episode_id)
-                if [scene_id, episode_id, episode_instruction] in done_res:
-                    continue
-
-                env.current_episode = episode
-                observations = env.reset()
-
-                agent_state = env.sim.get_agent_state()
-                rotation = agent_state.rotation
-                translation = agent_state.position
-                rotation_matrix = quaternion.as_rotation_matrix(rotation)
-                transformation_matrix = np.eye(4)
-                transformation_matrix[:3, :3] = rotation_matrix
-                transformation_matrix[:3, 3] = translation
-
-                os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
-                Image.fromarray(observations['rgb']).save(
-                    os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
-                )
-
-                vis_frames = []
-                step_id = 0
-
-                if self.save_video:
-                    os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
-
-                rgb_list = []
-                action_seq = []
-                output_ids = None
-
-                pixel_goal = None
-                action = None
-                messages = []
-                local_actions = []
-                flag = False
-                while not env.episode_over and step_id <= 500:
-                    rgb = observations["rgb"]
-                    depth = observations["depth"]  # optional
-                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                    depth = depth * 1000
-
-                    image = Image.fromarray(rgb).convert('RGB')
-                    save_raw_image = image.copy()
-
-                    if action == 5:
-                        look_down_image = image
-                        save_raw_image = look_down_image.copy()
-                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                            do_depth_scale=True,
-                            depth_scale=1000,
-                            target_height=224,
-                            target_width=224,
-                        )
-                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                        look_down_depth[look_down_depth > 5.0] = 5.0
-                    else:
-                        image = image.resize((self.args.resize_w, self.args.resize_h))
-                        rgb_list.append(image)
-
-                        down_observations = env.step(5)
-                        down_observations = env.step(5)
-
-                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
-
-                        depth = down_observations["depth"]
-                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                        depth = depth * 1000
-                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                            do_depth_scale=True,
-                            depth_scale=1000,
-                            target_height=224,
-                            target_width=224,
-                        )
-                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                        # depth clip to 5m
-                        look_down_depth[look_down_depth > 5.0] = 5.0
-
-                        env.step(4)
-                        env.step(4)
-
-                    info = env.get_metrics()
-
-                    if len(action_seq) == 0 and pixel_goal is None:
-                        if action != 5:
-                            sources = copy.deepcopy(self.conversation)
-                            sources[0]["value"] = sources[0]["value"].replace(
-                                '<instruction>.', episode.instruction.instruction_text[:-1]
-                            )
-                            cur_images = rgb_list[-1:]
-                            if step_id == 0:
-                                history_id = []
-                            else:
-                                history_id = np.unique(
-                                    np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
-                                ).tolist()
-                                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
-                                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-
-                            history_id = sorted(history_id)
-                            input_images = [rgb_list[i] for i in history_id] + cur_images
-                            input_img_id = 0
-                        else:
-                            assert action == 5  # last action is look down
-                            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-                            input_images += [look_down_image]
-                            messages.append(
-                                {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
-                            )
-                            input_img_id = -1
-
-                        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-                        sources[0]["value"] += f" {prompt}."
-                        prompt_instruction = copy.deepcopy(sources[0]["value"])
-                        parts = split_and_clean(prompt_instruction)
-
-                        content = []
-                        for i in range(len(parts)):
-                            if parts[i] == "<image>":
-                                content.append({"type": "image", "image": input_images[input_img_id]})
-                                input_img_id += 1
-                            else:
-                                content.append({"type": "text", "text": parts[i]})
-
-                        messages.append({'role': 'user', 'content': content})
-
-                        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-                        inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(
-                            self.model.device
-                        )
-
-                        with torch.no_grad():
-                            outputs = self.model.generate(
-                                **inputs,
-                                max_new_tokens=128,
-                                do_sample=False,
-                                use_cache=True,
-                                past_key_values=None,
-                                return_dict_in_generate=True,
-                            )
-                            output_ids = outputs.sequences
-
-                        llm_outputs = self.processor.tokenizer.decode(
-                            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
-                        )
-                        print('step_id:', step_id, 'output text:', llm_outputs)
-                        if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
-                            forward_action = 0
-                            coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
-
-                            pixel_goal = [int(coord[1]), int(coord[0])]
-
-                            env.step(4)
-                            env.step(4)
-
-                            local_actions = []
-                            pixel_values = inputs.pixel_values
-                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-
-                            with torch.no_grad():
-                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-
-                            image_dp = (
-                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                            )
-                            pix_goal_image = copy.copy(image_dp)
-                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                            pix_goal_depth = copy.copy(depth_dp)
-                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-
-                            with torch.no_grad():
-                                dp_actions = self.model.generate_traj(traj_latents, images_dp, depths_dp)
-
-                            action_list = traj_to_actions(dp_actions)
-                            if len(action_list) < MAX_STEPS:
-                                action_list += [0] * (MAX_STEPS - len(action_list))
-
-                            local_actions = action_list
-                            if len(local_actions) >= MAX_LOCAL_STEPS:
-                                local_actions = local_actions[:MAX_LOCAL_STEPS]
-
-                            action = local_actions[0]
-                            if action == 0:
-                                pixel_goal = None
-                                output_ids = None
-                                action = 2
-                                observations = env.step(action)
-                                step_id += 1
-                                messages = []
-                                continue
-                            print('predicted goal', pixel_goal, flush=True)
-
-                        else:
-                            action_seq = self.parse_actions(llm_outputs)
-                            print('actions', action_seq, flush=True)
-
-                    if len(action_seq) != 0:
-                        action = action_seq[0]
-                        action_seq.pop(0)
-                    elif pixel_goal is not None:
-                        if len(local_actions) == 0:
-                            local_actions = []
-                            image_dp = (
-                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                            )
-
-                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-                            with torch.no_grad():
-                                dp_actions = self.model.generate_traj(traj_latents, images_dp, depths_dp)
-
-                            action_list = traj_to_actions(dp_actions)
-                            if len(action_list) < MAX_STEPS:
-                                action_list += [0] * (MAX_STEPS - len(action_list))
-
-                            local_actions = action_list
-                            if len(local_actions) >= MAX_LOCAL_STEPS:
-                                local_actions = local_actions[:MAX_LOCAL_STEPS]
-                            print("local_actions", local_actions)
-                            action = local_actions.pop(0)
-                        else:
-                            action = local_actions.pop(0)
-
-                        forward_action += 1
-                        if forward_action > MAX_STEPS:
-                            pixel_goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            local_actions = []
-                            continue
-                        if action == 0:
-                            pixel_goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            local_actions = []
-                            continue
-                    else:
-                        action = 0
-
-                    if info['top_down_map'] is not None:
-                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
-                        if pixel_goal is not None and flag:
-                            cv2.circle(frame, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
-                        vis_frames.append(frame)
-
-                    print("step_id", step_id, "action", action)
-
-                    if action == 5:
-                        env.step(action)
-                        observations = env.step(action)
-                        flag = True
-                    else:
-                        observations = env.step(action)
-                        step_id += 1
-                        messages = []
-                        flag = False
-
-                process_bar.update(1)
-
-                metrics = env.get_metrics()
-                if self.save_video:
-                    images_to_video(
-                        vis_frames,
-                        os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
-                        f'{episode_id:04d}',
-                        fps=6,
-                        quality=9,
-                    )
-                vis_frames.clear()
-                sucs.append(metrics['success'])
-                spls.append(metrics['spl'])
-                oss.append(metrics['oracle_success'])
-                nes.append(metrics["distance_to_goal"])
-                if 'ndtw' in metrics:
-                    dtws.append(metrics["ndtw"])
-                print(
-                    f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
-                )
-
-                result = {
-                    "scene_id": scene_id,
-                    "episode_id": episode_id,
-                    "success": metrics["success"],
-                    "spl": metrics["spl"],
-                    "os": metrics['oracle_success'],
-                    "ne": metrics["distance_to_goal"],
-                    "steps": step_id,
-                    "episode_instruction": episode_instruction,
-                }
-                if 'ndtw' in metrics:
-                    result['ndtw'] = metrics['ndtw']
-                with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
-                    f.write(json.dumps(result) + "\n")
-        env.close()
-        return (
-            torch.tensor(sucs).to(self.device),
-            torch.tensor(spls).to(self.device),
-            torch.tensor(oss).to(self.device),
-            torch.tensor(nes).to(self.device),
-            torch.tensor(dtws).to(self.device) if 'ndtw' in metrics else None,
-            torch.tensor(len(sucs)).to(self.device),
-        )
-
-    def eval_system2(self, idx) -> None:  # noqa: C901
-        self.model.eval()
-        env = self.config_env()
-        scene_episode_dict = {}
-        for episode in env.episodes:
-            if episode.scene_id not in scene_episode_dict:
-                scene_episode_dict[episode.scene_id] = []
-            scene_episode_dict[episode.scene_id].append(episode)
-
-        intrinsic_matrix = self.get_intrinsic_matrix(
-            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
-        )
-        sucs, spls, oss, nes, ndtw = [], [], [], [], []
-        done_res = []
-
-        if os.path.exists(os.path.join(self.output_path, 'result.json')):
-            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
-                for line in f.readlines():
-                    res = json.loads(line)
-                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
-                    if get_rank() == 0:  # noqa: F405
-                        sucs.append(res['success'])
-                        spls.append(res['spl'])
-                        oss.append(res['os'])
-                        nes.append(res['ne'])
-                        if 'ndtw' in res:
-                            ndtw.append(res['ndtw'])
-
-        for scene in sorted(scene_episode_dict.keys()):
-            episodes = scene_episode_dict[scene]
-            scene_id = scene.split('/')[-2]
-            print(f"scene_id = {scene_id}")
-            process_bar = tqdm.tqdm(range(len(episodes[idx :: self.env_num])), desc=f"scene {scene_id}")
-            for episode in episodes[idx :: self.env_num]:
-                metrics = env.get_metrics()
-                episode_instruction = (
-                    episode.instruction.instruction_text
-                    if 'objectnav' not in self.config_path
-                    else episode.object_category
-                )
-                print("episode start", episode_instruction)
-                episode_id = int(episode.episode_id)
-                if [scene_id, episode_id, episode_instruction] in done_res:
-                    continue
-
-                env.current_episode = episode
-                observations = env.reset()
-
-                agent_state = env.sim.get_agent_state()
-                rotation = agent_state.rotation
-                translation = agent_state.position
-                rotation_matrix = quaternion.as_rotation_matrix(rotation)
-                transformation_matrix = np.eye(4)
-                transformation_matrix[:3, :3] = rotation_matrix
-                transformation_matrix[:3, 3] = translation
-
-                agent = ShortestPathFollower(env.sim, 0.25, False)
-
-                os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
-                Image.fromarray(observations['rgb']).save(
-                    os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
-                )
-
-                vis_frames = []
-                step_id = 0
-
-                if self.save_video:
-                    os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
-                initial_height = env.sim.get_agent_state().position[1]
-
-                rgb_list = []
-                action_seq = []
-                output_ids = None
-
-                goal = None
-                action = None
-                messages = []
-                flag = False
-
-                while not env.episode_over and step_id <= 500:
-                    rgb = observations["rgb"]
-                    depth = observations["depth"]
-                    x, y = observations["gps"]
-                    camera_yaw = observations["compass"][0]
-                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                    depth = depth * 1000
-
-                    agent_state = env.sim.get_agent_state()
-                    height = agent_state.position[1] - initial_height  # Habitat GPS makes west negative, so flip y
-                    camera_position = np.array([x, -y, self._camera_height + height])
-                    tf_camera_to_episodic = (
-                        self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
-                        @ self.get_axis_align_matrix()
-                    )
-
-                    image = Image.fromarray(rgb).convert('RGB')  # raw observation image
-                    save_raw_image = image.copy()
-
-                    if action == 5:
-                        look_down_image = image
-                        save_raw_image = look_down_image.copy()
-                    else:
-                        image = image.resize((self.args.resize_w, self.args.resize_h))
-                        rgb_list.append(image)
-
-                    info = env.get_metrics()
-
-                    if len(action_seq) == 0 and goal is None:
-                        if action != 5:
-                            sources = copy.deepcopy(self.conversation)
-                            if 'objectnav' in self.config_path:  # not exist?
-                                sources[0]["value"] = sources[0]["value"].replace(
-                                    '<instruction>.',
-                                    random.choice(self.objectnav_instructions).format(
-                                        target_object=episode.object_category.replace('_', ' ')
-                                    ),
-                                )
-                            else:
-                                sources[0]["value"] = sources[0]["value"].replace(
-                                    '<instruction>.', episode.instruction.instruction_text[:-1]
-                                )
-                            cur_images = rgb_list[-1:]  # current observation
-                            if step_id == 0:
-                                history_id = []
-                            else:
-                                history_id = np.unique(
-                                    np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
-                                ).tolist()
-                                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
-                                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-
-                            history_id = sorted(history_id)
-                            input_images = [rgb_list[i] for i in history_id] + cur_images
-                            input_img_id = 0
-                        else:
-                            assert action == 5  # last action is look down
-                            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-                            input_images += [look_down_image]
-                            messages.append(
-                                {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
-                            )
-                            input_img_id = -1
-
-                        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-                        sources[0]["value"] += f" {prompt}."
-                        print('sources', step_id, sources)
-                        prompt_instruction = copy.deepcopy(sources[0]["value"])
-                        parts = split_and_clean(prompt_instruction)
-
-                        content = []
-                        for i in range(len(parts)):
-                            if parts[i] == "<image>":
-                                content.append({"type": "image", "image": input_images[input_img_id]})
-                                input_img_id += 1
-                            else:
-                                content.append({"type": "text", "text": parts[i]})
-
-                        messages.append({'role': 'user', 'content': content})
-
-                        print('step_id', step_id, 'messages:', messages)
-
-                        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-                        inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(
-                            self.model.device
-                        )
-
-                        with torch.no_grad():
-                            output = self.model.generate(
-                                **inputs,
-                                max_new_tokens=128,
-                                do_sample=False,
-                                use_cache=True,
-                                past_key_values=None,
-                                return_dict_in_generate=True,
-                            )
-                            output_ids = output.sequences
-
-                        llm_outputs = self.processor.tokenizer.decode(
-                            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
-                        )
-                        print('step_id:', step_id, 'output text:', llm_outputs)
-
-                        if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
-                            forward_action = 0
-                            coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
-                            print('coords:', coord)
-
-                            pixel_goal = [int(coord[1]), int(coord[0])]  # switch the goal o
-
-                            # look down --> horizontal
-                            env.step(4)
-                            env.step(4)
-
-                            goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
-
-                            goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
-
-                            if not env.sim.pathfinder.is_navigable(np.array(goal)):
-                                goal = np.array(env.sim.pathfinder.snap_point(np.array(goal)))
-
-                            action = agent.get_next_action(goal)
-                            if action == 0:
-                                goal = None
-                                output_ids = None
-                                action = 2
-                                print('conduct a random action 2')
-                                observations = env.step(action)
-                                step_id += 1
-                                messages = []
-                                continue
-                            print('predicted goal', pixel_goal, goal, flush=True)
-                        else:
-                            action_seq = self.parse_actions(llm_outputs)
-                            print('actions', action_seq, flush=True)
-
-                    if len(action_seq) != 0:
-                        action = action_seq[0]
-                        action_seq.pop(0)
-                    elif goal is not None:
-                        action = agent.get_next_action(goal)
-                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
-                        action = action[0] if hasattr(action, "__len__") else action
-
-                        forward_action += 1
-                        print('forward_action', forward_action, flush=True)
-                        if forward_action > 8:
-                            goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            continue
-                        if action == 0:
-                            goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            continue
-                    else:
-                        action = 0
-
-                    if info['top_down_map'] is not None:
-                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
-                        if goal is not None and flag:
-                            cv2.circle(frame, (pixel_goal[0], pixel_goal[1]), radius=8, color=(255, 0, 0), thickness=-1)
-                        vis_frames.append(frame)
-
-                    print("step_id", step_id, "action", action)
-
-                    if action == 5:
-                        env.step(action)
-                        observations = env.step(action)
-                        flag = True
-                    else:
-                        observations = env.step(action)
-                        step_id += 1
-                        messages = []
-                        flag = False
-
-                process_bar.update(1)
-
-                metrics = env.get_metrics()
-                if self.save_video and metrics['success'] == 1.0:
-                    images_to_video(
-                        vis_frames,
-                        os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
-                        f'{episode_id:04d}',
-                        fps=6,
-                        quality=9,
-                    )
-                vis_frames.clear()
-                sucs.append(metrics['success'])
-                spls.append(metrics['spl'])
-                oss.append(metrics['oracle_success'])
-                nes.append(metrics["distance_to_goal"])
-                if 'ndtw' in metrics:
-                    ndtw.append(metrics["ndtw"])
-                print(
-                    f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
-                )
-
-                result = {
-                    "scene_id": scene_id,
-                    "episode_id": episode_id,
-                    "success": metrics["success"],
-                    "spl": metrics["spl"],
-                    "os": metrics['oracle_success'],
-                    "ne": metrics["distance_to_goal"],
-                    "steps": step_id,
-                    "episode_instruction": episode_instruction,
-                }
-                if 'ndtw' in metrics:
-                    result['ndtw'] = metrics['ndtw']
-
-                with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
-                    f.write(json.dumps(result) + "\n")
-
-        env.close()
-        return (
-            torch.tensor(sucs).to(self.device),
-            torch.tensor(spls).to(self.device),
-            torch.tensor(oss).to(self.device),
-            torch.tensor(nes).to(self.device),
-            torch.tensor(ndtw).to(self.device) if 'ndtw' in metrics else None,
-            torch.tensor(len(sucs)).to(self.device),
-        )
-
-    def parse_actions(self, output):
-        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
-        regex = re.compile(action_patterns)
-        matches = regex.findall(output)
-        actions = [self.actions2idx[match] for match in matches]
-        actions = itertools.chain.from_iterable(actions)
-        return list(actions)
diff --git a/internnav/habitat_extensions/habitat_vln_evaluator.py b/internnav/habitat_extensions/habitat_vln_evaluator.py
index 9855fe5..bdd56ba 100644
--- a/internnav/habitat_extensions/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/habitat_vln_evaluator.py
@@ -152,7 +152,6 @@ def __init__(self, cfg: EvalCfg):
             }
         )
 
-        self.num_frames = self.model_args.num_frames
         self.num_history = self.model_args.num_history
 
         self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
diff --git a/internnav/habitat_extensions/measures.py b/internnav/habitat_extensions/measures.py
index fb9e5c7..1e3dee6 100644
--- a/internnav/habitat_extensions/measures.py
+++ b/internnav/habitat_extensions/measures.py
@@ -173,440 +173,6 @@ def update_metric(self, *args: Any, **kwargs: Any):
 
         dtw_distance = self.dtw_func(self.locations, self.gt_locations, dist=euclidean_distance)[0]
 
-        # nDTW = np.exp(
-        #     -dtw_distance
-        #     / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
-        # )
-
         nDTW = np.exp(-dtw_distance / (len(self.gt_locations) * 3.0))  # HARDCODED
 
         self._metric = nDTW
-
-
-# import gzip
-# import json
-# import pickle
-# from dtw import dtw
-# from fastdtw import fastdtw
-# from habitat.config import Config
-# from utils import maps
-# from habitat_extensions.task import RxRVLNCEDatasetV1
-# from habitat.tasks.nav.nav import DistanceToGoal, Success
-# from habitat.tasks.utils import cartesian_to_polar
-# from habitat.utils.geometry_utils import quaternion_rotate_vector
-# from habitat.utils.visualizations import fog_of_war
-# from habitat.utils.visualizations import maps as habitat_maps
-# from habitat.core.dataset import Episode
-# from habitat.core.embodied_task import Action, EmbodiedTask, Measure
-# from habitat.core.logging import logger
-
-# @registry.register_measure
-# class WaypointRewardMeasure(Measure):
-#     """A reward measure used for training VLN-CE agents via RL."""
-
-#     def __init__(
-#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
-#     ) -> None:
-#         self._sim = sim
-#         self._slack_reward = config.slack_reward
-#         self._use_distance_scaled_slack_reward = (
-#             config.use_distance_scaled_slack_reward
-#         )
-#         self._scale_slack_on_prediction = config.scale_slack_on_prediction
-#         self._success_reward = config.success_reward
-#         self._distance_scalar = config.distance_scalar
-#         self._prev_position = None
-#         super().__init__()
-
-#     def reset_metric(
-#         self, *args: Any, task: EmbodiedTask, **kwargs: Any
-#     ) -> None:
-#         task.measurements.check_measure_dependencies(
-#             self.uuid, [DistanceToGoal.cls_uuid, Success.cls_uuid]
-#         )
-#         self._previous_distance_to_goal = task.measurements.measures[
-#             "distance_to_goal"
-#         ].get_metric()
-#         self._metric = 0.0
-#         self._prev_position = np.take(
-#             self._sim.get_agent_state().position, [0, 2]
-#         )
-
-#     def _get_scaled_slack_reward(self, action: Action) -> float:
-#         if isinstance(action["action"], int):
-#             return self._slack_reward
-
-#         if not self._use_distance_scaled_slack_reward:
-#             return self._slack_reward
-
-#         agent_pos = np.take(self._sim.get_agent_state().position, [0, 2])
-#         slack_distance = (
-#             action["action_args"]["r"]
-#             if self._scale_slack_on_prediction and action["action"] != "STOP"
-#             else np.linalg.norm(self._prev_position - agent_pos)
-#         )
-#         scaled_slack_reward = self._slack_reward * slack_distance / 0.25
-#         self._prev_position = agent_pos
-#         return min(self._slack_reward, scaled_slack_reward)
-
-#     def _progress_to_goal(self, task: EmbodiedTask) -> float:
-#         distance_to_goal = task.measurements.measures[
-#             "distance_to_goal"
-#         ].get_metric()
-#         distance_to_goal_delta = (
-#             self._previous_distance_to_goal - distance_to_goal
-#         )
-#         if np.isnan(distance_to_goal_delta) or np.isinf(
-#             distance_to_goal_delta
-#         ):
-#             l = self._sim.get_agent_state().position
-#             logger.error(
-#                 f"\nNaN or inf encountered in distance measure. agent location: {l}",
-#             )
-#             distance_to_goal_delta = -1.0
-#         self._previous_distance_to_goal = distance_to_goal
-#         return self._distance_scalar * distance_to_goal_delta
-
-#     def update_metric(
-#         self, *args: Any, action: Action, task: EmbodiedTask, **kwargs: Any
-#     ) -> None:
-#         reward = self._get_scaled_slack_reward(action)
-#         reward += self._progress_to_goal(task)
-#         reward += (
-#             self._success_reward
-#             * task.measurements.measures["success"].get_metric()
-#         )
-#         self._metric = reward
-
-#     @staticmethod
-#     def _get_uuid(*args: Any, **kwargs: Any) -> str:
-#         return "waypoint_reward_measure"
-
-
-# @registry.register_measure
-# class NDTW(Measure):
-#     """NDTW (Normalized Dynamic Time Warping)
-#     ref: https://arxiv.org/abs/1907.05446
-#     """
-
-#     cls_uuid: str = "ndtw"
-
-#     def __init__(
-#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
-#     ):
-#         self._sim = sim
-#         self._config = config
-#         self.dtw_func = fastdtw if config.FDTW else dtw
-
-#         if "{role}" in config.GT_PATH:
-#             self.gt_json = {}
-#             for role in RxRVLNCEDatasetV1.annotation_roles:
-#                 with gzip.open(
-#                     config.GT_PATH.format(split=config.SPLIT, role=role), "rt"
-#                 ) as f:
-#                     self.gt_json.update(json.load(f))
-#         else:
-#             with gzip.open(
-#                 config.GT_PATH.format(split=config.SPLIT), "rt"
-#             ) as f:
-#                 self.gt_json = json.load(f)
-
-#         super().__init__()
-
-#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
-#         return self.cls_uuid
-
-#     def reset_metric(self, *args: Any, episode, **kwargs: Any):
-#         self.locations = []
-#         self.gt_locations = self.gt_json[episode.episode_id]["locations"]
-#         self.update_metric()
-
-#     def update_metric(self, *args: Any, **kwargs: Any):
-#         current_position = self._sim.get_agent_state().position.tolist()
-#         if len(self.locations) == 0:
-#             self.locations.append(current_position)
-#         else:
-#             if current_position == self.locations[-1]:
-#                 return
-#             self.locations.append(current_position)
-
-#         dtw_distance = self.dtw_func(
-#             self.locations, self.gt_locations, dist=euclidean_distance
-#         )[0]
-
-#         nDTW = np.exp(
-#             -dtw_distance
-#             / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
-#         )
-#         self._metric = nDTW
-
-
-# @registry.register_measure
-# class SDTW(Measure):
-#     """SDTW (Success Weighted be nDTW)
-#     ref: https://arxiv.org/abs/1907.05446
-#     """
-
-#     cls_uuid: str = "sdtw"
-
-#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
-#         return self.cls_uuid
-
-#     def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
-#         task.measurements.check_measure_dependencies(
-#             self.uuid, [NDTW.cls_uuid, Success.cls_uuid]
-#         )
-#         self.update_metric(task=task)
-
-#     def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
-#         ep_success = task.measurements.measures[Success.cls_uuid].get_metric()
-#         nDTW = task.measurements.measures[NDTW.cls_uuid].get_metric()
-#         self._metric = ep_success * nDTW
-
-
-# @registry.register_measure
-# class TopDownMapVLNCE(Measure):
-#     """A top down map that optionally shows VLN-related visual information
-#     such as MP3D node locations and MP3D agent traversals.
-#     """
-
-#     cls_uuid: str = "top_down_map_vlnce"
-
-#     def __init__(
-#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
-#     ) -> None:
-#         self._sim = sim
-#         self._config = config
-#         self._step_count = None
-#         self._map_resolution = config.MAP_RESOLUTION
-#         self._previous_xy_location = None
-#         self._top_down_map = None
-#         self._meters_per_pixel = None
-#         self.current_node = ""
-#         with open(self._config.GRAPHS_FILE, "rb") as f:
-#             self._conn_graphs = pickle.load(f)
-#         super().__init__()
-
-#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
-#         return self.cls_uuid
-
-#     def get_original_map(self) -> ndarray:
-#         top_down_map = habitat_maps.get_topdown_map_from_sim(
-#             self._sim,
-#             map_resolution=self._map_resolution,
-#             draw_border=self._config.DRAW_BORDER,
-#             meters_per_pixel=self._meters_per_pixel,
-#         )
-
-#         self._fog_of_war_mask = None
-#         if self._config.FOG_OF_WAR.DRAW:
-#             self._fog_of_war_mask = np.zeros_like(top_down_map)
-
-#         return top_down_map
-
-#     def reset_metric(
-#         self, *args: Any, episode: Episode, **kwargs: Any
-#     ) -> None:
-#         self._scene_id = episode.scene_id.split("/")[-2]
-#         self._step_count = 0
-#         self._metric = None
-#         self._meters_per_pixel = habitat_maps.calculate_meters_per_pixel(
-#             self._map_resolution, self._sim
-#         )
-#         self._top_down_map = self.get_original_map()
-#         agent_position = self._sim.get_agent_state().position
-#         scene_id = episode.scene_id.split("/")[-1].split(".")[0]
-#         a_x, a_y = habitat_maps.to_grid(
-#             agent_position[2],
-#             agent_position[0],
-#             self._top_down_map.shape[0:2],
-#             sim=self._sim,
-#         )
-#         self._previous_xy_location = (a_y, a_x)
-
-#         if self._config.FOG_OF_WAR.DRAW:
-#             self._fog_of_war_mask = fog_of_war.reveal_fog_of_war(
-#                 self._top_down_map,
-#                 self._fog_of_war_mask,
-#                 np.array([a_x, a_y]),
-#                 self.get_polar_angle(),
-#                 fov=self._config.FOG_OF_WAR.FOV,
-#                 max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST
-#                 / habitat_maps.calculate_meters_per_pixel(
-#                     self._map_resolution, sim=self._sim
-#                 ),
-#             )
-
-#         if self._config.DRAW_FIXED_WAYPOINTS:
-#             maps.draw_mp3d_nodes(
-#                 self._top_down_map,
-#                 self._sim,
-#                 episode,
-#                 self._conn_graphs[scene_id],
-#                 self._meters_per_pixel,
-#             )
-
-#         if self._config.DRAW_SHORTEST_PATH:
-#             shortest_path_points = self._sim.get_straight_shortest_path_points(
-#                 agent_position, episode.goals[0].position
-#             )
-#             maps.draw_straight_shortest_path_points(
-#                 self._top_down_map,
-#                 self._sim,
-#                 self._map_resolution,
-#                 shortest_path_points,
-#             )
-
-#         if self._config.DRAW_REFERENCE_PATH:
-#             maps.draw_reference_path(
-#                 self._top_down_map,
-#                 self._sim,
-#                 episode,
-#                 self._map_resolution,
-#                 self._meters_per_pixel,
-#             )
-
-#         # draw source and target points last to avoid overlap
-#         if self._config.DRAW_SOURCE_AND_TARGET:
-#             maps.draw_source_and_target(
-#                 self._top_down_map,
-#                 self._sim,
-#                 episode,
-#                 self._meters_per_pixel,
-#             )
-
-#         # MP3D START NODE
-#         self._nearest_node = maps.get_nearest_node(
-#             self._conn_graphs[scene_id], np.take(agent_position, (0, 2))
-#         )
-#         nn_position = self._conn_graphs[self._scene_id].nodes[
-#             self._nearest_node
-#         ]["position"]
-#         self.s_x, self.s_y = habitat_maps.to_grid(
-#             nn_position[2],
-#             nn_position[0],
-#             self._top_down_map.shape[0:2],
-#             self._sim,
-#         )
-#         self.update_metric()
-
-#     def update_metric(self, *args: Any, **kwargs: Any) -> None:
-#         self._step_count += 1
-#         (
-#             house_map,
-#             map_agent_pos,
-#         ) = self.update_map(self._sim.get_agent_state().position)
-
-#         self._metric = {
-#             "map": house_map,
-#             "fog_of_war_mask": self._fog_of_war_mask,
-#             "agent_map_coord": map_agent_pos,
-#             "agent_angle": self.get_polar_angle(),
-#             "bounds": {
-#                 k: v
-#                 for k, v in zip(
-#                     ["lower", "upper"],
-#                     self._sim.pathfinder.get_bounds(),
-#                 )
-#             },
-#             "meters_per_px": self._meters_per_pixel,
-#         }
-
-#     def get_polar_angle(self) -> float:
-#         agent_state = self._sim.get_agent_state()
-#         # quaternion is in x, y, z, w format
-#         ref_rotation = agent_state.rotation
-
-#         heading_vector = quaternion_rotate_vector(
-#             ref_rotation.inverse(), np.array([0, 0, -1])
-#         )
-
-#         phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
-#         z_neg_z_flip = np.pi
-#         return np.array(phi) + z_neg_z_flip
-
-#     def update_map(self, agent_position: List[float]) -> None:
-#         a_x, a_y = habitat_maps.to_grid(
-#             agent_position[2],
-#             agent_position[0],
-#             self._top_down_map.shape[0:2],
-#             self._sim,
-#         )
-#         # Don't draw over the source point
-#         gradient_color = 15 + min(
-#             self._step_count * 245 // self._config.MAX_EPISODE_STEPS, 245
-#         )
-#         if self._top_down_map[a_x, a_y] != maps.MAP_SOURCE_POINT_INDICATOR:
-#             maps.drawline(
-#                 self._top_down_map,
-#                 self._previous_xy_location,
-#                 (a_y, a_x),
-#                 gradient_color,
-#                 thickness=int(
-#                     self._map_resolution * 1.4 / maps.MAP_THICKNESS_SCALAR
-#                 ),
-#                 style="filled",
-#             )
-
-#         if self._config.FOG_OF_WAR.DRAW:
-#             self._fog_of_war_mask = fog_of_war.reveal_fog_of_war(
-#                 self._top_down_map,
-#                 self._fog_of_war_mask,
-#                 np.array([a_x, a_y]),
-#                 self.get_polar_angle(),
-#                 self._config.FOG_OF_WAR.FOV,
-#                 max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST
-#                 / habitat_maps.calculate_meters_per_pixel(
-#                     self._map_resolution, sim=self._sim
-#                 ),
-#             )
-
-#         point_padding = int(0.2 / self._meters_per_pixel)
-#         prev_nearest_node = self._nearest_node
-#         self._nearest_node = maps.update_nearest_node(
-#             self._conn_graphs[self._scene_id],
-#             self._nearest_node,
-#             np.take(agent_position, (0, 2)),
-#         )
-#         if (
-#             self._nearest_node != prev_nearest_node
-#             and self._config.DRAW_MP3D_AGENT_PATH
-#         ):
-#             nn_position = self._conn_graphs[self._scene_id].nodes[
-#                 self._nearest_node
-#             ]["position"]
-#             (prev_s_x, prev_s_y) = (self.s_x, self.s_y)
-#             self.s_x, self.s_y = habitat_maps.to_grid(
-#                 nn_position[2],
-#                 nn_position[0],
-#                 self._top_down_map.shape[0:2],
-#                 self._sim,
-#             )
-#             self._top_down_map[
-#                 self.s_x
-#                 - int(2.0 / 3.0 * point_padding) : self.s_x
-#                 + int(2.0 / 3.0 * point_padding)
-#                 + 1,
-#                 self.s_y
-#                 - int(2.0 / 3.0 * point_padding) : self.s_y
-#                 + int(2.0 / 3.0 * point_padding)
-#                 + 1,
-#             ] = gradient_color
-
-#             maps.drawline(
-#                 self._top_down_map,
-#                 (prev_s_y, prev_s_x),
-#                 (self.s_y, self.s_x),
-#                 gradient_color,
-#                 thickness=int(
-#                     1.0
-#                     / 2.0
-#                     * np.round(
-#                         self._map_resolution / maps.MAP_THICKNESS_SCALAR
-#                     )
-#                 ),
-#             )
-
-#         self._previous_xy_location = (a_y, a_x)
-#         map_agent_pos = (a_x, a_y)
-#         return self._top_down_map, map_agent_pos
diff --git a/requirements/core_requirements.txt b/requirements/core_requirements.txt
index 26982e3..f8aada5 100644
--- a/requirements/core_requirements.txt
+++ b/requirements/core_requirements.txt
@@ -35,7 +35,7 @@ jsonschema-specifications>=2023.12.1,<2026.0.0
 msgpack-numpy>=0.4.8,<0.5
 
 # numpy wheels:
-numpy>=1.26,<1.27
+numpy>=1.26
 
 nvsmi>=0.4.2,<0.5
 
diff --git a/requirements/habitat_requirements.txt b/requirements/habitat_requirements.txt
index e69de29..9952df3 100644
--- a/requirements/habitat_requirements.txt
+++ b/requirements/habitat_requirements.txt
@@ -0,0 +1,8 @@
+accelerate==1.4.0
+depth-camera-filtering @ git+https://github.com/naokiyokoyama/depth_camera_filtering.git
+diffusers==0.33.1
+diffusion_policy @ git+https://github.com/real-stanford/diffusion_policy.git@5ba07ac6661db573af695b419a7947ecb704690f
+flash_attn==2.7.4.post1
+ftfy==6.3.1
+transformers==4.51.0
+dtw-python==1.7.2
diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
index a6b40bc..2ba24b3 100644
--- a/scripts/eval/configs/habitat_dual_system_cfg.py
+++ b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -6,9 +6,7 @@
         model_name='internvla_n1',
         model_settings={
             "mode": "dual_system",  # inference mode: dual_system or system2
-            "model_path": "checkpoints/InternVLA-N1-DualVLN",  # path to model checkpoint
-            "num_future_steps": 4,  # number of future steps for prediction
-            "num_frames": 32,  # number of frames used in evaluation
+            "model_path": "checkpoints/checkpoints_weimeng/InternVLA-N1-DualVLN",  # path to model checkpoint
             "num_history": 8,
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
diff --git a/scripts/eval/configs/habitat_s2_cfg.py b/scripts/eval/configs/habitat_s2_cfg.py
index 5222b36..4c6613a 100644
--- a/scripts/eval/configs/habitat_s2_cfg.py
+++ b/scripts/eval/configs/habitat_s2_cfg.py
@@ -6,9 +6,7 @@
         model_name='internvla_n1',
         model_settings={
             "mode": "system2",  # inference mode: dual_system or system2
-            "model_path": "checkpoints/InternVLA-N1-System2",  # path to model checkpoint
-            "num_future_steps": 4,  # number of future steps for prediction
-            "num_frames": 32,  # number of frames used in evaluation
+            "model_path": "checkpoints/checkpoints_weimeng/InternVLA-N1-System2",  # path to model checkpoint
             "num_history": 8,
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
diff --git a/setup.py b/setup.py
index ac0a187..0d92789 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,9 @@ def parse_readme(readme: str) -> str:
 with open('requirements/isaac_requirements.txt', 'r') as f:
     isaac_requires = f.read().splitlines()
 
+with open('requirements/habitat_requirements.txt', 'r') as f:
+    habitat_requires = f.read().splitlines()
+
 with open('requirements/internvla_n1.txt', 'r') as f:
     n1_requires = f.read().splitlines()
 
@@ -74,7 +77,7 @@ def parse_readme(readme: str) -> str:
     extras_require={
         # envs
         "isaac": isaac_requires,
-        "habitat": [],
+        "habitat": habitat_requires,
         "demo": [
             "gradio==5.45",
             "hf-xet==1.1.5",

From 2bc96ccd3cabca0f6743f3594345d788da7bb330 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Fri, 5 Dec 2025 10:16:18 +0000
Subject: [PATCH 04/12] update requirements

---
 internnav/habitat_extensions/habitat_vln_evaluator.py | 2 +-
 requirements/core_requirements.txt                    | 2 +-
 requirements/habitat_requirements.txt                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/internnav/habitat_extensions/habitat_vln_evaluator.py b/internnav/habitat_extensions/habitat_vln_evaluator.py
index bdd56ba..15621e7 100644
--- a/internnav/habitat_extensions/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/habitat_vln_evaluator.py
@@ -770,7 +770,7 @@ def _run_eval_system2(self) -> tuple:
                 if len(action_seq) != 0:
                     action = action_seq[0]
                     action_seq.pop(0)
-                elif pixel_goal is not None:
+                elif goal is not None:
                     action = agent.get_next_action(goal)
                     action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
                     action = action[0] if hasattr(action, "__len__") else action
diff --git a/requirements/core_requirements.txt b/requirements/core_requirements.txt
index f8aada5..5946bd6 100644
--- a/requirements/core_requirements.txt
+++ b/requirements/core_requirements.txt
@@ -40,7 +40,7 @@ numpy>=1.26
 nvsmi>=0.4.2,<0.5
 
 # opencv wheels are python- and platform-specific
-opencv-python-headless>=4.9.0.80,<4.10.0.0
+opencv-python-headless>=4.9.0.80
 
 packaging>=23.0,<25
 
diff --git a/requirements/habitat_requirements.txt b/requirements/habitat_requirements.txt
index 9952df3..474a391 100644
--- a/requirements/habitat_requirements.txt
+++ b/requirements/habitat_requirements.txt
@@ -5,4 +5,4 @@ diffusion_policy @ git+https://github.com/real-stanford/diffusion_policy.git@5ba
 flash_attn==2.7.4.post1
 ftfy==6.3.1
 transformers==4.51.0
-dtw-python==1.7.2
+dtw==1.4.0

From 0f1d8a718f6278a6c3e4a02a3f3c3dadcfa080ca Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 8 Dec 2025 06:46:05 +0000
Subject: [PATCH 05/12] update diffuser version

---
 internnav/habitat_extensions/habitat_vln_evaluator.py | 5 ++++-
 requirements/habitat_requirements.txt                 | 2 +-
 requirements/model_requirements.txt                   | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/internnav/habitat_extensions/habitat_vln_evaluator.py b/internnav/habitat_extensions/habitat_vln_evaluator.py
index 15621e7..c20ab98 100644
--- a/internnav/habitat_extensions/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/habitat_vln_evaluator.py
@@ -544,7 +544,7 @@ def _run_eval_dual_system(self) -> tuple:
                 f"ne: {metrics['distance_to_goal']}"
             )
 
-            # Write per-episode result.json entry (still per-rank)
+            # Write per-episode progress.json entry (still per-rank)
             result = {
                 "scene_id": scene_id,
                 "episode_id": episode_id,
@@ -558,9 +558,12 @@ def _run_eval_dual_system(self) -> tuple:
             if 'ndtw' in metrics:
                 result['ndtw'] = metrics['ndtw']
 
+            # save current progress
             os.makedirs(self.output_path, exist_ok=True)
             with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
                 f.write(json.dumps(result) + "\n")
+
+            # save video
             if self.save_video and metrics['success'] == 1.0:
                 images_to_video(
                     vis_frames,
diff --git a/requirements/habitat_requirements.txt b/requirements/habitat_requirements.txt
index 474a391..cde0431 100644
--- a/requirements/habitat_requirements.txt
+++ b/requirements/habitat_requirements.txt
@@ -1,6 +1,6 @@
 accelerate==1.4.0
 depth-camera-filtering @ git+https://github.com/naokiyokoyama/depth_camera_filtering.git
-diffusers==0.33.1
+diffusers==0.32.2
 diffusion_policy @ git+https://github.com/real-stanford/diffusion_policy.git@5ba07ac6661db573af695b419a7947ecb704690f
 flash_attn==2.7.4.post1
 ftfy==6.3.1
diff --git a/requirements/model_requirements.txt b/requirements/model_requirements.txt
index 8a3f0a4..465e5f9 100644
--- a/requirements/model_requirements.txt
+++ b/requirements/model_requirements.txt
@@ -3,7 +3,7 @@ azure-core==1.28.0
 azure-identity==1.13.0
 azure-storage-blob==12.17.0
 depth-camera-filtering @ git+https://github.com/naokiyokoyama/depth_camera_filtering.git
-diffusers==0.33.1
+diffusers==0.32.2
 diffusion_policy @ git+https://github.com/real-stanford/diffusion_policy.git@5ba07ac6661db573af695b419a7947ecb704690f
 distro==1.9.0
 docstring_parser==0.16

From cf00d456f93e2f41edf95659dcc2af75e1cada15 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 8 Dec 2025 07:38:29 +0000
Subject: [PATCH 06/12] test r2r, fix import utils

---
 .../habitat_vln_evaluator.py                  | 19 ++++++++++++-------
 internnav/habitat_extensions/measures.py      |  3 ++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/internnav/habitat_extensions/habitat_vln_evaluator.py b/internnav/habitat_extensions/habitat_vln_evaluator.py
index c20ab98..c8b5ab8 100644
--- a/internnav/habitat_extensions/habitat_vln_evaluator.py
+++ b/internnav/habitat_extensions/habitat_vln_evaluator.py
@@ -2,7 +2,7 @@
 import json
 import os
 import sys
-from enum import Enum
+from enum import IntEnum
 
 sys.path.append('./src/diffusion-policy')
 import copy
@@ -32,7 +32,13 @@
 
 from internnav.configs.evaluator import EvalCfg
 from internnav.evaluator import DistributedEvaluator, Evaluator
-from internnav.habitat_extensions.utils import preprocess_depth_image_v2
+from internnav.habitat_extensions.utils import (
+    get_axis_align_matrix,
+    get_intrinsic_matrix,
+    pixel_to_gps,
+    preprocess_depth_image_v2,
+    xyz_yaw_pitch_to_tf_matrix,
+)
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
 from internnav.model.utils.vln_utils import split_and_clean, traj_to_actions
 
@@ -46,7 +52,7 @@
 MAX_LOCAL_STEPS = 4
 
 
-class action_code(Enum):
+class action_code(IntEnum):
     STOP = 0
     FORWARD = 1
     LEFT = 2
@@ -618,7 +624,7 @@ def _run_eval_system2(self) -> tuple:
 
             agent = ShortestPathFollower(self.env._env.sim, 0.25, False)
 
-            intrinsic_matrix = self.get_intrinsic_matrix(
+            intrinsic_matrix = get_intrinsic_matrix(
                 self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
             )
 
@@ -662,8 +668,7 @@ def _run_eval_system2(self) -> tuple:
                 height = agent_state.position[1] - initial_height  # Habitat GPS makes west negative, so flip y
                 camera_position = np.array([x, -y, self._camera_height + height])
                 tf_camera_to_episodic = (
-                    self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
-                    @ self.get_axis_align_matrix()
+                    xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30)) @ get_axis_align_matrix()
                 )
 
                 image = Image.fromarray(rgb).convert('RGB')
@@ -748,7 +753,7 @@ def _run_eval_system2(self) -> tuple:
                         self.env.step(action_code.LOOKUP)
                         self.env.step(action_code.LOOKUP)
 
-                        goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
+                        goal = pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
 
                         goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
 
diff --git a/internnav/habitat_extensions/measures.py b/internnav/habitat_extensions/measures.py
index 1e3dee6..19bd123 100644
--- a/internnav/habitat_extensions/measures.py
+++ b/internnav/habitat_extensions/measures.py
@@ -149,8 +149,9 @@ def __init__(self, *args: Any, sim: Simulator, config: Any, **kwargs: Any):
         self._config = config
         self.dtw_func = dtw
 
+        # Load ground truth paths from rxr dataset, update this path as needed
         with gzip.open("val_unseen_guide_gt.json.gz", "rt") as f:
-            self.gt_json = json.load(f)  # HARDCODED
+            self.gt_json = json.load(f)
 
         super().__init__()
 

From 4dc059d0da2083796eb1929ed7ddc1fba41cbc1d Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 8 Dec 2025 09:02:18 +0000
Subject: [PATCH 07/12] update ndtw json path

---
 internnav/habitat_extensions/measures.py        | 5 +++--
 scripts/eval/configs/habitat_dual_system_cfg.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/internnav/habitat_extensions/measures.py b/internnav/habitat_extensions/measures.py
index 19bd123..95ebb61 100644
--- a/internnav/habitat_extensions/measures.py
+++ b/internnav/habitat_extensions/measures.py
@@ -149,8 +149,9 @@ def __init__(self, *args: Any, sim: Simulator, config: Any, **kwargs: Any):
         self._config = config
         self.dtw_func = dtw
 
-        # Load ground truth paths from rxr dataset, update this path as needed
-        with gzip.open("val_unseen_guide_gt.json.gz", "rt") as f:
+        # Load ground truth paths from rxr dataset, update this path in habitat config as needed
+        gt_json_path = 'data/vln_ce/raw_data/rxr/val_unseen/val_unseen_guide_gt.json.gz'
+        with gzip.open(gt_json_path, "rt") as f:
             self.gt_json = json.load(f)
 
         super().__init__()
diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
index 2ba24b3..f2b9805 100644
--- a/scripts/eval/configs/habitat_dual_system_cfg.py
+++ b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -17,14 +17,14 @@
         env_type='habitat',
         env_settings={
             # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
-            'config_path': 'scripts/eval/configs/vln_r2r.yaml',
+            'config_path': 'scripts/eval/configs/vln_rxr.yaml',
         },
     ),
     eval_type='habitat_vln',
     eval_settings={
         # all current parse args
         "output_path": "./logs/habitat/test_dual_system",  # output directory for logs/results
-        "save_video": False,  # whether to save videos
+        "save_video": True,  # whether to save videos
         "epoch": 0,  # epoch number for logging
         "max_steps_per_episode": 500,  # maximum steps per episode
         # distributed settings

From 4291aea5dc5014c1049d615a6782badc8cb6396e Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 8 Dec 2025 09:28:50 +0000
Subject: [PATCH 08/12] test rxr

---
 scripts/eval/configs/habitat_dual_system_cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
index f2b9805..5524d88 100644
--- a/scripts/eval/configs/habitat_dual_system_cfg.py
+++ b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -24,7 +24,7 @@
     eval_settings={
         # all current parse args
         "output_path": "./logs/habitat/test_dual_system",  # output directory for logs/results
-        "save_video": True,  # whether to save videos
+        "save_video": False,  # whether to save videos
         "epoch": 0,  # epoch number for logging
         "max_steps_per_episode": 500,  # maximum steps per episode
         # distributed settings

From 59a3b93c70a2db4fabc06d1516697c534ea3a8ae Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 8 Dec 2025 13:19:03 +0000
Subject: [PATCH 09/12] fix internvla_n1_policy with new model updates; remove
 use_async flag in generate_traj

---
 internnav/agent/internvla_n1_agent.py         |  13 +-
 .../internvla_n1/internvla_n1_policy.py       | 185 ++++++++++--------
 .../eval/configs/h1_internvla_n1_async_cfg.py |  12 +-
 scripts/eval/configs/h1_rdp_cfg.py            |   8 +-
 4 files changed, 113 insertions(+), 105 deletions(-)

diff --git a/internnav/agent/internvla_n1_agent.py b/internnav/agent/internvla_n1_agent.py
index 8e696af..1a9bffe 100644
--- a/internnav/agent/internvla_n1_agent.py
+++ b/internnav/agent/internvla_n1_agent.py
@@ -207,7 +207,7 @@ def s2_thread_func():
         self.s2_thread.daemon = True
         self.s2_thread.start()
 
-    def should_infer_s2(self, mode="sync"):
+    def should_infer_s2(self, mode="partial_async"):
         """Function: Enables the sys2 inference thread depending on the mode.
         mode: just support 2 modes: "sync" and "partial_async".
         "sync": Synchronous mode (navdp_version >= 0.0), Sys1 and Sys2 execute in a sequential inference chain.
@@ -298,7 +298,7 @@ def step(self, obs):
                 if self.sys1_infer_times > 0:
                     self.dual_forward_step += 1
 
-            # print('Output action:', output, self.dual_forward_step)
+            print('Output action:', output, self.dual_forward_step)
 
         else:
             self.look_down = False
@@ -333,13 +333,9 @@ def step(self, obs):
                         .unsqueeze(-1)
                         .to(self.device)
                     )  # [1, 2, 224, 224, 1]
-                    self.s1_output = self.policy.s1_step_latent(
-                        rgbs, depths, self.s2_output.output_latent, use_async=True
-                    )
+                    self.s1_output = self.policy.s1_step_latent(rgbs, depths, self.s2_output.output_latent)
                 else:
-                    self.s1_output = self.policy.s1_step_latent(
-                        rgb, depth * 10000.0, self.s2_output.output_latent, use_async=False
-                    )
+                    self.s1_output = self.policy.s1_step_latent(rgb, depth * 10000.0, self.s2_output.output_latent)
 
             else:
                 assert False, f"S2 output should be either action or latent, but got neither!  {self.s2_output}"
@@ -372,6 +368,7 @@ def step(self, obs):
                     if self.dual_forward_step > self.sys2_max_forward_step:
                         print("!!!!!!!!!!!!")
                         print("ERR: self.dual_forward_step ", self.dual_forward_step, " > ", self.sys2_max_forward_step)
+                        print("Potential reason: sys1 infers empty trajectory list []")
                         print("!!!!!!!!!!!!")
 
         print('Output discretized traj:', output['action'], self.dual_forward_step)
diff --git a/internnav/model/basemodel/internvla_n1/internvla_n1_policy.py b/internnav/model/basemodel/internvla_n1/internvla_n1_policy.py
index c37ea3d..e56daaf 100644
--- a/internnav/model/basemodel/internvla_n1/internvla_n1_policy.py
+++ b/internnav/model/basemodel/internvla_n1/internvla_n1_policy.py
@@ -1,15 +1,26 @@
-from typing import Union
-from transformers import PreTrainedModel, AutoTokenizer, AutoProcessor
-import torch
-from internnav.configs.model.base_encoders import ModelCfg
-from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM, InternVLAN1ModelConfig
-from internnav.model.utils.vln_utils import S2Output, S1Output, traj_to_actions, chunk_token, split_and_clean
-from PIL import Image
-import numpy as np
-import re
 import copy
 import itertools
+import re
 from collections import OrderedDict
+from typing import Union
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer, PreTrainedModel
+
+from internnav.configs.model.base_encoders import ModelCfg
+from internnav.model.basemodel.internvla_n1.internvla_n1 import (
+    InternVLAN1ForCausalLM,
+    InternVLAN1ModelConfig,
+)
+from internnav.model.utils.vln_utils import (
+    S1Output,
+    S2Output,
+    chunk_token,
+    split_and_clean,
+    traj_to_actions,
+)
 
 
 class InternVLAN1Net(PreTrainedModel):
@@ -18,59 +29,62 @@ class InternVLAN1Net(PreTrainedModel):
     def __init__(self, config: Union[InternVLAN1ModelConfig, ModelCfg]):
         super().__init__(config)
         self.model_config = ModelCfg(**config.model_cfg['model'])
-        
+
         self.model = InternVLAN1ForCausalLM.from_pretrained(
-            self.model_config.model_path, torch_dtype=torch.bfloat16,
-            attn_implementation="flash_attention_2", device_map={"": self.model_config.device}
+            self.model_config.model_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map={"": self.model_config.device},
         )
-        
+
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_config.model_path, use_fast=True)
         self.processor = AutoProcessor.from_pretrained(self.model_config.model_path)
         self.processor.tokenizer = self.tokenizer
         self.processor.tokenizer.padding_side = 'left'
-        
+
         self.init_prompts()
-        
+
         self.num_frames = self.model_config.num_frames
         self.num_history = self.model_config.num_history
         self.num_future_steps = self.model_config.num_future_steps
         self.continuous_traj = self.model_config.continuous_traj
         self.resize_w = self.model_config.resize_w
         self.resize_h = self.model_config.resize_h
-        
+
         self.rgb_list = []
         self.depth_list = []
         self.pose_list = []
         self.episode_idx = 0  # S2's episode idx is different from the system's idx
         self.conversation_history = []  # Multi-turn conversation exists when looking down
         self.llm_output = ""
-        
-        
+
     def init_prompts(self):
         self.DEFAULT_IMAGE_TOKEN = "<image>"
-        # For absolute pixel goal 
-        prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task." 
+        # For absolute pixel goal
+        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
         answer = ""
         self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
-        
+
         self.conjunctions = [
-                                'you can see ',
-                                'in front of you is ',
-                                'there is ',
-                                'you can spot ',
-                                'you are toward the ',
-                                'ahead of you is ',
-                                'in your sight is '
-                            ]
-
-        self.actions2idx = OrderedDict({
-            'STOP': [0],
-            "↑": [1],
-            "←": [2],
-            "→": [3],
-            "↓": [5],
-        })
-        
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+
+        self.actions2idx = OrderedDict(
+            {
+                'STOP': [0],
+                "↑": [1],
+                "←": [2],
+                "→": [3],
+                "↓": [5],
+            }
+        )
+
     def reset(self):
         self.rgb_list = []
         self.depth_list = []
@@ -78,7 +92,7 @@ def reset(self):
         self.episode_idx = 0
         self.conversation_history = []
         self.llm_output = ""
-        
+
     def parse_actions(self, output):
         action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
         regex = re.compile(action_patterns)
@@ -86,14 +100,13 @@ def parse_actions(self, output):
         actions = [self.actions2idx[match] for match in matches]
         actions = itertools.chain.from_iterable(actions)
         return list(actions)
-    
+
     def step_no_infer(self, rgb, depth, pose):
         image = Image.fromarray(rgb).convert('RGB')
-        raw_image_size = image.size
         image = image.resize((self.resize_w, self.resize_h))
         self.rgb_list.append(image)
         self.episode_idx += 1
-        
+
     def s2_step(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
         # Need to be careful: look_down images are not added to rgb_list and won't be selected as history
         # 1. Preprocess input
@@ -101,11 +114,11 @@ def s2_step(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
         if not look_down:  # Don't add look_down images to rgb_list
             image = image.resize((self.resize_w, self.resize_h))
             self.rgb_list.append(image)
-        
+
         # 2. Prepare input for the model
         if not look_down:
             # Clear conversation history when not looking down, provide normal image history and instruction
-            self.conversation_history = [] 
+            self.conversation_history = []
             # 2.1 instruction
             sources = copy.deepcopy(self.conversation)
             sources[0]["value"] = sources[0]["value"].replace('<instruction>.', instruction)
@@ -117,7 +130,7 @@ def s2_step(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
                 history_id = np.unique(np.linspace(0, self.episode_idx - 1, self.num_history, dtype=np.int32)).tolist()
                 placeholder = (self.DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
                 sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-            
+
             history_id = sorted(history_id)
             self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
             input_img_id = 0
@@ -128,77 +141,75 @@ def s2_step(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
             input_img_id = -1
             assert self.llm_output != "", "Last llm_output should not be empty when look down"
             sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-            self.conversation_history.append({ 'role': 'assistant', 'content': [{ 'type': 'text', 'text': self.llm_output}]})
-            
+            self.conversation_history.append(
+                {'role': 'assistant', 'content': [{'type': 'text', 'text': self.llm_output}]}
+            )
+
         prompt = self.conjunctions[0] + self.DEFAULT_IMAGE_TOKEN
         sources[0]["value"] += f" {prompt}."
         prompt_instruction = copy.deepcopy(sources[0]["value"])
         parts = split_and_clean(prompt_instruction)
-        
+
         content = []
-        for i in range (len(parts)):
+        for i in range(len(parts)):
             if parts[i] == "<image>":
                 content.append({"type": "image", "image": self.input_images[input_img_id]})
-                input_img_id +=1
+                input_img_id += 1
             else:
-                content.append({"type": "text", "text": parts[i]}) 
-        
+                content.append({"type": "text", "text": parts[i]})
+
         self.conversation_history.append({'role': 'user', 'content': content})
-        
-        text = self.processor.apply_chat_template(
-            self.conversation_history, tokenize=False, add_generation_prompt=True
-        )
-        
+
+        text = self.processor.apply_chat_template(self.conversation_history, tokenize=False, add_generation_prompt=True)
+
         inputs = self.processor(text=[text], images=self.input_images, return_tensors="pt").to(self.device)
-        
+
         # 3. Model inference
         with torch.no_grad():
-            output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
-        self.llm_output = self.processor.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            output_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=128,
+                do_sample=False,
+                use_cache=True,
+                past_key_values=None,
+                return_dict_in_generate=True,
+            ).sequences
+        self.llm_output = self.processor.tokenizer.decode(
+            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+        )
         print(f"============ output {self.episode_idx}  {self.llm_output}")
         output = S2Output()
-        
+
         # 4. Post-process results
-        if bool(re.search(r'\d', self.llm_output)):  # Output pixel goal 
+        if bool(re.search(r'\d', self.llm_output)):  # Output pixel goal
             coord = [int(c) for c in re.findall(r'\d+', self.llm_output)]
             pixel_goal = [int(coord[1]), int(coord[0])]
             output.output_pixel = np.array(pixel_goal)
-            
-            image_grid_thw = torch.cat(
-                [thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0
-            )
+
+            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
             with torch.no_grad():
-                traj_latents = self.model.generate_latents(
-                    output_ids, inputs.pixel_values, image_grid_thw
-                )
+                traj_latents = self.model.generate_latents(output_ids, inputs.pixel_values, image_grid_thw)
             output.output_latent = traj_latents
-            
+
         else:  # Output action
             action_seq = self.parse_actions(self.llm_output)
             output.output_action = action_seq
-            
+
         return output
-    
-            
-    def s1_step_latent(self, rgb, depth, latent, use_async=False):
+
+    def s1_step_latent(self, rgb, depth, latent):
         with torch.no_grad():
-            if use_async:
-                dp_actions = self.model.generate_traj(latent, rgb, depth, use_async)
-            else:
-                dp_actions = self.model.generate_traj(latent)
+            dp_actions = self.model.generate_traj(
+                traj_latents=latent, images_dp=rgb, depths_dp=depth
+            )  # use_aysnc based on MODEL
 
         if self.continuous_traj:
             action_list = traj_to_actions(dp_actions)
         else:
             random_choice = np.random.choice(dp_actions.shape[0])
             action_list = chunk_token(dp_actions[random_choice])
-            
+
         action_list = [x for x in action_list if x != 0]
-        
-        
-        ##If the mode is async, S1 just use the part of actions
-        if use_async:
-            output = S1Output(idx=action_list[:4])
-        else:
-            output = S1Output(idx=action_list[:8])
-        return output
\ No newline at end of file
+
+        output = S1Output(idx=action_list[:4])
+        return output
diff --git a/scripts/eval/configs/h1_internvla_n1_async_cfg.py b/scripts/eval/configs/h1_internvla_n1_async_cfg.py
index 36f1012..ba4a0d8 100644
--- a/scripts/eval/configs/h1_internvla_n1_async_cfg.py
+++ b/scripts/eval/configs/h1_internvla_n1_async_cfg.py
@@ -16,7 +16,7 @@
         model_settings={
             'env_num': 1,
             'sim_num': 1,
-            'model_path': "checkpoints/InternVLA-N1",
+            'model_path': "checkpoints/checkpoints_weimeng/InternVLA-N1-DualVLN",
             'camera_intrinsic': [[585.0, 0.0, 320.0], [0.0, 585.0, 240.0], [0.0, 0.0, 1.0]],
             'width': 640,
             'height': 480,
@@ -33,7 +33,7 @@
             'infer_mode': 'partial_async',  # You can choose "sync" or "partial_async", but for this model, "partial_async" is better.
             # debug
             'vis_debug': True,  # If vis_debug=True, you can get visualization results
-            'vis_debug_path': './logs/test/vis_debug',
+            'vis_debug_path': './logs/test_n1/vis_debug',
         },
     ),
     env=EnvCfg(
@@ -44,19 +44,19 @@
         },
     ),
     task=TaskCfg(
-        task_name='test',
+        task_name='test_n1',
         task_settings={
             'env_num': 1,
             'use_distributed': False,  # If the others setting in task_settings, please set use_distributed = False.
             'proc_num': 1,
-            'max_step': 1000,  # If use flash mode，default 1000; descrete mode, set 50000
+            'max_step': 50000,  # If use flash mode，default 1000; descrete mode, set 50000
         },
         scene=SceneCfg(
             scene_type='mp3d',
             scene_data_dir='data/scene_data/mp3d_pe',
         ),
         robot_name='h1',
-        robot_flash=True,  # If robot_flash is True, the mode is flash (set world_pose directly); else you choose physical mode.
+        robot_flash=False,  # If robot_flash is True, the mode is flash (set world_pose directly); else you choose physical mode.
         robot_usd_path='data/Embodiments/vln-pe/h1/h1_internvla.usd',
         camera_resolution=[640, 480],  # (W,H)
         camera_prim_path='torso_link/h1_1_25_down_30',
@@ -75,7 +75,7 @@
     eval_type='vln_distributed',
     eval_settings={
         'save_to_json': True,
-        'vis_output': False,
+        'vis_output': True,
         'use_agent_server': False,  # If use_agent_server=True, please start the agent server first.
     },
 )
diff --git a/scripts/eval/configs/h1_rdp_cfg.py b/scripts/eval/configs/h1_rdp_cfg.py
index ef5ff19..1f58fd2 100644
--- a/scripts/eval/configs/h1_rdp_cfg.py
+++ b/scripts/eval/configs/h1_rdp_cfg.py
@@ -24,8 +24,8 @@
     task=TaskCfg(
         task_name='rdp_eval',
         task_settings={
-            'env_num': 2,
-            'use_distributed': True,
+            'env_num': 1,
+            'use_distributed': False,
             'proc_num': 4,
         },
         scene=SceneCfg(
@@ -48,7 +48,7 @@
     eval_type='vln_distributed',
     eval_settings={
         'save_to_json': True,
-        'vis_output': False,
-        'use_agent_server': True,
+        'vis_output': True,
+        'use_agent_server': False,
     },
 )

From 3b9f6a3ad60f4369e6039d833037f6cb96593ea0 Mon Sep 17 00:00:00 2001
From: mengwei <kellymeng0427@gmail.com>
Date: Wed, 10 Dec 2025 12:28:39 +0800
Subject: [PATCH 10/12] update README.md

---
 README.md | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0195fe7..c10c51b 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,8 @@
 [![demo](assets/InternNav.gif "demo")](https://www.youtube.com/watch?v=fD0F1jIax5Y)
 
 [![HomePage](https://img.shields.io/badge/HomePage-144B9E?logo=ReactOS&logoColor=white)](https://internrobotics.github.io/internvla-n1.github.io/)
-[![Technique Report](https://img.shields.io/badge/Paper-B31B1B?logo=arXiv&logoColor=white)](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf)
+[![Technical Report — InternVLA-N1](https://img.shields.io/badge/Technical_Report-InternVLA--N1-BB2649?logo=adobeacrobatreader&logoColor=white)](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf)
+[![DualVLN Paper — arXiv](https://img.shields.io/badge/arXiv-DualVLN-B31B1B?logo=arxiv&logoColor=white)](https://arxiv.org/abs/2512.08186)
 [![doc](https://img.shields.io/badge/Document-FFA500?logo=readthedocs&logoColor=white)](https://internrobotics.github.io/user_guide/internnav/index.html)
 [![GitHub star chart](https://img.shields.io/github/stars/InternRobotics/InternNav?style=square)](https://github.com/InternRobotics/InternNav)
 [![GitHub Issues](https://img.shields.io/github/issues/InternRobotics/InternNav)](https://github.com/InternRobotics/InternNav/issues)
@@ -34,12 +35,15 @@ The toolbox supports the most comprehensive 6 datasets \& benchmarks and 10+ pop
 The toolbox supports the most advanced high-quality navigation dataset, InternData-N1, which includes 3k+ scenes and 830k VLN data covering diverse embodiments and scenes, and the first dual-system navigation foundation model with leading performance on all the benchmarks and zero-shot generalization capability in the real world, InternVLA-N1.
 
 ## 🔥 News
-- [2025/12] Training code for InternVLA-N1 is now available. This official release provides two dual-system configurations: **InternVLA-N1 (Dual System)**<span style="color: #28a745; font-size: 0.9em"> with NavDP*</span> and **InternVLA-N1 (Dual System)**<span style="color: #28a745; font-size: 0.9em"> DualVLN</span>. For model architecture and training details, please refer to the [DualVLN paper](TO_BE_UPDATED).
-- [2025/10] Add a simple [inference-only demo](scripts/notebooks/inference_only_demo.ipynb) of InternVLA-N1.
-- [2025/10] InternVLA-N1 [technical report](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf) is released. Please check our [homepage](https://internrobotics.github.io/internvla-n1.github.io/).
-- [2025/09] Real-world deployment code of InternVLA-N1 is released. Upload 3D printing [files](assets/3d_printing_files/go2_stand.STEP) for Unitree Go2.
-- [2025/07] We are hosting 🏆IROS 2025 Grand Challenge, stay tuned at [official website](https://internrobotics.shlab.org.cn/challenge/2025/).
-- [2025/07] InternNav v0.1.1 released.
+| Time   | Update |
+|---------|--------|
+| 2025/12 | Training code for InternVLA-N1 is now available. This release provides two model configurations: InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> with NavDP*</span> and InternVLA-N1 (Dual System)<span style="color: #28a745; font-size: 0.9em"> DualVLN </span>. For model architecture and training details, please refer to the [DualVLN paper](https://arxiv.org/abs/2512.08186).|
+| 2025/11 | InternNav v0.2.0 released — added distributed evaluation support for VLN-PE.|
+| 2025/10 | Add a [inference-only demo](scripts/notebooks/inference_only_demo.ipynb) of InternVLA-N1. |
+| 2025/10 | InternVLA-N1 [technical report](https://internrobotics.github.io/internvla-n1.github.io/static/pdfs/InternVLA_N1.pdf) is released. Please check our [homepage](https://internrobotics.github.io/internvla-n1.github.io/). |
+| 2025/09 | Real-world deployment code of InternVLA-N1 released. Upload 3D printing [files](assets/3d_printing_files/go2_stand.STEP) for Unitree Go2. |
+| 2025/07 | Hosting the 🏆 IROS 2025 Grand Challenge (see updates at [official website](https://internrobotics.shlab.org.cn/challenge/2025/)) |
+| 2025/07 | InternNav v0.1.1 released |
 
 ## 📋 Table of Contents
 - [🏠 Introduction](#-introduction)
@@ -251,11 +255,14 @@ If you use the specific pretrained models and benchmarks, please kindly cite the
     year = {2025},
     booktitle={arXiv},
 }
-@misc{dualvln,
-    title = {{InternVLA-N1: An} Open Dual-System Navigation Foundation Model with Learned Latent Plans},
-    author = {InternVLA-N1 Team},
-    year = {2025},
-    booktitle={arXiv},
+@misc{wei2025groundslowfastdualsystem,
+      title={Ground Slow, Move Fast: A Dual-System Foundation Model for Generalizable Vision-and-Language Navigation}, 
+      author={Meng Wei and Chenyang Wan and Jiaqi Peng and Xiqian Yu and Yuqiang Yang and Delin Feng and Wenzhe Cai and Chenming Zhu and Tai Wang and Jiangmiao Pang and Xihui Liu},
+      year={2025},
+      eprint={2512.08186},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2512.08186}, 
 }
 ```
 

From d0563ee211bb5e73f163e49087cf9846aae8d5d5 Mon Sep 17 00:00:00 2001
From: mengwei <kellymeng0427@gmail.com>
Date: Wed, 10 Dec 2025 15:57:49 +0800
Subject: [PATCH 11/12] fix some typos

---
 scripts/eval/bash/eval_dual_system.sh             | 2 +-
 scripts/eval/bash/eval_system2.sh                 | 2 +-
 scripts/eval/configs/h1_internvla_n1_async_cfg.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/eval/bash/eval_dual_system.sh b/scripts/eval/bash/eval_dual_system.sh
index 939e91a..91f17a5 100755
--- a/scripts/eval/bash/eval_dual_system.sh
+++ b/scripts/eval/bash/eval_dual_system.sh
@@ -1,7 +1,7 @@
 MID_RUN_NAME="InternVLA-N1"
 CONFIG="scripts/eval/configs/habitat_dual_system_cfg.py"
 
-srun -p gpu_partition \
+srun -p <YOUR_PARTITION_NAME> \
     --gres=gpu:8 \
     --ntasks=8 \
     --time=0-20:00:00 \
diff --git a/scripts/eval/bash/eval_system2.sh b/scripts/eval/bash/eval_system2.sh
index a073dc9..81d34a4 100755
--- a/scripts/eval/bash/eval_system2.sh
+++ b/scripts/eval/bash/eval_system2.sh
@@ -1,7 +1,7 @@
 MID_RUN_NAME="InternVLA-N1"
 CONFIG="scripts/eval/configs/habitat_s2_cfg.py"
 
-srun -p gpu_partition \
+srun -p <YOUR_PARTITION_NAME> \
     --gres=gpu:8 \
     --ntasks=8 \
     --time=0-20:00:00 \
diff --git a/scripts/eval/configs/h1_internvla_n1_async_cfg.py b/scripts/eval/configs/h1_internvla_n1_async_cfg.py
index ba4a0d8..f3c1ced 100644
--- a/scripts/eval/configs/h1_internvla_n1_async_cfg.py
+++ b/scripts/eval/configs/h1_internvla_n1_async_cfg.py
@@ -16,7 +16,7 @@
         model_settings={
             'env_num': 1,
             'sim_num': 1,
-            'model_path': "checkpoints/checkpoints_weimeng/InternVLA-N1-DualVLN",
+            'model_path': "checkpoints/InternVLA-N1-DualVLN",
             'camera_intrinsic': [[585.0, 0.0, 320.0], [0.0, 585.0, 240.0], [0.0, 0.0, 1.0]],
             'width': 640,
             'height': 480,

From 4f6500eebc6ea9dc16bae2f8a464fe7af8e39ec2 Mon Sep 17 00:00:00 2001
From: mengwei <kellymeng0427@gmail.com>
Date: Fri, 12 Dec 2025 10:41:41 +0800
Subject: [PATCH 12/12] Minor code cleanup

---
 internnav/agent/internvla_n1_agent.py               |  2 --
 .../model/basemodel/internvla_n1/internvla_n1.py    | 13 -------------
 scripts/eval/configs/habitat_dual_system_cfg.py     |  2 +-
 scripts/eval/configs/habitat_s2_cfg.py              |  2 +-
 4 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/internnav/agent/internvla_n1_agent.py b/internnav/agent/internvla_n1_agent.py
index 1a9bffe..d2e1d27 100644
--- a/internnav/agent/internvla_n1_agent.py
+++ b/internnav/agent/internvla_n1_agent.py
@@ -298,8 +298,6 @@ def step(self, obs):
                 if self.sys1_infer_times > 0:
                     self.dual_forward_step += 1
 
-            print('Output action:', output, self.dual_forward_step)
-
         else:
             self.look_down = False
             # 2. If output is in latent form, execute latent S1
diff --git a/internnav/model/basemodel/internvla_n1/internvla_n1.py b/internnav/model/basemodel/internvla_n1/internvla_n1.py
index 6703617..69c7f47 100644
--- a/internnav/model/basemodel/internvla_n1/internvla_n1.py
+++ b/internnav/model/basemodel/internvla_n1/internvla_n1.py
@@ -221,19 +221,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            # # Upcast to float if we need to compute the loss to avoid potential precision issues
-            # logits = logits.float()
-            # # Shift so that tokens < n predict n
-            # shift_logits = logits[..., :-1, :].contiguous()
-            # shift_labels = labels[..., 1:].contiguous()
-            # # Flatten the tokens
-            # loss_fct = torch.nn.CrossEntropyLoss()
-            # shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            # shift_labels = shift_labels.view(-1)
-            # # Enable model parallelism
-            # shift_labels = shift_labels.to(shift_logits.device)
-            # loss = loss_fct(shift_logits, shift_labels)
-
             traj_hidden_states = []
             for b in range(hidden_states.shape[0]):
                 traj_hidden_states.append(hidden_states[b, t_s_pos[b] : t_s_pos[b] + self.config.n_query, :])
diff --git a/scripts/eval/configs/habitat_dual_system_cfg.py b/scripts/eval/configs/habitat_dual_system_cfg.py
index 5524d88..2be738c 100644
--- a/scripts/eval/configs/habitat_dual_system_cfg.py
+++ b/scripts/eval/configs/habitat_dual_system_cfg.py
@@ -6,7 +6,7 @@
         model_name='internvla_n1',
         model_settings={
             "mode": "dual_system",  # inference mode: dual_system or system2
-            "model_path": "checkpoints/checkpoints_weimeng/InternVLA-N1-DualVLN",  # path to model checkpoint
+            "model_path": "checkpoints/InternVLA-N1-DualVLN",  # path to model checkpoint
             "num_history": 8,
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height
diff --git a/scripts/eval/configs/habitat_s2_cfg.py b/scripts/eval/configs/habitat_s2_cfg.py
index 4c6613a..ddb5b72 100644
--- a/scripts/eval/configs/habitat_s2_cfg.py
+++ b/scripts/eval/configs/habitat_s2_cfg.py
@@ -6,7 +6,7 @@
         model_name='internvla_n1',
         model_settings={
             "mode": "system2",  # inference mode: dual_system or system2
-            "model_path": "checkpoints/checkpoints_weimeng/InternVLA-N1-System2",  # path to model checkpoint
+            "model_path": "checkpoints/InternVLA-N1-System2",  # path to model checkpoint
             "num_history": 8,
             "resize_w": 384,  # image resize width
             "resize_h": 384,  # image resize height

- System2 (VLN-CE) + VLN Benchmarks	- System1 (VN) -	- Whole-system (VLN) + VN Benchmarks
- VLN-CE R2R - VLN-CE RxR + VLN-CE + VLN-PE	- Cluttered Envs - GRScenes-100 - -	- - VLN-CE - VLN-PE + Cluttered Environments + GRScenes-100
- System2 (VLN-CE) + 🧠 VLN Single-System	- System1 (VN) + 🎯 VN System (System1)	- Whole-system (VLN) + 🤝 VLN Multi-System
- StreamVLN - InternVLA-N1-Preview (S2) - InternVLA-N1 (S2) + Seq2Seq + CMA + RDP + StreamVLN (coming soon)	- DD-PPO - iPlanner - ViPlanner - GNM - ViNT - NoMad - NavDP + DD-PPO + iPlanner + ViPlanner + GNM + ViNT + NoMad + NavDP InternVLA-N1 (System 1)	- Seq2Seq - CMA - RDP - InternVLA-N1-Preview - InternVLA-N1 + InternVLA-N1 (System 2) + Decoupled System1 + InternVLA-N1 (Dual System) w/ NavDP* (NavDP* indicates joint tuning with System 2) + InternVLA-N1 (Dual System) DualVLN