diff --git a/internnav/agent/__init__.py b/internnav/agent/__init__.py
index 74aa4bc..8ccc4f9 100644
--- a/internnav/agent/__init__.py
+++ b/internnav/agent/__init__.py
@@ -1,13 +1,8 @@
 from internnav.agent.base import Agent
 from internnav.agent.cma_agent import CmaAgent
+from internnav.agent.dialog_agent import DialogAgent
+from internnav.agent.internvla_n1_agent import InternVLAN1Agent
 from internnav.agent.rdp_agent import RdpAgent
 from internnav.agent.seq2seq_agent import Seq2SeqAgent
-from internnav.agent.internvla_n1_agent import InternVLAN1Agent
 
-__all__ = [
-    'Agent',
-    'CmaAgent',
-    'RdpAgent',
-    'Seq2SeqAgent',
-    'InternVLAN1Agent'
-]
+__all__ = ['Agent', 'DialogAgent', 'CmaAgent', 'RdpAgent', 'Seq2SeqAgent', 'InternVLAN1Agent']
diff --git a/internnav/agent/base.py b/internnav/agent/base.py
index a68626f..02a566d 100644
--- a/internnav/agent/base.py
+++ b/internnav/agent/base.py
@@ -25,6 +25,7 @@ def decorator(agent_class):
             if agent_type in cls.agents:
                 raise ValueError(f"Agent {agent_type} already registered.")
             cls.agents[agent_type] = agent_class
+            return agent_class
 
         return decorator
 
diff --git a/internnav/agent/dialog_agent.py b/internnav/agent/dialog_agent.py
new file mode 100644
index 0000000..952448a
--- /dev/null
+++ b/internnav/agent/dialog_agent.py
@@ -0,0 +1,468 @@
+import argparse
+import copy
+import itertools
+import random
+import re
+import time
+from collections import OrderedDict
+from typing import Any, Dict
+
+import numpy as np
+import quaternion
+import torch
+from PIL import Image, ImageDraw
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    Qwen2_5_VLForConditionalGeneration,
+)
+
+from internnav.agent import Agent
+from internnav.configs.agent import AgentCfg
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+
+try:
+    from depth_camera_filtering import filter_depth
+    from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
+except Exception as e:
+    print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+def split_and_clean(text):
+    # 按 <image> 分割，保留分割符
+    import re
+
+    parts = re.split(r'(<image>)', text)
+    results = []
+    for part in parts:
+        if part == '<image>':
+            results.append(part)
+        else:
+            # 去除所有换行符，并strip两端空白
+            clean_part = part.replace('\n', '').strip()
+            if clean_part:  # 跳过空字符串
+                results.append(clean_part)
+    return results
+
+
+@Agent.register('dialog')
+class DialogAgent(Agent):
+    """
+    agent template, override the functions for custom policy
+    """
+
+    def __init__(self, agent_config: AgentCfg):
+        self.agent_config = agent_config
+        self.task_name = self.agent_config.model_settings['task_name']
+
+        # sensor config
+        self.sim_sensors_config = self.agent_config.model_settings['sim_sensors_config']
+        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
+        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
+        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
+        self._camera_fov = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
+        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(self._camera_fov / 2))
+
+        # model
+        self.model_args = argparse.Namespace(**self.agent_config.model_settings)
+
+        self.task = self.model_args.task
+        self.append_look_down = self.model_args.append_look_down
+        self.resize_h = self.model_args.resize_h
+        self.resize_w = self.model_args.resize_w
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_args.model_path, use_fast=True)
+        processor = AutoProcessor.from_pretrained("/mnt/inspurfs/efm_t/weimeng/Qwen2.5-VL-7B-Instruct")
+        processor.tokenizer = tokenizer
+        processor.tokenizer.padding_side = 'left'
+
+        self.device = torch.device('cuda', self.agent_config.model_settings['local_rank'])
+        if self.model_args.mode == 'dual_system':
+            model = InternVLAN1ForCausalLM.from_pretrained(
+                self.model_args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": self.device},
+            )
+        elif self.model_args.mode == 'system2':
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                self.model_args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": self.device},
+            )
+        else:
+            raise ValueError(f"Invalid mode: {self.model_args.mode}")
+
+        model.eval()
+
+        self.model = model
+        self.processor = processor
+        self.num_history = self.model_args.num_history
+
+        # prompt
+        if 'dialog' in self.task_name or self.agent_config.model_settings['dialog_enabled']:
+            prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 15 degrees. Please output STOP when you have successfully completed the task."
+        else:
+            prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? When you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 15 degrees. Please output STOP when you have successfully completed the task."
+        answer = ""
+        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
+
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+
+        self.actions2idx = OrderedDict(
+            {
+                'STOP': [0],
+                "↑": [1],
+                "←": [2],
+                "→": [3],
+                "↓": [5],
+            }
+        )
+
+    def convert_input(self, obs, info):
+        # update new information after env.step
+        depth = obs["depth"]
+        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+        self.depth = depth * 1000  # get depth
+
+        rgb = obs["rgb"]
+        image = Image.fromarray(rgb).convert('RGB')  # raw observation image
+        self.save_raw_image = image.copy()  # get rgb
+
+        x, y = obs["gps"]
+        camera_yaw = obs["compass"][0]
+        agent_state = info['agent state']
+        height = agent_state.position[1] - self.initial_height  # Habitat GPS makes west negative, so flip y
+        camera_position = np.array([x, -y, self._camera_height + height])
+        self.tf_camera_to_episodic = (
+            self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30)) @ self.get_axis_align_matrix()
+        )  # get transformation from camera to agent
+
+        if self.last_action == 5:
+            self.look_down_image = image
+            self.save_raw_image = self.look_down_image.copy()
+        elif self.last_action != 6:
+            image = image.resize((self.resize_w, self.resize_h))
+            self.rgb_list.append(image)
+        return obs
+
+    def convert_output(self, env, llm_outputs: str):
+        if '<talk>' in llm_outputs:
+            self.question = llm_outputs.replace('<talk>', '')
+            return 6
+        else:
+            if bool(re.search(r'\d', llm_outputs)):  # output pixel goal
+                # get pixel goal
+                self.forward_action = 0
+                coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+                print('coords:', coord)
+                try:
+                    pixel_goal = [int(coord[1]), int(coord[0])]  # switch the goal o
+                except Exception as e:
+                    print(f"Invalid pixel goal: len(coor)!=2: {e}")
+                    return 0
+
+                # trans pixel goal to global goal
+                try:
+                    self.goal = self.pixel_to_gps(
+                        pixel_goal, self.depth / 1000, self.intrinsic_matrix, self.tf_camera_to_episodic
+                    )
+                except Exception as e:
+                    print(f"Invalid pixel goal: out of image size range: {e}")
+                    return 0
+                self.goal = (self.transformation_matrix @ np.array([-self.goal[1], 0, -self.goal[0], 1]))[:3]
+                if not env._env.sim.pathfinder.is_navigable(np.array(self.goal)):
+                    self.goal = np.array(env._env.sim.pathfinder.snap_point(np.array(self.goal)))
+
+                # paint pixel goal
+                draw = ImageDraw.Draw(self.save_raw_image, 'RGB')
+                x, y, r = pixel_goal[0], pixel_goal[1], 2
+                draw.ellipse([(x - r, y - r), (x + r, y + r)], fill=(255, 0, 0))
+
+                # look down --> horizontal
+                env.step(4)
+                env.step(4)
+
+                if self.append_look_down and self.look_down_image is not None:
+                    self.prev_look_image = self.look_down_image.resize((self.resize_w, self.resize_h))
+                action = self.agent.get_next_action(self.goal)
+                if action == 0:
+                    self.goal = None
+                    self.messages = []
+                    print('conduct a random action 2')
+                    self.last_action = 2
+                    return 2
+                print('predicted goal', pixel_goal, self.goal, flush=True)
+            else:
+                self.action_seq = self.parse_actions(llm_outputs)
+                print('actions', self.action_seq, flush=True)
+
+    def inference(self, obs, info):
+        if self.last_action == 6:
+            self.dialogs.append({'role': 'navigator', 'message': self.question, 'true_idx': self.step_id})
+            self.dialogs.append({'role': 'oracle', 'message': obs['npc_answer'], 'true_idx': self.step_id})
+            self.messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': self.last_llm_outputs}]})
+            self.messages.append({'role': 'user', 'content': [{'type': 'text', 'text': obs['npc_answer']}]})
+        elif self.last_action == 5:
+            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+            self.input_images += [self.look_down_image]
+            self.messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': self.last_llm_outputs}]})
+            input_img_id = -1
+        else:
+            sources = copy.deepcopy(self.conversation)
+            sources[0]["value"] = sources[0]["value"].replace('<instruction>', info['episode_instruction'])
+            cur_images = self.rgb_list[-1:]  # current observation
+            if self.step_id == 0:
+                history_id = []
+            else:
+                history_id = np.unique(np.linspace(0, self.step_id - 1, self.num_history, dtype=np.int32)).tolist()
+                # add dialod history
+                dialogs_idx = np.sort(list(set([i['true_idx'] for i in self.dialogs]))).tolist()
+                history_id = np.sort(np.unique(np.concatenate([history_id, dialogs_idx]).astype(np.int32))).tolist()
+                placeholder = [''] * (len(history_id) + 1)
+                for n in dialogs_idx:
+                    pos = history_id.index(n)
+                    output = ""
+                    for dialog in self.dialogs:
+                        if dialog['true_idx'] == n:
+                            output += f"<|{dialog['role']}|>{dialog['message']}"
+                    placeholder[pos + 1] = "<|dialog_start|>" + output + "<|dialog_end|>"
+                # add image history
+                placeholder = (DEFAULT_IMAGE_TOKEN + '\n').join(placeholder)
+                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+                if self.append_look_down:
+                    if self.prev_look_image is not None:
+                        sources[0]["value"] += f' Your previous look down image is:{DEFAULT_IMAGE_TOKEN}.'
+                    else:
+                        sources[0]["value"] += ' Your previous look down image is not here.'
+            history_id = sorted(history_id)
+            print('history_id', self.step_id, history_id)
+            # prepare images
+            if self.append_look_down:
+                if self.prev_look_image is not None:
+                    self.input_images = [self.rgb_list[i] for i in history_id] + [self.prev_look_image] + cur_images
+                else:
+                    self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
+            else:
+                self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
+            input_img_id = 0
+
+        if self.last_action != 6:
+            # prompt text
+            prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+            sources[0]["value"] += f" {prompt}."
+            prompt_instruction = copy.deepcopy(sources[0]["value"])
+
+            # prompt images
+            parts = split_and_clean(prompt_instruction)
+            content = []
+            for i in range(len(parts)):
+                if parts[i] == "<image>":
+                    content.append({"type": "image", "image": self.input_images[input_img_id]})
+                    input_img_id += 1
+                else:
+                    content.append({"type": "text", "text": parts[i]})
+
+            self.messages.append({'role': 'user', 'content': content})
+        # inference
+        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
+        print('step_id', self.step_id, ' ', text)
+        # for image_idx, input_image in enumerate(self.input_images):
+        #     input_image.save(os.path.join('/'.join(info['output_path'].split('/')[:-3]), 'debug_images', f'image_{image_idx}.jpg'))
+        inputs = self.processor(text=[text], images=self.input_images, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs, max_new_tokens=self.agent_config.model_settings['max_new_tokens'], do_sample=False
+            )
+        llm_outputs = self.processor.tokenizer.decode(
+            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+        )
+        print('step_id:', self.step_id, 'output text:', llm_outputs)
+        return llm_outputs
+
+    def step(self, obs: Dict[str, Any], env, info):
+        print(f'{self.agent_config.model_name} Agent step')
+        start = time.time()
+        # convert obs to model input
+        self.step_id = info['step']
+        obs = self.convert_input(obs, info)
+        if len(self.action_seq) == 0 and self.goal is None:
+            llm_outputs = self.inference(obs, info)
+            self.last_llm_outputs = llm_outputs
+            action = self.convert_output(env, llm_outputs)
+            with open(info['output_path'], 'a') as f:
+                f.write(str(self.step_id) + " " + llm_outputs + "\n")
+        else:
+            action = None
+
+        if action is None:
+            if len(self.action_seq) != 0:
+                action = self.action_seq.pop(0)
+            elif self.goal is not None:
+                action = self.agent.get_next_action(self.goal)
+                action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                action = action[0] if hasattr(action, "__len__") else action
+
+                self.forward_action += 1
+                print('forward_action', self.forward_action, flush=True)
+                if self.forward_action > 8:
+                    self.goal = None
+                    self.messages = []
+                    self.forward_action = 0
+                    end = time.time()
+                    print(f'time: {round(end-start, 4)}s')
+                    return 7
+                if action == 0:
+                    self.goal = None
+                    self.messages = []
+                    end = time.time()
+                    print(f'time: {round(end-start, 4)}s')
+                    return 7
+            else:
+                action = 0
+
+        end = time.time()
+        print(f'time: {round(end-start, 4)}s')
+        self.last_action = action
+        return action
+
+    def reset(self, env):
+        self.intrinsic_matrix = self.get_intrinsic_matrix(self.sim_sensors_config.rgb_sensor)
+        self.agent = ShortestPathFollower(env._env.sim, 0.25, False)
+
+        # params saving and initialization
+        agent_state = env._env.sim.get_agent_state()
+        rotation_matrix = quaternion.as_rotation_matrix(agent_state.rotation)
+        self.transformation_matrix = np.eye(4)
+        self.transformation_matrix[:3, :3] = rotation_matrix
+        self.transformation_matrix[:3, 3] = agent_state.position  # get transformation from world to agent
+        self.initial_height = agent_state.position[1]  # get initial height
+
+        self.last_action = None
+        self.messages = []
+        self.rgb_list = []
+        self.action_seq = []
+        self.goal = None
+        self.prev_look_image = None
+        self.look_down_image = None  # params for qwen model
+
+        self.dialogs = []
+
+        # params for saving
+        self.save_raw_image = None
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depth", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
diff --git a/internnav/configs/evaluator/__init__.py b/internnav/configs/evaluator/__init__.py
index 27e63a3..fc3f7e7 100644
--- a/internnav/configs/evaluator/__init__.py
+++ b/internnav/configs/evaluator/__init__.py
@@ -39,8 +39,8 @@ class MetricCfg(BaseModel):
 
 class TaskCfg(BaseModel):
     task_name: Optional[str] = None
-    task_settings: Dict[str, Any]
-    scene: SceneCfg
+    task_settings: Dict[str, Any] = None
+    scene: SceneCfg = None
     robot_name: Optional[str] = None
     robot: Optional[RobotCfg] = None
     robot_flash: Optional[bool] = None
@@ -56,6 +56,7 @@ class EvalDatasetCfg(BaseModel):
 
 
 class EvalCfg(BaseModel):
+    remote_agent: Optional[bool] = None
     eval_type: Optional[str] = None
     eval_settings: Optional[Dict[str, Any]] = {}
     agent: Optional[AgentCfg] = None
diff --git a/internnav/env/__init__.py b/internnav/env/__init__.py
index 798723e..d055f53 100644
--- a/internnav/env/__init__.py
+++ b/internnav/env/__init__.py
@@ -1,4 +1,5 @@
 from internnav.env.base import Env
+from internnav.env.habitat_env import HabitatEnv
 from internnav.env.internutopia_env import InternutopiaEnv
 
-__all__ = ['Env', 'InternutopiaEnv']
+__all__ = ['Env', 'InternutopiaEnv', 'HabitatEnv']
diff --git a/internnav/env/base.py b/internnav/env/base.py
index a5b9d6e..5e2a019 100644
--- a/internnav/env/base.py
+++ b/internnav/env/base.py
@@ -42,6 +42,7 @@ def decorator(env_class):
             if env_type in cls.envs:
                 raise ValueError(f"Env {env_type} already registered.")
             cls.envs[env_type] = env_class
+            return env_class
 
         return decorator
 
diff --git a/internnav/env/dialog_mp3d.py b/internnav/env/dialog_mp3d.py
new file mode 100644
index 0000000..b642e94
--- /dev/null
+++ b/internnav/env/dialog_mp3d.py
@@ -0,0 +1,179 @@
+import cv2
+import numpy as np
+
+
+def fill_small_holes(depth_img: np.ndarray, area_thresh: int) -> np.ndarray:
+    """
+    Identifies regions in the depth image that have a value of 0 and fills them in
+    with 1 if the region is smaller than a given area threshold.
+
+    Args:
+        depth_img (np.ndarray): The input depth image
+        area_thresh (int): The area threshold for filling in holes
+
+    Returns:
+        np.ndarray: The depth image with small holes filled in
+    """
+    # Create a binary image where holes are 1 and the rest is 0
+    binary_img = np.where(depth_img == 0, 1, 0).astype("uint8")
+
+    # Find contours in the binary image
+    contours, _ = cv2.findContours(binary_img, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+    filled_holes = np.zeros_like(binary_img)
+
+    for cnt in contours:
+        # If the area of the contour is smaller than the threshold
+        if cv2.contourArea(cnt) < area_thresh:
+            # Fill the contour
+            cv2.drawContours(filled_holes, [cnt], 0, 1, -1)
+
+    # Create the filled depth image
+    filled_depth_img = np.where(filled_holes == 1, 1, depth_img)
+
+    return filled_depth_img
+
+
+class MP3DGTPerception:
+    def __init__(self, max_depth, min_depth, fx, fy):
+        self.max_depth = max_depth
+        self.min_depth = min_depth
+        self.fx = fx
+        self.fy = fy
+
+    def predict(self, depth, targets, tf_camera_to_ply, area_threshold=2500):
+        '''
+        Get the gt semantic map of the target objects
+        image: (H, W, 3) current rgb frame
+        depth: (H, W) current depth frame
+        targets: (N, 6) bboxes of the target objects, first 3 are coordinates of min corner, last 3 are coordinates of max corner
+        area_threshold: int
+        return: (N, H, W) gt semantic map of the target objects
+        '''
+        # get the point clouds of current frame
+        filled_depth = fill_small_holes(depth, area_threshold)
+        scaled_depth = filled_depth * (self.max_depth - self.min_depth) + self.min_depth
+        mask = scaled_depth < self.max_depth
+        point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, self.fx, self.fy)
+        point_cloud_ply_frame = transform_points(tf_camera_to_ply, point_cloud_camera_frame)
+
+        # mark the points in the target objects' bboxes
+        semantic_images = []
+        for target in targets:
+            min_x, min_y, min_z = target[:3]
+            max_x, max_y, max_z = target[3:]
+
+            in_bbox = (
+                (point_cloud_ply_frame[:, 0] >= min_x)
+                & (point_cloud_ply_frame[:, 0] <= max_x)
+                & (point_cloud_ply_frame[:, 1] >= min_y)
+                & (point_cloud_ply_frame[:, 1] <= max_y)
+                & (point_cloud_ply_frame[:, 2] >= min_z)
+                & (point_cloud_ply_frame[:, 2] <= max_z)
+            )
+            in_bbox_points = point_cloud_ply_frame[in_bbox]
+            semantic_image = np.zeros(depth.shape, dtype=np.uint8)
+            if len(in_bbox_points) > 0:
+                # map the marked points back to the image to get the semantic map
+                in_bbox_camera_frame = inverse_transform_points(tf_camera_to_ply, in_bbox_points)
+                in_box_image_coords = project_points_to_image(in_bbox_camera_frame, self.fx, self.fy, depth.shape)
+                try:
+                    mask = [
+                        in_box_image_coords[i, 0] < 480 and in_box_image_coords[i, 1] < 640
+                        for i in range(len(in_box_image_coords))
+                    ]
+                    in_box_image_coords = in_box_image_coords[mask]
+                    semantic_image[in_box_image_coords[:, 0], in_box_image_coords[:, 1]] = 1
+                except Exception as e:
+                    print(e)
+                semantic_image = fill_small_holes(semantic_image, area_threshold)
+            semantic_images.append(semantic_image)
+        if len(semantic_images) > 0:
+            semantic_images = np.stack(semantic_images, axis=0)
+        else:
+            semantic_images = np.zeros((1, depth.shape[0], depth.shape[1]), dtype=np.uint8)
+        return semantic_images
+
+
+def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
+    # Add a homogeneous coordinate of 1 to each point for matrix multiplication
+    homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
+
+    # Apply the transformation matrix to the points
+    transformed_points = np.dot(transformation_matrix, homogeneous_points.T).T
+
+    # Remove the added homogeneous coordinate and divide by the last coordinate
+    return transformed_points[:, :3] / transformed_points[:, 3:]
+
+
+def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray:
+    """Calculates the 3D coordinates (x, y, z) of points in the depth image based on
+    the horizontal field of view (HFOV), the image width and height, the depth values,
+    and the pixel x and y coordinates.
+
+    Args:
+        depth_image (np.ndarray): 2D depth image.
+        mask (np.ndarray): 2D binary mask identifying relevant pixels.
+        fx (float): Focal length in the x direction.
+        fy (float): Focal length in the y direction.
+
+    Returns:
+        np.ndarray: Array of 3D coordinates (x, y, z) of the points in the image plane.
+    """
+    v, u = np.where(mask)
+    z = depth_image[v, u]
+    x = (u - depth_image.shape[1] // 2) * z / fx
+    y = (v - depth_image.shape[0] // 2) * z / fy
+    cloud = np.stack((x, -y, -z), axis=-1)
+
+    return cloud
+
+
+def inverse_transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
+    """Convert point cloud from episodic coordinate system to camera coordinate system
+
+    Args:
+        transformation_matrix (np.ndarray): 4x4 transformation matrix
+        points (np.ndarray): Point cloud coordinates (N, 3)
+
+    Returns:
+        np.ndarray: Point cloud coordinates in camera coordinate system (N, 3)
+    """
+    # Calculate the inverse of the transformation matrix
+    inv_matrix = np.linalg.inv(transformation_matrix)
+
+    # Add a homogeneous coordinate of 1 to each point for matrix multiplication
+    homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
+
+    # Apply the inverse transformation
+    transformed_points = np.dot(inv_matrix, homogeneous_points.T).T
+
+    # Remove the added homogeneous coordinate
+    return transformed_points[:, :3] / transformed_points[:, 3:]
+
+
+def project_points_to_image(points: np.ndarray, fx: float, fy: float, image_shape: tuple) -> np.ndarray:
+    """Project points from camera coordinate system to image plane
+
+    Args:
+        points (np.ndarray): Points in camera coordinate system (N, 3)
+        fx (float): x-axis focal length
+        fy (float): y-axis focal length
+        image_shape (tuple): Image dimensions (height, width)
+
+    Returns:
+        np.ndarray: Image coordinates (N, 2)
+    """
+    points = np.stack((points[:, 0], -points[:, 1], -points[:, 2]), axis=-1)
+    # Ensure points are in front of the camera
+    valid_mask = points[:, 2] > 0  # z > 0
+
+    # Calculate image coordinates
+    u = points[:, 0] * fx / points[:, 2] + image_shape[1] // 2
+    v = points[:, 1] * fy / points[:, 2] + image_shape[0] // 2
+
+    # Combine coordinates
+    image_coords = np.stack((v, u), axis=-1)
+    image_coords = image_coords.astype(np.int32)
+    # Return valid points only
+    return image_coords[valid_mask]
diff --git a/internnav/env/habitat_env.py b/internnav/env/habitat_env.py
new file mode 100644
index 0000000..fe60462
--- /dev/null
+++ b/internnav/env/habitat_env.py
@@ -0,0 +1,175 @@
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import quaternion
+from depth_camera_filtering import filter_depth
+from habitat.config.default import get_agent_config
+
+from internnav.configs.evaluator import EnvCfg, TaskCfg
+from internnav.env import base
+
+from .dialog_mp3d import MP3DGTPerception
+
+
+@base.Env.register('habitat')
+class HabitatEnv(base.Env):
+    def __init__(self, env_config: EnvCfg, task_config: TaskCfg = None):
+        """
+        env_settings include:
+            - habitat_config: loaded from get_habitat_config
+            - rank: int, rank index for sharding
+            - world_size: int, total number of ranks
+        """
+        try:
+            from habitat import Env
+        except ImportError as e:
+            raise RuntimeError(
+                "Habitat modules could not be imported. " "Make sure both repositories are installed and on PYTHONPATH."
+            ) from e
+
+        super().__init__(env_config, task_config)
+        self.config = env_config.env_settings['habitat_config']
+        self._env = Env(self.config)
+
+        self.rank = env_config.env_settings.get('rank', 0)
+        self.world_size = env_config.env_settings.get('world_size', 1)
+        self._current_episode_index: int = 0
+        self._last_obs: Optional[Dict[str, Any]] = None
+
+        self.is_running = True
+        self.output_path = env_config.env_settings.get('output_path', './output')
+
+        agent_config = get_agent_config(self.config.simulator)
+        self.min_depth = agent_config.sim_sensors.depth_sensor.min_depth
+        self.max_depth = agent_config.sim_sensors.depth_sensor.max_depth
+        self._camera_fov = np.deg2rad(agent_config.sim_sensors.depth_sensor.hfov)
+        self._fx = self._fy = agent_config.sim_sensors.depth_sensor.width / (2 * np.tan(self._camera_fov / 2))
+        self._camera_height = agent_config.sim_sensors.rgb_sensor.position[1]
+        self.segmentation = MP3DGTPerception(self.max_depth, self.min_depth, self._fx, self._fy)
+
+        # generate episodes
+        # self._env.episodes = self._env.episodes[0:1]  # for debug
+        self.episodes = self.generate_episodes()
+        # print(self.episodes)
+
+    def generate_episodes(self) -> List[Any]:
+        """
+        Generate list of episodes for the current split, already:
+        - grouped by scene
+        - filtered by done_res (the path is self.output_path/progress.json)
+        - sharded by (rank, world_size)
+        """
+        all_episodes = []
+
+        # group episodes by scene
+        scene_episode_dict: Dict[str, List[Any]] = {}
+        for episode in self._env.episodes:
+            scene_episode_dict.setdefault(episode.scene_id, []).append(episode)
+
+        # load done_res
+        done_res = set()
+        result_path = os.path.join(self.output_path, 'progress.json')
+        if os.path.exists(result_path):
+            with open(result_path, 'r') as f:
+                for line in f:
+                    res = json.loads(line)
+                    # only skip if current format has scene_id
+                    if "scene_id" in res:
+                        done_res.add((res["scene_id"], res["episode_id"]))
+
+        # iterate scenes in order, collect all episodes
+        for scene in sorted(scene_episode_dict.keys()):
+            per_scene_eps = scene_episode_dict[scene]
+            scene_id = scene.split('/')[-2]
+
+            # shard by rank index / world_size
+            for episode in per_scene_eps[self.rank :: self.world_size]:
+                episode_id = int(episode.episode_id)
+                if (scene_id, episode_id) in done_res:
+                    continue
+                all_episodes.append(episode)
+
+        return all_episodes
+
+    def reset(self):
+        """
+        load next episode and return first observation
+        """
+        # no more episodes
+        if not (0 <= self._current_episode_index < len(self.episodes)):
+            self.is_running = False
+            return
+
+        # Manually set to next episode in habitat
+        self._env.current_episode = self.episodes[self._current_episode_index]
+        self._current_episode_index += 1
+
+        # Habitat reset
+        self._last_obs = self._env.reset()
+        if "instance" in self.task_config.task_name:
+            self._last_obs['semantic'] = self.get_semantic(self._last_obs)
+        return self._last_obs
+
+    def step(self, action: List[Any]):
+        """
+        step the environment with given action
+
+        Args: action: List[Any], action for each env in the batch
+
+        Return: obs, reward, done, info
+        """
+        obs = self._env.step(action)
+        if "instance" in self.task_config.task_name:
+            obs['semantic'] = self.get_semantic(obs)
+        done = self._env.episode_over
+        info = self._env.get_metrics()
+        reward = info.get('reward', 0.0)
+        return obs, reward, done, info
+
+    def close(self):
+        print('Habitat Env close')
+        self._env.close()
+
+    def render(self):
+        self._env.render()
+
+    def get_observation(self) -> Dict[str, Any]:
+        return self._env.get_observations()
+
+    def get_metrics(self) -> Dict[str, Any]:
+        return self._env.get_metrics()
+
+    def get_current_episode(self):
+        return self._env.current_episode
+
+    def get_tf_episodic_to_global(self):
+        agent_state = self._env.sim.get_agent_state()
+        rotation = agent_state.rotation
+        translation = agent_state.position
+        rotation_matrix = quaternion.as_rotation_matrix(rotation)
+        tf_episodic_to_global = np.eye(4)
+        tf_episodic_to_global[:3, :3] = rotation_matrix
+        tf_episodic_to_global[:3, 3] = translation
+        return tf_episodic_to_global
+
+    def get_semantic(self, obs: dict):
+        targets = [
+            self.get_current_episode().goals[idx].bbox
+            for idx, _ in enumerate(self.get_current_episode().instruction.instance_id)
+        ]
+        targets = np.array(
+            [
+                [target[0], min(-target[2], -target[5]), target[1], target[3], max(-target[5], -target[2]), target[4]]
+                for target in targets
+            ]
+        )
+        depth = filter_depth(obs["depth"].reshape(obs["depth"].shape[:2]), blur_type=None)
+        tf_camera_to_global = self.get_tf_episodic_to_global()
+        tf_camera_to_global[1, 3] = self._camera_height + self._env.sim.get_agent_state().position[1]
+        tf_camera_to_ply = np.dot(
+            np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]), tf_camera_to_global
+        )
+        semantic = self.segmentation.predict(depth, targets, tf_camera_to_ply)
+        return semantic
diff --git a/internnav/evaluator/__init__.py b/internnav/evaluator/__init__.py
index 35968e8..6ba1c8c 100644
--- a/internnav/evaluator/__init__.py
+++ b/internnav/evaluator/__init__.py
@@ -1,6 +1,7 @@
 from internnav.evaluator.base import Evaluator
 from internnav.evaluator.distributed_base import DistributedEvaluator
 from internnav.evaluator.vln_distributed_evaluator import VLNDistributedEvaluator
+from internnav.evaluator.habitat_dialog_evaluator import HabitatDialogEvaluator
 
 # register habitat
 try:
@@ -9,4 +10,4 @@
     print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
 
 
-__all__ = ['Evaluator', 'DistributedEvaluator', 'VLNDistributedEvaluator', 'HabitatVLNEvaluator']
+__all__ = ['Evaluator', 'DistributedEvaluator', 'VLNDistributedEvaluator', 'HabitatVLNEvaluator', 'HabitatDialogEvaluator']
diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index 9df4e74..307adbb 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -64,6 +64,7 @@ def __init__(self, eval_cfg: EvalCfg, init_env: bool = True, init_agent: bool =
                 from internnav.agent import Agent
 
                 eval_cfg.agent.model_settings['local_rank'] = self.local_rank
+                eval_cfg.agent.model_settings['task_name'] = eval_cfg.task.task_name
                 self.agent = Agent.init(eval_cfg.agent)
 
     def eval(self):
diff --git a/internnav/evaluator/habitat_dialog_evaluator.py b/internnav/evaluator/habitat_dialog_evaluator.py
new file mode 100644
index 0000000..518b6df
--- /dev/null
+++ b/internnav/evaluator/habitat_dialog_evaluator.py
@@ -0,0 +1,457 @@
+import argparse
+import json
+import os
+import sys
+
+import copy
+import itertools
+import random
+import re
+from collections import OrderedDict
+
+import numpy as np
+import quaternion
+import torch
+import tqdm
+from depth_camera_filtering import filter_depth
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from transformers.image_utils import to_numpy_array
+
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator import DistributedEvaluator, Evaluator
+from internnav.model.utils.vln_utils import (
+    chunk_token,
+    open_image,
+    split_and_clean,
+    traj_to_actions,
+)
+from internnav.agent.dialog_agent import DialogAgent
+from internnav.internnav_habitat.dialog_utils import get_config, get_path_description_
+from internnav.internnav_habitat.simple_npc.simple_npc import SimpleNPC
+
+try:
+    import habitat
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat.utils.visualizations.utils import observations_to_image
+
+    # Import for Habitat registry side effects — do not remove
+    import internnav.internnav_habitat.measures  # noqa: F401 
+    from internnav.internnav_habitat.dialog_dataset import DialogDatasetV1
+    # isort: skip
+except Exception as e:
+    print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+@Evaluator.register('habitat_dialog')
+class HabitatDialogEvaluator(DistributedEvaluator):
+    def __init__(self, cfg: EvalCfg):
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.epoch = args.epoch
+        self.max_steps_per_episode = args.max_steps_per_episode
+        self.scene_summary = args.scene_summary
+        self.output_path = args.output_path
+ 
+        self.task = cfg.task.task_name
+        self.turn = args.turn
+        self.dialog_enabled = cfg.agent.model_settings['dialog_enabled']
+        self.save_video = args.save_video
+
+        self.npc = SimpleNPC(
+                max_interaction_turn=10,
+                model_name=args.model_name,
+                openai_api_key=args.openai_api_key,
+                base_url=args.base_url,
+            )
+
+        # create habitat config
+        self.config_path = cfg.env.env_settings['habitat_config_path']
+        self.config = get_config(self.config_path, cfg.env.env_settings['baseline_config_path'])
+
+        with habitat.config.read_write(self.config):
+            self.config.exp.task = self.task
+            self.config.habitat.dataset.split = args.eval_split
+            self.config.habitat.task.measurements.update(
+                {
+                    "top_down_map": TopDownMapMeasurementConfig(
+                        map_padding=3,
+                        map_resolution=1024,
+                        draw_source=True,
+                        draw_border=True,
+                        draw_shortest_path=True,
+                        draw_view_points=True,
+                        draw_goal_positions=True,
+                        draw_goal_aabbs=True,
+                        fog_of_war=FogOfWarConfig(
+                            draw=True,
+                            visibility_dist=5.0,
+                            fov=90,
+                        ),
+                    ),
+                    "collisions": CollisionsMeasurementConfig(),
+                }
+            )
+        cfg.env.env_settings['habitat_config'] = self.config.habitat
+        cfg.env.env_settings['output_path'] = self.output_path
+
+        # init agent and env
+        cfg.agent.model_settings['task'] = self.task
+        cfg.agent.model_settings['sim_sensors_config'] = self.config.habitat.simulator.agents.main_agent.sim_sensors
+        self.objectnav_instruction = "search for {target_object}."
+        super().__init__(cfg, init_agent=True, init_env=True)
+
+    def eval_action(self):
+        """
+        Run local episodes on this rank.
+
+        Returns dict[str, Tensor] on GPU (1D tensors of same length).
+        """
+        sucs, spls, oss, nes = [], [], [], []
+        done_res = []
+        if os.path.exists(os.path.join(self.output_path, f'result.json')):
+            with open(os.path.join(self.output_path, f'result.json'),'r') as f:
+                for line in f.readlines():
+                    res = json.loads(line)
+                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
+                    sucs.append(res['success'])
+                    spls.append(res['spl'])
+                    oss.append(res['os'])
+                    nes.append(res['ne'])
+        env = self.env
+
+        while env.is_running:
+            obs = env.reset()
+            if not env.is_running or obs is None:
+                break
+
+            # recover from last evaluated episode
+            episode = env._env.current_episode
+            scene_id = episode.scene_id.split('/')[-2]
+            if 'coin' in self.task:
+                episode_instruction = self.objectnav_instruction.format(target_object=episode.object_category.replace('_', ' '))+", "+episode.instruction
+            elif 'objectnav' in self.task:
+                episode_instruction = self.objectnav_instruction.format(target_object=episode.object_category.replace('_', ' '))
+            else:    
+                episode_instruction = episode.instruction.instruction_text[:-1]
+            episode_id = int(episode.episode_id)
+            if [scene_id, episode_id, episode_instruction] in done_res:
+                continue
+            # make directories
+            os.makedirs(os.path.join(self.output_path, 'check_sim'), exist_ok=True)
+            Image.fromarray(obs['rgb']).save(os.path.join(self.output_path, 'check_sim', f'rgb_{self.rank}.jpg'))
+            os.makedirs(os.path.join(self.output_path, 'action', f'{scene_id}'), exist_ok=True)
+            # os.makedirs(os.path.join(self.output_path, 'debug_images'), exist_ok=True)
+            
+            if self.save_video:
+                os.makedirs(os.path.join(self.output_path, 'vis', f'{scene_id}'), exist_ok=True)
+            
+            # get agent ready
+            self.agent.reset(env)
+
+            # info for npc
+            if 'dialog' in self.task or self.dialog_enabled: # gt of env for npc
+                with open(os.path.join(self.scene_summary, scene_id, 'object_dict.json'), 'r', encoding='utf-8') as f:
+                    object_dict = json.load(f)
+                with open(os.path.join(self.scene_summary, scene_id, 'region_dict.json'), 'r', encoding='utf-8') as f:
+                    region_dict = json.load(f)
+
+            # initialization
+            step_id = 0
+
+            path_list = []
+            vis_frames = []
+            action_list = [] # params for saving results
+
+            while not env._env.episode_over and step_id <= self.max_steps_per_episode:
+                agent_state = env._env.sim.get_agent_state()
+                path_list.append(agent_state.position.tolist())
+                info = {'step': step_id, 'agent state': agent_state, 'episode_instruction': episode_instruction, 'output_path': os.path.join(self.output_path, 'action', f'{scene_id}', f'{episode_id}.txt'), 'info': env.get_metrics()}
+                action = self.agent.step(obs, env, info=info)
+                print("step_id", step_id, "action", action)
+                action_list.append(action)
+                if action in [0, 1, 2, 3]:
+                    obs, reward, done, info = env.step(action)
+                elif action == 5:
+                    env.step(action)
+                    obs, reward, done, info = env.step(action)
+                    continue
+                elif action == 6:
+                    if len(self.agent.dialogs)/2>=self.turn:
+                        npc_answer = 'Sorry, you have reached the question limit. No further answers are available.'
+                    else: 
+                        path_description, pl = get_path_description_(env._env, object_dict, region_dict)
+                        task_finish = obs['semantic'][0].sum() > 0 and pl < 3
+                        npc_answer = self.npc.answer_question(
+                                                                question=self.agent.question,
+                                                                instance_id=env._env.current_episode.instruction.instance_id[0],
+                                                                object_dict=object_dict,
+                                                                task_done=task_finish,
+                                                                path_description=path_description,
+                                                                mode="two_turn",
+                                                            )
+                    if npc_answer is None:
+                        npc_answer = 'Sorry, I can not answer your question now.'
+                    
+                    with open(os.path.join(self.output_path, 'action', f'{scene_id}', f'{episode_id}.txt'), 'a') as f:
+                        f.write(npc_answer + "\n")
+                    obs['npc_answer'] = npc_answer
+                    continue
+
+                step_id += 1
+                self.agent.messages = []
+
+            m = env.get_metrics()
+            sucs.append(m["success"])
+            spls.append(m["spl"])
+            oss.append(m["oracle_success"])
+            nes.append(m["distance_to_goal"])
+            result = {
+                "scene_id": scene_id,
+                "episode_id": episode_id,
+                "success": m["success"],
+                "spl": m["spl"],
+                "os": m['oracle_success'],
+                "ne": m["distance_to_goal"],
+                "steps": step_id,
+                "episode_instruction": episode_instruction,
+                "path": path_list,
+                "action": action_list,
+                "object_category": episode.object_category if 'vln' not in self.task else ''
+            }
+            with open(os.path.join(self.output_path, f'result.json'), 'a') as f:
+                f.write(json.dumps(result) + "\n")
+
+        env.close()
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(len(sucs)).to(self.device),
+        )
+
+    def calc_metrics(self, global_metrics: dict) -> dict:
+        """
+        global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
+        """
+        sucs_all = global_metrics["sucs"]
+        spls_all = global_metrics["spls"]
+        oss_all = global_metrics["oss"]
+        nes_all = global_metrics["nes"]
+
+        # avoid /0 if no episodes
+        denom = max(len(sucs_all), 1)
+
+        return {
+            "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
+            "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
+            "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
+            "nes_all": float(nes_all.mean().item()) if denom > 0 else 0.0,
+            # "length" will be filled by base class
+        }
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
+
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+
+        return img, (target_width, target_height)
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depthhhhhhhhhhhhhh", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def dot_matrix_two_dimensional(
+        self,
+        image_or_image_path,
+        save_path=None,
+        dots_size_w=8,
+        dots_size_h=8,
+        save_img=False,
+        font_path='fonts/arial.ttf',
+        pixel_goal=None,
+    ):
+        """
+        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
+        control args:
+        1. dots_size_w: the number of columns of the dots matrix
+        2. dots_size_h: the number of rows of the dots matrix
+        """
+        with open_image(image_or_image_path) as img:
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            draw = ImageDraw.Draw(img, 'RGB')
+
+            width, height = img.size
+            grid_size_w = dots_size_w + 1
+            grid_size_h = dots_size_h + 1
+            cell_width = width / grid_size_w
+            cell_height = height / grid_size_h
+
+            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
+
+            target_i = target_j = None
+            if pixel_goal is not None:
+                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
+                # Validate pixel coordinates
+                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
+                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
+
+                # Convert to grid coordinates
+                target_i = round(x_pixel / cell_width)
+                target_j = round(y_pixel / cell_height)
+
+                # Validate grid bounds
+                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
+                    raise ValueError(
+                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
+                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
+                    )
+
+            count = 0
+
+            for j in range(1, grid_size_h):
+                for i in range(1, grid_size_w):
+                    x = int(i * cell_width)
+                    y = int(j * cell_height)
+
+                    pixel_color = img.getpixel((x, y))
+                    # choose a more contrasting color from black and white
+                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
+                        opposite_color = (0, 0, 0)
+                    else:
+                        opposite_color = (255, 255, 255)
+
+                    if pixel_goal is not None and i == target_i and j == target_j:
+                        opposite_color = (255, 0, 0)  # Red for target
+
+                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
+                    draw.ellipse(
+                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
+                        fill=opposite_color,
+                    )
+
+                    text_x, text_y = x + 3, y
+                    count_w = count // dots_size_w
+                    count_h = count % dots_size_w
+                    label_str = f"({count_w+1},{count_h+1})"
+                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
+                    count += 1
+            if save_img:
+                print(">>> dots overlaid image processed, stored in", save_path)
+                img.save(save_path)
+            return img
diff --git a/internnav/internnav_habitat/README.md b/internnav/internnav_habitat/README.md
new file mode 100644
index 0000000..b7c5d73
--- /dev/null
+++ b/internnav/internnav_habitat/README.md
@@ -0,0 +1,136 @@
+# Habitat in InternNav
+
+This package adapts [Meta AI Habitat](https://aihabitat.org) environments and
+metrics so they can be used from InternNav's evaluation framework. It provides
+an environment wrapper, custom measurements, and evaluator implementations that
+bridge Habitat simulations with InternNav agents and distributed evaluation
+utilities.
+
+## Package structure
+
+```
+internnav_habitat/
+├── __init__.py
+├── habitat_env.py
+├── habitat_default_evaluator.py
+├── habitat_vln_evaluator.py
+├── measures.py
+└── refactor_notes.md
+```
+
+* `__init__.py` re-exports the public entry points for the environment and the
+  VLN evaluator so they can be imported as
+  `from internnav.internnav_habitat import HabitatEnv`.
+* `habitat_env.py` implements the `Env` subclass that wraps Habitat's
+  `Env` object. It bootstraps episodes, handles sharding across distributed
+  ranks, and adapts Habitat's observations to InternNav's expectations.
+* `habitat_default_evaluator.py` contains a lightweight evaluator that runs a
+  conventional Habitat agent inside the InternNav evaluator loop.
+* `habitat_vln_evaluator.py` is the task-specific evaluator used for Vision-
+  and-Language Navigation (VLN). It loads InternNav vision-language models,
+  orchestrates inference, and logs results during distributed evaluation.
+* `measures.py` registers additional Habitat measurements (path length,
+  oracle metrics, step counts) that are required by the evaluators.
+
+The `refactor_notes.md` file captures design notes and TODOs collected during a
+previous refactoring pass.
+
+## Habitat environment wrapper
+
+`HabitatEnv` is registered under the key `"habitat"` via the shared
+`Env.register` decorator. When InternNav builds an environment from an
+`EnvCfg`, the wrapper:
+
+1. Imports and instantiates the Habitat `Env` using the configuration object
+   provided in `env_settings['habitat_config']`.
+2. Stores the distributed context (`local_rank`, `world_size`) and any output
+   directory override (`output_path`).
+3. Pre-computes the episode list by grouping Habitat episodes by scene,
+   filtering completed episodes via `progress.json`, and sharding the remaining
+   work by rank.
+4. Implements the standard reset/step/close/render accessors expected by the
+   InternNav `Env` base class while delegating to the underlying Habitat
+   simulator.
+
+This design keeps the Habitat-specific logic isolated from the rest of the
+framework and ensures that distributed evaluation proceeds deterministically
+across ranks.
+
+## Evaluation pipeline
+
+InternNav evaluators extend the shared `DistributedEvaluator` base class, which
+handles distributed initialization, environment instantiation, metric
+aggregation, and result logging. The Habitat integration provides two
+implementations:
+
+### `HabitatVlnEvaluator`
+
+The VLN evaluator (`habitat_vln_evaluator.py`) is responsible for coordinating
+model inference in Habitat scenes.
+
+* **Configuration:** During initialization the evaluator reads an `EvalCfg`
+  whose `env.env_settings['config_path']` points to a Habitat YAML file. The
+  config is loaded with Habitat's baseline utilities, sensor intrinsics are
+  cached, and custom measurements (`top_down_map`, `collisions`) are enabled.
+* **Environment binding:** The Habitat configuration is injected back into the
+  `EnvCfg` so the shared `DistributedEvaluator` base class can create the
+  `HabitatEnv` wrapper with the correct settings.
+* **Model loading:** Depending on `cfg.agent.model_settings.mode`, the evaluator
+  loads either the InternVLA dual-system model or a Qwen2.5-VL model using
+  Hugging Face Transformers. The processor is configured with left padding and
+  the model is moved to the rank-local GPU.
+* **Episode loop:**
+  1. `HabitatEnv.reset()` advances to the next episode and returns the first
+     observation.
+  2. The evaluator reads episode metadata (scene, instruction) from Habitat,
+     constructs prompt messages, and collects RGB/depth history for the
+     language model.
+  3. Visual inputs are prepared (resizing, optional look-down depth capture) and
+     depth maps are filtered through `filter_depth` to remove sensor noise.
+  4. The evaluator queries the loaded model for the next action sequence,
+     translates model tokens to Habitat actions via `traj_to_actions`, and
+     steps the environment.
+  5. Per-episode metrics (`success`, `SPL`, oracle success, navigation error)
+     are appended and checkpointed to `progress.json` for resumability.
+* **Aggregation:** After all ranks finish, inherited utilities gather per-rank
+  tensors, compute global averages, and write `result.json` in
+  `output_path`.
+
+### `HabitatVlnEvaluator` (baseline)
+
+The default evaluator in `habitat_default_evaluator.py` offers a simpler loop
+where a pre-built InternNav agent interacts with the Habitat environment.
+InternNav's agent abstraction is reset with each new Habitat episode, and
+per-step actions are produced via `agent.act()`. The evaluator records the same
+metrics as the VLN evaluator, making it useful for baselines or sanity checks.
+
+## Custom Habitat measurements
+
+`measures.py` registers a suite of metrics with Habitat's registry so that they
+are available in the Habitat configuration:
+
+* `PathLength`: cumulative Euclidean distance traveled by the agent.
+* `OracleNavigationError`: minimum geodesic distance to the goal along the
+  trajectory.
+* `OracleSuccess`: binary success metric derived from oracle navigation error
+  relative to a goal radius (default 3.0 meters).
+* `OracleSPL`: best Success weighted by Path Length value observed during the
+  trajectory.
+* `StepsTaken`: number of actions issued by the agent, including STOP.
+
+These metrics complement Habitat's built-in success and SPL scores, allowing
+InternNav to report a richer set of statistics.
+
+## Extending the integration
+
+* **Adding evaluators:** Subclass `DistributedEvaluator`, supply
+  Habitat-specific initialization similar to `HabitatVlnEvaluator`, and
+  implement `eval_action` and `calc_metrics`.
+* **Custom sensors or observations:** Augment the Habitat YAML configuration and
+  update `HabitatEnv` or the evaluator to consume the new observation keys.
+* **Additional metrics:** Register new measures in `measures.py` and enable them
+  in the Habitat config via `config.habitat.task.measurements.update(...)`.
+
+By centralizing Habitat-specific logic in this package, InternNav can swap in
+other simulators or extend Habitat support without touching the rest of the
+training and evaluation stack.
diff --git a/internnav/internnav_habitat/__init__.py b/internnav/internnav_habitat/__init__.py
new file mode 100644
index 0000000..092071f
--- /dev/null
+++ b/internnav/internnav_habitat/__init__.py
@@ -0,0 +1 @@
+from internnav.internnav_habitat.habitat_vln_evaluator import HabitatVlnEvaluator
diff --git a/internnav/internnav_habitat/dialog_dataset.py b/internnav/internnav_habitat/dialog_dataset.py
new file mode 100644
index 0000000..33b489d
--- /dev/null
+++ b/internnav/internnav_habitat/dialog_dataset.py
@@ -0,0 +1,83 @@
+import gzip
+import json
+import os
+from typing import TYPE_CHECKING, List, Optional
+
+import attr
+from habitat.core.dataset import Dataset
+from habitat.core.registry import registry
+from habitat.datasets.utils import VocabDict
+
+from .dialog_episodes import (
+    AgentPosition,
+    DialogEpisode,
+    DialogGoal,
+    DialogViewLocation,
+)
+
+if TYPE_CHECKING:
+    from omegaconf import DictConfig
+
+
+DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/"
+
+
+@attr.s(auto_attribs=True, kw_only=True)
+class DialogInstructionData:
+    task_type: str
+    instruction_text: str
+    instance_id: List[str]
+    instruction_info: Optional[List[str]] = None
+
+
+@registry.register_dataset(name="dialog")
+class DialogDatasetV1(Dataset):
+    r"""Class inherited from Dataset that loads a Vision and Language
+    Navigation dataset.
+    """
+
+    episodes: List[DialogEpisode]
+    instruction_vocab: VocabDict
+
+    @staticmethod
+    def check_config_paths_exist(config: "DictConfig") -> bool:
+        return os.path.exists(config.data_path.format(split=config.split)) and os.path.exists(config.scenes_dir)
+
+    def __init__(self, config: Optional["DictConfig"] = None) -> None:
+        self.episodes = []
+
+        if config is None:
+            return
+
+        dataset_filename = config.data_path.format(split=config.split)
+        with gzip.open(dataset_filename, "rt") as f:
+            self.from_json(f.read(), scenes_dir=config.scenes_dir)
+
+        self.episodes = list(filter(self.build_content_scenes_filter(config), self.episodes))
+
+    def from_json(self, json_str: str, scenes_dir: Optional[str] = None) -> None:
+
+        deserialized = json.loads(json_str)
+        # self.instruction_vocab = VocabDict(
+        #     word_list=deserialized["instruction_vocab"]["word_list"]
+        # )
+        if "category_to_task_category_id" in deserialized:
+            self.category_to_task_category_id = deserialized["category_to_task_category_id"]
+
+        for episode in deserialized["episodes"]:
+            episode = DialogEpisode(**episode)
+
+            if scenes_dir is not None:
+                if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX):
+                    episode.scene_id = episode.scene_id[len(DEFAULT_SCENE_PATH_PREFIX) :]
+
+                episode.scene_id = os.path.join(scenes_dir, episode.scene_id)
+            episode.instruction = DialogInstructionData(**episode.instruction)
+            for g_index, goal in enumerate(episode.goals):
+                view_points = []
+                for view_point in goal['view_points']:
+                    view_point = DialogViewLocation(**{'agent_state': AgentPosition(**view_point['agent_state'])})
+                    view_points.append(view_point)
+                goal['view_points'] = view_points
+                episode.goals[g_index] = DialogGoal(**goal)
+            self.episodes.append(episode)
diff --git a/internnav/internnav_habitat/dialog_episodes.py b/internnav/internnav_habitat/dialog_episodes.py
new file mode 100644
index 0000000..b1eb019
--- /dev/null
+++ b/internnav/internnav_habitat/dialog_episodes.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+from typing import List, Optional, Union
+
+import attr
+import numpy as np
+from habitat.core.dataset import Episode
+from habitat.core.utils import not_none_validator
+from habitat.tasks.nav.nav import NavigationGoal
+
+
+@attr.s(auto_attribs=True)
+class AgentPosition:
+    position: Union[None, List[float], np.ndarray]
+
+
+@attr.s(auto_attribs=True)
+class DialogViewLocation:
+    agent_state: AgentPosition
+
+
+@attr.s(auto_attribs=True, kw_only=True)
+class DialogGoal(NavigationGoal):
+    r"""Base class for a goal specification hierarchy."""
+
+    position: List[float] = attr.ib(default=None, validator=not_none_validator)
+    radius: Optional[float] = None
+    bbox: Optional[List[float]] = None
+    view_points: Optional[List[DialogViewLocation]] = None
+
+
+@attr.s(auto_attribs=True, kw_only=True)
+class DialogEpisode(Episode):
+    object_category: Optional[str] = None
+    goals: List[DialogGoal] = attr.ib(
+        default=None,
+        validator=not_none_validator,
+        on_setattr=Episode._reset_shortest_path_cache_hook,
+    )
+    instruction: Optional[dict] = []
+    frames: Optional[int] = []
diff --git a/internnav/internnav_habitat/dialog_utils.py b/internnav/internnav_habitat/dialog_utils.py
new file mode 100644
index 0000000..2b34d87
--- /dev/null
+++ b/internnav/internnav_habitat/dialog_utils.py
@@ -0,0 +1,174 @@
+import inspect
+import os
+from typing import Optional
+
+import cv2
+import habitat_sim
+import numpy as np
+import quaternion
+from habitat_baselines.config.default import get_config as get_habitat_config
+from npc.utils.get_description import (
+    get_path_description,
+    get_path_description_without_additional_info,
+)
+from omegaconf import DictConfig, OmegaConf, open_dict
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+def get_config(
+    habitat_config_path: str,
+    baseline_config_path: str,
+    opts: Optional[list] = None,
+    configs_dir: str = os.path.dirname(inspect.getabsfile(inspect.currentframe())),
+) -> DictConfig:
+    """
+    Returns habitat_baselines config object composed of configs from yaml file (config_path) and overrides.
+
+    :param config_path: path to the yaml config file.
+    :param overrides: list of config overrides. For example, :py:`overrides=["habitat_baselines.trainer_name=ddppo"]`.
+    :param configs_dir: path to the config files root directory (defaults to :ref:`_BASELINES_CFG_DIR`).
+    :return: composed config object.
+    """
+    habitat_config = get_habitat_config(habitat_config_path, overrides=opts, configs_dir=configs_dir)
+    baseline_config = OmegaConf.load(baseline_config_path)
+
+    with open_dict(habitat_config):
+        config = OmegaConf.merge(habitat_config, baseline_config)
+
+    return config
+
+
+def calculate_path_length(path):
+    accumulated_length = [0]
+    for i, p in enumerate(path[1:]):
+        accumulated_length.append(accumulated_length[i] + np.linalg.norm(np.array(p) - np.array(path[i])))
+    return accumulated_length
+
+
+def get_shortest_path(env, start_position, target_position):
+    """
+    在habitat环境中找到从当前位置到目标位置的最短路径
+
+    参数:
+        env: habitat环境实例
+        start_position: 起点位置坐标, numpy数组 [x, y, z]
+        target_position: 目标位置坐标, numpy数组 [x, y, z]
+
+    返回:
+        path: 路径点列表
+        success: 是否找到有效路径
+    """
+    # 创建路径规划器
+    shortest_path = habitat_sim.ShortestPath()
+    shortest_path.requested_start = start_position
+    shortest_path.requested_end = target_position
+
+    # 计算最短路径
+    success = env.sim.pathfinder.find_path(shortest_path)
+    return shortest_path.points, success
+
+
+def get_navigable_path(env, start_position, target_positions: list, object_info: dict):
+    start_position = [float(i) for i in start_position]
+    target_positions = sorted(
+        target_positions,
+        key=lambda x: np.linalg.norm(np.array(x['agent_state']['position']) - np.array(object_info['position'])),
+    )
+    success = False
+    while not success and len(target_positions) > 0:
+        target_position = target_positions.pop(0)
+        shortest_path, success = get_shortest_path(env, start_position, target_position['agent_state']['position'])
+    if success:
+        return shortest_path, True
+    else:
+        return [], False
+
+
+def get_path_description_(env, object_dict, region_dict):
+    goal_path, success = get_navigable_path(
+        env,
+        env.sim.get_agent_state().position,
+        [{'agent_state': {'position': vp.agent_state.position}} for vp in env.current_episode.goals[0].view_points],
+        {'position': env.current_episode.goals[0].position},
+    )
+    if not success or len(np.unique(goal_path, axis=0)) == 1:
+        print('no shortest path')
+        return None, 0
+    path_length = calculate_path_length(goal_path)
+    pl = path_length[-1]
+    goal_index = max([i for i, c in enumerate(path_length) if c < 4])
+    # goal_index = len(goal_path)-1
+    if goal_index == 0:
+        goal_index = len(goal_path) - 1
+    questioned_path = goal_path[: goal_index + 1]
+    current_yaw = 2 * np.arctan2(env.sim.get_agent_state().rotation.y, env.sim.get_agent_state().rotation.w)
+    _, idx = np.unique(questioned_path, axis=0, return_index=True)
+    idx_sorted = np.sort(idx)
+    questioned_path = list(np.array(questioned_path)[idx_sorted])
+    try:
+        path_description, _ = get_path_description(
+            quaternion.from_euler_angles([0, current_yaw, 0]),
+            questioned_path,
+            object_dict,
+            region_dict,
+            return_finish=False,
+            height_list=[env.sim.get_agent_state().position[1]] * len(questioned_path),
+        )
+    except Exception as e:
+        print(e)
+        path_description, _ = get_path_description_without_additional_info(
+            quaternion.from_euler_angles([0, current_yaw, 0]),
+            questioned_path,
+            height_list=[env.sim.get_agent_state().position[1]] * len(questioned_path),
+        )
+    return path_description, pl
+
+
+def unify_to_first(
+    vis_frames,
+    method: str = "resize",  # "resize" 或 "letterbox"
+    pad_color=(0, 0, 0),  # letterbox 的填充色 (B,G,R)
+    assume_rgb: bool = True,  # 如果后续用 OpenCV 写视频，通常 True 表示当前是 RGB，需要转 BGR
+):
+    assert len(vis_frames) > 0, "vis_frames 为空"
+    h0, w0 = vis_frames[0].shape[:2]
+    out = []
+
+    for i, f in enumerate(vis_frames):
+        f = np.asarray(f)
+
+        # 保障三通道
+        if f.ndim == 2:  # 灰度 -> 3通道
+            f = np.stack([f] * 3, axis=2)
+        if f.shape[2] > 3:
+            f = f[:, :, :3]  # 多通道时只取前三个
+
+        # dtype 归一：转 uint8
+        if f.dtype != np.uint8:
+            # 若是 [0,1] 浮点，×255；若已是 0-255 浮点，直接裁剪
+            fmax = float(np.nanmax(f)) if f.size else 1.0
+            f = (f * 255.0) if fmax <= 1.5 else np.clip(f, 0, 255)
+            f = f.astype(np.uint8)
+
+        h, w = f.shape[:2]
+        if (h, w) == (h0, w0):
+            out.append(np.ascontiguousarray(f))
+            continue
+
+        if method == "letterbox":
+            # 等比缩放 + 居中贴到画布
+            scale = min(w0 / w, h0 / h)
+            nw, nh = int(round(w * scale)), int(round(h * scale))
+            resized = cv2.resize(f, (nw, nh), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
+            canvas = np.full((h0, w0, 3), pad_color, dtype=np.uint8)
+            top, left = (h0 - nh) // 2, (w0 - nw) // 2
+            canvas[top : top + nh, left : left + nw] = resized
+            f_out = canvas
+        else:
+            # 直接拉伸到目标大小
+            f_out = cv2.resize(f, (w0, h0), interpolation=cv2.INTER_AREA if (h * w) > (h0 * w0) else cv2.INTER_LINEAR)
+
+        out.append(np.ascontiguousarray(f_out))
+
+    return out
diff --git a/internnav/internnav_habitat/habitat_default_evaluator.py b/internnav/internnav_habitat/habitat_default_evaluator.py
new file mode 100644
index 0000000..2f65236
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_default_evaluator.py
@@ -0,0 +1,128 @@
+import argparse
+import sys
+
+sys.path.append('./src/diffusion-policy')
+
+
+# Import for Habitat registry side effects — do not remove
+import internnav.env.utils.habitat_extensions.measures  # noqa: F401
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator import DistributedEvaluator, Evaluator
+
+try:
+    import habitat
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat_baselines.config.default import get_config as get_habitat_config
+except Exception as e:
+    print("Habitat Error:", e)
+    print("Habitat Evaluation is not loaded.")
+
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+@Evaluator.register('habitat_vlln')
+class HabitatVlnEvaluator(DistributedEvaluator):
+    def __init__(self, cfg: EvalCfg):
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.args = args
+        self.save_video = args.save_video
+        self.epoch = args.epoch
+        self.max_steps_per_episode = args.max_steps_per_episode
+        self.output_path = args.output_path
+
+        # create habitat config
+        self.config_path = cfg.env.env_settings['config_path']
+        self.config = get_habitat_config(self.config_path)
+        self.agent_config = get_agent_config(self.config.habitat.simulator)
+        self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
+
+        with habitat.config.read_write(self.config):
+            self.config.habitat.task.measurements.update(
+                {
+                    "top_down_map": TopDownMapMeasurementConfig(
+                        map_padding=3,
+                        map_resolution=1024,
+                        draw_source=True,
+                        draw_border=True,
+                        draw_shortest_path=True,
+                        draw_view_points=True,
+                        draw_goal_positions=True,
+                        draw_goal_aabbs=True,
+                        fog_of_war=FogOfWarConfig(
+                            draw=True,
+                            visibility_dist=5.0,
+                            fov=90,
+                        ),
+                    ),
+                    "collisions": CollisionsMeasurementConfig(),
+                }
+            )
+        cfg.env.env_settings['habitat_config'] = self.config
+        cfg.env.env_settings['output_path'] = self.output_path
+
+        # init agent and env
+        super().__init__(cfg)
+
+    def eval_action(self):
+        """
+        Run local episodes on this rank.
+
+        Returns dict[str, Tensor] on GPU (1D tensors of same length).
+        """
+        sucs, spls, oss, nes = [], [], [], []
+        env = self.env
+
+        while env.is_running:
+            obs = env.reset()
+            if not env.is_running or obs is None:
+                break
+
+            episode = env.env.current_episode
+            self.agent.reset(episode, env)
+
+            done = False
+            step_id = 0
+            while not done and step_id <= self.max_steps_per_episode:
+                action = self.agent.act(obs, env, info=None)
+                obs, reward, done, info = env.step(action)
+                step_id += 1
+
+            m = env.get_metrics()
+            sucs.append(m["success"])
+            spls.append(m["spl"])
+            oss.append(m["oracle_success"])
+            nes.append(m["distance_to_goal"])
+
+        env.close()
+        return {
+            "sucs": sucs,  # shape [N_local]
+            "spls": spls,  # shape [N_local]
+            "oss": oss,  # shape [N_local]
+            "nes": nes,  # shape [N_local]
+        }
+
+    def calc_metrics(self, global_metrics: dict) -> dict:
+        """
+        global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
+        """
+        sucs_all = global_metrics["sucs"]
+        spls_all = global_metrics["spls"]
+        oss_all = global_metrics["oss"]
+        nes_all = global_metrics["nes"]
+
+        # avoid /0 if no episodes
+        denom = max(len(sucs_all), 1)
+
+        return {
+            "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
+            "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
+            "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
+            "nes_all": float(nes_all.mean().item()) if denom > 0 else 0.0,
+            # "length" will be filled by base class
+        }
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
new file mode 100644
index 0000000..0be81f4
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -0,0 +1,837 @@
+import argparse
+import json
+import os
+import sys
+
+sys.path.append('./src/diffusion-policy')
+import copy
+import itertools
+import random
+import re
+from collections import OrderedDict
+
+import numpy as np
+import quaternion
+import torch
+import tqdm
+from depth_camera_filtering import filter_depth
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from transformers.image_utils import to_numpy_array
+
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator import DistributedEvaluator, Evaluator
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.model.utils.vln_utils import (
+    chunk_token,
+    open_image,
+    split_and_clean,
+    traj_to_actions,
+)
+
+try:
+    import habitat
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
+    from habitat.utils.visualizations.utils import observations_to_image
+    from habitat_baselines.config.default import get_config as get_habitat_config
+
+    # Import for Habitat registry side effects — do not remove
+    import internnav.internnav_habitat.measures  # noqa: F401 # isort: skip
+except Exception as e:
+    print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
+
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+@Evaluator.register('habitat_vln')
+class HabitatVlnEvaluator(DistributedEvaluator):
+    def __init__(self, cfg: EvalCfg):
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.save_video = args.save_video
+        self.epoch = args.epoch
+        self.max_steps_per_episode = args.max_steps_per_episode
+        self.output_path = args.output_path
+
+        # create habitat config
+        self.config_path = cfg.env.env_settings['config_path']
+        self.config = get_habitat_config(self.config_path)
+        self.agent_config = get_agent_config(self.config.habitat.simulator)
+        self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
+
+        with habitat.config.read_write(self.config):
+            self.config.habitat.task.measurements.update(
+                {
+                    "top_down_map": TopDownMapMeasurementConfig(
+                        map_padding=3,
+                        map_resolution=1024,
+                        draw_source=True,
+                        draw_border=True,
+                        draw_shortest_path=True,
+                        draw_view_points=True,
+                        draw_goal_positions=True,
+                        draw_goal_aabbs=True,
+                        fog_of_war=FogOfWarConfig(
+                            draw=True,
+                            visibility_dist=5.0,
+                            fov=90,
+                        ),
+                    ),
+                    "collisions": CollisionsMeasurementConfig(),
+                }
+            )
+        cfg.env.env_settings['habitat_config'] = self.config
+        cfg.env.env_settings['output_path'] = self.output_path
+
+        # init agent and env
+        super().__init__(cfg, init_agent=False)
+
+        # ------------------------------------- model ------------------------------------------
+        self.model_args = argparse.Namespace(**cfg.agent.model_settings)
+
+        processor = AutoProcessor.from_pretrained(self.model_args.model_path)
+        processor.tokenizer.padding_side = 'left'
+
+        device = torch.device(f"cuda:{self.local_rank}")
+        if self.model_args.mode == 'dual_system':
+            model = InternVLAN1ForCausalLM.from_pretrained(
+                self.model_args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": device},
+            )
+        elif self.model_args.mode == 'system2':
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                self.model_args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": device},
+            )
+        else:
+            raise ValueError(f"Invalid mode: {self.model_args.mode}")
+
+        model.eval()
+        self.device = device
+
+        self.model = model
+        self.processor = processor
+
+        # refactor: this part used in three places
+        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
+        answer = ""
+        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
+
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+
+        self.actions2idx = OrderedDict(
+            {
+                'STOP': [0],
+                "↑": [1],
+                "←": [2],
+                "→": [3],
+                "↓": [5],
+            }
+        )
+
+        self.objectnav_instructions = ["Search for the {target_object}."]
+
+        self.num_frames = self.model_args.num_frames
+        self.num_future_steps = self.model_args.num_future_steps
+        self.num_history = self.model_args.num_history
+
+        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
+        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
+        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
+
+        camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
+        self._camera_fov = camera_fov_rad
+        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
+
+    def eval_action(self):
+        """
+        Run local episodes on this rank.
+
+        Returns dict[str, Tensor] on GPU (1D tensors of same length).
+        """
+        # Old behavior was something like:
+        # sucs, spls, oss, nes, ep_num = self.eval_action(self.rank)
+        # Now just implement the actual eval here and return dict.
+
+        sucs, spls, oss, nes, _ = self._run_local_eval()
+
+        return {
+            "sucs": sucs,  # shape [N_local]
+            "spls": spls,  # shape [N_local]
+            "oss": oss,  # shape [N_local]
+            "nes": nes,  # shape [N_local]
+        }
+
+    def calc_metrics(self, global_metrics: dict) -> dict:
+        """
+        global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
+        """
+        sucs_all = global_metrics["sucs"]
+        spls_all = global_metrics["spls"]
+        oss_all = global_metrics["oss"]
+        nes_all = global_metrics["nes"]
+
+        # avoid /0 if no episodes
+        denom = max(len(sucs_all), 1)
+
+        return {
+            "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
+            "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
+            "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
+            "nes_all": float(nes_all.mean().item()) if denom > 0 else 0.0,
+            # "length" will be filled by base class
+        }
+
+    def _run_local_eval(self) -> None:  # noqa: C901
+        """
+        Run local evaluation on this rank.
+
+        Important: if resuming from previous results, need to read from / write to "self.output_path/progress.json".
+                    For each episode, save the result dict in jsonl format to that file.
+                    In Env, the episodes are already filtered by this file, tasks that have the same (scene_id, episode_id) are skipped.
+
+
+        Returns
+        -------
+        dict[str, Tensor]:
+            {
+                "sucs": [N_local],
+                "spls": [N_local],
+                "oss":  [N_local],
+                "nes":  [N_local],
+            }
+        """
+        # Create / get env
+        # self.env = self.env  # HabitatEnv from DistributedEvaluator
+
+        sucs, spls, oss, nes = [], [], [], []
+        self.model.eval()
+
+        # resume from previous results
+        # TODO: Current read write op is not distributed safe
+        if os.path.exists(os.path.join(self.output_path, 'progress.json')):
+            with open(os.path.join(self.output_path, 'progress.json'), 'r') as f:
+                for line in f.readlines():
+                    res = json.loads(line)
+                    if "scene_id" not in res:
+                        print("This evaluation has already finished!")
+                        return (
+                            torch.tensor(sucs).to(self.device),
+                            torch.tensor(spls).to(self.device),
+                            torch.tensor(oss).to(self.device),
+                            torch.tensor(nes).to(self.device),
+                            torch.tensor(len(sucs)).to(self.device),
+                        )
+                    if self.rank == 0:  # noqa: F405 TODO this need to keep in evaluator
+                        sucs.append(res['success'])
+                        spls.append(res['spl'])
+                        oss.append(res['os'])
+                        nes.append(res['ne'])
+
+        # Episode loop is now driven by env.reset() + env.is_running
+        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {self.rank}")
+        while self.env.is_running:
+
+            # ------------ 1. Start of episode ------------
+            observations = self.env.reset()
+            if not self.env.is_running or observations is None:
+                break
+
+            # ---- episode meta (scene_id, episode_id, instruction) ----
+            # we get it from the underlying habitat env
+            episode = self.env.get_current_episode()
+            scene_id = episode.scene_id.split('/')[-2]
+            episode_id = int(episode.episode_id)
+            episode_instruction = (
+                episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
+            )
+            print("episode start", episode_instruction)
+
+            agent_state = self.env._env.sim.get_agent_state()
+            rotation = agent_state.rotation
+            translation = agent_state.position
+            rotation_matrix = quaternion.as_rotation_matrix(rotation)
+            transformation_matrix = np.eye(4)
+            transformation_matrix[:3, :3] = rotation_matrix
+            transformation_matrix[:3, 3] = translation
+
+            agent = ShortestPathFollower(self.env._env.sim, 0.25, False)
+
+            # save first frame per rank to validate sim quality
+            os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
+            Image.fromarray(observations['rgb']).save(
+                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{self.rank}.jpg')
+            )
+
+            vis_frames = []
+            step_id = 0
+
+            if self.save_video:
+                os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            initial_height = self.env._env.sim.get_agent_state().position[1]
+
+            rgb_list = []
+            action_seq = []
+            output_ids = None
+
+            goal = None
+            action = None
+            messages = []
+            local_actions = []
+
+            done = False
+
+            # ---------- 2. Episode step loop -----------
+            while (not done) and (step_id <= self.max_steps_per_episode):
+                # refactor agent get action
+                rgb = observations["rgb"]
+                depth = observations["depth"]
+                x, y = observations["gps"]
+                camera_yaw = observations["compass"][0]
+                depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                depth = depth * 1000
+
+                agent_state = self.env._env.sim.get_agent_state()
+                height = agent_state.position[1] - initial_height
+                camera_position = np.array([x, -y, self._camera_height + height])
+                tf_camera_to_episodic = (
+                    self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
+                    @ self.get_axis_align_matrix()
+                )
+
+                image = Image.fromarray(rgb).convert('RGB')
+                save_raw_image = image.copy()
+
+                save_dot = False
+                if action == 5:
+                    look_down_image = image
+                    save_raw_image = look_down_image.copy()
+                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
+                    )
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                    look_down_depth[look_down_depth > 5.0] = 5.0
+                else:
+                    image = image.resize((self.model_args.resize_w, self.model_args.resize_h))
+                    rgb_list.append(image)
+
+                    if self.model_args.mode == 'dual_system':
+                        down_observations, _, done, _ = self.env.step(5)
+                        down_observations, _, done, _ = self.env.step(5)
+
+                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
+                        depth = down_observations["depth"]
+                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                        depth = depth * 1000
+                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                            do_depth_scale=True,
+                            depth_scale=1000,
+                            target_height=224,
+                            target_width=224,
+                        )
+                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                        look_down_depth[look_down_depth > 5.0] = 5.0
+
+                        self.env.step(4)
+                        self.env.step(4)
+
+                info = self.env.get_metrics()
+
+                if len(action_seq) == 0 and goal is None:
+                    if action != 5:
+                        sources = copy.deepcopy(self.conversation)
+                        sources[0]["value"] = sources[0]["value"].replace(
+                            '<instruction>.', episode.instruction.instruction_text[:-1]
+                        )
+                        cur_images = rgb_list[-1:]
+                        if step_id == 0:
+                            history_id = []
+                        else:
+                            history_id = np.unique(
+                                np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                            ).tolist()
+                            placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                            sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+                        history_id = sorted(history_id)
+                        print('history_idddddddd', step_id, history_id)
+                        input_images = [rgb_list[i] for i in history_id] + cur_images
+                        input_img_id = 0
+                    else:
+                        assert action == 5
+                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                        input_images += [look_down_image]
+                        # messages.append(
+                        #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                        # )
+                        input_img_id = -1
+
+                    prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+                    sources[0]["value"] += f" {prompt}."
+                    print('sources', step_id, sources)
+                    prompt_instruction = copy.deepcopy(sources[0]["value"])
+                    parts = split_and_clean(prompt_instruction)
+
+                    content = []
+                    for i in range(len(parts)):
+                        if parts[i] == "<image>":
+                            content.append({"type": "image", "image": input_images[input_img_id]})
+                            input_img_id += 1
+                        else:
+                            content.append({"type": "text", "text": parts[i]})
+
+                    messages.append({'role': 'user', 'content': content})
+
+                    print('step_id', step_id, 'messages:', messages)
+
+                    text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+                    inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
+
+                    with torch.no_grad():
+                        output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+
+                    llm_outputs = self.processor.tokenizer.decode(
+                        output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                    )
+                    print('step_id:', step_id, 'output text:', llm_outputs)
+
+                    if bool(re.search(r'\d', llm_outputs)):
+                        forward_action = 0
+                        coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+                        pixel_goal = [int(coord[1]), int(coord[0])]
+
+                        intrinsic_matrix = self.get_intrinsic_matrix(
+                            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
+                        )
+                        goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
+                        print('before', goal, depth.shape)
+                        goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
+
+                        if not self.env._env.sim.pathfinder.is_navigable(np.array(goal)):
+                            goal = np.array(self.env._env.sim.pathfinder.snap_point(np.array(goal)))
+
+                        # look down --> horizontal
+                        self.env.step(4)
+                        self.env.step(4)
+
+                        # Forking logic based on mode
+                        if self.model_args.mode == 'system2':
+                            action = agent.get_next_action(goal)
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2  # random action
+                                print('conduct a random action 2')
+                                observations, _, done, _ = self.env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
+                        else:  # dual-system logic
+                            local_actions = []
+                            pixel_values = inputs.pixel_values
+                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+
+                            with torch.no_grad():
+                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
+
+                            # prepocess align with navdp
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+                            pix_goal_image = copy.copy(image_dp)
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                            pix_goal_depth = copy.copy(depth_dp)
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(
+                                    traj_latents, images_dp, depths_dp, use_async=True
+                                )
+
+                            random_choice = np.random.choice(dp_actions.shape[0])
+                            if self.model_args.continuous_traj:
+                                action_list = traj_to_actions(dp_actions)
+                                if len(action_list) < 8:
+                                    action_list += [0] * (8 - len(action_list))
+                            else:
+                                action_list = chunk_token(dp_actions[random_choice])
+
+                            local_actions = action_list
+                            if len(local_actions) >= 4:
+                                local_actions = local_actions[:4]
+                            action = local_actions[0]
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2  # random action
+                                print('conduct a random action 2')
+                                observations, _, done, _ = self.env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
+
+                        print('predicted goal', pixel_goal, goal, flush=True)
+                    else:
+                        action_seq = self.parse_actions(llm_outputs)
+                        print('actions', action_seq, flush=True)
+
+                if len(action_seq) != 0:
+                    action = action_seq[0]
+                    action_seq.pop(0)
+                elif goal is not None:
+                    # Forking logic based on mode
+                    if self.model_args.mode == 'system2':
+                        action = agent.get_next_action(goal)
+                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                        action = action[0] if hasattr(action, "__len__") else action
+                    else:  # dual-system logic
+                        if len(local_actions) == 0:
+                            # navdp
+                            local_actions = []
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(
+                                    traj_latents, images_dp, depths_dp, use_async=True
+                                )
+
+                            random_choice = np.random.choice(dp_actions.shape[0])
+                            if self.model_args.continuous_traj:
+                                action_list = traj_to_actions(dp_actions)
+                                if len(action_list) < 8:
+                                    action_list += [0] * (8 - len(action_list))
+                            else:
+                                action_list = chunk_token(dp_actions[random_choice])
+                            print("first action_list", action_list)
+
+                            local_actions = action_list
+                            if len(local_actions) >= 4:
+                                local_actions = local_actions[:4]
+                            # if len(local_actions) >= 2:
+                            #     local_actions = local_actions[:2]
+
+                            print("local_actions", local_actions)
+
+                            action = local_actions.pop(0)
+                            # navdp
+                        else:
+                            action = local_actions.pop(0)
+
+                    forward_action += 1
+                    print('forward_action', forward_action, flush=True)
+                    if forward_action > 8:
+                        goal = None
+                        output_ids = None
+                        messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                    if action == 0:
+                        goal = None
+                        output_ids = None
+                        messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                else:
+                    action = 0
+
+                if info['top_down_map'] is not None:
+                    if save_dot:
+                        save_raw_image = self.dot_matrix_two_dimensional(
+                            save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
+                        )
+                    if self.save_video:
+                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                        vis_frames.append(frame)
+
+                print("step_id", step_id, "action", action)
+
+                # refactor: core
+                if action == 5:
+                    self.env.step(action)
+                    observations, _, done, _ = self.env.step(action)
+                else:
+                    observations, _, done, _ = self.env.step(action)
+                    step_id += 1
+                    messages = []
+
+            # ---------- 3. End of episode -----------
+            # Update result and write progress to the output_path/progress.json
+
+            process_bar.update(1)
+
+            # After the episode finishes, collect metrics:
+            metrics = self.env.get_metrics()
+
+            sucs.append(metrics['success'])
+            spls.append(metrics['spl'])
+            oss.append(metrics['oracle_success'])
+            nes.append(metrics["distance_to_goal"])
+
+            print(
+                f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, "
+                f"spl: {metrics['spl']}, os: {metrics['oracle_success']}, "
+                f"ne: {metrics['distance_to_goal']}"
+            )
+
+            # Write per-episode result.json entry (still per-rank)
+            result = {
+                "scene_id": scene_id,
+                "episode_id": episode_id,
+                "success": metrics["success"],
+                "spl": metrics["spl"],
+                "os": metrics['oracle_success'],
+                "ne": metrics["distance_to_goal"],
+                "steps": step_id,
+                "episode_instruction": episode_instruction,
+            }
+            os.makedirs(self.output_path, exist_ok=True)
+            with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
+                f.write(json.dumps(result) + "\n")
+
+        self.env.close()
+
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(len(sucs)).to(self.device),
+        )
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        # import ipdb; ipdb.set_trace()
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
+
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+
+        return img, (target_width, target_height)
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depthhhhhhhhhhhhhh", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def dot_matrix_two_dimensional(
+        self,
+        image_or_image_path,
+        save_path=None,
+        dots_size_w=8,
+        dots_size_h=8,
+        save_img=False,
+        font_path='fonts/arial.ttf',
+        pixel_goal=None,
+    ):
+        """
+        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
+        control args:
+        1. dots_size_w: the number of columns of the dots matrix
+        2. dots_size_h: the number of rows of the dots matrix
+        """
+        with open_image(image_or_image_path) as img:
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            draw = ImageDraw.Draw(img, 'RGB')
+
+            width, height = img.size
+            grid_size_w = dots_size_w + 1
+            grid_size_h = dots_size_h + 1
+            cell_width = width / grid_size_w
+            cell_height = height / grid_size_h
+
+            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
+
+            target_i = target_j = None
+            if pixel_goal is not None:
+                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
+                # Validate pixel coordinates
+                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
+                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
+
+                # Convert to grid coordinates
+                target_i = round(x_pixel / cell_width)
+                target_j = round(y_pixel / cell_height)
+
+                # Validate grid bounds
+                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
+                    raise ValueError(
+                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
+                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
+                    )
+
+            count = 0
+
+            for j in range(1, grid_size_h):
+                for i in range(1, grid_size_w):
+                    x = int(i * cell_width)
+                    y = int(j * cell_height)
+
+                    pixel_color = img.getpixel((x, y))
+                    # choose a more contrasting color from black and white
+                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
+                        opposite_color = (0, 0, 0)
+                    else:
+                        opposite_color = (255, 255, 255)
+
+                    if pixel_goal is not None and i == target_i and j == target_j:
+                        opposite_color = (255, 0, 0)  # Red for target
+
+                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
+                    draw.ellipse(
+                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
+                        fill=opposite_color,
+                    )
+
+                    text_x, text_y = x + 3, y
+                    count_w = count // dots_size_w
+                    count_h = count % dots_size_w
+                    label_str = f"({count_w+1},{count_h+1})"
+                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
+                    count += 1
+            if save_img:
+                print(">>> dots overlaid image processed, stored in", save_path)
+                img.save(save_path)
+            return img
diff --git a/internnav/internnav_habitat/measures.py b/internnav/internnav_habitat/measures.py
new file mode 100644
index 0000000..5dddcae
--- /dev/null
+++ b/internnav/internnav_habitat/measures.py
@@ -0,0 +1,560 @@
+from typing import Any, List, Union
+
+import numpy as np
+from habitat.core.embodied_task import EmbodiedTask, Measure
+from habitat.core.registry import registry
+from habitat.core.simulator import Simulator
+from habitat.core.utils import try_cv2_import
+from habitat.tasks.nav.nav import DistanceToGoal
+from numpy import ndarray
+
+cv2 = try_cv2_import()
+
+
+def euclidean_distance(pos_a: Union[List[float], ndarray], pos_b: Union[List[float], ndarray]) -> float:
+    return np.linalg.norm(np.array(pos_b) - np.array(pos_a), ord=2)
+
+
+@registry.register_measure
+class PathLength(Measure):
+    """Path Length (PL)
+    PL = sum(geodesic_distance(agent_prev_position, agent_position)
+            over all agent positions.
+    """
+
+    cls_uuid: str = "path_length"
+
+    def __init__(self, sim: Simulator, *args: Any, **kwargs: Any):
+        self._sim = sim
+        super().__init__(**kwargs)
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, **kwargs: Any):
+        self._previous_position = self._sim.get_agent_state().position
+        self._metric = 0.0
+
+    def update_metric(self, *args: Any, **kwargs: Any):
+        current_position = self._sim.get_agent_state().position
+        self._metric += euclidean_distance(current_position, self._previous_position)
+        self._previous_position = current_position
+
+
+@registry.register_measure
+class OracleNavigationError(Measure):
+    """Oracle Navigation Error (ONE)
+    ONE = min(geosdesic_distance(agent_pos, goal)) over all points in the
+    agent path.
+    """
+
+    cls_uuid: str = "oracle_navigation_error"
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        task.measurements.check_measure_dependencies(self.uuid, [DistanceToGoal.cls_uuid])
+        self._metric = float("inf")
+        self.update_metric(task=task)
+
+    def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        distance_to_target = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric()
+        self._metric = min(self._metric, distance_to_target)
+
+
+@registry.register_measure
+class OracleSuccess(Measure):
+    """Oracle Success Rate (OSR). OSR = I(ONE <= goal_radius)"""
+
+    cls_uuid: str = "oracle_success"
+
+    # def __init__(self, *args: Any, config: Config, **kwargs: Any):
+    #     self._config = config
+    #     super().__init__()
+
+    def __init__(self, *args: Any, config: Any, **kwargs: Any):
+        print(f"in oracle success init: args = {args}, kwargs = {kwargs}")
+        self._config = config
+        super().__init__()
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        task.measurements.check_measure_dependencies(self.uuid, [DistanceToGoal.cls_uuid])
+        self._metric = 0.0
+        self.update_metric(task=task)
+
+    def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        d = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric()
+        # self._metric = float(self._metric or d < self._config["success_distance"])
+        self._metric = float(self._metric or d < 3.0)
+
+
+@registry.register_measure
+class OracleSPL(Measure):
+    """OracleSPL (Oracle Success weighted by Path Length)
+    OracleSPL = max(SPL) over all points in the agent path.
+    """
+
+    cls_uuid: str = "oracle_spl"
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        task.measurements.check_measure_dependencies(self.uuid, ["spl"])
+        self._metric = 0.0
+
+    def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        spl = task.measurements.measures["spl"].get_metric()
+        self._metric = max(self._metric, spl)
+
+
+@registry.register_measure
+class StepsTaken(Measure):
+    """Counts the number of times update_metric() is called. This is equal to
+    the number of times that the agent takes an action. STOP counts as an
+    action.
+    """
+
+    cls_uuid: str = "steps_taken"
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, **kwargs: Any):
+        self._metric = 0.0
+
+    def update_metric(self, *args: Any, **kwargs: Any):
+        self._metric += 1.0
+
+
+# import gzip
+# import json
+# import pickle
+# from dtw import dtw
+# from fastdtw import fastdtw
+# from habitat.config import Config
+# from utils import maps
+# from habitat_extensions.task import RxRVLNCEDatasetV1
+# from habitat.tasks.nav.nav import DistanceToGoal, Success
+# from habitat.tasks.utils import cartesian_to_polar
+# from habitat.utils.geometry_utils import quaternion_rotate_vector
+# from habitat.utils.visualizations import fog_of_war
+# from habitat.utils.visualizations import maps as habitat_maps
+# from habitat.core.dataset import Episode
+# from habitat.core.embodied_task import Action, EmbodiedTask, Measure
+# from habitat.core.logging import logger
+
+# @registry.register_measure
+# class WaypointRewardMeasure(Measure):
+#     """A reward measure used for training VLN-CE agents via RL."""
+
+#     def __init__(
+#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
+#     ) -> None:
+#         self._sim = sim
+#         self._slack_reward = config.slack_reward
+#         self._use_distance_scaled_slack_reward = (
+#             config.use_distance_scaled_slack_reward
+#         )
+#         self._scale_slack_on_prediction = config.scale_slack_on_prediction
+#         self._success_reward = config.success_reward
+#         self._distance_scalar = config.distance_scalar
+#         self._prev_position = None
+#         super().__init__()
+
+#     def reset_metric(
+#         self, *args: Any, task: EmbodiedTask, **kwargs: Any
+#     ) -> None:
+#         task.measurements.check_measure_dependencies(
+#             self.uuid, [DistanceToGoal.cls_uuid, Success.cls_uuid]
+#         )
+#         self._previous_distance_to_goal = task.measurements.measures[
+#             "distance_to_goal"
+#         ].get_metric()
+#         self._metric = 0.0
+#         self._prev_position = np.take(
+#             self._sim.get_agent_state().position, [0, 2]
+#         )
+
+#     def _get_scaled_slack_reward(self, action: Action) -> float:
+#         if isinstance(action["action"], int):
+#             return self._slack_reward
+
+#         if not self._use_distance_scaled_slack_reward:
+#             return self._slack_reward
+
+#         agent_pos = np.take(self._sim.get_agent_state().position, [0, 2])
+#         slack_distance = (
+#             action["action_args"]["r"]
+#             if self._scale_slack_on_prediction and action["action"] != "STOP"
+#             else np.linalg.norm(self._prev_position - agent_pos)
+#         )
+#         scaled_slack_reward = self._slack_reward * slack_distance / 0.25
+#         self._prev_position = agent_pos
+#         return min(self._slack_reward, scaled_slack_reward)
+
+#     def _progress_to_goal(self, task: EmbodiedTask) -> float:
+#         distance_to_goal = task.measurements.measures[
+#             "distance_to_goal"
+#         ].get_metric()
+#         distance_to_goal_delta = (
+#             self._previous_distance_to_goal - distance_to_goal
+#         )
+#         if np.isnan(distance_to_goal_delta) or np.isinf(
+#             distance_to_goal_delta
+#         ):
+#             l = self._sim.get_agent_state().position
+#             logger.error(
+#                 f"\nNaN or inf encountered in distance measure. agent location: {l}",
+#             )
+#             distance_to_goal_delta = -1.0
+#         self._previous_distance_to_goal = distance_to_goal
+#         return self._distance_scalar * distance_to_goal_delta
+
+#     def update_metric(
+#         self, *args: Any, action: Action, task: EmbodiedTask, **kwargs: Any
+#     ) -> None:
+#         reward = self._get_scaled_slack_reward(action)
+#         reward += self._progress_to_goal(task)
+#         reward += (
+#             self._success_reward
+#             * task.measurements.measures["success"].get_metric()
+#         )
+#         self._metric = reward
+
+#     @staticmethod
+#     def _get_uuid(*args: Any, **kwargs: Any) -> str:
+#         return "waypoint_reward_measure"
+
+
+# @registry.register_measure
+# class NDTW(Measure):
+#     """NDTW (Normalized Dynamic Time Warping)
+#     ref: https://arxiv.org/abs/1907.05446
+#     """
+
+#     cls_uuid: str = "ndtw"
+
+#     def __init__(
+#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
+#     ):
+#         self._sim = sim
+#         self._config = config
+#         self.dtw_func = fastdtw if config.FDTW else dtw
+
+#         if "{role}" in config.GT_PATH:
+#             self.gt_json = {}
+#             for role in RxRVLNCEDatasetV1.annotation_roles:
+#                 with gzip.open(
+#                     config.GT_PATH.format(split=config.SPLIT, role=role), "rt"
+#                 ) as f:
+#                     self.gt_json.update(json.load(f))
+#         else:
+#             with gzip.open(
+#                 config.GT_PATH.format(split=config.SPLIT), "rt"
+#             ) as f:
+#                 self.gt_json = json.load(f)
+
+#         super().__init__()
+
+#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+#         return self.cls_uuid
+
+#     def reset_metric(self, *args: Any, episode, **kwargs: Any):
+#         self.locations = []
+#         self.gt_locations = self.gt_json[episode.episode_id]["locations"]
+#         self.update_metric()
+
+#     def update_metric(self, *args: Any, **kwargs: Any):
+#         current_position = self._sim.get_agent_state().position.tolist()
+#         if len(self.locations) == 0:
+#             self.locations.append(current_position)
+#         else:
+#             if current_position == self.locations[-1]:
+#                 return
+#             self.locations.append(current_position)
+
+#         dtw_distance = self.dtw_func(
+#             self.locations, self.gt_locations, dist=euclidean_distance
+#         )[0]
+
+#         nDTW = np.exp(
+#             -dtw_distance
+#             / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
+#         )
+#         self._metric = nDTW
+
+
+# @registry.register_measure
+# class SDTW(Measure):
+#     """SDTW (Success Weighted be nDTW)
+#     ref: https://arxiv.org/abs/1907.05446
+#     """
+
+#     cls_uuid: str = "sdtw"
+
+#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+#         return self.cls_uuid
+
+#     def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+#         task.measurements.check_measure_dependencies(
+#             self.uuid, [NDTW.cls_uuid, Success.cls_uuid]
+#         )
+#         self.update_metric(task=task)
+
+#     def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+#         ep_success = task.measurements.measures[Success.cls_uuid].get_metric()
+#         nDTW = task.measurements.measures[NDTW.cls_uuid].get_metric()
+#         self._metric = ep_success * nDTW
+
+
+# @registry.register_measure
+# class TopDownMapVLNCE(Measure):
+#     """A top down map that optionally shows VLN-related visual information
+#     such as MP3D node locations and MP3D agent traversals.
+#     """
+
+#     cls_uuid: str = "top_down_map_vlnce"
+
+#     def __init__(
+#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
+#     ) -> None:
+#         self._sim = sim
+#         self._config = config
+#         self._step_count = None
+#         self._map_resolution = config.MAP_RESOLUTION
+#         self._previous_xy_location = None
+#         self._top_down_map = None
+#         self._meters_per_pixel = None
+#         self.current_node = ""
+#         with open(self._config.GRAPHS_FILE, "rb") as f:
+#             self._conn_graphs = pickle.load(f)
+#         super().__init__()
+
+#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+#         return self.cls_uuid
+
+#     def get_original_map(self) -> ndarray:
+#         top_down_map = habitat_maps.get_topdown_map_from_sim(
+#             self._sim,
+#             map_resolution=self._map_resolution,
+#             draw_border=self._config.DRAW_BORDER,
+#             meters_per_pixel=self._meters_per_pixel,
+#         )
+
+#         self._fog_of_war_mask = None
+#         if self._config.FOG_OF_WAR.DRAW:
+#             self._fog_of_war_mask = np.zeros_like(top_down_map)
+
+#         return top_down_map
+
+#     def reset_metric(
+#         self, *args: Any, episode: Episode, **kwargs: Any
+#     ) -> None:
+#         self._scene_id = episode.scene_id.split("/")[-2]
+#         self._step_count = 0
+#         self._metric = None
+#         self._meters_per_pixel = habitat_maps.calculate_meters_per_pixel(
+#             self._map_resolution, self._sim
+#         )
+#         self._top_down_map = self.get_original_map()
+#         agent_position = self._sim.get_agent_state().position
+#         scene_id = episode.scene_id.split("/")[-1].split(".")[0]
+#         a_x, a_y = habitat_maps.to_grid(
+#             agent_position[2],
+#             agent_position[0],
+#             self._top_down_map.shape[0:2],
+#             sim=self._sim,
+#         )
+#         self._previous_xy_location = (a_y, a_x)
+
+#         if self._config.FOG_OF_WAR.DRAW:
+#             self._fog_of_war_mask = fog_of_war.reveal_fog_of_war(
+#                 self._top_down_map,
+#                 self._fog_of_war_mask,
+#                 np.array([a_x, a_y]),
+#                 self.get_polar_angle(),
+#                 fov=self._config.FOG_OF_WAR.FOV,
+#                 max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST
+#                 / habitat_maps.calculate_meters_per_pixel(
+#                     self._map_resolution, sim=self._sim
+#                 ),
+#             )
+
+#         if self._config.DRAW_FIXED_WAYPOINTS:
+#             maps.draw_mp3d_nodes(
+#                 self._top_down_map,
+#                 self._sim,
+#                 episode,
+#                 self._conn_graphs[scene_id],
+#                 self._meters_per_pixel,
+#             )
+
+#         if self._config.DRAW_SHORTEST_PATH:
+#             shortest_path_points = self._sim.get_straight_shortest_path_points(
+#                 agent_position, episode.goals[0].position
+#             )
+#             maps.draw_straight_shortest_path_points(
+#                 self._top_down_map,
+#                 self._sim,
+#                 self._map_resolution,
+#                 shortest_path_points,
+#             )
+
+#         if self._config.DRAW_REFERENCE_PATH:
+#             maps.draw_reference_path(
+#                 self._top_down_map,
+#                 self._sim,
+#                 episode,
+#                 self._map_resolution,
+#                 self._meters_per_pixel,
+#             )
+
+#         # draw source and target points last to avoid overlap
+#         if self._config.DRAW_SOURCE_AND_TARGET:
+#             maps.draw_source_and_target(
+#                 self._top_down_map,
+#                 self._sim,
+#                 episode,
+#                 self._meters_per_pixel,
+#             )
+
+#         # MP3D START NODE
+#         self._nearest_node = maps.get_nearest_node(
+#             self._conn_graphs[scene_id], np.take(agent_position, (0, 2))
+#         )
+#         nn_position = self._conn_graphs[self._scene_id].nodes[
+#             self._nearest_node
+#         ]["position"]
+#         self.s_x, self.s_y = habitat_maps.to_grid(
+#             nn_position[2],
+#             nn_position[0],
+#             self._top_down_map.shape[0:2],
+#             self._sim,
+#         )
+#         self.update_metric()
+
+#     def update_metric(self, *args: Any, **kwargs: Any) -> None:
+#         self._step_count += 1
+#         (
+#             house_map,
+#             map_agent_pos,
+#         ) = self.update_map(self._sim.get_agent_state().position)
+
+#         self._metric = {
+#             "map": house_map,
+#             "fog_of_war_mask": self._fog_of_war_mask,
+#             "agent_map_coord": map_agent_pos,
+#             "agent_angle": self.get_polar_angle(),
+#             "bounds": {
+#                 k: v
+#                 for k, v in zip(
+#                     ["lower", "upper"],
+#                     self._sim.pathfinder.get_bounds(),
+#                 )
+#             },
+#             "meters_per_px": self._meters_per_pixel,
+#         }
+
+#     def get_polar_angle(self) -> float:
+#         agent_state = self._sim.get_agent_state()
+#         # quaternion is in x, y, z, w format
+#         ref_rotation = agent_state.rotation
+
+#         heading_vector = quaternion_rotate_vector(
+#             ref_rotation.inverse(), np.array([0, 0, -1])
+#         )
+
+#         phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
+#         z_neg_z_flip = np.pi
+#         return np.array(phi) + z_neg_z_flip
+
+#     def update_map(self, agent_position: List[float]) -> None:
+#         a_x, a_y = habitat_maps.to_grid(
+#             agent_position[2],
+#             agent_position[0],
+#             self._top_down_map.shape[0:2],
+#             self._sim,
+#         )
+#         # Don't draw over the source point
+#         gradient_color = 15 + min(
+#             self._step_count * 245 // self._config.MAX_EPISODE_STEPS, 245
+#         )
+#         if self._top_down_map[a_x, a_y] != maps.MAP_SOURCE_POINT_INDICATOR:
+#             maps.drawline(
+#                 self._top_down_map,
+#                 self._previous_xy_location,
+#                 (a_y, a_x),
+#                 gradient_color,
+#                 thickness=int(
+#                     self._map_resolution * 1.4 / maps.MAP_THICKNESS_SCALAR
+#                 ),
+#                 style="filled",
+#             )
+
+#         if self._config.FOG_OF_WAR.DRAW:
+#             self._fog_of_war_mask = fog_of_war.reveal_fog_of_war(
+#                 self._top_down_map,
+#                 self._fog_of_war_mask,
+#                 np.array([a_x, a_y]),
+#                 self.get_polar_angle(),
+#                 self._config.FOG_OF_WAR.FOV,
+#                 max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST
+#                 / habitat_maps.calculate_meters_per_pixel(
+#                     self._map_resolution, sim=self._sim
+#                 ),
+#             )
+
+#         point_padding = int(0.2 / self._meters_per_pixel)
+#         prev_nearest_node = self._nearest_node
+#         self._nearest_node = maps.update_nearest_node(
+#             self._conn_graphs[self._scene_id],
+#             self._nearest_node,
+#             np.take(agent_position, (0, 2)),
+#         )
+#         if (
+#             self._nearest_node != prev_nearest_node
+#             and self._config.DRAW_MP3D_AGENT_PATH
+#         ):
+#             nn_position = self._conn_graphs[self._scene_id].nodes[
+#                 self._nearest_node
+#             ]["position"]
+#             (prev_s_x, prev_s_y) = (self.s_x, self.s_y)
+#             self.s_x, self.s_y = habitat_maps.to_grid(
+#                 nn_position[2],
+#                 nn_position[0],
+#                 self._top_down_map.shape[0:2],
+#                 self._sim,
+#             )
+#             self._top_down_map[
+#                 self.s_x
+#                 - int(2.0 / 3.0 * point_padding) : self.s_x
+#                 + int(2.0 / 3.0 * point_padding)
+#                 + 1,
+#                 self.s_y
+#                 - int(2.0 / 3.0 * point_padding) : self.s_y
+#                 + int(2.0 / 3.0 * point_padding)
+#                 + 1,
+#             ] = gradient_color
+
+#             maps.drawline(
+#                 self._top_down_map,
+#                 (prev_s_y, prev_s_x),
+#                 (self.s_y, self.s_x),
+#                 gradient_color,
+#                 thickness=int(
+#                     1.0
+#                     / 2.0
+#                     * np.round(
+#                         self._map_resolution / maps.MAP_THICKNESS_SCALAR
+#                     )
+#                 ),
+#             )
+
+#         self._previous_xy_location = (a_y, a_x)
+#         map_agent_pos = (a_x, a_y)
+#         return self._top_down_map, map_agent_pos
diff --git a/internnav/internnav_habitat/refactor_notes.md b/internnav/internnav_habitat/refactor_notes.md
new file mode 100644
index 0000000..7a197b4
--- /dev/null
+++ b/internnav/internnav_habitat/refactor_notes.md
@@ -0,0 +1,66 @@
+# Refactoring `habitat_vln_evaluator`
+
+This note explains how to split the current `VLNEvaluator` implementation into the
+framework's `Env` and `Agent` abstractions.
+
+## 1. Construction and configuration (lines 45-135)
+* **Environment**: Habitat config loading, dataset split selection, sensor parameter
+  extraction, and measurement registration belong to the environment setup. These
+  responsibilities configure and own the simulator state and therefore should be
+  moved into a `HabitatVLNEnv` class that extends `Env`.【F:internnav/evaluator/habitat_vln_evaluator.py†L45-L103】
+* **Agent**: Model handles, prompt bootstrapping, conversation history, action
+  vocabulary, and instruction templates are part of the policy logic and should be
+  carried by a dedicated `HabitatVLNAgent` subclass. These fields initialize the
+  reasoning model rather than the simulator.【F:internnav/evaluator/habitat_vln_evaluator.py†L104-L135】
+
+## 2. Perception utilities (lines 137-236)
+Depth pre-processing, intrinsic matrix computation, coordinate transforms, and GPS
+projection are tied to the simulator sensor geometry. They should move into the
+`HabitatVLNEnv` so that observation tensors returned to the agent are already in a
+consistent world frame.【F:internnav/evaluator/habitat_vln_evaluator.py†L137-L236】
+
+## 3. Visualization helper (lines 238-309)
+The dot-matrix overlay operates purely on rendered frames and can stay as an
+environment utility. The helper should become a method of the environment (or a
+separate visualization module) so evaluators can call it regardless of the agent.
+【F:internnav/evaluator/habitat_vln_evaluator.py†L238-L309】
+
+## 4. Low-level point navigation (lines 311-347)
+The `_pointnav` helper controls a waypoint-following controller that consumes
+processed observations and outputs low-level actions. Because it interacts with the
+robot's state (goal resets, depth resizing, point-goal calculation), it fits inside
+the environment. The agent can request point-goal actions through a method such as
+`HabitatVLNEnv.pointnav(goal, depth, ...)`.【F:internnav/evaluator/habitat_vln_evaluator.py†L311-L347】
+
+## 5. Main evaluation loop (lines 349-520)
+* **Environment**: Episode iteration, resetting, stepping, intrinsic assembly, and
+  metric gathering should be owned by the environment. Wrapping Habitat's episode
+  lifecycle in `HabitatVLNEnv` keeps the evaluator thin and deterministic.
+* **Agent**: Generating waypoint predictions, maintaining conversation turns, and
+  deciding discrete actions are policy responsibilities. The evaluator should ask
+  the new agent for an action by passing observations (RGB, depth, state metadata)
+  returned by the environment wrapper.【F:internnav/evaluator/habitat_vln_evaluator.py†L349-L520】
+
+## 6. Language and action parsing (lines 522-680)
+Instruction processing (`split_and_clean`, dynamic prompt assembly) and action string
+parsing convert model text into executable commands. These should be encapsulated in
+`HabitatVLNAgent` so the evaluator only receives structured actions (e.g., STOP,
+MOVE, LOOK).【F:internnav/evaluator/habitat_vln_evaluator.py†L522-L680】
+
+## 7. Metric aggregation and exports (lines 682-745)
+Writing JSON lines, aggregating SPL/OS/NE, and optional video dumping can remain in
+the evaluator, but the raw metrics originate from the environment through
+`HabitatVLNEnv.get_metrics()` and rendering helpers. The evaluator should simply
+post-process the aggregated numbers.【F:internnav/evaluator/habitat_vln_evaluator.py†L682-L745】
+
+## Resulting structure
+1. **`internnav/env/habitat_vln_env.py`**: wraps Habitat configuration, episode
+   control, sensor processing, point-nav helper, and visualization utilities.
+2. **`internnav/agent/habitat_vln_agent.py`**: encapsulates the vision-language
+   model, prompt management, observation parsing, and action decoding.
+3. **`internnav/evaluator/habitat_vln_evaluator.py`**: becomes a thin coordinator
+   that instantiates the env/agent via the registry, loops over episodes, and logs
+   metrics.
+
+This split brings the Habitat evaluator in line with the existing framework while
+keeping domain-specific functionality in focused components.
diff --git a/internnav/internnav_habitat/simple_npc/__init__.py b/internnav/internnav_habitat/simple_npc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/internnav/internnav_habitat/simple_npc/prompt.py b/internnav/internnav_habitat/simple_npc/prompt.py
new file mode 100644
index 0000000..a2d09e0
--- /dev/null
+++ b/internnav/internnav_habitat/simple_npc/prompt.py
@@ -0,0 +1,87 @@
+TEMPLATE = {
+    'one_turn_prompt': """
+You are a helpful assistant in helping agent to finish its navigation task.
+
+## Here is the ground truth information you know more than the agent
+'TASK DONE' shows if the agent has finished the task, if it is false, you need to know that the agent hasn't found the goal object.
+'GOAL INFORMATION' shows the goal object's information.
+'CORRECT PATH' shows the correct path description to the goal object.
+
+TASK DONE:
+{task_done}
+
+GOAL INFORMATION:
+{goal_information}
+
+CORRECT PATH:
+{path_description}
+
+## Some constraints you MUST follow:
+1. Only output the answer to the question.
+2. Don't be verbose.
+
+## Here is the question you need to answer
+QUESTION: {question}
+""",
+    "two_turn_prompt_0": """
+You are a helpful assistant in helping agent to finish its navigation task. You will be given a question among the following three types:
+1. Disambiguation: This question is asked to check whether the agent has found the goal object. Like "Is it the object you are looking for?"
+2. Path: This question is asked to get the path to the goal object. Like "Where should I go now?"
+3. Information: This question is asked to get more information about the goal object. Like "Where is the goal object?", "What is the color of the goal object?"
+
+You need to classify the question into one of the three types. Only output the name of the type(disambiguation, path, information). Don't be verbose.
+
+## Here is the question you need to answer
+QUESTION: {question}
+""",
+    "two_turn_prompt_1": """
+You are a helpful assistant in answering the question. Here follows the ground truth information about the goal object. You need to answer the question based on the ground truth information.
+
+## Here is the ground truth information about the goal object
+GOAL INFORMATION:
+{goal_information}
+
+## Here is the question you need to answer
+QUESTION: {question}
+""",
+}
+
+DISAMBIGUATION_PROMPT = {
+    'yes': [
+        "Yes, you are in the correct position.",
+        "That's right, you are at the intended location.",
+        "Yes, you have reached the right spot.",
+        "Correct, you are in the proper place.",
+        "Yes, you are exactly where you need to be.",
+        "Yes, you are aligned correctly.",
+        "Yes, you are positioned accurately.",
+        "Everything looks good, you are at the correct location.",
+        "You are in the right area.",
+        "Yes, you are currently at the correct position.",
+        "That's perfect, you are in the right spot.",
+        "Yes, your position is accurate.",
+        "You have reached the proper location.",
+        "Yes, you are at the specified position.",
+        "Everything is aligned properly, you're in the correct spot.",
+        "Yes, you are where you should be.",
+        "Yes, this is the right place.",
+    ],
+    'no': [
+        "This is not the intended location.",
+        "You are not in the proper place.",
+        "No, you are not where you need to be.",
+        "No, you are not aligned correctly.",
+        "No, you are positioned incorrectly.",
+        "You are not at the correct location.",
+        "No, you are situated incorrectly.",
+        "You are in the wrong area.",
+        "No, you are not currently at the correct position.",
+        "That's not the right spot.",
+        "No, you are not at the intended destination.",
+        "Your position is inaccurate.",
+        "You haven't reached the proper location.",
+        "No, you are not at the specified position.",
+        "The alignment is off, you are in the wrong spot.",
+        "This is not the right place.",
+    ],
+}
diff --git a/internnav/internnav_habitat/simple_npc/scene_summary b/internnav/internnav_habitat/simple_npc/scene_summary
new file mode 120000
index 0000000..ef6d02e
--- /dev/null
+++ b/internnav/internnav_habitat/simple_npc/scene_summary
@@ -0,0 +1 @@
+/mnt/petrelfs/huangwensi/vln_llava/data/scene_summary
\ No newline at end of file
diff --git a/internnav/internnav_habitat/simple_npc/simple_npc.py b/internnav/internnav_habitat/simple_npc/simple_npc.py
new file mode 100644
index 0000000..a1e0d4d
--- /dev/null
+++ b/internnav/internnav_habitat/simple_npc/simple_npc.py
@@ -0,0 +1,174 @@
+import base64
+import random
+
+from .prompt import DISAMBIGUATION_PROMPT, TEMPLATE
+
+
+class SimpleNPC:
+    def __init__(
+        self,
+        max_interaction_turn: int,
+        model_name: str,
+        openai_api_key: str,
+        base_url: str = None,
+    ) -> None:
+        try:
+            from openai import OpenAI
+        except ModuleNotFoundError:
+            print('ModuleNotFoundError: No module named \'openai\'. Please install it first.')
+            return
+        self.model_name = model_name
+        self.max_turn = max_interaction_turn
+        self.history_messages = []
+        with open(openai_api_key, 'r', encoding='utf-8') as file:
+            openai_api_key = file.read().strip()
+        try:
+            self.llm = OpenAI(api_key=openai_api_key, base_url=base_url)
+        except Exception as e:
+            print(f'Failed to initialize OpenAI: {e}')
+
+    def get_room_name(self, room):
+        room_name_dict = {
+            "living region": "living room",
+            "stair region": "stairs",
+            "bathing region": "bathroom",
+            "storage region": "storage room",
+            "study region": "study room",
+            "cooking region": "kitchen",
+            "sports region": "sports room",
+            "corridor region": "corridor",
+            "toliet region": "toilet",
+            "dinning region": "dining room",
+            "resting region": "resting room",
+            "open area region": "open area",
+            "other region": "area",
+        }
+        return room_name_dict[room]
+
+    def answer_question(
+        self, question: str, instance_id: str, object_dict: dict, task_done: bool, path_description: str, mode: str
+    ):
+        if mode == 'one_turn':
+            goal_information = ''
+            goal_information += 'room: ' + self.get_room_name(object_dict[instance_id]['room']) + '\n'
+            goal_information += '\n'.join(
+                [
+                    f'{a.lower()}: {i.lower()}'
+                    for a, i in object_dict[instance_id]['unique_description'].items()
+                    if a in ['color', 'texture', 'material', 'shape', 'placement'] and len(i) > 0
+                ]
+            )
+            nearby_objects = [
+                object_dict[obj]['unique_description']['fine grained category'].lower()
+                for obj, _ in object_dict[instance_id]['nearby_objects'].items()
+                if obj in object_dict and isinstance(object_dict[obj]['unique_description'], dict)
+            ]
+            if len(nearby_objects) > 0:
+                goal_information += '\nnearby objects: ' + ','.join(nearby_objects)
+            goal_information += 'whole description: ' + object_dict[instance_id]['caption']
+            answer = self.ask_directly(
+                template_type="one_turn_prompt",
+                question=question,
+                goal_information=goal_information,
+                path_description=path_description,
+                task_done=task_done,
+            )
+            return answer
+        elif mode == 'two_turn':
+            answer = self.ask_directly(
+                template_type="two_turn_prompt_0",
+                question=question,
+            )
+            if 'path' in answer.lower():
+                return path_description
+            elif 'disambiguation' in answer.lower():
+                if task_done:
+                    return random.choice(DISAMBIGUATION_PROMPT['yes'])
+                else:
+                    return random.choice(DISAMBIGUATION_PROMPT['no'])
+            elif 'information' in answer.lower():
+                goal_information = ''
+                goal_information += 'room: ' + self.get_room_name(object_dict[instance_id]['room']) + '\n'
+                goal_information += '\n'.join(
+                    [
+                        f'{a.lower()}: {i.lower()}'
+                        for a, i in object_dict[instance_id]['unique_description'].items()
+                        if a in ['color', 'texture', 'material', 'shape', 'placement'] and len(i) > 0
+                    ]
+                )
+                nearby_objects = [
+                    object_dict[obj]['unique_description']['fine grained category'].lower()
+                    for obj, _ in object_dict[instance_id]['nearby_objects'].items()
+                    if obj in object_dict and isinstance(object_dict[obj]['unique_description'], dict)
+                ]
+                if len(nearby_objects) > 0:
+                    goal_information += '\nnearby objects: ' + ','.join(nearby_objects)
+                goal_information += 'whole description: ' + object_dict[instance_id]['caption']
+                answer = self.ask_directly(
+                    template_type="one_turn_prompt",
+                    question=question,
+                    goal_information=goal_information,
+                    path_description=path_description,
+                    task_done=task_done,
+                )
+                answer = self.answer_question(question, instance_id, object_dict, task_done, answer, 'one_turn')
+            return answer
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+
+    def ask_directly(self, template_type, **kwargs):
+        def generate_prompt(template_type, **kwargs):
+            """
+            Generate a complete prompt based on the template type and provided content.
+
+            Parameters:
+                template_type (str): The type of template to use.
+                **kwargs: The content to fill in the template.
+
+            Returns:
+                str: The complete prompt.
+            """
+
+            prompt = TEMPLATE.get(template_type, None)
+            if prompt is None:
+                raise ValueError(f"Template type '{template_type}' not found.")
+            prompt = prompt.format(**kwargs)
+            return prompt
+
+        messages = []
+        image_bufs = kwargs.get('images', None)
+        cnt = 0
+        prompt = generate_prompt(template_type, **kwargs)
+        content = [{'type': 'text', 'text': prompt}]
+        if image_bufs is not None:
+            for im_id, image_buf in enumerate(image_bufs):
+                img_encoded = base64.b64encode(image_buf.getvalue()).decode('utf-8')
+                image_buf.close()
+                item = {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{img_encoded}',
+                        'detail': 'high',
+                    },
+                    'index': im_id,
+                }
+                content.append(item)
+        messages.append({'role': 'user', 'content': content})
+
+        while cnt < self.max_turn:
+            try:
+                response = self.llm.chat.completions.create(
+                    model=self.model_name,
+                    messages=messages,
+                    max_tokens=2048,
+                    top_p=1,
+                    frequency_penalty=0,
+                    presence_penalty=0,
+                )
+                result = response.choices[0].message.content
+                break
+            except Exception as e:
+                print(e)
+                cnt += 1
+                result = None
+        return result
diff --git a/scripts/eval/bash/srun_eval_dialog.sh b/scripts/eval/bash/srun_eval_dialog.sh
new file mode 100755
index 0000000..cc54fb1
--- /dev/null
+++ b/scripts/eval/bash/srun_eval_dialog.sh
@@ -0,0 +1,15 @@
+# use to run distributed eval with 8 gpus on single node
+export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
+
+srun -p mozi_t \
+    --gres=gpu:1 \
+    --ntasks=1 \
+    --time=0-20:00:00 \
+    --ntasks-per-node=1 \
+    --cpus-per-task=8 \
+    --exclude=HOST-10-140-66-53,HOST-10-140-66-69 \
+    --kill-on-bad-exit=1 \
+    python scripts/eval/eval.py \
+      --config scripts/eval/configs/habitat_dialog_cfg.py \
diff --git a/scripts/eval/bash/srun_eval_object.sh b/scripts/eval/bash/srun_eval_object.sh
new file mode 100755
index 0000000..165c6c9
--- /dev/null
+++ b/scripts/eval/bash/srun_eval_object.sh
@@ -0,0 +1,15 @@
+# use to run distributed eval with 8 gpus on single node
+export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
+export NCCL_SOCKET_IFNAME=bond0
+export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
+
+srun -p efm_t \
+    --gres=gpu:8 \
+    --ntasks=8 \
+    --time=0-20:00:00 \
+    --ntasks-per-node=8 \
+    --cpus-per-task=8 \
+    --exclude=HOST-10-140-66-53,HOST-10-140-66-69 \
+    --kill-on-bad-exit=1 \
+    python scripts/eval/eval.py \
+      --config scripts/eval/configs/habitat_object_cfg.py \
diff --git a/scripts/eval/configs/gen_videos.yaml b/scripts/eval/configs/gen_videos.yaml
new file mode 100644
index 0000000..91fdff3
--- /dev/null
+++ b/scripts/eval/configs/gen_videos.yaml
@@ -0,0 +1,64 @@
+exp:
+  num_environments: 1
+  dump_location: datadump_qualitative
+  exp_name: llava3d
+  no_gpu: 0
+  seed: 1
+  visualize: True
+  print_images: 0
+  num_sem_categories: 10
+  goal_type: object
+  goal_map_only: False
+  detector: mp3d_gt
+  frame_width: 90
+  frame_height: 160
+  vocabulary: hm3d
+  task: object
+
+agent:
+  max_steps: 500
+  pointnav_stop_radius: 0.9
+  num_goal_categories: 6
+  use_vision_language_feat: False
+  model_path: checkpoints/llava-v1.5-7b-waypoint-position-p-encoder
+  conv_mode: llava_v1
+  seq: ","
+  temperature: 0.2
+  top_p: null
+  num_beams: 1
+  max_new_tokens: 512
+  semantic_map:
+    semantic_categories: hm3d_9cat # "hm3d_90cat" #map semantic channel categories ("coco_indoor", "longtail_indoor", "mukul_indoor")
+    num_sem_categories: 10           # number of map semantic channel categories (16, 257, 35)
+    agent_height_project: False
+    map_size_cm: 4800        # global map size (in centimeters)
+    map_resolution: 5        # size of map bins (in centimeters)
+    vision_range: 100        # diameter of local map region visible by the agent (in cells)
+    global_downscaling: 2    # ratio of global over local map
+    du_scale: 1              # frame downscaling before projecting to point cloud
+    cat_pred_threshold: 5.0  # number of depth points to be in bin to classify it as a certain semantic category
+    exp_pred_threshold: 1.0  # number of depth points to be in bin to consider it as explored
+    map_pred_threshold: 0.1  # number of depth points to be in bin to consider it as obstacle
+    explored_radius: 150     # radius (in centimeters) of visually explored region
+    been_close_to_radius: 200  # radius (in centimeters) of been close to region
+    must_explore_close: False
+    min_obs_height_cm: 25    # minimum height (in centimeters) of obstacle to be considered as obstacle
+    global_update_steps: 500
+    # erosion and filtering to reduce the number of spurious artifacts
+    dilate_obstacles: False
+    dilate_size: 3
+    dilate_iter: 1
+    goal_reached_dist: 20
+
+  planner:
+    collision_threshold: 0.20       # forward move distance under which we consider there's a collision (in meters)
+    min_obs_dilation_selem_radius: 3    # radius (in cells) of obstacle dilation structuring element
+    obs_dilation_selem_radius: 5    # radius (in cells) of obstacle dilation structuring element
+    goal_dilation_selem_radius: 5  # radius (in cells) of goal dilation structuring element
+    use_dilation_for_stg: True      # use dilated goals for estimating short-term goals - or just reaching
+    map_downsample_factor: 1            # optional downsampling of traversible and goal map before fmm distance call (1 for no downsampling, 2 for halving resolution)
+    map_update_frequency: 1             # compute fmm distance map every n steps
+    step_size: 10                    # maximum distance of the short-term goal selected by the planner
+    magnify_goal_when_hard: 100
+    discrete_actions: True
+    verbose: False
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
new file mode 100644
index 0000000..a079c0c
--- /dev/null
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -0,0 +1,40 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg
+
+eval_cfg = EvalCfg(
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='internvla_n1',
+        ckpt_path='',
+        model_settings={
+            "mode": "dual_system",  # inference mode: dual_system or system2
+            "model_path": "checkpoints/InternVLA-N1",  # path to model checkpoint
+            "num_future_steps": 4,  # number of future steps for prediction
+            "num_frames": 32,  # number of frames used in evaluation
+            "num_history": 8,
+            "resize_w": 384,  # image resize width
+            "resize_h": 384,  # image resize height
+            "predict_step_nums": 32,  # number of steps to predict
+            "continuous_traj": True,  # whether to use continuous trajectory
+            "max_new_tokens": 1024,  # maximum number of tokens for generation
+        },
+    ),
+    env=EnvCfg(
+        env_type='habitat',
+        env_settings={
+            # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
+            'config_path': 'scripts/eval/configs/vln_r2r.yaml',
+        },
+    ),
+    eval_type='habitat_vln',
+    eval_settings={
+        # all current parse args
+        "output_path": "./logs/habitat/test",  # output directory for logs/results
+        "save_video": False,  # whether to save videos
+        "epoch": 0,  # epoch number for logging
+        "max_steps_per_episode": 500,  # maximum steps per episode
+        # distributed settings
+        "port": "2333",  # communication port
+        "dist_url": "env://",  # url for distributed setup
+    },
+)
diff --git a/scripts/eval/configs/habitat_dialog_cfg.py b/scripts/eval/configs/habitat_dialog_cfg.py
new file mode 100755
index 0000000..4e9f5d1
--- /dev/null
+++ b/scripts/eval/configs/habitat_dialog_cfg.py
@@ -0,0 +1,57 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg, TaskCfg
+
+eval_cfg = EvalCfg(
+    remote_agent=False,
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='dialog',
+        ckpt_path='',
+        model_settings={
+            "mode": "system2",  # inference mode: dual_system or system2
+            "dialog_enabled": True,
+            "model_path": "checkpoints/Vlln-dialog",  # path to model checkpoint
+            "append_look_down": False,
+            "num_history": 8,
+            "resize_w": 384,  # image resize width
+            "resize_h": 384,  # image resize height
+            "max_new_tokens": 128,  # maximum number of tokens for generation
+        },
+    ),
+    env=EnvCfg(
+        env_type='habitat',
+        env_settings={
+            # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
+            'baseline_config_path': 'scripts/eval/configs/gen_videos.yaml',
+            'habitat_config_path': 'scripts/eval/configs/instance_dialog.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/objectnav_hm3d.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/instance.yaml',
+        },
+    ),
+    task=TaskCfg(
+        task_name="instance_dialog"
+        # task_name = "object",
+        # task_name = "instance",
+    ),
+    eval_type="habitat_dialog",
+    eval_settings={
+        # all current parse args
+        "output_path": "./logs/habitat/dialog",  # output directory for logs/results
+        "epoch": 0,  # epoch number for logging
+        "max_steps_per_episode": 500,  # maximum steps per episode
+        # task setting
+        "eval_split": "easy_same",
+        # "eval_split": "val",
+        # "eval_split": "easy_same",
+        "turn": 5,
+        "save_video": False,  # whether to save videos
+        # npc setting
+        "base_url": 'http://35.220.164.252:3888/v1',
+        "model_name": "gpt-4o",
+        "openai_api_key": 'internnav/internnav_habitat/simple_npc/api_key.txt',
+        "scene_summary": 'internnav/internnav_habitat/simple_npc/scene_summary',
+        # distributed settings
+        "port": "2333",  # communication port
+        "dist_url": "env://",  # url for distributed setup
+    },
+)
diff --git a/scripts/eval/configs/habitat_object_cfg.py b/scripts/eval/configs/habitat_object_cfg.py
new file mode 100755
index 0000000..ae04806
--- /dev/null
+++ b/scripts/eval/configs/habitat_object_cfg.py
@@ -0,0 +1,57 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg, TaskCfg
+
+eval_cfg = EvalCfg(
+    remote_agent=False,
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='dialog',
+        ckpt_path='',
+        model_settings={
+            "mode": "system2",  # inference mode: dual_system or system2
+            "dialog_enabled": False,
+            "model_path": "checkpoints/Vlln-object",  # path to model checkpoint
+            "append_look_down": True,
+            "num_history": 8,
+            "resize_w": 384,  # image resize width
+            "resize_h": 384,  # image resize height
+            "max_new_tokens": 128,  # maximum number of tokens for generation
+        },
+    ),
+    env=EnvCfg(
+        env_type='habitat',
+        env_settings={
+            # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
+            'baseline_config_path': 'scripts/eval/configs/gen_videos.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/instance_dialog.yaml',
+            'habitat_config_path': 'scripts/eval/configs/objectnav_hm3d.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/instance.yaml',
+        },
+    ),
+    task=TaskCfg(
+        # task_name = "instance_dialog"
+        task_name="objectnav",
+        # task_name = "instance",
+    ),
+    eval_type="habitat_dialog",
+    eval_settings={
+        # all current parse args
+        "output_path": "./logs/habitat/object",  # output directory for logs/results
+        "epoch": 0,  # epoch number for logging
+        "max_steps_per_episode": 500,  # maximum steps per episode
+        # task setting
+        # "eval_split": "easy_same",
+        "eval_split": "val",
+        # "eval_split": "easy_same",
+        "turn": 5,
+        "save_video": False,  # whether to save videos
+        # npc setting
+        "base_url": 'http://35.220.164.252:3888/v1',
+        "model_name": "gpt-4o",
+        "openai_api_key": 'internnav/internnav_habitat/simple_npc/api_key.txt',
+        "scene_summary": 'internnav/internnav_habitat/simple_npc/scene_summary',
+        # distributed settings
+        "port": "2333",  # communication port
+        "dist_url": "env://",  # url for distributed setup
+    },
+)
diff --git a/scripts/eval/configs/instance_dialog.yaml b/scripts/eval/configs/instance_dialog.yaml
new file mode 100644
index 0000000..baa96ab
--- /dev/null
+++ b/scripts/eval/configs/instance_dialog.yaml
@@ -0,0 +1,79 @@
+# @package _global_
+
+defaults:
+  - /habitat: habitat_config_base
+  - /habitat/task: objectnav
+  # - habitat/task/measurements: dialog_success
+  - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgbd_agent
+  - /habitat/dataset/objectnav: mp3d
+  - /habitat/task/lab_sensors:
+    - gps_sensor
+    - compass_sensor
+  - _self_
+
+habitat:
+  environment:
+    max_episode_steps: 2000
+    iterator_options:
+      max_scene_repeat_steps: 50000
+      shuffle: False
+  simulator:
+    agents:
+      main_agent:
+        sim_sensors:
+          rgb_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+          depth_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            min_depth: 0.0
+            max_depth: 10.0
+    forward_step_size: 0.25
+    turn_angle: 30
+    tilt_angle: 15
+    action_space_config: "v1"
+    habitat_sim_v0:
+      allow_sliding: True
+      gpu_device_id: 0
+  task:
+    measurements:
+      distance_to_goal:
+        type: DistanceToGoal
+        distance_to: VIEW_POINTS
+      success:
+        type: Success
+        success_distance: 0.25
+      spl:
+        type: SPL
+      oracle_success:
+        type: OracleSuccess
+      oracle_navigation_error:
+        type: OracleNavigationError
+    actions:
+      stop:
+        type: StopAction
+        agent_index: 0
+      move_forward:
+        type: MoveForwardAction
+        agent_index: 0
+      turn_left:
+        type: TurnLeftAction
+        agent_index: 0
+      turn_right:
+        type: TurnRightAction
+        agent_index: 0
+      look_up:
+        type: LookUpAction
+        agent_index: 0
+      look_down:
+        type: LookDownAction
+        agent_index: 0
+
+  dataset:
+    type: dialog
+    split: unseen_mini
+    scenes_dir: data/scene_datasets/
+    data_path: data/datasets/instance_goal_dialog/unseen/final_VLLN_testset.json.gz
diff --git a/scripts/eval/configs/objectnav_hm3d.yaml b/scripts/eval/configs/objectnav_hm3d.yaml
new file mode 100644
index 0000000..dba3dbf
--- /dev/null
+++ b/scripts/eval/configs/objectnav_hm3d.yaml
@@ -0,0 +1,83 @@
+# @package _global_
+
+defaults:
+  - /habitat: habitat_config_base
+  - /habitat/task: objectnav
+  - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgbd_agent
+  - /habitat/dataset/objectnav: hm3d
+  - /habitat/task/lab_sensors:
+    - gps_sensor
+    - compass_sensor
+  - _self_
+
+habitat:
+  environment:
+    max_episode_steps: 2000
+    iterator_options:
+      max_scene_repeat_steps: 50000
+      shuffle: False
+  simulator:
+    agents:
+      main_agent:
+        sim_sensors:
+          rgb_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            position: [0, 1.25, 0]
+          depth_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            min_depth: 0.0
+            max_depth: 10.0
+            position: [0, 1.25, 0]
+        height: 1.25
+        radius: 0.18
+    forward_step_size: 0.25
+    turn_angle: 30
+    tilt_angle: 15
+    action_space_config: "v1"
+    habitat_sim_v0:
+      gpu_device_id: 0
+      allow_sliding: True
+  task:
+    measurements:
+      distance_to_goal:
+        type: DistanceToGoal
+        distance_to: VIEW_POINTS
+      success:
+        type: Success
+        success_distance: 0.25
+      spl:
+        type: SPL
+      oracle_success:
+        type: OracleSuccess
+      oracle_navigation_error:
+        type: OracleNavigationError
+    actions:
+      stop:
+        type: StopAction
+        agent_index: 0
+      move_forward:
+        type: MoveForwardAction
+        agent_index: 0
+      turn_left:
+        type: TurnLeftAction
+        agent_index: 0
+      turn_right:
+        type: TurnRightAction
+        agent_index: 0
+      look_up:
+        type: LookUpAction
+        agent_index: 0
+      look_down:
+        type: LookDownAction
+        agent_index: 0
+
+  dataset:
+    type: ObjectNav-v1
+    split: val
+    scenes_dir: data/scene_datasets/
+    data_path: data/datasets/objectnav_hm3d_v2/{split}/{split}.json.gz
+    # data_path: data/datasets/objectnav/mp3d/v1/val/val.json.gz
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 4c17026..8367178 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -18,6 +18,9 @@ def parse_args():
         default='scripts/eval/configs/h1_rdp_cfg.py',
         help='eval config file path, e.g. scripts/eval/configs/h1_cma_cfg.py',
     )
+    parser.add_argument('--port', type=int, default=None)
+    parser.add_argument('--host', type=str, default=None)
+    parser.add_argument('--dist_eval', action="store_true", default=False)
     return parser.parse_args()