NVIDIA
diff --git a/‎STYLE_GUIDE.md‎ ‎STYLEGUIDE.md‎STYLE_GUIDE.md renamed to STYLEGUIDE.md b/‎STYLE_GUIDE.md‎ ‎STYLEGUIDE.md‎STYLE_GUIDE.md renamed to STYLEGUIDE.md
diff --git a/‎crates/navigator-cli/src/run.rs‎
Lines changed: 29 additions & 11 deletions b/‎crates/navigator-cli/src/run.rs‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎crates/navigator-cli/src/ssh.rs‎
Lines changed: 11 additions & 0 deletions b/‎crates/navigator-cli/src/ssh.rs‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎crates/navigator-server/src/ssh_tunnel.rs‎
Lines changed: 48 additions & 4 deletions b/‎crates/navigator-server/src/ssh_tunnel.rs‎
Lines changed: 48 additions & 4 deletions
diff --git a/‎deploy/docker/Dockerfile.cluster‎
Lines changed: 7 additions & 4 deletions b/‎deploy/docker/Dockerfile.cluster‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎deploy/docker/cluster-entrypoint.sh‎
Lines changed: 40 additions & 0 deletions b/‎deploy/docker/cluster-entrypoint.sh‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎deploy/kube/manifests/navigator-helmchart.yaml‎
Lines changed: 1 addition & 0 deletions b/‎deploy/kube/manifests/navigator-helmchart.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tasks/ci.toml‎
Lines changed: 20 additions & 18 deletions b/‎tasks/ci.toml‎
Lines changed: 20 additions & 18 deletions
diff --git a/‎tasks/cluster.toml‎
Lines changed: 0 additions & 11 deletions b/‎tasks/cluster.toml‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎tasks/publish.toml‎
Lines changed: 0 additions & 24 deletions b/‎tasks/publish.toml‎
Lines changed: 0 additions & 24 deletions
@@ -143,12 +143,12 @@ impl LogDisplay {
     fn finish_phase(&mut self, phase: &str) {
         self.phase = phase.to_string();
         self.latest_log.clear();
-        self.spinner
-            .finish_with_message(format_phase_label(&self.phase));
-    }
-
-    fn shutdown(&self) {
-        self.spinner.disable_steady_tick();
+        // Print the final phase as a static line above the spinner, then
+        // clear the spinner itself.  This leaves the phase label visible
+        // in scrollback instead of erasing it with finish_and_clear().
+        let _ = self
+            .mp
+            .println(format!("  {}", format_phase_label(&self.phase)));
         self.spinner.finish_and_clear();
     }
 
@@ -1085,6 +1085,11 @@ pub async fn sandbox_create(
         println!("  {}", format_phase_label(phase_name(sandbox.phase)));
     }
 
+    // Don't use stop_on_terminal on the server — the Kubernetes CRD may
+    // briefly report a stale Ready status before the controller reconciles
+    // a newly created sandbox.  Instead we handle termination client-side:
+    // we wait until we have observed at least one non-Ready phase followed
+    // by Ready (a genuine Provisioning → Ready transition).
     let mut stream = client
         .watch_sandbox(WatchSandboxRequest {
             id: sandbox.id.clone(),
@@ -1093,10 +1098,8 @@ pub async fn sandbox_create(
             follow_events: true,
             log_tail_lines: 200,
             event_tail: 0,
-            stop_on_terminal: true,
+            stop_on_terminal: false,
             log_since_ms: 0,
-            // Only show gateway logs during provisioning — sandbox logs would
-            // keep the stream alive indefinitely and prevent stop_on_terminal.
             log_sources: vec!["gateway".to_string()],
             log_min_level: String::new(),
         })
@@ -1106,6 +1109,8 @@ pub async fn sandbox_create(
 
     let mut last_phase = sandbox.phase;
     let mut last_error_reason = String::new();
+    // Track whether we have seen a non-Ready phase during the watch.
+    let mut saw_non_ready = SandboxPhase::try_from(sandbox.phase) != Ok(SandboxPhase::Ready);
     let start_time = Instant::now();
     let provision_timeout = Duration::from_secs(120);
 
@@ -1125,10 +1130,16 @@ pub async fn sandbox_create(
         let evt = item.into_diagnostic()?;
         match evt.payload {
             Some(navigator_core::proto::sandbox_stream_event::Payload::Sandbox(s)) => {
+                let phase = SandboxPhase::try_from(s.phase).unwrap_or(SandboxPhase::Unknown);
                 last_phase = s.phase;
+
+                if phase != SandboxPhase::Ready {
+                    saw_non_ready = true;
+                }
+
                 // Capture error reason from conditions only when phase is Error
                 // to avoid showing stale transient error reasons
-                if SandboxPhase::try_from(s.phase) == Ok(SandboxPhase::Error)
+                if phase == SandboxPhase::Error
                     && let Some(status) = &s.status
                 {
                     for condition in &status.conditions {
@@ -1145,6 +1156,12 @@ pub async fn sandbox_create(
                 } else {
                     println!("  {}", format_phase_label(phase_name(s.phase)));
                 }
+
+                // Only accept Ready as terminal after we've observed a
+                // non-Ready phase, proving the controller has reconciled.
+                if saw_non_ready && phase == SandboxPhase::Ready {
+                    break;
+                }
             }
             Some(navigator_core::proto::sandbox_stream_event::Payload::Log(line)) => {
                 if let Some(d) = display.as_mut() {
@@ -1180,7 +1197,6 @@ pub async fn sandbox_create(
     // Finish up - check final phase
     if let Some(d) = display.as_mut() {
         d.finish_phase(phase_name(last_phase));
-        d.shutdown();
     }
     drop(display);
     let _ = std::io::stdout().flush();
@@ -1229,9 +1245,11 @@ pub async fn sandbox_create(
             }
 
             if command.is_empty() {
+                eprintln!("Connecting...");
                 return sandbox_connect(&effective_server, &sandbox_name, &effective_tls).await;
             }
 
+            eprintln!("Connecting...");
             let exec_result = sandbox_exec(
                 &effective_server,
                 &sandbox_name,
 
@@ -235,6 +235,17 @@ pub async fn sandbox_exec(
         .stdout(std::process::Stdio::inherit())
         .stderr(std::process::Stdio::inherit());
 
+    // For interactive TTY sessions, replace this process with SSH via exec()
+    // to avoid signal handling issues (e.g. Ctrl+C killing the parent ncl
+    // process and orphaning the SSH child).
+    if tty && std::io::stdin().is_terminal() {
+        #[cfg(unix)]
+        {
+            let err = ssh.exec();
+            return Err(miette::miette!("failed to exec ssh: {err}"));
+        }
+    }
+
     let status = tokio::task::spawn_blocking(move || ssh.status())
         .await
         .into_diagnostic()?
 
@@ -11,6 +11,7 @@ use hyper_util::rt::TokioIo;
 use navigator_core::proto::{Sandbox, SandboxPhase, SshSession};
 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::time::Duration;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tokio::net::TcpStream;
 use tracing::{info, warn};
@@ -127,10 +128,53 @@ async fn handle_tunnel(
     secret: &str,
     sandbox_id: &str,
 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
-    let mut upstream = match target {
-        ConnectTarget::Ip(addr) => TcpStream::connect(addr).await?,
-        ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), port)).await?,
-    };
+    // The sandbox pod may not be network-reachable immediately after the CRD
+    // reports Ready (DNS propagation, pod IP assignment, SSH server startup).
+    // Retry the TCP connection with exponential backoff.
+    let mut upstream = None;
+    let mut last_err = None;
+    let delays = [
+        Duration::from_millis(100),
+        Duration::from_millis(250),
+        Duration::from_millis(500),
+        Duration::from_secs(1),
+        Duration::from_secs(2),
+        Duration::from_secs(5),
+        Duration::from_secs(10),
+        Duration::from_secs(15),
+    ];
+    for (attempt, delay) in std::iter::once(&Duration::ZERO)
+        .chain(delays.iter())
+        .enumerate()
+    {
+        if !delay.is_zero() {
+            tokio::time::sleep(*delay).await;
+        }
+        let result = match &target {
+            ConnectTarget::Ip(addr) => TcpStream::connect(addr).await,
+            ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), *port)).await,
+        };
+        match result {
+            Ok(stream) => {
+                if attempt > 0 {
+                    info!(
+                        sandbox_id = %sandbox_id,
+                        attempts = attempt + 1,
+                        "SSH tunnel connected after retry"
+                    );
+                }
+                upstream = Some(stream);
+                break;
+            }
+            Err(err) => {
+                last_err = Some(err);
+            }
+        }
+    }
+    let mut upstream = upstream.ok_or_else(|| {
+        let err = last_err.unwrap();
+        format!("failed to connect to sandbox after retries: {err}")
+    })?;
     upstream.set_nodelay(true)?;
     let preface = build_preface(token, secret)?;
     upstream.write_all(preface.as_bytes()).await?;
 
@@ -18,11 +18,12 @@
 ARG K3S_VERSION=v1.29.8-k3s1
 FROM rancher/k3s:${K3S_VERSION}
 
-# Create directories for manifests and configuration
+# Create directories for manifests, charts, and configuration
 RUN mkdir -p /var/lib/rancher/k3s/server/manifests \
              /var/lib/rancher/k3s/server/static/charts \
              /etc/rancher/k3s \
-             /opt/navigator/manifests
+             /opt/navigator/manifests \
+             /opt/navigator/charts
 
 # Copy entrypoint script that configures DNS for Docker environments
 # This script detects the host gateway IP and configures CoreDNS to use it
@@ -36,8 +37,10 @@ RUN chmod +x /usr/local/bin/cluster-healthcheck.sh
 # Registry credentials for pulling component images at runtime are generated
 # by the entrypoint script at /etc/rancher/k3s/registries.yaml.
 
-# Copy packaged helm chart to static directory for serving via k3s API
-COPY deploy/docker/.build/charts/*.tgz /var/lib/rancher/k3s/server/static/charts/
+# Copy packaged helm charts to a staging directory that won't be
+# overwritten by the /var/lib/rancher/k3s volume mount. The entrypoint
+# script copies them into the k3s static charts directory at container start.
+COPY deploy/docker/.build/charts/*.tgz /opt/navigator/charts/
 
 # Copy Kubernetes manifests to a persistent location that won't be overwritten by the volume mount.
 # The bootstrap code will copy these to /var/lib/rancher/k3s/server/manifests/ after cluster start.
 
@@ -127,6 +127,35 @@ else
     echo "Warning: REGISTRY_HOST not set; skipping registry config"
 fi
 
+# Copy bundled Helm chart tarballs to the k3s static charts directory.
+# These are stored in /opt/navigator/charts/ because the volume mount
+# on /var/lib/rancher/k3s overwrites any files baked into that path.
+# Without this, a persistent volume from a previous deploy would keep
+# serving stale chart tarballs.
+K3S_CHARTS="/var/lib/rancher/k3s/server/static/charts"
+BUNDLED_CHARTS="/opt/navigator/charts"
+CHART_CHECKSUM=""
+
+if [ -d "$BUNDLED_CHARTS" ]; then
+    echo "Copying bundled charts to k3s..."
+    for chart in "$BUNDLED_CHARTS"/*.tgz; do
+        [ ! -f "$chart" ] && continue
+        cp "$chart" "$K3S_CHARTS/"
+    done
+    # Compute a checksum of the navigator chart so we can inject it into the
+    # HelmChart manifest below. When the chart content changes between image
+    # versions the checksum changes, which modifies the HelmChart CR spec and
+    # forces the k3s Helm controller to re-install.
+    NAV_CHART="$BUNDLED_CHARTS/navigator-0.1.0.tgz"
+    if [ -f "$NAV_CHART" ]; then
+        if command -v sha256sum >/dev/null 2>&1; then
+            CHART_CHECKSUM=$(sha256sum "$NAV_CHART" | cut -d ' ' -f 1)
+        elif command -v shasum >/dev/null 2>&1; then
+            CHART_CHECKSUM=$(shasum -a 256 "$NAV_CHART" | cut -d ' ' -f 1)
+        fi
+    fi
+fi
+
 # Copy bundled manifests to k3s manifests directory.
 # These are stored in /opt/navigator/manifests/ because the volume mount
 # on /var/lib/rancher/k3s overwrites any files baked into that path.
@@ -239,5 +268,16 @@ if [ -f "$HELMCHART" ]; then
     fi
 fi
 
+# Inject chart checksum into the HelmChart manifest so that a changed chart
+# tarball causes the HelmChart CR spec to differ, forcing the k3s Helm
+# controller to upgrade the release.
+if [ -n "$CHART_CHECKSUM" ] && [ -f "$HELMCHART" ]; then
+    echo "Injecting chart checksum: ${CHART_CHECKSUM}"
+    sed -i "s|__CHART_CHECKSUM__|${CHART_CHECKSUM}|g" "$HELMCHART"
+else
+    # Remove the placeholder line entirely so invalid YAML isn't left behind
+    sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART"
+fi
+
 # Execute k3s with explicit resolv-conf.
 exec /bin/k3s "$@" --resolv-conf="$RESOLV_CONF"
@@ -22,6 +22,7 @@ spec:
   targetNamespace: navigator
   createNamespace: true
   valuesContent: |-
+    chartChecksum: __CHART_CHECKSUM__
     image:
       repository: d1i0nduu2f6qxk.cloudfront.net/navigator/server
       tag: latest
 
@@ -1,7 +1,26 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-# CI and quality tasks
+# CI, build, and quality tasks
+
+[build]
+description = "Build all Rust crates"
+run = "cargo build --workspace"
+hide = true
+
+["build:release"]
+description = "Build all Rust crates in release mode"
+run = "cargo build --workspace --release"
+hide = true
+
+[check]
+description = "Run fast compile and type checks"
+depends = ["rust:check", "python:typecheck"]
+hide = true
+
+[clean]
+description = "Clean build artifacts"
+run = "cargo clean"
 
 [fmt]
 description = "Format Rust and Python code"
@@ -26,20 +45,3 @@ hide = true
 description = "Alias for ci"
 depends = ["ci"]
 hide = true
-
-[sandbox]
-description = "Create a sandbox on the running cluster"
-raw = true
-usage = """
-arg "[command]" var=#true help="Command to run in the sandbox (default: interactive agent)"
-"""
-run = """
-#!/usr/bin/env bash
-set -euo pipefail
-CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")}
-CONTAINER_NAME="navigator-cluster-${CLUSTER_NAME}"
-if ! docker ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then
-  mise run cluster
-fi
-ncl sandbox create -- ${usage_command:-claude}
-"""
 
@@ -16,17 +16,6 @@ depends = [
 run = "tasks/scripts/cluster-bootstrap.sh build"
 hide = true
 
-["cluster:sandbox"]
-description = "Run the sandbox container with an interactive shell"
-depends = ["docker:build:sandbox"]
-raw = true
-usage = """
-flag "-e --env <env>" var=#true help="Environment variables to pass into the sandbox"
-arg "[command]" var=#true help="Command to run in the sandbox (default: /bin/bash)"
-"""
-run = "bash tasks/scripts/run-sandbox.sh"
-hide = true
-
 ["cluster:deploy"]
 description = "Alias for cluster (incremental deploy)"
 run = "tasks/scripts/cluster.sh"
 
@@ -3,30 +3,6 @@
 
 # Publishing / release tasks
 
-["python:publish"]
-description = "Build and publish Python wheels"
-run = """
-#!/usr/bin/env bash
-set -euo pipefail
-VERSION=$(uv run python tasks/scripts/release.py get-version --python)
-CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo)
-NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:all
-uv run python tasks/scripts/release.py python-publish --version "$VERSION"
-"""
-hide = true
-
-["python:publish:macos"]
-description = "Build and publish macOS arm64 Python wheel"
-run = """
-#!/usr/bin/env bash
-set -euo pipefail
-VERSION=$(uv run python tasks/scripts/release.py get-version --python)
-CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo)
-NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
-uv run python tasks/scripts/release.py python-publish --version "$VERSION" --wheel-glob "*macosx*arm64.whl"
-"""
-hide = true
-
 ["publish:main"]
 description = "Main branch publish job (images with :dev, :latest, and version tag)"
 run = """