diff --git a/STYLE_GUIDE.md b/STYLEGUIDE.md similarity index 100% rename from STYLE_GUIDE.md rename to STYLEGUIDE.md diff --git a/crates/navigator-cli/src/run.rs b/crates/navigator-cli/src/run.rs index c769bb61..4cceb407 100644 --- a/crates/navigator-cli/src/run.rs +++ b/crates/navigator-cli/src/run.rs @@ -143,12 +143,12 @@ impl LogDisplay { fn finish_phase(&mut self, phase: &str) { self.phase = phase.to_string(); self.latest_log.clear(); - self.spinner - .finish_with_message(format_phase_label(&self.phase)); - } - - fn shutdown(&self) { - self.spinner.disable_steady_tick(); + // Print the final phase as a static line above the spinner, then + // clear the spinner itself. This leaves the phase label visible + // in scrollback instead of erasing it with finish_and_clear(). + let _ = self + .mp + .println(format!(" {}", format_phase_label(&self.phase))); self.spinner.finish_and_clear(); } @@ -1085,6 +1085,11 @@ pub async fn sandbox_create( println!(" {}", format_phase_label(phase_name(sandbox.phase))); } + // Don't use stop_on_terminal on the server — the Kubernetes CRD may + // briefly report a stale Ready status before the controller reconciles + // a newly created sandbox. Instead we handle termination client-side: + // we wait until we have observed at least one non-Ready phase followed + // by Ready (a genuine Provisioning → Ready transition). let mut stream = client .watch_sandbox(WatchSandboxRequest { id: sandbox.id.clone(), @@ -1093,10 +1098,8 @@ pub async fn sandbox_create( follow_events: true, log_tail_lines: 200, event_tail: 0, - stop_on_terminal: true, + stop_on_terminal: false, log_since_ms: 0, - // Only show gateway logs during provisioning — sandbox logs would - // keep the stream alive indefinitely and prevent stop_on_terminal. log_sources: vec!["gateway".to_string()], log_min_level: String::new(), }) @@ -1106,6 +1109,8 @@ pub async fn sandbox_create( let mut last_phase = sandbox.phase; let mut last_error_reason = String::new(); + // Track whether we have seen a non-Ready phase during the watch. + let mut saw_non_ready = SandboxPhase::try_from(sandbox.phase) != Ok(SandboxPhase::Ready); let start_time = Instant::now(); let provision_timeout = Duration::from_secs(120); @@ -1125,10 +1130,16 @@ pub async fn sandbox_create( let evt = item.into_diagnostic()?; match evt.payload { Some(navigator_core::proto::sandbox_stream_event::Payload::Sandbox(s)) => { + let phase = SandboxPhase::try_from(s.phase).unwrap_or(SandboxPhase::Unknown); last_phase = s.phase; + + if phase != SandboxPhase::Ready { + saw_non_ready = true; + } + // Capture error reason from conditions only when phase is Error // to avoid showing stale transient error reasons - if SandboxPhase::try_from(s.phase) == Ok(SandboxPhase::Error) + if phase == SandboxPhase::Error && let Some(status) = &s.status { for condition in &status.conditions { @@ -1145,6 +1156,12 @@ pub async fn sandbox_create( } else { println!(" {}", format_phase_label(phase_name(s.phase))); } + + // Only accept Ready as terminal after we've observed a + // non-Ready phase, proving the controller has reconciled. + if saw_non_ready && phase == SandboxPhase::Ready { + break; + } } Some(navigator_core::proto::sandbox_stream_event::Payload::Log(line)) => { if let Some(d) = display.as_mut() { @@ -1180,7 +1197,6 @@ pub async fn sandbox_create( // Finish up - check final phase if let Some(d) = display.as_mut() { d.finish_phase(phase_name(last_phase)); - d.shutdown(); } drop(display); let _ = std::io::stdout().flush(); @@ -1229,9 +1245,11 @@ pub async fn sandbox_create( } if command.is_empty() { + eprintln!("Connecting..."); return sandbox_connect(&effective_server, &sandbox_name, &effective_tls).await; } + eprintln!("Connecting..."); let exec_result = sandbox_exec( &effective_server, &sandbox_name, diff --git a/crates/navigator-cli/src/ssh.rs b/crates/navigator-cli/src/ssh.rs index 9a0d2486..45994a99 100644 --- a/crates/navigator-cli/src/ssh.rs +++ b/crates/navigator-cli/src/ssh.rs @@ -235,6 +235,17 @@ pub async fn sandbox_exec( .stdout(std::process::Stdio::inherit()) .stderr(std::process::Stdio::inherit()); + // For interactive TTY sessions, replace this process with SSH via exec() + // to avoid signal handling issues (e.g. Ctrl+C killing the parent ncl + // process and orphaning the SSH child). + if tty && std::io::stdin().is_terminal() { + #[cfg(unix)] + { + let err = ssh.exec(); + return Err(miette::miette!("failed to exec ssh: {err}")); + } + } + let status = tokio::task::spawn_blocking(move || ssh.status()) .await .into_diagnostic()? diff --git a/crates/navigator-server/src/ssh_tunnel.rs b/crates/navigator-server/src/ssh_tunnel.rs index 8a07af46..caf8171e 100644 --- a/crates/navigator-server/src/ssh_tunnel.rs +++ b/crates/navigator-server/src/ssh_tunnel.rs @@ -11,6 +11,7 @@ use hyper_util::rt::TokioIo; use navigator_core::proto::{Sandbox, SandboxPhase, SshSession}; use std::net::SocketAddr; use std::sync::Arc; +use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::net::TcpStream; use tracing::{info, warn}; @@ -127,10 +128,53 @@ async fn handle_tunnel( secret: &str, sandbox_id: &str, ) -> Result<(), Box> { - let mut upstream = match target { - ConnectTarget::Ip(addr) => TcpStream::connect(addr).await?, - ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), port)).await?, - }; + // The sandbox pod may not be network-reachable immediately after the CRD + // reports Ready (DNS propagation, pod IP assignment, SSH server startup). + // Retry the TCP connection with exponential backoff. + let mut upstream = None; + let mut last_err = None; + let delays = [ + Duration::from_millis(100), + Duration::from_millis(250), + Duration::from_millis(500), + Duration::from_secs(1), + Duration::from_secs(2), + Duration::from_secs(5), + Duration::from_secs(10), + Duration::from_secs(15), + ]; + for (attempt, delay) in std::iter::once(&Duration::ZERO) + .chain(delays.iter()) + .enumerate() + { + if !delay.is_zero() { + tokio::time::sleep(*delay).await; + } + let result = match &target { + ConnectTarget::Ip(addr) => TcpStream::connect(addr).await, + ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), *port)).await, + }; + match result { + Ok(stream) => { + if attempt > 0 { + info!( + sandbox_id = %sandbox_id, + attempts = attempt + 1, + "SSH tunnel connected after retry" + ); + } + upstream = Some(stream); + break; + } + Err(err) => { + last_err = Some(err); + } + } + } + let mut upstream = upstream.ok_or_else(|| { + let err = last_err.unwrap(); + format!("failed to connect to sandbox after retries: {err}") + })?; upstream.set_nodelay(true)?; let preface = build_preface(token, secret)?; upstream.write_all(preface.as_bytes()).await?; diff --git a/deploy/docker/Dockerfile.cluster b/deploy/docker/Dockerfile.cluster index 5e696b26..823b6259 100644 --- a/deploy/docker/Dockerfile.cluster +++ b/deploy/docker/Dockerfile.cluster @@ -18,11 +18,12 @@ ARG K3S_VERSION=v1.29.8-k3s1 FROM rancher/k3s:${K3S_VERSION} -# Create directories for manifests and configuration +# Create directories for manifests, charts, and configuration RUN mkdir -p /var/lib/rancher/k3s/server/manifests \ /var/lib/rancher/k3s/server/static/charts \ /etc/rancher/k3s \ - /opt/navigator/manifests + /opt/navigator/manifests \ + /opt/navigator/charts # Copy entrypoint script that configures DNS for Docker environments # This script detects the host gateway IP and configures CoreDNS to use it @@ -36,8 +37,10 @@ RUN chmod +x /usr/local/bin/cluster-healthcheck.sh # Registry credentials for pulling component images at runtime are generated # by the entrypoint script at /etc/rancher/k3s/registries.yaml. -# Copy packaged helm chart to static directory for serving via k3s API -COPY deploy/docker/.build/charts/*.tgz /var/lib/rancher/k3s/server/static/charts/ +# Copy packaged helm charts to a staging directory that won't be +# overwritten by the /var/lib/rancher/k3s volume mount. The entrypoint +# script copies them into the k3s static charts directory at container start. +COPY deploy/docker/.build/charts/*.tgz /opt/navigator/charts/ # Copy Kubernetes manifests to a persistent location that won't be overwritten by the volume mount. # The bootstrap code will copy these to /var/lib/rancher/k3s/server/manifests/ after cluster start. diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 0027133d..1e0db386 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -127,6 +127,35 @@ else echo "Warning: REGISTRY_HOST not set; skipping registry config" fi +# Copy bundled Helm chart tarballs to the k3s static charts directory. +# These are stored in /opt/navigator/charts/ because the volume mount +# on /var/lib/rancher/k3s overwrites any files baked into that path. +# Without this, a persistent volume from a previous deploy would keep +# serving stale chart tarballs. +K3S_CHARTS="/var/lib/rancher/k3s/server/static/charts" +BUNDLED_CHARTS="/opt/navigator/charts" +CHART_CHECKSUM="" + +if [ -d "$BUNDLED_CHARTS" ]; then + echo "Copying bundled charts to k3s..." + for chart in "$BUNDLED_CHARTS"/*.tgz; do + [ ! -f "$chart" ] && continue + cp "$chart" "$K3S_CHARTS/" + done + # Compute a checksum of the navigator chart so we can inject it into the + # HelmChart manifest below. When the chart content changes between image + # versions the checksum changes, which modifies the HelmChart CR spec and + # forces the k3s Helm controller to re-install. + NAV_CHART="$BUNDLED_CHARTS/navigator-0.1.0.tgz" + if [ -f "$NAV_CHART" ]; then + if command -v sha256sum >/dev/null 2>&1; then + CHART_CHECKSUM=$(sha256sum "$NAV_CHART" | cut -d ' ' -f 1) + elif command -v shasum >/dev/null 2>&1; then + CHART_CHECKSUM=$(shasum -a 256 "$NAV_CHART" | cut -d ' ' -f 1) + fi + fi +fi + # Copy bundled manifests to k3s manifests directory. # These are stored in /opt/navigator/manifests/ because the volume mount # on /var/lib/rancher/k3s overwrites any files baked into that path. @@ -239,5 +268,16 @@ if [ -f "$HELMCHART" ]; then fi fi +# Inject chart checksum into the HelmChart manifest so that a changed chart +# tarball causes the HelmChart CR spec to differ, forcing the k3s Helm +# controller to upgrade the release. +if [ -n "$CHART_CHECKSUM" ] && [ -f "$HELMCHART" ]; then + echo "Injecting chart checksum: ${CHART_CHECKSUM}" + sed -i "s|__CHART_CHECKSUM__|${CHART_CHECKSUM}|g" "$HELMCHART" +else + # Remove the placeholder line entirely so invalid YAML isn't left behind + sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" +fi + # Execute k3s with explicit resolv-conf. exec /bin/k3s "$@" --resolv-conf="$RESOLV_CONF" diff --git a/deploy/kube/manifests/navigator-helmchart.yaml b/deploy/kube/manifests/navigator-helmchart.yaml index b810d550..f07b3726 100644 --- a/deploy/kube/manifests/navigator-helmchart.yaml +++ b/deploy/kube/manifests/navigator-helmchart.yaml @@ -22,6 +22,7 @@ spec: targetNamespace: navigator createNamespace: true valuesContent: |- + chartChecksum: __CHART_CHECKSUM__ image: repository: d1i0nduu2f6qxk.cloudfront.net/navigator/server tag: latest diff --git a/tasks/ci.toml b/tasks/ci.toml index 00841987..515b5abd 100644 --- a/tasks/ci.toml +++ b/tasks/ci.toml @@ -1,7 +1,26 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# CI and quality tasks +# CI, build, and quality tasks + +[build] +description = "Build all Rust crates" +run = "cargo build --workspace" +hide = true + +["build:release"] +description = "Build all Rust crates in release mode" +run = "cargo build --workspace --release" +hide = true + +[check] +description = "Run fast compile and type checks" +depends = ["rust:check", "python:typecheck"] +hide = true + +[clean] +description = "Clean build artifacts" +run = "cargo clean" [fmt] description = "Format Rust and Python code" @@ -26,20 +45,3 @@ hide = true description = "Alias for ci" depends = ["ci"] hide = true - -[sandbox] -description = "Create a sandbox on the running cluster" -raw = true -usage = """ -arg "[command]" var=#true help="Command to run in the sandbox (default: interactive agent)" -""" -run = """ -#!/usr/bin/env bash -set -euo pipefail -CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")} -CONTAINER_NAME="navigator-cluster-${CLUSTER_NAME}" -if ! docker ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then - mise run cluster -fi -ncl sandbox create -- ${usage_command:-claude} -""" diff --git a/tasks/cluster.toml b/tasks/cluster.toml index 46ba4393..12e4d1a9 100644 --- a/tasks/cluster.toml +++ b/tasks/cluster.toml @@ -16,17 +16,6 @@ depends = [ run = "tasks/scripts/cluster-bootstrap.sh build" hide = true -["cluster:sandbox"] -description = "Run the sandbox container with an interactive shell" -depends = ["docker:build:sandbox"] -raw = true -usage = """ -flag "-e --env " var=#true help="Environment variables to pass into the sandbox" -arg "[command]" var=#true help="Command to run in the sandbox (default: /bin/bash)" -""" -run = "bash tasks/scripts/run-sandbox.sh" -hide = true - ["cluster:deploy"] description = "Alias for cluster (incremental deploy)" run = "tasks/scripts/cluster.sh" diff --git a/tasks/publish.toml b/tasks/publish.toml index b4ac7955..bce96d52 100644 --- a/tasks/publish.toml +++ b/tasks/publish.toml @@ -3,30 +3,6 @@ # Publishing / release tasks -["python:publish"] -description = "Build and publish Python wheels" -run = """ -#!/usr/bin/env bash -set -euo pipefail -VERSION=$(uv run python tasks/scripts/release.py get-version --python) -CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo) -NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:all -uv run python tasks/scripts/release.py python-publish --version "$VERSION" -""" -hide = true - -["python:publish:macos"] -description = "Build and publish macOS arm64 Python wheel" -run = """ -#!/usr/bin/env bash -set -euo pipefail -VERSION=$(uv run python tasks/scripts/release.py get-version --python) -CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo) -NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos -uv run python tasks/scripts/release.py python-publish --version "$VERSION" --wheel-glob "*macosx*arm64.whl" -""" -hide = true - ["publish:main"] description = "Main branch publish job (images with :dev, :latest, and version tag)" run = """ diff --git a/tasks/python.toml b/tasks/python.toml index fd4fab37..9d045714 100644 --- a/tasks/python.toml +++ b/tasks/python.toml @@ -207,6 +207,30 @@ depends = ["python:proto"] run = "uv run ty check {{vars.python_paths}}" hide = true +["python:publish"] +description = "Build and publish Python wheels" +run = """ +#!/usr/bin/env bash +set -euo pipefail +VERSION=$(uv run python tasks/scripts/release.py get-version --python) +CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo) +NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:all +uv run python tasks/scripts/release.py python-publish --version "$VERSION" +""" +hide = true + +["python:publish:macos"] +description = "Build and publish macOS arm64 Python wheel" +run = """ +#!/usr/bin/env bash +set -euo pipefail +VERSION=$(uv run python tasks/scripts/release.py get-version --python) +CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo) +NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos +uv run python tasks/scripts/release.py python-publish --version "$VERSION" --wheel-glob "*macosx*arm64.whl" +""" +hide = true + ["python:proto"] description = "Generate Python protobuf stubs from .proto files" env = { UV_NO_SYNC = "1" } diff --git a/tasks/rust.toml b/tasks/rust.toml index 8d708f46..69214ce7 100644 --- a/tasks/rust.toml +++ b/tasks/rust.toml @@ -1,22 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Rust build, check, lint, and format tasks - -[build] -description = "Build all Rust crates" -run = "cargo build --workspace" -hide = true - -["build:release"] -description = "Build all Rust crates in release mode" -run = "cargo build --workspace --release" -hide = true - -[check] -description = "Run fast compile and type checks" -depends = ["rust:check", "python:typecheck"] -hide = true +# Rust check, lint, and format tasks ["rust:check"] description = "Check all Rust crates for errors" @@ -37,7 +22,3 @@ hide = true description = "Check Rust formatting" run = "cargo fmt --all -- --check" hide = true - -[clean] -description = "Clean build artifacts" -run = "cargo clean" diff --git a/tasks/sandbox.toml b/tasks/sandbox.toml new file mode 100644 index 00000000..b58f5558 --- /dev/null +++ b/tasks/sandbox.toml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Sandbox tasks + +[sandbox] +description = "Create or reconnect to the dev sandbox (redeploys cluster if dirty)" +raw = true +usage = """ +arg "[command]" var=#true help="Command to run in the sandbox (default: claude)" +""" +run = "bash tasks/scripts/sandbox.sh" diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 7a8cfe88..104fce6d 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -166,6 +166,26 @@ compute_fingerprint() { local path local digest + # Include the committed state of relevant source paths via git tree + # hashes. This ensures that committed changes (e.g. after `git pull` + # or amend) are detected even when there are no uncommitted edits. + local committed_trees="" + case "${component}" in + server) + committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/cross-build.sh crates/navigator-core/ crates/navigator-providers/ crates/navigator-router/ crates/navigator-server/ deploy/docker/Dockerfile.server 2>/dev/null || true) + ;; + sandbox) + committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/cross-build.sh crates/navigator-core/ crates/navigator-providers/ crates/navigator-sandbox/ deploy/docker/sandbox/ deploy/docker/openclaw-start.sh python/ pyproject.toml uv.lock dev-sandbox-policy.rego 2>/dev/null || true) + ;; + helm) + committed_trees=$(git ls-tree HEAD deploy/helm/navigator/ 2>/dev/null || true) + ;; + esac + if [[ -n "${committed_trees}" ]]; then + payload+="${committed_trees}"$'\n' + fi + + # Layer uncommitted changes on top so dirty files trigger a rebuild too. for path in "${changed_files[@]}"; do case "${component}" in server) diff --git a/tasks/scripts/run-sandbox.sh b/tasks/scripts/run-sandbox.sh deleted file mode 100644 index 45d2df9e..00000000 --- a/tasks/scripts/run-sandbox.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -set -euo pipefail - -TTY_FLAG="" -if [ -t 0 ]; then - TTY_FLAG="-it" -fi - -CMD=(${usage_command:-/bin/bash}) -ENV_FLAGS="" -for var in ${usage_env:-}; do - if [ -n "${!var+x}" ]; then - ENV_FLAGS="${ENV_FLAGS} -e ${var}=${!var}" - else - echo "Warning: ${var} is not set in your environment, skipping" >&2 - fi -done - -docker run ${TTY_FLAG} \ - --cap-add=SYS_ADMIN \ - --cap-add=NET_ADMIN \ - --cap-add=SYS_PTRACE \ - -v ${PWD}/dev-sandbox-policy.rego:/var/navigator/policy.rego:ro \ - -v ${PWD}/dev-sandbox-policy.yaml:/var/navigator/data.yaml:ro \ - -v ${PWD}/inference-routes.yaml:/var/navigator/inference-routes.yaml:ro \ - -v ${PWD}/tmp:/sandbox/tmp \ - -e HOME=/sandbox \ - -w /sandbox \ - -e NEMOCLAW_POLICY_RULES=/var/navigator/policy.rego \ - -e NEMOCLAW_POLICY_DATA=/var/navigator/data.yaml \ - -e NEMOCLAW_INFERENCE_ROUTES=/var/navigator/inference-routes.yaml \ - -e NVIDIA_API_KEY="${NVIDIA_API_KEY:-}" \ - ${ENV_FLAGS} \ - navigator/sandbox:${IMAGE_TAG:-dev} -i -- ${CMD[@]}