Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
40 changes: 29 additions & 11 deletions crates/navigator-cli/src/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,12 @@ impl LogDisplay {
fn finish_phase(&mut self, phase: &str) {
self.phase = phase.to_string();
self.latest_log.clear();
self.spinner
.finish_with_message(format_phase_label(&self.phase));
}

fn shutdown(&self) {
self.spinner.disable_steady_tick();
// Print the final phase as a static line above the spinner, then
// clear the spinner itself. This leaves the phase label visible
// in scrollback instead of erasing it with finish_and_clear().
let _ = self
.mp
.println(format!(" {}", format_phase_label(&self.phase)));
self.spinner.finish_and_clear();
}

Expand Down Expand Up @@ -1085,6 +1085,11 @@ pub async fn sandbox_create(
println!(" {}", format_phase_label(phase_name(sandbox.phase)));
}

// Don't use stop_on_terminal on the server — the Kubernetes CRD may
// briefly report a stale Ready status before the controller reconciles
// a newly created sandbox. Instead we handle termination client-side:
// we wait until we have observed at least one non-Ready phase followed
// by Ready (a genuine Provisioning → Ready transition).
let mut stream = client
.watch_sandbox(WatchSandboxRequest {
id: sandbox.id.clone(),
Expand All @@ -1093,10 +1098,8 @@ pub async fn sandbox_create(
follow_events: true,
log_tail_lines: 200,
event_tail: 0,
stop_on_terminal: true,
stop_on_terminal: false,
log_since_ms: 0,
// Only show gateway logs during provisioning — sandbox logs would
// keep the stream alive indefinitely and prevent stop_on_terminal.
log_sources: vec!["gateway".to_string()],
log_min_level: String::new(),
})
Expand All @@ -1106,6 +1109,8 @@ pub async fn sandbox_create(

let mut last_phase = sandbox.phase;
let mut last_error_reason = String::new();
// Track whether we have seen a non-Ready phase during the watch.
let mut saw_non_ready = SandboxPhase::try_from(sandbox.phase) != Ok(SandboxPhase::Ready);
let start_time = Instant::now();
let provision_timeout = Duration::from_secs(120);

Expand All @@ -1125,10 +1130,16 @@ pub async fn sandbox_create(
let evt = item.into_diagnostic()?;
match evt.payload {
Some(navigator_core::proto::sandbox_stream_event::Payload::Sandbox(s)) => {
let phase = SandboxPhase::try_from(s.phase).unwrap_or(SandboxPhase::Unknown);
last_phase = s.phase;

if phase != SandboxPhase::Ready {
saw_non_ready = true;
}

// Capture error reason from conditions only when phase is Error
// to avoid showing stale transient error reasons
if SandboxPhase::try_from(s.phase) == Ok(SandboxPhase::Error)
if phase == SandboxPhase::Error
&& let Some(status) = &s.status
{
for condition in &status.conditions {
Expand All @@ -1145,6 +1156,12 @@ pub async fn sandbox_create(
} else {
println!(" {}", format_phase_label(phase_name(s.phase)));
}

// Only accept Ready as terminal after we've observed a
// non-Ready phase, proving the controller has reconciled.
if saw_non_ready && phase == SandboxPhase::Ready {
break;
}
}
Some(navigator_core::proto::sandbox_stream_event::Payload::Log(line)) => {
if let Some(d) = display.as_mut() {
Expand Down Expand Up @@ -1180,7 +1197,6 @@ pub async fn sandbox_create(
// Finish up - check final phase
if let Some(d) = display.as_mut() {
d.finish_phase(phase_name(last_phase));
d.shutdown();
}
drop(display);
let _ = std::io::stdout().flush();
Expand Down Expand Up @@ -1229,9 +1245,11 @@ pub async fn sandbox_create(
}

if command.is_empty() {
eprintln!("Connecting...");
return sandbox_connect(&effective_server, &sandbox_name, &effective_tls).await;
}

eprintln!("Connecting...");
let exec_result = sandbox_exec(
&effective_server,
&sandbox_name,
Expand Down
11 changes: 11 additions & 0 deletions crates/navigator-cli/src/ssh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,17 @@ pub async fn sandbox_exec(
.stdout(std::process::Stdio::inherit())
.stderr(std::process::Stdio::inherit());

// For interactive TTY sessions, replace this process with SSH via exec()
// to avoid signal handling issues (e.g. Ctrl+C killing the parent ncl
// process and orphaning the SSH child).
if tty && std::io::stdin().is_terminal() {
#[cfg(unix)]
{
let err = ssh.exec();
return Err(miette::miette!("failed to exec ssh: {err}"));
}
}

let status = tokio::task::spawn_blocking(move || ssh.status())
.await
.into_diagnostic()?
Expand Down
52 changes: 48 additions & 4 deletions crates/navigator-server/src/ssh_tunnel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use hyper_util::rt::TokioIo;
use navigator_core::proto::{Sandbox, SandboxPhase, SshSession};
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::TcpStream;
use tracing::{info, warn};
Expand Down Expand Up @@ -127,10 +128,53 @@ async fn handle_tunnel(
secret: &str,
sandbox_id: &str,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let mut upstream = match target {
ConnectTarget::Ip(addr) => TcpStream::connect(addr).await?,
ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), port)).await?,
};
// The sandbox pod may not be network-reachable immediately after the CRD
// reports Ready (DNS propagation, pod IP assignment, SSH server startup).
// Retry the TCP connection with exponential backoff.
let mut upstream = None;
let mut last_err = None;
let delays = [
Duration::from_millis(100),
Duration::from_millis(250),
Duration::from_millis(500),
Duration::from_secs(1),
Duration::from_secs(2),
Duration::from_secs(5),
Duration::from_secs(10),
Duration::from_secs(15),
];
for (attempt, delay) in std::iter::once(&Duration::ZERO)
.chain(delays.iter())
.enumerate()
{
if !delay.is_zero() {
tokio::time::sleep(*delay).await;
}
let result = match &target {
ConnectTarget::Ip(addr) => TcpStream::connect(addr).await,
ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), *port)).await,
};
match result {
Ok(stream) => {
if attempt > 0 {
info!(
sandbox_id = %sandbox_id,
attempts = attempt + 1,
"SSH tunnel connected after retry"
);
}
upstream = Some(stream);
break;
}
Err(err) => {
last_err = Some(err);
}
}
}
let mut upstream = upstream.ok_or_else(|| {
let err = last_err.unwrap();
format!("failed to connect to sandbox after retries: {err}")
})?;
upstream.set_nodelay(true)?;
let preface = build_preface(token, secret)?;
upstream.write_all(preface.as_bytes()).await?;
Expand Down
11 changes: 7 additions & 4 deletions deploy/docker/Dockerfile.cluster
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
ARG K3S_VERSION=v1.29.8-k3s1
FROM rancher/k3s:${K3S_VERSION}

# Create directories for manifests and configuration
# Create directories for manifests, charts, and configuration
RUN mkdir -p /var/lib/rancher/k3s/server/manifests \
/var/lib/rancher/k3s/server/static/charts \
/etc/rancher/k3s \
/opt/navigator/manifests
/opt/navigator/manifests \
/opt/navigator/charts

# Copy entrypoint script that configures DNS for Docker environments
# This script detects the host gateway IP and configures CoreDNS to use it
Expand All @@ -36,8 +37,10 @@ RUN chmod +x /usr/local/bin/cluster-healthcheck.sh
# Registry credentials for pulling component images at runtime are generated
# by the entrypoint script at /etc/rancher/k3s/registries.yaml.

# Copy packaged helm chart to static directory for serving via k3s API
COPY deploy/docker/.build/charts/*.tgz /var/lib/rancher/k3s/server/static/charts/
# Copy packaged helm charts to a staging directory that won't be
# overwritten by the /var/lib/rancher/k3s volume mount. The entrypoint
# script copies them into the k3s static charts directory at container start.
COPY deploy/docker/.build/charts/*.tgz /opt/navigator/charts/

# Copy Kubernetes manifests to a persistent location that won't be overwritten by the volume mount.
# The bootstrap code will copy these to /var/lib/rancher/k3s/server/manifests/ after cluster start.
Expand Down
40 changes: 40 additions & 0 deletions deploy/docker/cluster-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,35 @@ else
echo "Warning: REGISTRY_HOST not set; skipping registry config"
fi

# Copy bundled Helm chart tarballs to the k3s static charts directory.
# These are stored in /opt/navigator/charts/ because the volume mount
# on /var/lib/rancher/k3s overwrites any files baked into that path.
# Without this, a persistent volume from a previous deploy would keep
# serving stale chart tarballs.
K3S_CHARTS="/var/lib/rancher/k3s/server/static/charts"
BUNDLED_CHARTS="/opt/navigator/charts"
CHART_CHECKSUM=""

if [ -d "$BUNDLED_CHARTS" ]; then
echo "Copying bundled charts to k3s..."
for chart in "$BUNDLED_CHARTS"/*.tgz; do
[ ! -f "$chart" ] && continue
cp "$chart" "$K3S_CHARTS/"
done
# Compute a checksum of the navigator chart so we can inject it into the
# HelmChart manifest below. When the chart content changes between image
# versions the checksum changes, which modifies the HelmChart CR spec and
# forces the k3s Helm controller to re-install.
NAV_CHART="$BUNDLED_CHARTS/navigator-0.1.0.tgz"
if [ -f "$NAV_CHART" ]; then
if command -v sha256sum >/dev/null 2>&1; then
CHART_CHECKSUM=$(sha256sum "$NAV_CHART" | cut -d ' ' -f 1)
elif command -v shasum >/dev/null 2>&1; then
CHART_CHECKSUM=$(shasum -a 256 "$NAV_CHART" | cut -d ' ' -f 1)
fi
fi
fi

# Copy bundled manifests to k3s manifests directory.
# These are stored in /opt/navigator/manifests/ because the volume mount
# on /var/lib/rancher/k3s overwrites any files baked into that path.
Expand Down Expand Up @@ -239,5 +268,16 @@ if [ -f "$HELMCHART" ]; then
fi
fi

# Inject chart checksum into the HelmChart manifest so that a changed chart
# tarball causes the HelmChart CR spec to differ, forcing the k3s Helm
# controller to upgrade the release.
if [ -n "$CHART_CHECKSUM" ] && [ -f "$HELMCHART" ]; then
echo "Injecting chart checksum: ${CHART_CHECKSUM}"
sed -i "s|__CHART_CHECKSUM__|${CHART_CHECKSUM}|g" "$HELMCHART"
else
# Remove the placeholder line entirely so invalid YAML isn't left behind
sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART"
fi

# Execute k3s with explicit resolv-conf.
exec /bin/k3s "$@" --resolv-conf="$RESOLV_CONF"
1 change: 1 addition & 0 deletions deploy/kube/manifests/navigator-helmchart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ spec:
targetNamespace: navigator
createNamespace: true
valuesContent: |-
chartChecksum: __CHART_CHECKSUM__
image:
repository: d1i0nduu2f6qxk.cloudfront.net/navigator/server
tag: latest
Expand Down
38 changes: 20 additions & 18 deletions tasks/ci.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# CI and quality tasks
# CI, build, and quality tasks

[build]
description = "Build all Rust crates"
run = "cargo build --workspace"
hide = true

["build:release"]
description = "Build all Rust crates in release mode"
run = "cargo build --workspace --release"
hide = true

[check]
description = "Run fast compile and type checks"
depends = ["rust:check", "python:typecheck"]
hide = true

[clean]
description = "Clean build artifacts"
run = "cargo clean"

[fmt]
description = "Format Rust and Python code"
Expand All @@ -26,20 +45,3 @@ hide = true
description = "Alias for ci"
depends = ["ci"]
hide = true

[sandbox]
description = "Create a sandbox on the running cluster"
raw = true
usage = """
arg "[command]" var=#true help="Command to run in the sandbox (default: interactive agent)"
"""
run = """
#!/usr/bin/env bash
set -euo pipefail
CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")}
CONTAINER_NAME="navigator-cluster-${CLUSTER_NAME}"
if ! docker ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then
mise run cluster
fi
ncl sandbox create -- ${usage_command:-claude}
"""
11 changes: 0 additions & 11 deletions tasks/cluster.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,6 @@ depends = [
run = "tasks/scripts/cluster-bootstrap.sh build"
hide = true

["cluster:sandbox"]
description = "Run the sandbox container with an interactive shell"
depends = ["docker:build:sandbox"]
raw = true
usage = """
flag "-e --env <env>" var=#true help="Environment variables to pass into the sandbox"
arg "[command]" var=#true help="Command to run in the sandbox (default: /bin/bash)"
"""
run = "bash tasks/scripts/run-sandbox.sh"
hide = true

["cluster:deploy"]
description = "Alias for cluster (incremental deploy)"
run = "tasks/scripts/cluster.sh"
Expand Down
24 changes: 0 additions & 24 deletions tasks/publish.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,6 @@

# Publishing / release tasks

["python:publish"]
description = "Build and publish Python wheels"
run = """
#!/usr/bin/env bash
set -euo pipefail
VERSION=$(uv run python tasks/scripts/release.py get-version --python)
CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo)
NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:all
uv run python tasks/scripts/release.py python-publish --version "$VERSION"
"""
hide = true

["python:publish:macos"]
description = "Build and publish macOS arm64 Python wheel"
run = """
#!/usr/bin/env bash
set -euo pipefail
VERSION=$(uv run python tasks/scripts/release.py get-version --python)
CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo)
NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
uv run python tasks/scripts/release.py python-publish --version "$VERSION" --wheel-glob "*macosx*arm64.whl"
"""
hide = true

["publish:main"]
description = "Main branch publish job (images with :dev, :latest, and version tag)"
run = """
Expand Down
Loading
Loading