Skip to content

Commit fdae1a8

Browse files
authored
chore: more contributing improvements (#103)
1 parent 1dae6a7 commit fdae1a8

File tree

15 files changed

+213
-130
lines changed

15 files changed

+213
-130
lines changed
File renamed without changes.

crates/navigator-cli/src/run.rs

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,12 @@ impl LogDisplay {
143143
fn finish_phase(&mut self, phase: &str) {
144144
self.phase = phase.to_string();
145145
self.latest_log.clear();
146-
self.spinner
147-
.finish_with_message(format_phase_label(&self.phase));
148-
}
149-
150-
fn shutdown(&self) {
151-
self.spinner.disable_steady_tick();
146+
// Print the final phase as a static line above the spinner, then
147+
// clear the spinner itself. This leaves the phase label visible
148+
// in scrollback instead of erasing it with finish_and_clear().
149+
let _ = self
150+
.mp
151+
.println(format!(" {}", format_phase_label(&self.phase)));
152152
self.spinner.finish_and_clear();
153153
}
154154

@@ -1085,6 +1085,11 @@ pub async fn sandbox_create(
10851085
println!(" {}", format_phase_label(phase_name(sandbox.phase)));
10861086
}
10871087

1088+
// Don't use stop_on_terminal on the server — the Kubernetes CRD may
1089+
// briefly report a stale Ready status before the controller reconciles
1090+
// a newly created sandbox. Instead we handle termination client-side:
1091+
// we wait until we have observed at least one non-Ready phase followed
1092+
// by Ready (a genuine Provisioning → Ready transition).
10881093
let mut stream = client
10891094
.watch_sandbox(WatchSandboxRequest {
10901095
id: sandbox.id.clone(),
@@ -1093,10 +1098,8 @@ pub async fn sandbox_create(
10931098
follow_events: true,
10941099
log_tail_lines: 200,
10951100
event_tail: 0,
1096-
stop_on_terminal: true,
1101+
stop_on_terminal: false,
10971102
log_since_ms: 0,
1098-
// Only show gateway logs during provisioning — sandbox logs would
1099-
// keep the stream alive indefinitely and prevent stop_on_terminal.
11001103
log_sources: vec!["gateway".to_string()],
11011104
log_min_level: String::new(),
11021105
})
@@ -1106,6 +1109,8 @@ pub async fn sandbox_create(
11061109

11071110
let mut last_phase = sandbox.phase;
11081111
let mut last_error_reason = String::new();
1112+
// Track whether we have seen a non-Ready phase during the watch.
1113+
let mut saw_non_ready = SandboxPhase::try_from(sandbox.phase) != Ok(SandboxPhase::Ready);
11091114
let start_time = Instant::now();
11101115
let provision_timeout = Duration::from_secs(120);
11111116

@@ -1125,10 +1130,16 @@ pub async fn sandbox_create(
11251130
let evt = item.into_diagnostic()?;
11261131
match evt.payload {
11271132
Some(navigator_core::proto::sandbox_stream_event::Payload::Sandbox(s)) => {
1133+
let phase = SandboxPhase::try_from(s.phase).unwrap_or(SandboxPhase::Unknown);
11281134
last_phase = s.phase;
1135+
1136+
if phase != SandboxPhase::Ready {
1137+
saw_non_ready = true;
1138+
}
1139+
11291140
// Capture error reason from conditions only when phase is Error
11301141
// to avoid showing stale transient error reasons
1131-
if SandboxPhase::try_from(s.phase) == Ok(SandboxPhase::Error)
1142+
if phase == SandboxPhase::Error
11321143
&& let Some(status) = &s.status
11331144
{
11341145
for condition in &status.conditions {
@@ -1145,6 +1156,12 @@ pub async fn sandbox_create(
11451156
} else {
11461157
println!(" {}", format_phase_label(phase_name(s.phase)));
11471158
}
1159+
1160+
// Only accept Ready as terminal after we've observed a
1161+
// non-Ready phase, proving the controller has reconciled.
1162+
if saw_non_ready && phase == SandboxPhase::Ready {
1163+
break;
1164+
}
11481165
}
11491166
Some(navigator_core::proto::sandbox_stream_event::Payload::Log(line)) => {
11501167
if let Some(d) = display.as_mut() {
@@ -1180,7 +1197,6 @@ pub async fn sandbox_create(
11801197
// Finish up - check final phase
11811198
if let Some(d) = display.as_mut() {
11821199
d.finish_phase(phase_name(last_phase));
1183-
d.shutdown();
11841200
}
11851201
drop(display);
11861202
let _ = std::io::stdout().flush();
@@ -1229,9 +1245,11 @@ pub async fn sandbox_create(
12291245
}
12301246

12311247
if command.is_empty() {
1248+
eprintln!("Connecting...");
12321249
return sandbox_connect(&effective_server, &sandbox_name, &effective_tls).await;
12331250
}
12341251

1252+
eprintln!("Connecting...");
12351253
let exec_result = sandbox_exec(
12361254
&effective_server,
12371255
&sandbox_name,

crates/navigator-cli/src/ssh.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,17 @@ pub async fn sandbox_exec(
235235
.stdout(std::process::Stdio::inherit())
236236
.stderr(std::process::Stdio::inherit());
237237

238+
// For interactive TTY sessions, replace this process with SSH via exec()
239+
// to avoid signal handling issues (e.g. Ctrl+C killing the parent ncl
240+
// process and orphaning the SSH child).
241+
if tty && std::io::stdin().is_terminal() {
242+
#[cfg(unix)]
243+
{
244+
let err = ssh.exec();
245+
return Err(miette::miette!("failed to exec ssh: {err}"));
246+
}
247+
}
248+
238249
let status = tokio::task::spawn_blocking(move || ssh.status())
239250
.await
240251
.into_diagnostic()?

crates/navigator-server/src/ssh_tunnel.rs

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use hyper_util::rt::TokioIo;
1111
use navigator_core::proto::{Sandbox, SandboxPhase, SshSession};
1212
use std::net::SocketAddr;
1313
use std::sync::Arc;
14+
use std::time::Duration;
1415
use tokio::io::{AsyncReadExt, AsyncWriteExt};
1516
use tokio::net::TcpStream;
1617
use tracing::{info, warn};
@@ -127,10 +128,53 @@ async fn handle_tunnel(
127128
secret: &str,
128129
sandbox_id: &str,
129130
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
130-
let mut upstream = match target {
131-
ConnectTarget::Ip(addr) => TcpStream::connect(addr).await?,
132-
ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), port)).await?,
133-
};
131+
// The sandbox pod may not be network-reachable immediately after the CRD
132+
// reports Ready (DNS propagation, pod IP assignment, SSH server startup).
133+
// Retry the TCP connection with exponential backoff.
134+
let mut upstream = None;
135+
let mut last_err = None;
136+
let delays = [
137+
Duration::from_millis(100),
138+
Duration::from_millis(250),
139+
Duration::from_millis(500),
140+
Duration::from_secs(1),
141+
Duration::from_secs(2),
142+
Duration::from_secs(5),
143+
Duration::from_secs(10),
144+
Duration::from_secs(15),
145+
];
146+
for (attempt, delay) in std::iter::once(&Duration::ZERO)
147+
.chain(delays.iter())
148+
.enumerate()
149+
{
150+
if !delay.is_zero() {
151+
tokio::time::sleep(*delay).await;
152+
}
153+
let result = match &target {
154+
ConnectTarget::Ip(addr) => TcpStream::connect(addr).await,
155+
ConnectTarget::Host(host, port) => TcpStream::connect((host.as_str(), *port)).await,
156+
};
157+
match result {
158+
Ok(stream) => {
159+
if attempt > 0 {
160+
info!(
161+
sandbox_id = %sandbox_id,
162+
attempts = attempt + 1,
163+
"SSH tunnel connected after retry"
164+
);
165+
}
166+
upstream = Some(stream);
167+
break;
168+
}
169+
Err(err) => {
170+
last_err = Some(err);
171+
}
172+
}
173+
}
174+
let mut upstream = upstream.ok_or_else(|| {
175+
let err = last_err.unwrap();
176+
format!("failed to connect to sandbox after retries: {err}")
177+
})?;
134178
upstream.set_nodelay(true)?;
135179
let preface = build_preface(token, secret)?;
136180
upstream.write_all(preface.as_bytes()).await?;

deploy/docker/Dockerfile.cluster

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
ARG K3S_VERSION=v1.29.8-k3s1
1919
FROM rancher/k3s:${K3S_VERSION}
2020

21-
# Create directories for manifests and configuration
21+
# Create directories for manifests, charts, and configuration
2222
RUN mkdir -p /var/lib/rancher/k3s/server/manifests \
2323
/var/lib/rancher/k3s/server/static/charts \
2424
/etc/rancher/k3s \
25-
/opt/navigator/manifests
25+
/opt/navigator/manifests \
26+
/opt/navigator/charts
2627

2728
# Copy entrypoint script that configures DNS for Docker environments
2829
# This script detects the host gateway IP and configures CoreDNS to use it
@@ -36,8 +37,10 @@ RUN chmod +x /usr/local/bin/cluster-healthcheck.sh
3637
# Registry credentials for pulling component images at runtime are generated
3738
# by the entrypoint script at /etc/rancher/k3s/registries.yaml.
3839

39-
# Copy packaged helm chart to static directory for serving via k3s API
40-
COPY deploy/docker/.build/charts/*.tgz /var/lib/rancher/k3s/server/static/charts/
40+
# Copy packaged helm charts to a staging directory that won't be
41+
# overwritten by the /var/lib/rancher/k3s volume mount. The entrypoint
42+
# script copies them into the k3s static charts directory at container start.
43+
COPY deploy/docker/.build/charts/*.tgz /opt/navigator/charts/
4144

4245
# Copy Kubernetes manifests to a persistent location that won't be overwritten by the volume mount.
4346
# The bootstrap code will copy these to /var/lib/rancher/k3s/server/manifests/ after cluster start.

deploy/docker/cluster-entrypoint.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,35 @@ else
127127
echo "Warning: REGISTRY_HOST not set; skipping registry config"
128128
fi
129129

130+
# Copy bundled Helm chart tarballs to the k3s static charts directory.
131+
# These are stored in /opt/navigator/charts/ because the volume mount
132+
# on /var/lib/rancher/k3s overwrites any files baked into that path.
133+
# Without this, a persistent volume from a previous deploy would keep
134+
# serving stale chart tarballs.
135+
K3S_CHARTS="/var/lib/rancher/k3s/server/static/charts"
136+
BUNDLED_CHARTS="/opt/navigator/charts"
137+
CHART_CHECKSUM=""
138+
139+
if [ -d "$BUNDLED_CHARTS" ]; then
140+
echo "Copying bundled charts to k3s..."
141+
for chart in "$BUNDLED_CHARTS"/*.tgz; do
142+
[ ! -f "$chart" ] && continue
143+
cp "$chart" "$K3S_CHARTS/"
144+
done
145+
# Compute a checksum of the navigator chart so we can inject it into the
146+
# HelmChart manifest below. When the chart content changes between image
147+
# versions the checksum changes, which modifies the HelmChart CR spec and
148+
# forces the k3s Helm controller to re-install.
149+
NAV_CHART="$BUNDLED_CHARTS/navigator-0.1.0.tgz"
150+
if [ -f "$NAV_CHART" ]; then
151+
if command -v sha256sum >/dev/null 2>&1; then
152+
CHART_CHECKSUM=$(sha256sum "$NAV_CHART" | cut -d ' ' -f 1)
153+
elif command -v shasum >/dev/null 2>&1; then
154+
CHART_CHECKSUM=$(shasum -a 256 "$NAV_CHART" | cut -d ' ' -f 1)
155+
fi
156+
fi
157+
fi
158+
130159
# Copy bundled manifests to k3s manifests directory.
131160
# These are stored in /opt/navigator/manifests/ because the volume mount
132161
# on /var/lib/rancher/k3s overwrites any files baked into that path.
@@ -239,5 +268,16 @@ if [ -f "$HELMCHART" ]; then
239268
fi
240269
fi
241270

271+
# Inject chart checksum into the HelmChart manifest so that a changed chart
272+
# tarball causes the HelmChart CR spec to differ, forcing the k3s Helm
273+
# controller to upgrade the release.
274+
if [ -n "$CHART_CHECKSUM" ] && [ -f "$HELMCHART" ]; then
275+
echo "Injecting chart checksum: ${CHART_CHECKSUM}"
276+
sed -i "s|__CHART_CHECKSUM__|${CHART_CHECKSUM}|g" "$HELMCHART"
277+
else
278+
# Remove the placeholder line entirely so invalid YAML isn't left behind
279+
sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART"
280+
fi
281+
242282
# Execute k3s with explicit resolv-conf.
243283
exec /bin/k3s "$@" --resolv-conf="$RESOLV_CONF"

deploy/kube/manifests/navigator-helmchart.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ spec:
2222
targetNamespace: navigator
2323
createNamespace: true
2424
valuesContent: |-
25+
chartChecksum: __CHART_CHECKSUM__
2526
image:
2627
repository: d1i0nduu2f6qxk.cloudfront.net/navigator/server
2728
tag: latest

tasks/ci.toml

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,26 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
# CI and quality tasks
4+
# CI, build, and quality tasks
5+
6+
[build]
7+
description = "Build all Rust crates"
8+
run = "cargo build --workspace"
9+
hide = true
10+
11+
["build:release"]
12+
description = "Build all Rust crates in release mode"
13+
run = "cargo build --workspace --release"
14+
hide = true
15+
16+
[check]
17+
description = "Run fast compile and type checks"
18+
depends = ["rust:check", "python:typecheck"]
19+
hide = true
20+
21+
[clean]
22+
description = "Clean build artifacts"
23+
run = "cargo clean"
524

625
[fmt]
726
description = "Format Rust and Python code"
@@ -26,20 +45,3 @@ hide = true
2645
description = "Alias for ci"
2746
depends = ["ci"]
2847
hide = true
29-
30-
[sandbox]
31-
description = "Create a sandbox on the running cluster"
32-
raw = true
33-
usage = """
34-
arg "[command]" var=#true help="Command to run in the sandbox (default: interactive agent)"
35-
"""
36-
run = """
37-
#!/usr/bin/env bash
38-
set -euo pipefail
39-
CLUSTER_NAME=${CLUSTER_NAME:-$(basename "$PWD")}
40-
CONTAINER_NAME="navigator-cluster-${CLUSTER_NAME}"
41-
if ! docker ps -q --filter "name=${CONTAINER_NAME}" | grep -q .; then
42-
mise run cluster
43-
fi
44-
ncl sandbox create -- ${usage_command:-claude}
45-
"""

tasks/cluster.toml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,6 @@ depends = [
1616
run = "tasks/scripts/cluster-bootstrap.sh build"
1717
hide = true
1818

19-
["cluster:sandbox"]
20-
description = "Run the sandbox container with an interactive shell"
21-
depends = ["docker:build:sandbox"]
22-
raw = true
23-
usage = """
24-
flag "-e --env <env>" var=#true help="Environment variables to pass into the sandbox"
25-
arg "[command]" var=#true help="Command to run in the sandbox (default: /bin/bash)"
26-
"""
27-
run = "bash tasks/scripts/run-sandbox.sh"
28-
hide = true
29-
3019
["cluster:deploy"]
3120
description = "Alias for cluster (incremental deploy)"
3221
run = "tasks/scripts/cluster.sh"

tasks/publish.toml

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,6 @@
33

44
# Publishing / release tasks
55

6-
["python:publish"]
7-
description = "Build and publish Python wheels"
8-
run = """
9-
#!/usr/bin/env bash
10-
set -euo pipefail
11-
VERSION=$(uv run python tasks/scripts/release.py get-version --python)
12-
CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo)
13-
NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:all
14-
uv run python tasks/scripts/release.py python-publish --version "$VERSION"
15-
"""
16-
hide = true
17-
18-
["python:publish:macos"]
19-
description = "Build and publish macOS arm64 Python wheel"
20-
run = """
21-
#!/usr/bin/env bash
22-
set -euo pipefail
23-
VERSION=$(uv run python tasks/scripts/release.py get-version --python)
24-
CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo)
25-
NEMOCLAW_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
26-
uv run python tasks/scripts/release.py python-publish --version "$VERSION" --wheel-glob "*macosx*arm64.whl"
27-
"""
28-
hide = true
29-
306
["publish:main"]
317
description = "Main branch publish job (images with :dev, :latest, and version tag)"
328
run = """

0 commit comments

Comments
 (0)