Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 228 additions & 6 deletions bin/nemoclaw.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ const registry = require("./lib/registry");
const nim = require("./lib/nim");
const policies = require("./lib/policies");
const { parseGatewayInference } = require("./lib/inference-config");
const onboardSession = require("./lib/onboard-session");
const { parseLiveSandboxNames } = require("./lib/runtime-recovery");

// ── Global commands ──────────────────────────────────────────────

Expand All @@ -44,6 +46,7 @@ const GLOBAL_COMMANDS = new Set([

const REMOTE_UNINSTALL_URL = "https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uninstall.sh";
let OPENSHELL_BIN = null;
const MIN_LOGS_OPENSHELL_VERSION = "0.0.7";

function getOpenshellBinary() {
if (!OPENSHELL_BIN) {
Expand Down Expand Up @@ -83,11 +86,164 @@ function captureOpenshell(args, opts = {}) {
};
}

function parseVersionFromText(value = "") {
const match = String(value || "").match(/([0-9]+\.[0-9]+\.[0-9]+)/);
return match ? match[1] : null;
}

function versionGte(left = "0.0.0", right = "0.0.0") {
const lhs = String(left).split(".").map((part) => Number.parseInt(part, 10) || 0);
const rhs = String(right).split(".").map((part) => Number.parseInt(part, 10) || 0);
const length = Math.max(lhs.length, rhs.length);
for (let index = 0; index < length; index += 1) {
const a = lhs[index] || 0;
const b = rhs[index] || 0;
if (a > b) return true;
if (a < b) return false;
}
return true;
}

function getInstalledOpenshellVersion() {
const versionResult = captureOpenshell(["--version"], { ignoreError: true });
return parseVersionFromText(versionResult.output);
}

function stripAnsi(value = "") {
// eslint-disable-next-line no-control-regex
return String(value).replace(/\x1b\[[0-9;]*m/g, "");
}

function buildRecoveredSandboxEntry(name, metadata = {}) {
return {
name,
model: metadata.model || null,
provider: metadata.provider || null,
gpuEnabled: metadata.gpuEnabled === true,
policies: Array.isArray(metadata.policies)
? metadata.policies
: Array.isArray(metadata.policyPresets)
? metadata.policyPresets
: [],
nimContainer: metadata.nimContainer || null,
};
Comment on lines +117 to +129
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Recovered nimContainer metadata is still lost on first insert.

Line 128 includes nimContainer, but Line 138 inserts through registry.registerSandbox(entry), and bin/lib/registry.js currently does not persist nimContainer in registerSandbox(). Result: newly recovered entries lose container linkage immediately.

🧩 Cross-file fix (`bin/lib/registry.js`)
 function registerSandbox(entry) {
   return withLock(() => {
     const data = load();
     const normalized = normalizeEntry(entry);
     if (!normalized) {
       throw new Error(`Invalid sandbox entry: ${entry.name}`);
     }
     data.sandboxes[normalized.name] = {
       name: normalized.name,
       model: normalized.model,
       provider: normalized.provider,
       gpuEnabled: normalized.gpuEnabled,
       policies: normalized.policies || [],
+      nimContainer: normalized.nimContainer,
     };
     if (!data.defaultSandbox) {
       data.defaultSandbox = entry.name;
     }
     save(data);
   });
 }

Also applies to: 132-139

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@bin/nemoclaw.js` around lines 117 - 129, buildRecoveredSandboxEntry correctly
includes nimContainer in the recovered entry, but registerSandbox in
bin/lib/registry.js does not persist nimContainer, so recovered entries lose
their container linkage; update the registerSandbox(sandbox) implementation to
accept and store sandbox.nimContainer (alongside name, model, provider,
gpuEnabled, policies) when creating/persisting the sandbox record and ensure any
normalization/DB write paths (e.g., the code paths that previously handled
model/provider/policies) include nimContainer so the value from
buildRecoveredSandboxEntry is retained after registry.registerSandbox(entry) is
called.

}

function upsertRecoveredSandbox(name, metadata = {}) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I notice resolveReconnectSandboxName runs validateName() on sandbox names that come from user input — nice. Looking at upsertRecoveredSandbox, though, names sourced from onboard-session.json or the gateway sandbox list output flow straight to registry.registerSandbox() without the same validation. Is there a reason to skip validation here, or would it be worth being consistent across both paths?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a fair consistency question. I originally skipped validation there because those names come from NemoClaw/OpenShell-managed sources rather than raw user input, and the gateway path is already constrained by OpenShell sandbox naming rules. I do agree that validating before registry writes would be a reasonable hardening step; I just did not want to expand this PR again after the UX direction changed to remove reconnect and move recovery onto connect.

let validName;
try {
validName = validateName(name, "sandbox name");
} catch {
return false;
}

const entry = buildRecoveredSandboxEntry(validName, metadata);
if (registry.getSandbox(validName)) {
registry.updateSandbox(validName, entry);
return false;
}
registry.registerSandbox(entry);
return true;
}

function shouldRecoverRegistryEntries(current, session, requestedSandboxName) {
const hasSessionSandbox = Boolean(session?.sandboxName);
const missingSessionSandbox =
hasSessionSandbox &&
!current.sandboxes.some((sandbox) => sandbox.name === session.sandboxName);
const missingRequestedSandbox =
Boolean(requestedSandboxName) &&
!current.sandboxes.some((sandbox) => sandbox.name === requestedSandboxName);
const hasRecoverySeed = current.sandboxes.length > 0 || hasSessionSandbox || Boolean(requestedSandboxName);
return {
missingRequestedSandbox,
shouldRecover:
hasRecoverySeed &&
(current.sandboxes.length === 0 || missingRequestedSandbox || missingSessionSandbox),
};
}

function seedRecoveryMetadata(current, session, requestedSandboxName) {
const metadataByName = new Map(current.sandboxes.map((sandbox) => [sandbox.name, sandbox]));
let recoveredFromSession = false;

if (!session?.sandboxName) {
return { metadataByName, recoveredFromSession };
}

metadataByName.set(
session.sandboxName,
buildRecoveredSandboxEntry(session.sandboxName, {
model: session.model || null,
provider: session.provider || null,
nimContainer: session.nimContainer || null,
policyPresets: session.policyPresets || null,
})
);
const sessionSandboxMissing = !current.sandboxes.some((sandbox) => sandbox.name === session.sandboxName);
const shouldRecoverSessionSandbox =
current.sandboxes.length === 0 || sessionSandboxMissing || requestedSandboxName === session.sandboxName;
if (shouldRecoverSessionSandbox) {
recoveredFromSession = upsertRecoveredSandbox(session.sandboxName, metadataByName.get(session.sandboxName));
}
Comment on lines +174 to +188
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

REGISTRY_FILE="$(fd '^registry\.js$' bin | head -n1)"
echo "Inspecting: $REGISTRY_FILE"
rg -n -C6 'function normalizeEntry|function updateSandbox|gpuEnabled|policies|nimContainer' "$REGISTRY_FILE"

Repository: NVIDIA/NemoClaw

Length of output: 929


🏁 Script executed:

cat -n bin/lib/registry.js | sed -n '165,200p'

Repository: NVIDIA/NemoClaw

Length of output: 1211


🏁 Script executed:

cat -n bin/nemoclaw.js | sed -n '174,215p'

Repository: NVIDIA/NemoClaw

Length of output: 1878


🏁 Script executed:

rg -n -A15 'function buildRecoveredSandboxEntry|function upsertRecoveredSandbox' bin/nemoclaw.js

Repository: NVIDIA/NemoClaw

Length of output: 999


Session recovery can reset critical sandbox fields when updating existing sandboxes.

Line 142 in bin/lib/registry.js calls updateSandbox() with an entry created by buildRecoveredSandboxEntry(). Although updateSandbox() uses Object.assign() (a merge operation at line 172), the recovered entry is built from session snapshots that don't include all original fields. Specifically:

  • buildRecoveredSandboxEntry() defaults gpuEnabled to false (line 122) if missing from metadata
  • It defaults policies to [] (line 123–127) if missing from metadata

When seedRecoveryMetadata() processes a session at lines 174–182 and later passes it through upsertRecoveredSandbox(), these incomplete snapshots will overwrite existing sandbox fields with defaults, silently resetting gpuEnabled, policies, or nimContainer on already-registered sandboxes.

Preserve existing fields by not overwriting entries in metadataByName when they already exist, or ensure session snapshots only update fields that were actually saved in the session.

return { metadataByName, recoveredFromSession };
}

async function recoverRegistryFromLiveGateway(metadataByName) {
if (!resolveOpenshell()) {
return 0;
}
const recovery = await recoverNamedGatewayRuntime();
const canInspectLiveGateway =
recovery.recovered ||
recovery.before?.state === "healthy_named" ||
recovery.after?.state === "healthy_named";
if (!canInspectLiveGateway) {
return 0;
}

let recoveredFromGateway = 0;
const liveList = captureOpenshell(["sandbox", "list"], { ignoreError: true });
const liveNames = Array.from(parseLiveSandboxNames(liveList.output));
for (const name of liveNames) {
const metadata = metadataByName.get(name) || {};
if (upsertRecoveredSandbox(name, metadata)) {
recoveredFromGateway += 1;
}
}
return recoveredFromGateway;
}

function applyRecoveredDefault(currentDefaultSandbox, requestedSandboxName, session) {
const recovered = registry.listSandboxes();
const preferredDefault = requestedSandboxName || (!currentDefaultSandbox ? session?.sandboxName || null : null);
if (preferredDefault && recovered.sandboxes.some((sandbox) => sandbox.name === preferredDefault)) {
registry.setDefault(preferredDefault);
}
return registry.listSandboxes();
Comment on lines +217 to +223
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

connect recovery shouldn't silently retarget the default sandbox.

Line 219 prefers requestedSandboxName even when currentDefaultSandbox is already set. That means beta connect can repair the registry and also change later global commands to operate on beta, which is a surprising side effect for a recovery path.

🛠️ Suggested fix
 function applyRecoveredDefault(currentDefaultSandbox, requestedSandboxName, session) {
   const recovered = registry.listSandboxes();
-  const preferredDefault = requestedSandboxName || (!currentDefaultSandbox ? session?.sandboxName || null : null);
+  const preferredDefault = currentDefaultSandbox
+    ? null
+    : requestedSandboxName || session?.sandboxName || null;
   if (preferredDefault && recovered.sandboxes.some((sandbox) => sandbox.name === preferredDefault)) {
     registry.setDefault(preferredDefault);
   }
   return registry.listSandboxes();
 }

Also applies to: 1052-1057

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@bin/nemoclaw.js` around lines 217 - 223, The recovery logic in
applyRecoveredDefault currently prefers requestedSandboxName even when a
currentDefaultSandbox exists, causing silent global default changes; change it
so the default is only set when there is no currentDefaultSandbox (i.e., only
use requestedSandboxName or session.sandboxName when currentDefaultSandbox is
falsy), then call registry.setDefault(preferredDefault) only if preferredDefault
is truthy and exists in recovered.sandboxes; update the same pattern at the
other occurrence (around lines 1052-1057) to avoid silently retargeting the
global default during connect recovery.

}

async function recoverRegistryEntries({ requestedSandboxName = null } = {}) {
const current = registry.listSandboxes();
const session = onboardSession.loadSession();
const recoveryCheck = shouldRecoverRegistryEntries(current, session, requestedSandboxName);
if (!recoveryCheck.shouldRecover) {
return { ...current, recoveredFromSession: false, recoveredFromGateway: 0 };
}

const seeded = seedRecoveryMetadata(current, session, requestedSandboxName);
const shouldProbeLiveGateway = current.sandboxes.length > 0 || Boolean(session?.sandboxName);
const recoveredFromGateway = shouldProbeLiveGateway
? await recoverRegistryFromLiveGateway(seeded.metadataByName)
: 0;
const recovered = applyRecoveredDefault(current.defaultSandbox, requestedSandboxName, session);
return {
...recovered,
recoveredFromSession: seeded.recoveredFromSession,
recoveredFromGateway,
};
}

function hasNamedGateway(output = "") {
return stripAnsi(output).includes("Gateway: nemoclaw");
}
Expand Down Expand Up @@ -302,6 +458,13 @@ async function ensureLiveSandboxOrExit(sandboxName) {
process.exit(1);
}

function printOldLogsCompatibilityGuidance(installedVersion = null) {
const versionText = installedVersion ? ` (${installedVersion})` : "";
console.error(` Installed OpenShell${versionText} is too old or incompatible with \`nemoclaw logs\`.`);
console.error(` NemoClaw expects \`openshell logs <name>\` and live streaming via \`--tail\`.`);
console.error(" Upgrade OpenShell by rerunning `nemoclaw onboard`, or reinstall the OpenShell CLI and try again.");
}

function resolveUninstallScript() {
const candidates = [
path.join(ROOT, "uninstall.sh"),
Expand Down Expand Up @@ -551,11 +714,18 @@ function showStatus() {
run(`bash "${SCRIPTS}/start-services.sh" --status`);
}

function listSandboxes() {
const { sandboxes, defaultSandbox } = registry.listSandboxes();
async function listSandboxes() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a thoughtful approach to the reboot problem — recovering the registry transparently during list so the user doesn't have to know about it.

One behavioral shift worth calling out: list was previously a pure read; now it can write to sandboxes.json as a side effect. Two questions come to mind:

  1. If someone runs nemoclaw list while nemoclaw onboard is in progress on another terminal, could both be writing to sandboxes.json concurrently? How does the registry lock handle that?
  2. Would it be useful to mention the write-on-read behavior in the help text or a doc note, so users who script around list aren't surprised?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the registry implementation before answering this. The write path is serialized through the existing advisory lock in bin/lib/registry.js (mkdir lock dir + owner PID + stale-lock handling) and the file write itself is tmp-file-plus-rename, so list and onboard should serialize rather than clobber each other. I also updated the PR body to call out explicitly that list now repairs local inventory on read, since that side effect is worth documenting.

const recovery = await recoverRegistryEntries();
const { sandboxes, defaultSandbox } = recovery;
if (sandboxes.length === 0) {
console.log("");
console.log(" No sandboxes registered. Run `nemoclaw onboard` to get started.");
const session = onboardSession.loadSession();
if (session?.sandboxName) {
console.log(` No sandboxes registered locally, but the last onboarded sandbox was '${session.sandboxName}'.`);
console.log(" Retry `nemoclaw <name> connect` or `nemoclaw <name> status` once the gateway/runtime is healthy.");
} else {
console.log(" No sandboxes registered. Run `nemoclaw onboard` to get started.");
}
console.log("");
return;
}
Expand All @@ -566,6 +736,14 @@ function listSandboxes() {
);

console.log("");
if (recovery.recoveredFromSession) {
console.log(" Recovered sandbox inventory from the last onboard session.");
console.log("");
}
if (recovery.recoveredFromGateway > 0) {
console.log(` Recovered ${recovery.recoveredFromGateway} sandbox entr${recovery.recoveredFromGateway === 1 ? "y" : "ies"} from the live OpenShell gateway.`);
console.log("");
}
console.log(" Sandboxes:");
for (const sb of sandboxes) {
const def = sb.name === defaultSandbox ? " *" : "";
Expand Down Expand Up @@ -664,9 +842,44 @@ async function sandboxStatus(sandboxName) {
}

function sandboxLogs(sandboxName, follow) {
const installedVersion = getInstalledOpenshellVersion();
if (installedVersion && !versionGte(installedVersion, MIN_LOGS_OPENSHELL_VERSION)) {
printOldLogsCompatibilityGuidance(installedVersion);
process.exit(1);
}

const args = ["logs", sandboxName];
if (follow) args.push("--follow");
runOpenshell(args);
if (follow) args.push("--tail");
const result = spawnSync(getOpenshellBinary(), args, {
cwd: ROOT,
env: process.env,
encoding: "utf-8",
stdio: follow ? ["ignore", "inherit", "pipe"] : ["ignore", "pipe", "pipe"],
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch on the --follow--tail translation — this clearly fixes the regression.

One thing I wanted to think through with you on the stdio configuration: in follow mode, stderr is piped (["ignore", "inherit", "pipe"]), which means any errors openshell writes during a long-running streaming session won't be visible to the user until after they Ctrl-C. Could that mask a mid-stream error in a way that would be confusing? Would it be worth inheriting stderr too and handling the version-compatibility detection a different way (e.g., checking the version upfront only, which you already do)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I kept stderr piped in follow mode for this PR because the compatibility path still needs to inspect parser failures from older OpenShell builds, and those show up on stderr before a stream is established. Since we now check the installed version up front as well, I agree the remaining tradeoff is mid-stream stderr visibility versus preserving that fallback detection. I chose to keep the existing detection path stable here and would treat inheriting stderr in follow mode as a follow-up cleanup if we want to simplify around the version gate alone.

});
const stdout = String(result.stdout || "");
const stderr = String(result.stderr || "");
const combined = `${stdout}${stderr}`;
if (!follow && stdout) {
process.stdout.write(stdout);
}
if (result.status === 0) {
return;
}
if (stderr) {
process.stderr.write(stderr);
}
if (
/unrecognized subcommand 'logs'|unexpected argument '--tail'|unexpected argument '--follow'/i.test(combined) ||
(installedVersion && !versionGte(installedVersion, MIN_LOGS_OPENSHELL_VERSION))
) {
printOldLogsCompatibilityGuidance(installedVersion);
process.exit(1);
}
if (result.status === null || result.signal) {
exitWithSpawnResult(result);
}
console.error(` Command failed (exit ${result.status}): openshell ${args.join(" ")}`);
exitWithSpawnResult(result);
}

async function sandboxPolicyAdd(sandboxName) {
Expand Down Expand Up @@ -802,7 +1015,7 @@ const [cmd, ...args] = process.argv.slice(2);
case "status": showStatus(); break;
case "debug": debug(args); break;
case "uninstall": uninstall(args); break;
case "list": listSandboxes(); break;
case "list": await listSandboxes(); break;
case "--version":
case "-v": {
const pkg = require(path.join(__dirname, "..", "package.json"));
Expand Down Expand Up @@ -836,6 +1049,15 @@ const [cmd, ...args] = process.argv.slice(2);
return;
}

if (args[0] === "connect") {
validateName(cmd, "sandbox name");
await recoverRegistryEntries({ requestedSandboxName: cmd });
if (registry.getSandbox(cmd)) {
await sandboxConnect(cmd);
return;
}
}

// Unknown command — suggest
console.error(` Unknown command: ${cmd}`);
console.error("");
Expand Down
Loading
Loading