From ffe01ebe121e26a0ea4535019670439d317443af Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Mar 2026 11:18:21 +0000 Subject: [PATCH 1/6] =?UTF-8?q?feat:=20mosh-inspired=20link=20resilience?= =?UTF-8?q?=20=E2=80=94=20heartbeat=20+=20network-aware=20reconnect?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the core weakness the Moshi creator identified: tmux handles server-side persistence, but the client-to-server WebSocket link dies silently on phone sleep and WiFi↔cellular switches — the same fragility mosh was designed to fix at the transport layer. Since we're bound to WebSocket (browser), we implement the equivalent resilience at the application layer: 1. Server-side heartbeat (routes.ts) - Server pings every 15 s; if no pong returns before the next ping the connection is considered dead and closed immediately. - Dead connections now detected in <20 s instead of TCP's multi-minute timeout window. 2. Client-side ping watchdog (useTerminal.ts) - Client tracks time of last server ping; if silent for 35 s the socket is forcibly closed to trigger a fresh reconnect cycle. - Catches the mirror case: client is "alive" but server can't reach it (common after mobile sleep with NAT table expiry). 3. Immediate reconnect on network change (useTerminal.ts) - `window.online` event fires when WiFi↔cellular switch completes. - New `reconnectNow()` helper kills any pending backoff timer and opens a fresh WebSocket immediately — no waiting for backoff queue. 4. Improved visibility reconnect (useTerminal.ts) - Existing handler only checked CLOSED state; now also detects stuck CONNECTING sockets (common after wake) and force-restarts them. - Cancels any queued backoff retry before calling connect(). https://claude.ai/code/session_01EuCbuu1DNGduvLdMP11ykX --- backend/src/terminal/routes.ts | 28 +++++++++- frontend/src/hooks/useTerminal.ts | 87 ++++++++++++++++++++++++++++--- 2 files changed, 107 insertions(+), 8 deletions(-) diff --git a/backend/src/terminal/routes.ts b/backend/src/terminal/routes.ts index 660b59c..88fe943 100644 --- a/backend/src/terminal/routes.ts +++ b/backend/src/terminal/routes.ts @@ -132,7 +132,7 @@ const terminalRoutes: FastifyPluginAsync = async (fastify) => { fastify.get('/ws/terminal', { websocket: true }, (connection: any, request) => { // In @fastify/websocket v11, connection might be the socket itself or contain a socket const ws = connection.socket || connection; - + if (!ws || typeof ws.send !== 'function') { fastify.log.error({ connection: !!connection }, 'Invalid WebSocket connection object'); return; @@ -142,6 +142,25 @@ const terminalRoutes: FastifyPluginAsync = async (fastify) => { let attachPromise: Promise | null = null; let lastSize = { cols: 80, rows: 24 }; + // Heartbeat: detect silent/dead connections (e.g. phone sleep, network change). + // Server pings every 15s; if no pong arrives before the next ping, the connection + // is considered dead and closed. This mirrors mosh's approach of actively probing + // the client-to-server link rather than waiting for TCP to eventually time out. + const HEARTBEAT_INTERVAL_MS = 15_000; + let heartbeatAlive = true; + const heartbeatTimer = setInterval(() => { + if (!heartbeatAlive) { + ws.close(1001, 'Ping timeout'); + return; + } + heartbeatAlive = false; + if (ws.readyState === 1) { + ws.send(JSON.stringify({ type: 'ping', timestamp: Date.now() })); + } + }, HEARTBEAT_INTERVAL_MS); + + const cleanupHeartbeat = () => clearInterval(heartbeatTimer); + const cookieToken = request.cookies?.['session']; const queryToken = (request.query as Record)['token']; const token = cookieToken ?? queryToken; @@ -242,6 +261,11 @@ const terminalRoutes: FastifyPluginAsync = async (fastify) => { break; } + case 'pong': { + heartbeatAlive = true; + break; + } + case 'terminal.input': { if (!ptySession && attachPromise) { await attachPromise; @@ -295,12 +319,14 @@ const terminalRoutes: FastifyPluginAsync = async (fastify) => { }); ws.on('close', () => { + cleanupHeartbeat(); void ptySession?.close().catch(() => {}); ptySession = null; attachedSession = null; }); ws.on('error', () => { + cleanupHeartbeat(); void ptySession?.close().catch(() => {}); ptySession = null; attachedSession = null; diff --git a/frontend/src/hooks/useTerminal.ts b/frontend/src/hooks/useTerminal.ts index d0d07d8..892d211 100644 --- a/frontend/src/hooks/useTerminal.ts +++ b/frontend/src/hooks/useTerminal.ts @@ -32,6 +32,10 @@ function decodeBase64ToBytes(dataBase64: string): Uint8Array { const MAX_RETRIES = 10 const BASE_DELAY_MS = 500 +// If the server sends no ping for this long, assume the connection is silently dead +// (e.g. phone woke from sleep and the TCP socket wasn't cleaned up server-side yet). +const CLIENT_PING_TIMEOUT_MS = 35_000 + export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTerminalResult { const wsRef = useRef(null) const retryCountRef = useRef(0) @@ -42,6 +46,8 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer const lastSizeRef = useRef<{ cols: number; rows: number } | null>(null) const pendingMessagesRef = useRef([]) const hasRenderedContentRef = useRef(false) + // Tracks last server ping time so we can detect silent connection death + const lastPingRef = useRef(Date.now()) const markReady = useCallback(() => { hasRenderedContentRef.current = true @@ -83,6 +89,25 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer } }, [sessionId, terminal, markReady]) + // Force-reconnect immediately, bypassing backoff. Used when we get a strong signal + // that the connection is dead (network change, visibility restore, ping timeout). + const reconnectNow = useCallback(() => { + if (!mountedRef.current) return + if (retryTimeoutRef.current) { + clearTimeout(retryTimeoutRef.current) + retryTimeoutRef.current = null + } + const current = wsRef.current + if (current) { + current.onclose = null // suppress the normal close→backoff path + current.onerror = null + current.close() + wsRef.current = null + } + retryCountRef.current = 0 + setIsConnected(false) + }, []) + const connect = useCallback(() => { if (!mountedRef.current) return @@ -93,12 +118,25 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer const ws = new WebSocket(url) wsRef.current = ws + // Watchdog: if no ping arrives from the server within CLIENT_PING_TIMEOUT_MS, the + // connection is silently dead (common after phone sleep or a WiFi→cellular switch). + // Close it immediately so the onclose handler kicks off a fresh reconnect. + lastPingRef.current = Date.now() + const pingWatchdog = setInterval(() => { + if (Date.now() - lastPingRef.current > CLIENT_PING_TIMEOUT_MS) { + clearInterval(pingWatchdog) + ws.close(1001, 'Ping watchdog timeout') + } + }, 5_000) + ws.onopen = () => { if (!mountedRef.current) { + clearInterval(pingWatchdog) ws.close() return } retryCountRef.current = 0 + lastPingRef.current = Date.now() // reset watchdog on fresh connect setIsConnected(true) setBootState(hasRenderedContentRef.current ? 'ready' : 'waiting-for-output') @@ -127,9 +165,20 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer status?: string error?: string message?: string + timestamp?: number } switch (msg.type) { + case 'ping': + // Respond to server heartbeat and reset the client-side watchdog. + // This is the mosh-inspired link health check: both ends actively verify + // the channel is alive so dead connections are detected in <20s rather + // than waiting for TCP timeout (which can take minutes). + lastPingRef.current = Date.now() + if (ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify({ type: 'pong', timestamp: msg.timestamp })) + } + break case 'terminal.output': if (terminal && msg.dataBase64 !== undefined) { const bytes = decodeBase64ToBytes(msg.dataBase64) @@ -163,6 +212,7 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer } ws.onclose = () => { + clearInterval(pingWatchdog) if (!mountedRef.current) return setIsConnected(false) if (!hasRenderedContentRef.current) { @@ -188,23 +238,46 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer } ws.onerror = () => { + clearInterval(pingWatchdog) ws.close() } }, [sessionId, terminal]) - // Handle visibility + // Handle visibility: when the page becomes visible after a sleep/background period, + // reconnect immediately. Also handles stuck CONNECTING sockets (common after wake). useEffect(() => { const handleVisibilityChange = () => { - if (document.visibilityState === 'visible') { - if (!wsRef.current || wsRef.current.readyState === WebSocket.CLOSED) { - retryCountRef.current = 0 - connect() - } + if (document.visibilityState !== 'visible') return + const ws = wsRef.current + const isDead = !ws + || ws.readyState === WebSocket.CLOSED + || ws.readyState === WebSocket.CLOSING + // CONNECTING sockets may be stuck after a sleep; if the last ping was long ago + // it's safer to kill and restart than to wait for the backoff chain. + const isStuckConnecting = ws?.readyState === WebSocket.CONNECTING + && Date.now() - lastPingRef.current > CLIENT_PING_TIMEOUT_MS + if (isDead || isStuckConnecting) { + reconnectNow() + connect() } } document.addEventListener('visibilitychange', handleVisibilityChange) return () => document.removeEventListener('visibilitychange', handleVisibilityChange) - }, [connect]) + }, [connect, reconnectNow]) + + // Handle network changes: when the browser comes back online (WiFi↔cellular switch, + // or reconnecting after airplane mode) immediately try to reconnect rather than + // waiting for the exponential backoff queue to drain. This is the browser-accessible + // analog to mosh's roaming — we can't change IP-layer transport, but we can react + // to the network change event as fast as possible. + useEffect(() => { + const handleOnline = () => { + reconnectNow() + connect() + } + window.addEventListener('online', handleOnline) + return () => window.removeEventListener('online', handleOnline) + }, [connect, reconnectNow]) useEffect(() => { mountedRef.current = true From c9b572a39c07efe3e57dc0660d36a9e6f401e762 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Mar 2026 11:25:40 +0000 Subject: [PATCH 2/6] docs: document mosh-inspired connection resilience improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates four docs to explain the heartbeat, ping watchdog, network-aware reconnect, and improved visibility handling added in the previous commit. - CHANGELOG.md — four new bullet points under [Unreleased] - README.md — new "Resilient connection" feature bullet + new FAQ entry covering the phone sleep / Wi-Fi↔cellular scenario explicitly - docs/how-it-works.md — new "Connection Resilience" section (§3) explaining each layer of the system; section numbers bumped; summary flow updated with a resilience step - docs/remote-control.md — new "Connection Resilience on Mobile" section with a scenario table covering sleep, network switch, signal loss, and page-visibility cases https://claude.ai/code/session_01EuCbuu1DNGduvLdMP11ykX --- CHANGELOG.md | 4 ++++ README.md | 4 ++++ docs/how-it-works.md | 25 +++++++++++++++++++++++-- docs/remote-control.md | 13 +++++++++++++ 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 048829d..476839d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Visible agent selection in the quick dispatch card so tasks can be routed to different agents intentionally - Full transcript reader with paginated history, scrollback loading, and timestamped session output - Session loading panel that appears immediately after launch so users see progress before terminal output arrives +- **Connection heartbeat:** server sends a `ping` every 15 seconds and closes the socket if no `pong` arrives — dead connections are now detected in under 20 seconds instead of waiting for TCP timeout +- **Client-side ping watchdog:** client force-closes and reconnects if no server ping is received for 35 seconds, catching the case where the TCP socket is silently stale (common after phone sleep with an expired NAT entry) +- **Network-aware reconnect:** listening on the browser `online` event immediately cancels any pending backoff timer and opens a fresh WebSocket when the device changes networks (Wi-Fi↔cellular switch, airplane mode off, etc.) +- **Improved visibility reconnect:** page-visibility handler now detects sockets stuck in `CONNECTING` state — a wake-from-sleep artifact — and replaces them immediately rather than waiting for the connection attempt to time out ### Changed diff --git a/README.md b/README.md index b6b822a..73102eb 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Start an agent on your laptop, walk away, and check in from your phone or tablet - **Transcript logs:** Shows the full session output in a scrollable, timestamped transcript view. - **QR code pairing:** Scan a QR code from your terminal to authenticate your phone. No passwords or SSH keys. - **Persistent sessions:** Sessions run inside `tmux`. Your agent keeps working if your laptop sleeps or your connection drops. Reconnect and pick up where you left off. +- **Resilient connection:** A server-side heartbeat and client-side watchdog detect dead connections in under 20 seconds. The browser's network-change event triggers an immediate reconnect when you switch between Wi-Fi and cellular — no waiting for backoff timers to drain. - **Flexible networking:** Works on local Wi-Fi, over Tailscale (private network), or via Cloudflare Tunnels (no port-forwarding needed). - **Git worktree isolation:** Run agents in isolated `git worktrees` to keep your working directory clean. @@ -158,6 +159,9 @@ CloudCode uses a small Go-based sidecar to interface with UNIX pseudo-terminals **What happens if my laptop goes to sleep while an agent is running?** The agent keeps running. Sessions are managed by `tmux`, which is independent of CloudCode's web server. Your agent's process continues as long as the machine is powered on. When you reconnect, CloudCode picks the session back up. +**What happens to my phone's connection when it sleeps or switches networks?** +CloudCode's connection layer is designed for exactly this. A server-side heartbeat ping detects that your phone's WebSocket is gone within 15 seconds rather than waiting for TCP's multi-minute timeout. On the client side, the browser's `online` event fires the moment a new network interface is ready (e.g. after a Wi-Fi→cellular switch), triggering an immediate reconnect without cycling through an exponential backoff queue. In practice, the terminal is back live within a few seconds of your phone waking up or changing networks. + **Can I run multiple agents at the same time?** Yes. Each session is an independent `tmux` window. You can run as many concurrent sessions as your machine can handle and manage them all from the dashboard. diff --git a/docs/how-it-works.md b/docs/how-it-works.md index 1fd8abd..353ca26 100644 --- a/docs/how-it-works.md +++ b/docs/how-it-works.md @@ -17,13 +17,33 @@ Standard mobile keyboards are missing critical developer keys (`Ctrl`, `Esc`, `T * **Haptic Feedback**: Every keypress provides a subtle vibration, making the virtual terminal feel tactile and responsive. * **Live PTY Stream**: CloudCode uses a dedicated PTY sidecar to attach to tmux and stream raw terminal bytes to the browser, preserving interactive terminal behavior with scrollback and fewer rendering artifacts. -### 3. Secure Remote Access +### 3. Connection Resilience + +tmux guarantees the *agent* survives any disruption — but the *connection* between your phone and the server is a separate problem. Standard WebSocket over TCP has the same fragility as SSH: a network change kills the socket silently, and neither side knows until TCP's own timeout fires (which can take minutes). + +CloudCode uses a layered approach to detect and recover from these failures as fast as possible: + +**Server-side heartbeat** +The server sends a `ping` message every 15 seconds. If a `pong` does not arrive before the next ping interval, the connection is declared dead and closed immediately. This bounds the detection window to under 20 seconds instead of waiting for TCP's multi-minute timeout. + +**Client-side ping watchdog** +The client tracks the timestamp of the last server ping. If no ping has been received for 35 seconds — a signal that the TCP socket is silently dead — the client force-closes the socket and starts a fresh reconnect. This catches the mirror case where the server is alive but the client's side of the connection has gone stale (common after a phone wake from sleep with NAT table entries already expired). + +**Network-aware reconnect** +The browser's `online` event fires when a network interface becomes available — including transitions between Wi-Fi and cellular. CloudCode listens for this event and immediately cancels any pending backoff retry and opens a new WebSocket. On a typical Wi-Fi↔cellular switch, the terminal is back live in under two seconds. + +**Improved wake-from-sleep recovery** +When the browser tab becomes visible again, CloudCode checks not only for closed sockets but also for sockets stuck in the `CONNECTING` state — a common artifact of waking a phone that had an in-flight connection attempt. Stuck sockets are terminated and replaced immediately rather than waiting for the connection attempt to time out. + +--- + +### 4. Secure Remote Access CloudCode is designed to be used over [Tailscale](https://tailscale.com). * **Private Networking**: Your workstation gets a private IP that is only accessible to your devices. * **Identity Validation**: When integrated with Tailscale, CloudCode can verify exactly *who* is accessing the server before they even see a login page. * **Zero-Trust**: No ports need to be opened to the public internet. -### 4. Safety & Auditing +### 5. Safety & Auditing Because agents are powerful, CloudCode prioritizes transparency: * **Path Sandboxing**: Agents are restricted to specific "Repository Roots" to prevent accidental directory traversal. * **Live Audit Logs**: Every session creation, stop command, and profile change is logged with a timestamp and user ID. @@ -35,3 +55,4 @@ Because agents are powerful, CloudCode prioritizes transparency: 1. **Workstation**: Runs the CloudCode backend, SQLite DB, and tmux. 2. **Tailscale**: Securely tunnels your phone to your workstation. 3. **Phone**: Accesses the CloudCode PWA to launch, monitor, and interact with agents via a live PTY stream backed by tmux sessions. +4. **Resilience layer**: Server heartbeat + client watchdog + network-event listener ensure the WebSocket reconnects within seconds of any network disruption — phone sleep, Wi-Fi↔cellular switch, or brief signal loss. diff --git a/docs/remote-control.md b/docs/remote-control.md index 1fa3294..2562b79 100644 --- a/docs/remote-control.md +++ b/docs/remote-control.md @@ -51,3 +51,16 @@ CloudCode uses a "Zero-Password" pairing system: 2. It embeds this token into a QR code. 3. When you scan the QR code, the remote device is instantly authenticated and granted a 30-day session cookie. 4. No need to type passwords or manage SSH keys on your mobile device. + +## Connection Resilience on Mobile + +Pairing gets you connected — but mobile networks are inherently unstable. CloudCode is designed to stay live through the disruptions that are normal on a phone: + +| Scenario | What happens | +|---|---| +| Phone screen locks / sleeps | Server detects the silent socket within 15 s via heartbeat; client detects it within 35 s via ping watchdog. Both sides clean up and the next wake triggers an instant reconnect. | +| Wi-Fi → cellular (or back) | Browser fires the `online` event the moment a new interface is ready. CloudCode immediately opens a fresh WebSocket — no waiting for the backoff queue. | +| Brief signal loss | Existing exponential backoff (up to 10 retries, capped at 30 s) handles transient drops. | +| Page becomes visible after background | Visibility handler checks for closed *and* stuck-CONNECTING sockets, terminates them, and reconnects before you can tap anything. | + +The agent itself is never affected by any of these events — it continues running in its `tmux` session regardless. The resilience work is entirely about getting your phone's view back to the live session as fast as possible. From 490ff7e4352f5427c18cef7bb82bf1a28007e041 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Mar 2026 17:39:39 +0000 Subject: [PATCH 3/6] fix: plug ping watchdog leak and guard stale WebSocket onopen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in useTerminal fixed: 1. Ping watchdog interval leak — reconnectNow() suppresses the onclose callback (by setting ws.onclose = null) so the backoff path is skipped on forced reconnects. As a side-effect, the clearInterval(pingWatchdog) call inside onclose was never reached, leaking one setInterval per forced reconnect. Fix: track the active watchdog in pingWatchdogRef and clear it explicitly in reconnectNow() and on unmount. 2. Stale-socket race in onopen — if reconnectNow fires while a CONNECTING socket's handshake is in-flight, the old socket's onopen can still fire after the new socket has taken wsRef.current. That would corrupt shared state (retryCountRef, pendingMessagesRef, isConnected) on behalf of the wrong socket. Fix: guard onopen and onclose with if (ws !== wsRef.current) and silently close the orphaned socket. https://claude.ai/code/session_01EuCbuu1DNGduvLdMP11ykX --- frontend/src/hooks/useTerminal.ts | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/frontend/src/hooks/useTerminal.ts b/frontend/src/hooks/useTerminal.ts index 892d211..c49c36b 100644 --- a/frontend/src/hooks/useTerminal.ts +++ b/frontend/src/hooks/useTerminal.ts @@ -40,6 +40,7 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer const wsRef = useRef(null) const retryCountRef = useRef(0) const retryTimeoutRef = useRef | null>(null) + const pingWatchdogRef = useRef | null>(null) const mountedRef = useRef(true) const [isConnected, setIsConnected] = useState(false) const [bootState, setBootState] = useState('loading-history') @@ -97,6 +98,13 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer clearTimeout(retryTimeoutRef.current) retryTimeoutRef.current = null } + // Clear the watchdog for the socket we're about to force-close. Without this, + // setting onclose=null (below) would prevent the normal onclose path from calling + // clearInterval, so the interval would leak and accumulate across reconnects. + if (pingWatchdogRef.current) { + clearInterval(pingWatchdogRef.current) + pingWatchdogRef.current = null + } const current = wsRef.current if (current) { current.onclose = null // suppress the normal close→backoff path @@ -125,11 +133,21 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer const pingWatchdog = setInterval(() => { if (Date.now() - lastPingRef.current > CLIENT_PING_TIMEOUT_MS) { clearInterval(pingWatchdog) + if (pingWatchdogRef.current === pingWatchdog) pingWatchdogRef.current = null ws.close(1001, 'Ping watchdog timeout') } }, 5_000) + pingWatchdogRef.current = pingWatchdog ws.onopen = () => { + // Guard against stale sockets: if reconnectNow fired while this socket's + // handshake was in-flight, a newer socket has already taken wsRef.current. + // Silently close this one rather than corrupting shared state. + if (ws !== wsRef.current) { + clearInterval(pingWatchdog) + ws.close() + return + } if (!mountedRef.current) { clearInterval(pingWatchdog) ws.close() @@ -213,7 +231,10 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer ws.onclose = () => { clearInterval(pingWatchdog) + if (pingWatchdogRef.current === pingWatchdog) pingWatchdogRef.current = null if (!mountedRef.current) return + // Stale socket (displaced by reconnectNow + a new connect call): ignore. + if (ws !== wsRef.current) return setIsConnected(false) if (!hasRenderedContentRef.current) { setBootState('connecting') @@ -295,6 +316,10 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer return () => { mountedRef.current = false if (retryTimeoutRef.current) clearTimeout(retryTimeoutRef.current) + if (pingWatchdogRef.current) { + clearInterval(pingWatchdogRef.current) + pingWatchdogRef.current = null + } if (wsRef.current) { wsRef.current.close() wsRef.current = null From b326287a3f996f0583b918b9973c2e607fbf2c9a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 24 Mar 2026 17:40:03 +0000 Subject: [PATCH 4/6] chore: update package-lock.json after npm install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflects workspace version bumps (backend/frontend 0.1.5→0.1.6) and dev/peer flag corrections for optional rollup platform packages. https://claude.ai/code/session_01EuCbuu1DNGduvLdMP11ykX --- package-lock.json | 56 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index 2bf6129..8bb237d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,7 +22,7 @@ }, "backend": { "name": "@humans-of-ai/cloudcode", - "version": "0.1.5", + "version": "0.1.6", "hasInstallScript": true, "license": "MIT", "dependencies": { @@ -67,7 +67,7 @@ }, "frontend": { "name": "cloudcode-frontend", - "version": "0.1.0", + "version": "0.1.6", "license": "MIT", "dependencies": { "@tailwindcss/typography": "^0.5.19", @@ -280,11 +280,13 @@ "cpu": [ "ppc64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "aix" ], + "peer": true, "engines": { "node": ">=18" } @@ -296,11 +298,13 @@ "cpu": [ "arm" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "android" ], + "peer": true, "engines": { "node": ">=18" } @@ -312,11 +316,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "android" ], + "peer": true, "engines": { "node": ">=18" } @@ -328,11 +334,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "android" ], + "peer": true, "engines": { "node": ">=18" } @@ -344,11 +352,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "darwin" ], + "peer": true, "engines": { "node": ">=18" } @@ -360,11 +370,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "darwin" ], + "peer": true, "engines": { "node": ">=18" } @@ -376,11 +388,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "freebsd" ], + "peer": true, "engines": { "node": ">=18" } @@ -392,11 +406,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "freebsd" ], + "peer": true, "engines": { "node": ">=18" } @@ -408,11 +424,13 @@ "cpu": [ "arm" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -424,11 +442,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -440,11 +460,13 @@ "cpu": [ "ia32" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -456,11 +478,13 @@ "cpu": [ "loong64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -472,11 +496,13 @@ "cpu": [ "mips64el" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -488,11 +514,13 @@ "cpu": [ "ppc64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -504,11 +532,13 @@ "cpu": [ "riscv64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -520,11 +550,13 @@ "cpu": [ "s390x" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -536,11 +568,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "linux" ], + "peer": true, "engines": { "node": ">=18" } @@ -552,11 +586,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "netbsd" ], + "peer": true, "engines": { "node": ">=18" } @@ -568,11 +604,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "netbsd" ], + "peer": true, "engines": { "node": ">=18" } @@ -584,11 +622,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "openbsd" ], + "peer": true, "engines": { "node": ">=18" } @@ -600,11 +640,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "openbsd" ], + "peer": true, "engines": { "node": ">=18" } @@ -616,11 +658,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "openharmony" ], + "peer": true, "engines": { "node": ">=18" } @@ -632,11 +676,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "sunos" ], + "peer": true, "engines": { "node": ">=18" } @@ -648,11 +694,13 @@ "cpu": [ "arm64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "win32" ], + "peer": true, "engines": { "node": ">=18" } @@ -664,11 +712,13 @@ "cpu": [ "ia32" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "win32" ], + "peer": true, "engines": { "node": ">=18" } @@ -680,11 +730,13 @@ "cpu": [ "x64" ], + "dev": true, "license": "MIT", "optional": true, "os": [ "win32" ], + "peer": true, "engines": { "node": ">=18" } From 9824f8d01f9724527b3e25dd75c8da0717e21c69 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Mar 2026 03:30:56 +0000 Subject: [PATCH 5/6] fix: offline feedback, online debounce/guard, session.status handling Three follow-up improvements to connection resilience: 1. offline event listener Listening to window 'offline' now calls setIsConnected(false) immediately, so the terminal header flips to "Syncing" the moment the network drops rather than waiting up to 35 s for the ping watchdog or 20 s for the server heartbeat to notice. 2. handleOnline OPEN guard + 200 ms debounce - Guard: if wsRef.current is already OPEN, skip the reconnect entirely. Some mobile browsers fire 'online' even when the existing socket is healthy (e.g. switching back to a known Wi-Fi while LTE stays up), which would needlessly tear down a working connection. - Debounce: some OS/browser combos emit multiple 'online' events during a single network transition. The 200 ms debounce collapses the burst into one reconnect attempt instead of spinning up multiple sockets that the stale-socket guard then has to clean up. 3. session.status stopped/error handling The 'session.status' case was a no-op. The backend sends {type:'session.status', status:'stopped'} when the PTY exits and {status:'error'} on failures. Now these set sessionEnded=true and isConnected=false so the terminal header correctly shows "Ended" (grey dot) instead of staying on "Live" or "Syncing" after the agent finishes. sessionEnded is reset to false when a new session mounts. Terminal.tsx updated to consume the new sessionEnded field. https://claude.ai/code/session_01EuCbuu1DNGduvLdMP11ykX --- frontend/src/components/Terminal.tsx | 8 +++--- frontend/src/hooks/useTerminal.ts | 43 +++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/frontend/src/components/Terminal.tsx b/frontend/src/components/Terminal.tsx index 68c986b..ea4e5a4 100644 --- a/frontend/src/components/Terminal.tsx +++ b/frontend/src/components/Terminal.tsx @@ -63,7 +63,7 @@ export function Terminal({ sessionId, sessionTitle, agentName }: TerminalProps) const fitAddonRef = useRef(null) const searchAddonRef = useRef(null) const [terminalInstance, setTerminalInstance] = useState(null) - const { isConnected, bootState, sendInput, resize } = useTerminal({ sessionId, terminal: terminalInstance }) + const { isConnected, bootState, sessionEnded, sendInput, resize } = useTerminal({ sessionId, terminal: terminalInstance }) const [ctrlMode, setCtrlMode] = useState(false) const [showSearch, setShowSearch] = useState(false) @@ -693,8 +693,8 @@ export function Terminal({ sessionId, sessionTitle, agentName }: TerminalProps) title="Scroll to bottom" >END - - {isConnected ? 'Live' : 'Syncing'} + + {sessionEnded ? 'Ended' : isConnected ? 'Live' : 'Syncing'} @@ -728,7 +728,7 @@ export function Terminal({ sessionId, sessionTitle, agentName }: TerminalProps)
- {isConnected ? 'Stream attached' : 'Connecting...'} + {sessionEnded ? 'Session ended' : isConnected ? 'Stream attached' : 'Connecting...'} Session {sessionId.slice(0, 8)}
diff --git a/frontend/src/hooks/useTerminal.ts b/frontend/src/hooks/useTerminal.ts index c49c36b..10e2918 100644 --- a/frontend/src/hooks/useTerminal.ts +++ b/frontend/src/hooks/useTerminal.ts @@ -10,6 +10,7 @@ export interface UseTerminalOptions { export interface UseTerminalResult { isConnected: boolean bootState: 'loading-history' | 'connecting' | 'waiting-for-output' | 'ready' + sessionEnded: boolean sendInput: (data: string) => void resize: (cols: number, rows: number) => void } @@ -44,6 +45,7 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer const mountedRef = useRef(true) const [isConnected, setIsConnected] = useState(false) const [bootState, setBootState] = useState('loading-history') + const [sessionEnded, setSessionEnded] = useState(false) const lastSizeRef = useRef<{ cols: number; rows: number } | null>(null) const pendingMessagesRef = useRef([]) const hasRenderedContentRef = useRef(false) @@ -212,6 +214,13 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer } break case 'session.status': + // The backend sends this when the PTY exits (status: 'stopped') or when + // a session transitions to an error state. Update UI immediately so the + // terminal header shows "Ended" rather than staying on "Live". + if (msg.status === 'stopped' || msg.status === 'error') { + setSessionEnded(true) + setIsConnected(false) + } break case 'session.error': if (terminal) { @@ -291,19 +300,45 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer // waiting for the exponential backoff queue to drain. This is the browser-accessible // analog to mosh's roaming — we can't change IP-layer transport, but we can react // to the network change event as fast as possible. + // + // Guards: + // • OPEN check — skip if we already have a healthy connection; some mobile browsers + // fire `online` even when the socket is still alive (e.g. switching back to a + // known Wi-Fi network while LTE stays up briefly). + // • Debounce (200 ms) — some OS/browser combos emit multiple `online` events in + // rapid succession during a single network transition. Without debouncing each + // event would tear down and re-create the socket, producing a burst of in-flight + // connections that the stale-socket guard would then have to clean up. useEffect(() => { + let onlineDebounceTimer: ReturnType | null = null const handleOnline = () => { - reconnectNow() - connect() + if (wsRef.current?.readyState === WebSocket.OPEN) return + if (onlineDebounceTimer) clearTimeout(onlineDebounceTimer) + onlineDebounceTimer = setTimeout(() => { + onlineDebounceTimer = null + reconnectNow() + connect() + }, 200) + } + // When the network goes away, update the UI immediately rather than waiting + // up to 35 s for the watchdog or 20 s for the server heartbeat to notice. + const handleOffline = () => { + setIsConnected(false) } window.addEventListener('online', handleOnline) - return () => window.removeEventListener('online', handleOnline) + window.addEventListener('offline', handleOffline) + return () => { + if (onlineDebounceTimer) clearTimeout(onlineDebounceTimer) + window.removeEventListener('online', handleOnline) + window.removeEventListener('offline', handleOffline) + } }, [connect, reconnectNow]) useEffect(() => { mountedRef.current = true hasRenderedContentRef.current = false setBootState('loading-history') + setSessionEnded(false) if (terminal) { terminal.write('\x1bc') void loadBootstrap().finally(() => { @@ -327,5 +362,5 @@ export function useTerminal({ sessionId, terminal }: UseTerminalOptions): UseTer } }, [sessionId, terminal, connect, loadBootstrap]) - return { isConnected, bootState, sendInput, resize } + return { isConnected, bootState, sessionEnded, sendInput, resize } } From d910d41ae6490027140c5f064ea2cc5693a600e5 Mon Sep 17 00:00:00 2001 From: Alex Chao Date: Wed, 25 Mar 2026 07:51:36 -0700 Subject: [PATCH 6/6] Fix heartbeat timer leak on unauthenticated WebSocket connections Two fixes to the heartbeat cleanup logic: 1. The auth failure early-return path now calls cleanupHeartbeat() before closing the socket. Previously the ws.on('close') handler (which calls cleanupHeartbeat) was registered after the auth check, so unauthenticated connections leaked the setInterval forever. 2. The heartbeat interval callback now clears itself before calling ws.close() when a ping timeout is detected, rather than relying solely on the onclose handler to stop it. This prevents the interval from firing additional times while the socket transitions through CLOSING state. --- backend/src/terminal/routes.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/src/terminal/routes.ts b/backend/src/terminal/routes.ts index 88fe943..13c1a76 100644 --- a/backend/src/terminal/routes.ts +++ b/backend/src/terminal/routes.ts @@ -150,6 +150,7 @@ const terminalRoutes: FastifyPluginAsync = async (fastify) => { let heartbeatAlive = true; const heartbeatTimer = setInterval(() => { if (!heartbeatAlive) { + cleanupHeartbeat(); ws.close(1001, 'Ping timeout'); return; } @@ -178,6 +179,7 @@ const terminalRoutes: FastifyPluginAsync = async (fastify) => { type: 'session.error', message: 'Authentication required', })); + cleanupHeartbeat(); ws.close(1008, 'Unauthorized'); return; }