diff --git a/src/router/dangling-image-cleanup.ts b/src/router/dangling-image-cleanup.ts new file mode 100644 index 00000000..6ed21a53 --- /dev/null +++ b/src/router/dangling-image-cleanup.ts @@ -0,0 +1,150 @@ +/** + * Periodic dangling-image cleanup for cascade-managed Docker images. + * + * Closes the leak class where `commitContainerToSnapshot` re-tags the + * `cascade-snapshot--:latest` slot for repeated runs of the + * same work item: the previous image's digest becomes dangling (untagged) + * and is dropped from the in-memory snapshot registry, so the registry-driven + * `runSnapshotCleanup` (`snapshot-cleanup.ts`) never sees it again. Without + * this loop, those orphaned digests accumulate at ~5 GB each. Production was + * measured at 102 GB reclaimable across 136 dangling images on 2026-05-01. + * + * Safety: the scan filter is `dangling=true AND label=cascade.managed=true`, + * AND-ed by Docker's filter API. The label clause is the only thing + * protecting unrelated host workloads (ucho-dev/prod, MySQL, Loki, etc.) + * from being reaped — see the regression test of the same name in + * `tests/unit/router/dangling-image-cleanup.test.ts`. Never widen the scope. + * + * The 5-min snapshot eviction loop and the 5-min orphan-container cleanup + * loop are unaffected; this loop runs at 30 min because dangling + * accumulation is gradual and `force: false` rmi is cheap. + */ + +import Docker from 'dockerode'; +import { captureException } from '../sentry.js'; +import { logger } from '../utils/logging.js'; + +const docker = new Docker(); + +const DANGLING_CLEANUP_INTERVAL_MS = 30 * 60 * 1000; // 30 minutes + +let danglingCleanupTimer: NodeJS.Timeout | null = null; + +/** + * Start the periodic dangling-image cleanup scan. + * No-op + warn on double-start. Mirrors `startOrphanCleanup`. + */ +export function startDanglingImageCleanup(): void { + if (danglingCleanupTimer) { + logger.warn('[DanglingImageCleanup] Cleanup already started'); + return; + } + + danglingCleanupTimer = setInterval(() => { + scanAndCleanupDanglingImages().catch((err) => { + logger.error('[DanglingImageCleanup] Error during cleanup scan:', err); + captureException(err, { + tags: { source: 'dangling_image_cleanup_scan' }, + level: 'error', + }); + }); + }, DANGLING_CLEANUP_INTERVAL_MS); + + logger.info('[DanglingImageCleanup] Started cleanup scan (every 30 minutes)'); +} + +/** Stop the periodic dangling-image cleanup scan. No-op when not started. */ +export function stopDanglingImageCleanup(): void { + if (danglingCleanupTimer) { + clearInterval(danglingCleanupTimer); + danglingCleanupTimer = null; + logger.info('[DanglingImageCleanup] Stopped cleanup scan'); + } +} + +interface DockerErrorShape { + statusCode?: number; +} + +function dockerStatusCode(err: unknown): number | undefined { + if (err && typeof err === 'object' && 'statusCode' in err) { + const code = (err as DockerErrorShape).statusCode; + return typeof code === 'number' ? code : undefined; + } + return undefined; +} + +/** + * Scan for dangling cascade-managed images and remove them. + * + * Filter is locked to `dangling=true AND label=cascade.managed=true` — + * widening this is a regression covered by the test of the same name. + * + * Per-image error handling mirrors `removeSnapshotImage` in + * `snapshot-cleanup.ts:80-110`: 409 (in use) and 404 (already gone) are + * silently swallowed; any other error is logged at warn and Sentry-captured + * under tag `dangling_image_remove`. The loop continues regardless. + * + * Failures of `listImages` itself are logged at error and Sentry-captured + * under tag `dangling_image_cleanup_scan`. The function never throws. + * + * @internal Exported for testing. + */ +export async function scanAndCleanupDanglingImages(): Promise { + let images: Array<{ Id: string; Size: number }>; + try { + images = (await docker.listImages({ + filters: { dangling: ['true'], label: ['cascade.managed=true'] }, + })) as Array<{ Id: string; Size: number }>; + } catch (err) { + logger.error('[DanglingImageCleanup] Failed to list dangling images:', err); + captureException(err, { + tags: { source: 'dangling_image_cleanup_scan' }, + level: 'error', + }); + return; + } + + if (images.length === 0) return; + + let removedCount = 0; + let reclaimedBytes = 0; + + for (const img of images) { + try { + await docker.getImage(img.Id).remove({ force: false }); + removedCount += 1; + reclaimedBytes += img.Size; + } catch (err: unknown) { + const status = dockerStatusCode(err); + if (status === 409) { + logger.debug('[DanglingImageCleanup] Dangling image in use, deferring:', { + imageId: img.Id, + }); + continue; + } + if (status === 404) { + logger.debug('[DanglingImageCleanup] Dangling image already gone:', { + imageId: img.Id, + }); + continue; + } + logger.warn('[DanglingImageCleanup] Failed to remove dangling image:', { + imageId: img.Id, + error: String(err), + }); + captureException(err, { + tags: { source: 'dangling_image_remove' }, + extra: { imageId: img.Id }, + level: 'warning', + }); + } + } + + if (removedCount > 0) { + logger.info('[DanglingImageCleanup] Cleanup pass complete:', { + removedCount, + reclaimedBytes, + }); + } +} diff --git a/src/router/worker-manager.ts b/src/router/worker-manager.ts index f031c76f..38511593 100644 --- a/src/router/worker-manager.ts +++ b/src/router/worker-manager.ts @@ -19,6 +19,7 @@ import { startOrphanCleanup, stopOrphanCleanup, } from './container-manager.js'; +import { startDanglingImageCleanup, stopDanglingImageCleanup } from './dangling-image-cleanup.js'; import { classifyDispatchError } from './dispatch-error-classifier.js'; import type { CascadeJob } from './queue.js'; import { acquireSlot, clearAllWaiters } from './slot-waiter.js'; @@ -101,6 +102,12 @@ export function startWorkerProcessor(): void { // Start periodic snapshot eviction alongside orphan cleanup startSnapshotCleanup(); + // Start periodic dangling-image cleanup. Closes the leak class where + // `commitContainerToSnapshot` re-tags `cascade-snapshot-*:latest` and + // orphans the prior digest outside the snapshot registry. See + // dangling-image-cleanup.ts for the safety scope. + startDanglingImageCleanup(); + // Reconcile pre-existing snapshot images on disk so the eviction loop can // apply TTL/max-count/max-size policies to them. Best-effort — Docker // outage at boot must not block the worker manager from starting. @@ -118,6 +125,7 @@ export async function stopWorkerProcessor(): Promise { // Stop orphan cleanup and snapshot cleanup first stopOrphanCleanup(); stopSnapshotCleanup(); + stopDanglingImageCleanup(); if (dashboardWorker) { await dashboardWorker.close(); diff --git a/tests/unit/router/dangling-image-cleanup.test.ts b/tests/unit/router/dangling-image-cleanup.test.ts new file mode 100644 index 00000000..d607aa46 --- /dev/null +++ b/tests/unit/router/dangling-image-cleanup.test.ts @@ -0,0 +1,280 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +// --------------------------------------------------------------------------- +// Hoisted mock state — created before vi.mock factories run +// --------------------------------------------------------------------------- + +const { + mockDockerListImages, + mockDockerGetImage, + mockImageRemove, + mockCaptureException, + mockLogger, +} = vi.hoisted(() => { + const mockImageRemove = vi.fn().mockResolvedValue(undefined); + return { + mockDockerListImages: vi.fn(), + mockDockerGetImage: vi.fn().mockReturnValue({ remove: mockImageRemove }), + mockImageRemove, + mockCaptureException: vi.fn(), + mockLogger: { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, + }; +}); + +// --------------------------------------------------------------------------- +// Module-level mocks +// --------------------------------------------------------------------------- + +vi.mock('dockerode', () => ({ + default: vi.fn().mockImplementation(() => ({ + getImage: mockDockerGetImage, + listImages: mockDockerListImages, + })), +})); + +vi.mock('../../../src/utils/logging.js', () => ({ + logger: mockLogger, +})); + +vi.mock('../../../src/sentry.js', () => ({ + captureException: mockCaptureException, +})); + +// --------------------------------------------------------------------------- +// Imports (after mocks) +// --------------------------------------------------------------------------- + +import { + scanAndCleanupDanglingImages, + startDanglingImageCleanup, + stopDanglingImageCleanup, +} from '../../../src/router/dangling-image-cleanup.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +interface DockerErrorShape { + statusCode?: number; + message?: string; +} + +function makeDockerError(statusCode: number, message: string): Error & DockerErrorShape { + const err = new Error(message) as Error & DockerErrorShape; + err.statusCode = statusCode; + return err; +} + +function makeImageSummary(id: string, size: number) { + return { Id: id, Size: size } as never; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('dangling-image-cleanup', () => { + beforeEach(() => { + mockDockerListImages.mockReset().mockResolvedValue([]); + mockImageRemove.mockReset().mockResolvedValue(undefined); + mockDockerGetImage.mockReset().mockReturnValue({ remove: mockImageRemove }); + mockCaptureException.mockReset(); + mockLogger.info.mockReset(); + mockLogger.warn.mockReset(); + mockLogger.error.mockReset(); + mockLogger.debug.mockReset(); + }); + + afterEach(() => { + stopDanglingImageCleanup(); + }); + + describe('scanAndCleanupDanglingImages — scan filter', () => { + it('lists images with EXACTLY {dangling=true, label=cascade.managed=true} (host-scope safety)', async () => { + // Regression guard: this filter is the only thing protecting unrelated + // images on the host (ucho-dev/prod, MySQL, Loki, etc.) from being + // reaped by a runaway prune. If anyone widens the scope by dropping + // either clause, this test must fail loudly. + await scanAndCleanupDanglingImages(); + + expect(mockDockerListImages).toHaveBeenCalledTimes(1); + expect(mockDockerListImages).toHaveBeenCalledWith({ + filters: { dangling: ['true'], label: ['cascade.managed=true'] }, + }); + }); + + it('passes the cascade.managed=true label clause (regression guard against scope expansion)', async () => { + await scanAndCleanupDanglingImages(); + + const callArg = mockDockerListImages.mock.calls[0]?.[0] as { + filters: { dangling?: string[]; label?: string[] }; + }; + expect(callArg.filters.label).toEqual(['cascade.managed=true']); + expect(callArg.filters.dangling).toEqual(['true']); + }); + }); + + describe('scanAndCleanupDanglingImages — happy path', () => { + it('removes each returned image with force=false', async () => { + mockDockerListImages.mockResolvedValue([ + makeImageSummary('sha256:aaa', 5_000_000_000), + makeImageSummary('sha256:bbb', 4_000_000_000), + ]); + + await scanAndCleanupDanglingImages(); + + expect(mockDockerGetImage).toHaveBeenCalledWith('sha256:aaa'); + expect(mockDockerGetImage).toHaveBeenCalledWith('sha256:bbb'); + expect(mockImageRemove).toHaveBeenCalledTimes(2); + expect(mockImageRemove).toHaveBeenNthCalledWith(1, { force: false }); + expect(mockImageRemove).toHaveBeenNthCalledWith(2, { force: false }); + }); + + it('logs the cleanup summary with removedCount and reclaimedBytes', async () => { + mockDockerListImages.mockResolvedValue([ + makeImageSummary('sha256:aaa', 5_000_000_000), + makeImageSummary('sha256:bbb', 4_000_000_000), + ]); + + await scanAndCleanupDanglingImages(); + + expect(mockLogger.info).toHaveBeenCalledWith( + '[DanglingImageCleanup] Cleanup pass complete:', + expect.objectContaining({ + removedCount: 2, + reclaimedBytes: 9_000_000_000, + }), + ); + }); + + it('does NOT log a summary when no images were found (zero noise)', async () => { + mockDockerListImages.mockResolvedValue([]); + + await scanAndCleanupDanglingImages(); + + expect(mockLogger.info).not.toHaveBeenCalledWith( + '[DanglingImageCleanup] Cleanup pass complete:', + expect.anything(), + ); + }); + }); + + describe('scanAndCleanupDanglingImages — Docker error swallowing', () => { + it('swallows 409 (image still in use) and continues with the next image', async () => { + mockDockerListImages.mockResolvedValue([ + makeImageSummary('sha256:in-use', 1_000), + makeImageSummary('sha256:ok', 2_000), + ]); + mockImageRemove + .mockRejectedValueOnce(makeDockerError(409, 'image is being used')) + .mockResolvedValueOnce(undefined); + + await expect(scanAndCleanupDanglingImages()).resolves.toBeUndefined(); + expect(mockImageRemove).toHaveBeenCalledTimes(2); + // 409 is debug-logged, not warn — to keep the noise floor low. + expect(mockLogger.debug).toHaveBeenCalledWith( + '[DanglingImageCleanup] Dangling image in use, deferring:', + expect.objectContaining({ imageId: 'sha256:in-use' }), + ); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); + + it('swallows 404 (image already gone) and continues', async () => { + mockDockerListImages.mockResolvedValue([ + makeImageSummary('sha256:gone', 1_000), + makeImageSummary('sha256:ok', 2_000), + ]); + mockImageRemove + .mockRejectedValueOnce(makeDockerError(404, 'no such image')) + .mockResolvedValueOnce(undefined); + + await expect(scanAndCleanupDanglingImages()).resolves.toBeUndefined(); + expect(mockImageRemove).toHaveBeenCalledTimes(2); + expect(mockLogger.debug).toHaveBeenCalledWith( + '[DanglingImageCleanup] Dangling image already gone:', + expect.objectContaining({ imageId: 'sha256:gone' }), + ); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); + + it('captures unexpected errors to Sentry under tag dangling_image_remove and continues', async () => { + mockDockerListImages.mockResolvedValue([ + makeImageSummary('sha256:bad', 1_000), + makeImageSummary('sha256:ok', 2_000), + ]); + mockImageRemove + .mockRejectedValueOnce(makeDockerError(500, 'internal docker error')) + .mockResolvedValueOnce(undefined); + + await expect(scanAndCleanupDanglingImages()).resolves.toBeUndefined(); + + expect(mockLogger.warn).toHaveBeenCalledWith( + '[DanglingImageCleanup] Failed to remove dangling image:', + expect.objectContaining({ imageId: 'sha256:bad' }), + ); + expect(mockCaptureException).toHaveBeenCalledWith( + expect.any(Error), + expect.objectContaining({ + tags: expect.objectContaining({ source: 'dangling_image_remove' }), + }), + ); + // Loop must continue after the failed image. + expect(mockImageRemove).toHaveBeenCalledTimes(2); + }); + + it('handles Docker listImages errors by logging at error level', async () => { + mockDockerListImages.mockRejectedValue(new Error('docker daemon down')); + + await expect(scanAndCleanupDanglingImages()).resolves.toBeUndefined(); + expect(mockLogger.error).toHaveBeenCalledWith( + '[DanglingImageCleanup] Failed to list dangling images:', + expect.any(Error), + ); + expect(mockCaptureException).toHaveBeenCalledWith( + expect.any(Error), + expect.objectContaining({ + tags: expect.objectContaining({ source: 'dangling_image_cleanup_scan' }), + }), + ); + }); + }); + + describe('startDanglingImageCleanup / stopDanglingImageCleanup', () => { + it('starts a periodic cleanup scan', () => { + expect(() => startDanglingImageCleanup()).not.toThrow(); + }); + + it('logs a startup message naming the scan interval', () => { + startDanglingImageCleanup(); + expect(mockLogger.info).toHaveBeenCalledWith( + expect.stringContaining('[DanglingImageCleanup] Started'), + ); + }); + + it('is idempotent on multiple starts — second call warns and creates no second timer', () => { + startDanglingImageCleanup(); + startDanglingImageCleanup(); + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('[DanglingImageCleanup] Cleanup already started'), + ); + }); + + it('stop is a no-op when not started', () => { + expect(() => stopDanglingImageCleanup()).not.toThrow(); + }); + + it('allows multiple start/stop cycles', () => { + expect(() => { + startDanglingImageCleanup(); + stopDanglingImageCleanup(); + startDanglingImageCleanup(); + stopDanglingImageCleanup(); + }).not.toThrow(); + }); + }); +});