diff --git a/.changeset/eight-colts-agree.md b/.changeset/eight-colts-agree.md new file mode 100644 index 0000000000..0484c8d6da --- /dev/null +++ b/.changeset/eight-colts-agree.md @@ -0,0 +1,28 @@ +--- +"@workflow/swc-playground-wasm": patch +"@workflow/swc-plugin": patch +"@workflow/world-postgres": patch +"@workflow/world-testing": patch +"@workflow/world-vercel": patch +"@workflow/world-local": patch +"@workflow/web-shared": patch +"@workflow/sveltekit": patch +"@workflow/builders": patch +"workflow": patch +"@workflow/errors": patch +"@workflow/rollup": patch +"@workflow/vitest": patch +"@workflow/astro": patch +"@workflow/nitro": patch +"@workflow/world": patch +"@workflow/core": patch +"@workflow/nest": patch +"@workflow/next": patch +"@workflow/nuxt": patch +"@workflow/vite": patch +"@workflow/cli": patch +"@workflow/web": patch +"@workflow/ai": patch +--- + +Add experimental rate limiting and flow concurrency control diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 768e857a2f..4f6d697db7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -565,6 +565,19 @@ jobs: DEPLOYMENT_URL: "http://localhost:${{ matrix.app.name == 'sveltekit' && '4173' || (matrix.app.name == 'astro' && '4321' || '3000') }}" NEXT_CANARY: ${{ matrix.app.canary && '1' || '' }} + - name: Run Low-Concurrency Worker-Slot Test + if: ${{ !matrix.app.canary && matrix.app.name == 'nextjs-turbopack' }} + run: | + cd "${{ steps.prepare-workbench.outputs.workbench_app_path }}" && PORT=3001 WORKFLOW_POSTGRES_WORKER_CONCURRENCY=1 pnpm start & + echo "starting low-concurrency tests in 10 seconds" && sleep 10 + pnpm vitest run packages/core/e2e/e2e.test.ts -t "frees worker slots for unrelated workflows while a waiter is blocked" + env: + NODE_OPTIONS: "--enable-source-maps" + APP_NAME: ${{ matrix.app.name }} + WORKBENCH_APP_PATH: ${{ steps.prepare-workbench.outputs.workbench_app_path }} + DEPLOYMENT_URL: "http://localhost:3001" + WORKFLOW_LIMITS_LOW_CONCURRENCY: "1" + - name: Generate E2E summary if: always() run: node .github/scripts/aggregate-e2e-results.js . --job-name "E2E Local Postgres (${{ matrix.app.name }})" >> $GITHUB_STEP_SUMMARY || true diff --git a/docs/lib/ai-agent-detection.ts b/docs/lib/ai-agent-detection.ts index be0a02a2e4..e4184e8ecf 100644 --- a/docs/lib/ai-agent-detection.ts +++ b/docs/lib/ai-agent-detection.ts @@ -18,84 +18,84 @@ // Layer 1: Known AI agent UA substrings (lowercase). const AI_AGENT_UA_PATTERNS = [ // Anthropic — https://support.claude.com/en/articles/8896518 - "claudebot", - "claude-searchbot", - "claude-user", - "anthropic-ai", - "claude-web", + 'claudebot', + 'claude-searchbot', + 'claude-user', + 'anthropic-ai', + 'claude-web', // OpenAI — https://platform.openai.com/docs/bots - "chatgpt", - "gptbot", - "oai-searchbot", - "openai", + 'chatgpt', + 'gptbot', + 'oai-searchbot', + 'openai', // Google AI - "gemini", - "bard", - "google-cloudvertexbot", - "google-extended", + 'gemini', + 'bard', + 'google-cloudvertexbot', + 'google-extended', // Meta - "meta-externalagent", - "meta-externalfetcher", - "meta-webindexer", + 'meta-externalagent', + 'meta-externalfetcher', + 'meta-webindexer', // Search/Research AI - "perplexity", - "youbot", - "you.com", - "deepseekbot", + 'perplexity', + 'youbot', + 'you.com', + 'deepseekbot', // Coding assistants - "cursor", - "github-copilot", - "codeium", - "tabnine", - "sourcegraph", + 'cursor', + 'github-copilot', + 'codeium', + 'tabnine', + 'sourcegraph', // Other AI agents / data scrapers (low-harm to serve markdown) - "cohere-ai", - "bytespider", - "amazonbot", - "ai2bot", - "diffbot", - "omgili", - "omgilibot", + 'cohere-ai', + 'bytespider', + 'amazonbot', + 'ai2bot', + 'diffbot', + 'omgili', + 'omgilibot', ]; // Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421). -const SIGNATURE_AGENT_DOMAINS = ["chatgpt.com"]; +const SIGNATURE_AGENT_DOMAINS = ['chatgpt.com']; // Layer 3: Traditional bot exclusion list — bots that should NOT trigger // the heuristic layer (they're search engine crawlers, social previews, or // monitoring tools, not AI agents). const TRADITIONAL_BOT_PATTERNS = [ - "googlebot", - "bingbot", - "yandexbot", - "baiduspider", - "duckduckbot", - "slurp", - "msnbot", - "facebot", - "twitterbot", - "linkedinbot", - "whatsapp", - "telegrambot", - "pingdom", - "uptimerobot", - "newrelic", - "datadog", - "statuspage", - "site24x7", - "applebot", + 'googlebot', + 'bingbot', + 'yandexbot', + 'baiduspider', + 'duckduckbot', + 'slurp', + 'msnbot', + 'facebot', + 'twitterbot', + 'linkedinbot', + 'whatsapp', + 'telegrambot', + 'pingdom', + 'uptimerobot', + 'newrelic', + 'datadog', + 'statuspage', + 'site24x7', + 'applebot', ]; // Broad regex for bot-like UA strings (used only in Layer 3 heuristic). const BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i; -export type DetectionMethod = "ua-match" | "signature-agent" | "heuristic"; +export type DetectionMethod = 'ua-match' | 'signature-agent' | 'heuristic'; export interface DetectionResult { detected: boolean; @@ -111,36 +111,36 @@ export interface DetectionResult { export function isAIAgent(request: { headers: { get(name: string): string | null }; }): DetectionResult { - const userAgent = request.headers.get("user-agent"); + const userAgent = request.headers.get('user-agent'); // Layer 1: Known UA pattern match if (userAgent) { const lowerUA = userAgent.toLowerCase(); if (AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) { - return { detected: true, method: "ua-match" }; + return { detected: true, method: 'ua-match' }; } } // Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent) - const signatureAgent = request.headers.get("signature-agent"); + const signatureAgent = request.headers.get('signature-agent'); if (signatureAgent) { const lowerSig = signatureAgent.toLowerCase(); if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) { - return { detected: true, method: "signature-agent" }; + return { detected: true, method: 'signature-agent' }; } } // Layer 3: Missing browser fingerprint heuristic // Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode // on navigation requests. Its absence signals a programmatic client. - const secFetchMode = request.headers.get("sec-fetch-mode"); + const secFetchMode = request.headers.get('sec-fetch-mode'); if (!secFetchMode && userAgent && BOT_LIKE_REGEX.test(userAgent)) { const lowerUA = userAgent.toLowerCase(); const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => lowerUA.includes(pattern) ); if (!isTraditionalBot) { - return { detected: true, method: "heuristic" }; + return { detected: true, method: 'heuristic' }; } } diff --git a/docs/proxy.ts b/docs/proxy.ts index 683a1f307c..02b2327970 100644 --- a/docs/proxy.ts +++ b/docs/proxy.ts @@ -59,13 +59,13 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { // AI agent detection — rewrite docs pages to markdown for agents // so they always get structured content without needing .md URLs or Accept headers if ( - (pathname === "/docs" || pathname.startsWith("/docs/")) && - !pathname.includes("/llms.mdx/") + (pathname === '/docs' || pathname.startsWith('/docs/')) && + !pathname.includes('/llms.mdx/') ) { const agentResult = isAIAgent(request); if (agentResult.detected && !isMarkdownPreferred(request)) { const result = - pathname === "/docs" + pathname === '/docs' ? `/${i18n.defaultLanguage}/llms.mdx` : rewriteLLM(pathname); @@ -73,10 +73,10 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { context.waitUntil( trackMdRequest({ path: pathname, - userAgent: request.headers.get("user-agent"), - referer: request.headers.get("referer"), - acceptHeader: request.headers.get("accept"), - requestType: "agent-rewrite", + userAgent: request.headers.get('user-agent'), + referer: request.headers.get('referer'), + acceptHeader: request.headers.get('accept'), + requestType: 'agent-rewrite', detectionMethod: agentResult.method, }) ); diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 618ded95b1..fd7fc12479 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -14,15 +14,17 @@ import { expect, test, } from 'vitest'; -import type { Run } from '../src/runtime'; +import { createLimitsRuntimeSuite } from '../../world-testing/src/limits-runtime.mts'; +import type { Run, StartOptions } from '../src/runtime.js'; import { + cancelRun, getHookByToken, getRun, getWorld, healthCheck, start as rawStart, resumeHook, -} from '../src/runtime'; +} from '../src/runtime.js'; import { cliCancel, cliHealthJson, @@ -49,10 +51,16 @@ if (!deploymentUrl) { * Tracked wrapper around start() that automatically registers runs * for diagnostics on test failure and observability metadata collection. */ -async function start( - ...args: Parameters> -): Promise> { - const run = await rawStart(...args); +type E2EWorkflowMetadata = Awaited>; + +async function start( + workflow: E2EWorkflowMetadata, + argsOrOptions?: unknown[] | StartOptions, + options?: StartOptions +): Promise> { + const run = Array.isArray(argsOrOptions) + ? await rawStart(workflow, argsOrOptions, options) + : await rawStart(workflow, argsOrOptions); trackRun(run); return run; } @@ -88,6 +96,20 @@ function writeE2EMetadata() { const e2e = (fn: string) => getWorkflowMetadata(deploymentUrl, 'workflows/99_e2e.ts', fn); +async function waitForRunLockAttempt(runId: string, timeoutMs = 10_000) { + const deadline = Date.now() + timeoutMs; + + while (Date.now() < deadline) { + const { data: events } = await getWorld().events.list({ runId }); + if (events.some((event) => event.eventType === 'lock_created')) { + return; + } + await sleep(50); + } + + throw new Error(`Timed out waiting for lock attempt on run ${runId}`); +} + /** * Triggers a workflow via HTTP POST. Used only for Pages Router tests * that specifically need to validate the HTTP trigger endpoint. @@ -220,11 +242,195 @@ describe('e2e', () => { const isNext = process.env.APP_NAME?.includes('nextjs'); const isLocal = deploymentUrl.includes('localhost'); + const isPostgresWorld = + process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres'; + const isLocalWorld = isLocalDeployment() && !isPostgresWorld; // only works with framework that transpiles react and // doesn't work on Vercel due to eval hack so react isn't // bundled in function const shouldSkipReactRenderTest = !(isNext && isLocal); + if (isLocalWorld || isPostgresWorld) { + createLimitsRuntimeSuite( + `limits runtime (${isPostgresWorld ? 'postgres' : 'local'})`, + async () => ({ + async runWorkflowWithScopedLocks(userId) { + const run = await start(await e2e('workflowWithScopedLocks'), [ + userId, + ]); + return await run.returnValue; + }, + async runWorkflowLockContention(userId, holdMs) { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs]); + await sleep(100); + const runB = await start(workflow, [userId, holdMs]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runLockedStepCallContention( + key, + holdMs, + labelA = 'A', + labelB = 'B' + ) { + const workflow = await e2e('lockedStepCallContentionWorkflow'); + const runA = await start(workflow, [key, holdMs, labelA]); + await sleep(100); + const runB = await start(workflow, [key, holdMs, labelB]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runWorkflowLockAcrossSuspension(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runWorkflowExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('workflowLeakedLockWorkflow'); + const waiterWorkflow = await e2e( + 'workflowOnlyLockContentionWorkflow' + ); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [userId, 0, 'B']); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runWorkflowTerminalHolderRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('workflowLeakedLockWorkflow'); + const waiterWorkflow = await e2e( + 'workflowOnlyLockContentionWorkflow' + ); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [userId, 0, 'B']); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runLeakedKeyExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('leakedKeyLockWorkflow'); + const waiterWorkflow = await e2e('lockedStepCallContentionWorkflow'); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [ + leakedResult.key, + 0, + 'B', + ]); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runWorkflowMixedLimitContention(userId, holdMs, periodMs) { + const workflow = await e2e('workflowMixedLimitContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, periodMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, periodMs, 'B']); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runReleasedRateLimitReplay(userId, periodMs, sleepMs) { + const workflow = await e2e('releasedRateLimitReplayWorkflow'); + const run = await start(workflow, [userId, periodMs, sleepMs]); + return await run.returnValue; + }, + async runWorkflowFifoThreeWaiters(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await waitForRunLockAttempt(runB.runId); + const runC = await start(workflow, [userId, holdMs, 'C']); + return await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + }, + async runCancelledWorkflowWaiter(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await waitForRunLockAttempt(runB.runId); + await cancelRun(getWorld(), runB.runId); + const runC = await start(workflow, [userId, holdMs, 'C']); + const [cancelledError, resultA, resultC] = await Promise.all([ + runB.returnValue.catch((error) => error), + runA.returnValue, + runC.returnValue, + ]); + return { cancelledError, resultA, resultC }; + }, + async runIndependentWorkflowKeys(holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, ['user-a', holdMs]); + await sleep(100); + const runB = await start(workflow, ['user-b', holdMs]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runIndependentStepKeys(holdMs) { + const workflow = await e2e('lockedStepCallContentionWorkflow'); + const runA = await start(workflow, [ + 'step:db:isolation:a', + holdMs, + 'A', + ]); + await sleep(100); + const runB = await start(workflow, [ + 'step:db:isolation:b', + holdMs, + 'B', + ]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runBlockedWaiterWithUnrelatedWorkflow(holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [ + 'worker-slot-shared', + holdMs, + 'A', + ]); + await sleep(100); + const runB = await start(workflow, [ + 'worker-slot-shared', + holdMs, + 'B', + ]); + await sleep(100); + const runC = await start(workflow, [ + 'worker-slot-unrelated', + Math.max(100, Math.floor(holdMs / 4)), + 'C', + ]); + + const [holder, waiter, unrelated] = await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + return { holder, waiter, unrelated }; + }, + async runWorkflowSingleLockAcrossMultipleSteps(holdMs) { + const workflow = await e2e('singleLockAcrossMultipleStepsWorkflow'); + const run = await start(workflow, ['step:db:batch', holdMs]); + return await run.returnValue; + }, + }) + ); + } + test.skipIf(shouldSkipReactRenderTest)( 'should work with react rendering in step', async () => { @@ -1969,7 +2175,7 @@ describe('e2e', () => { // Cancel the run using the core runtime cancelRun function. // This exercises the same cancelRun code path that the CLI uses // (the CLI delegates directly to this function). - const { cancelRun } = await import('../src/runtime'); + const { cancelRun } = await import('../src/runtime.js'); await cancelRun(getWorld(), run.runId); // Verify the run was cancelled - returnValue should throw WorkflowRunCancelledError diff --git a/packages/core/src/async-deserialization-ordering.test.ts b/packages/core/src/async-deserialization-ordering.test.ts index 0774b7d9d8..463a661ec0 100644 --- a/packages/core/src/async-deserialization-ordering.test.ts +++ b/packages/core/src/async-deserialization-ordering.test.ts @@ -36,6 +36,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/global.ts b/packages/core/src/global.ts index 3dd5c52ac8..6891e0a761 100644 --- a/packages/core/src/global.ts +++ b/packages/core/src/global.ts @@ -28,10 +28,17 @@ export interface WaitInvocationQueueItem { hasCreatedEvent?: boolean; } +export interface LimitWaitInvocationQueueItem { + type: 'limit_wait'; + correlationId: string; + resumeAt: Date; +} + export type QueueItem = | StepInvocationQueueItem | HookInvocationQueueItem - | WaitInvocationQueueItem; + | WaitInvocationQueueItem + | LimitWaitInvocationQueueItem; /** * An error that is thrown when one or more operations (steps/hooks/etc.) are called but do @@ -61,7 +68,9 @@ export class WorkflowSuspension extends Error { else if (item.type === 'hook') { if (item.disposed) hookDisposedCount++; else hookCount++; - } else if (item.type === 'wait') waitCount++; + } else if (item.type === 'wait' || item.type === 'limit_wait') { + waitCount++; + } } // Build description parts diff --git a/packages/core/src/hook-sleep-interaction.test.ts b/packages/core/src/hook-sleep-interaction.test.ts index a706628b81..9ec1bca88d 100644 --- a/packages/core/src/hook-sleep-interaction.test.ts +++ b/packages/core/src/hook-sleep-interaction.test.ts @@ -42,6 +42,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => promiseQueueHolder.current, }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 1d969aeaa6..413f87fa74 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -25,6 +25,12 @@ export { type WebhookOptions, } from './create-hook.js'; export { defineHook, type TypedHook } from './define-hook.js'; +export { + lock, + type LockHandle, + type LockOptions, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from './lock.js'; export { sleep } from './sleep.js'; export { getStepMetadata, diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts new file mode 100644 index 0000000000..0b63b96fc5 --- /dev/null +++ b/packages/core/src/lock.test.ts @@ -0,0 +1,66 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { + lock, + LIMITS_NOT_IMPLEMENTED_MESSAGE, + LOCK_WORKFLOW_ONLY_MESSAGE, +} from './lock.js'; +import { contextStorage } from './step/context-storage.js'; +import { WORKFLOW_HAS_STEP_CONTEXT, WORKFLOW_LOCK } from './symbols.js'; + +afterEach(() => { + delete (globalThis as any)[WORKFLOW_LOCK]; + (globalThis as any)[WORKFLOW_HAS_STEP_CONTEXT] = () => + contextStorage.getStore() !== undefined; +}); + +describe('lock', () => { + it('throws when called outside workflow or step execution context', async () => { + await expect( + lock({ + key: 'workflow:user:test', + concurrency: { max: 1 }, + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + }); + + it('prefers the workflow runtime lock when both runtimes are present', async () => { + const workflowHandle = { leaseId: 'lease_workflow' }; + const workflowLock = vi.fn().mockResolvedValue(workflowHandle); + (globalThis as any)[WORKFLOW_LOCK] = workflowLock; + const options = { + key: 'workflow:user:test', + concurrency: { max: 1 }, + }; + + await expect(lock(options)).resolves.toBe(workflowHandle); + expect(workflowLock).toHaveBeenCalledWith(options); + }); + + it('throws a workflow-only error when called inside a step context', async () => { + const options = { + key: 'step:db:cheap', + concurrency: { max: 2 }, + }; + + await expect( + contextStorage.run( + { + stepMetadata: { + stepId: 'step_test', + stepName: 'testStep', + stepStartedAt: new Date(), + attempt: 1, + }, + workflowMetadata: { + workflowName: 'testWorkflow', + workflowRunId: 'wrun_test', + workflowStartedAt: new Date(), + url: 'http://localhost:3000', + }, + ops: [], + }, + () => lock(options) + ) + ).rejects.toThrow(LOCK_WORKFLOW_ONLY_MESSAGE); + }); +}); diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts new file mode 100644 index 0000000000..9419bfb475 --- /dev/null +++ b/packages/core/src/lock.ts @@ -0,0 +1,56 @@ +import { + createLimitsNotImplementedError, + type LimitDefinition, + type LimitKey, + type LimitLease, +} from '@workflow/world'; +import { WORKFLOW_HAS_STEP_CONTEXT, WORKFLOW_LOCK } from './symbols.js'; + +export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; + +export const LOCK_WORKFLOW_ONLY_MESSAGE = + '`lock()` is only supported in workflow functions. Wrap the step call with `await using` in workflow code.'; + +/** + * Reserved first-pass user-facing API for future flow concurrency and rate + * limiting inside workflow functions. + */ +export interface LockOptions extends LimitDefinition { + key: LimitKey; + leaseTtlMs?: number; +} + +/** + * Reserved handle shape for future lock acquisition. + */ +export interface LockHandle + extends Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' + > { + dispose(): Promise; + heartbeat(ttlMs?: number): Promise; + [Symbol.asyncDispose](): Promise; +} + +/** + * Reserved workflow API for future concurrency and rate limiting. + */ +export async function lock(options: LockOptions): Promise { + const workflowLock = (globalThis as any)[WORKFLOW_LOCK] as + | ((options: LockOptions) => Promise) + | undefined; + + if (workflowLock) { + return workflowLock(options); + } + + const hasStepContext = (globalThis as any)[WORKFLOW_HAS_STEP_CONTEXT] as + | (() => boolean) + | undefined; + if (hasStepContext?.()) { + throw new Error(LOCK_WORKFLOW_ONLY_MESSAGE); + } + + throw createLimitsNotImplementedError(); +} diff --git a/packages/core/src/private.ts b/packages/core/src/private.ts index 97b028b018..7633ec7e9c 100644 --- a/packages/core/src/private.ts +++ b/packages/core/src/private.ts @@ -114,9 +114,12 @@ export { __private_getClosureVars } from './step/get-closure-vars.js'; export interface WorkflowOrchestratorContext { runId: string; + lockPreApproval?: string; encryptionKey: CryptoKey | undefined; globalThis: typeof globalThis; + advanceTimestamp: (timestamp: number) => void; eventsConsumer: EventsConsumer; + nextLockIndex: number; /** * Map of pending invocations keyed by correlationId. * Using Map instead of Array for O(1) lookup/delete operations. diff --git a/packages/core/src/runtime.ts b/packages/core/src/runtime.ts index da7a407bd3..7005404781 100644 --- a/packages/core/src/runtime.ts +++ b/packages/core/src/runtime.ts @@ -103,6 +103,7 @@ export function workflowEntrypoint( const { runId, + lockPreApproval, traceCarrier: traceContext, requestedAt, } = WorkflowInvokePayloadSchema.parse(message_); @@ -366,7 +367,8 @@ export function workflowEntrypoint( workflowCode, workflowRun, events, - encryptionKey + encryptionKey, + lockPreApproval ); } ); diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index aefac8aee3..411a6acc8e 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -5,9 +5,14 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; const { capturedHandlerRef, mockEventsCreate, + mockEventsListByCorrelationId, + mockLimitsAcquire, + mockLimitsHeartbeat, + mockLimitsRelease, mockQueue, mockRuntimeLogger, mockStepLogger, + mockStepGet, mockQueueMessage, mockStepFn, } = vi.hoisted(() => { @@ -19,6 +24,14 @@ const { current: null as null | ((...args: unknown[]) => Promise), }, mockEventsCreate: vi.fn(), + mockEventsListByCorrelationId: vi.fn().mockResolvedValue({ + data: [], + cursor: null, + hasMore: false, + }), + mockLimitsAcquire: vi.fn(), + mockLimitsHeartbeat: vi.fn(), + mockLimitsRelease: vi.fn().mockResolvedValue(undefined), mockQueue: vi.fn().mockResolvedValue({ messageId: 'msg_test' }), mockRuntimeLogger: { warn: vi.fn(), @@ -33,6 +46,16 @@ const { error: vi.fn(), }, mockQueueMessage: vi.fn().mockResolvedValue(undefined), + mockStepGet: vi.fn().mockResolvedValue({ + stepId: 'step_abc', + runId: 'wrun_test123', + stepName: 'myStep', + status: 'pending', + input: [], + attempt: 0, + createdAt: new Date(), + updatedAt: new Date(), + }), mockStepFn, }; }); @@ -48,7 +71,18 @@ vi.mock('@vercel/functions', () => ({ // Mock the world module - createQueueHandler captures the handler vi.mock('./world.js', () => ({ getWorld: vi.fn(() => ({ - events: { create: mockEventsCreate }, + events: { + create: mockEventsCreate, + listByCorrelationId: mockEventsListByCorrelationId, + }, + limits: { + acquire: mockLimitsAcquire, + heartbeat: mockLimitsHeartbeat, + release: mockLimitsRelease, + }, + steps: { + get: mockStepGet, + }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), })), @@ -204,9 +238,38 @@ describe('step-handler 409 handling', () => { mockStepFn.mockReset().mockResolvedValue('step-result'); mockStepFn.maxRetries = 3; mockQueueMessage.mockResolvedValue(undefined); + mockEventsListByCorrelationId.mockReset().mockResolvedValue({ + data: [], + cursor: null, + hasMore: false, + }); + mockLimitsAcquire.mockReset(); + mockLimitsHeartbeat.mockReset(); + mockLimitsRelease.mockReset().mockResolvedValue(undefined); + mockStepGet.mockReset().mockResolvedValue({ + stepId: 'step_abc', + runId: 'wrun_test123', + stepName: 'myStep', + status: 'pending', + input: [], + attempt: 0, + createdAt: new Date(), + updatedAt: new Date(), + }); // Re-set getWorld mock since clearAllMocks resets it vi.mocked(getWorld).mockReturnValue({ - events: { create: mockEventsCreate }, + events: { + create: mockEventsCreate, + listByCorrelationId: mockEventsListByCorrelationId, + }, + limits: { + acquire: mockLimitsAcquire, + heartbeat: mockLimitsHeartbeat, + release: mockLimitsRelease, + }, + steps: { + get: mockStepGet, + }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), } as any); @@ -228,6 +291,14 @@ describe('step-handler 409 handling', () => { vi.restoreAllMocks(); }); + it('does not call limits for ordinary step execution without lock()', async () => { + await capturedHandler(createMessage(), createMetadata('myStep')); + + expect(mockLimitsAcquire).not.toHaveBeenCalled(); + expect(mockLimitsHeartbeat).not.toHaveBeenCalled(); + expect(mockLimitsRelease).not.toHaveBeenCalled(); + }); + describe('step_completed 409', () => { it('should warn and return when step_completed gets a 409', async () => { // step_started succeeds, step function succeeds, step_completed returns 409 diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index 822f632a92..9795395316 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -176,7 +176,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // - Step not in terminal state (returns 409) // - retryAfter timestamp reached (returns 425 with Retry-After header) // - Workflow still active (returns 410 if completed) - let step; + let step: Awaited>; try { const startResult = await world.events.create( workflowRunId, diff --git a/packages/core/src/runtime/suspension-handler.ts b/packages/core/src/runtime/suspension-handler.ts index 20b6a815fe..3ff1028c53 100644 --- a/packages/core/src/runtime/suspension-handler.ts +++ b/packages/core/src/runtime/suspension-handler.ts @@ -15,6 +15,7 @@ import { import { importKey } from '../encryption.js'; import type { HookInvocationQueueItem, + LimitWaitInvocationQueueItem, StepInvocationQueueItem, WaitInvocationQueueItem, WorkflowSuspension, @@ -83,6 +84,9 @@ export async function handleSuspension({ const waitItems = suspension.steps.filter( (item): item is WaitInvocationQueueItem => item.type === 'wait' ); + const limitWaitItems = suspension.steps.filter( + (item): item is LimitWaitInvocationQueueItem => item.type === 'limit_wait' + ); // Split hooks by what actions they need const hooksNeedingCreation = allHookItems.filter( @@ -313,6 +317,38 @@ export async function handleSuspension({ } } + // Lock waits: schedule a delayed workflow replay keyed by correlationId so a + // later immediate wake-up can replace it. + for (const queueItem of limitWaitItems) { + ops.push( + (async () => { + /* + Lock waits are runtime control flow, not user-visible wait events. + We only enqueue a fallback replay here; promoted waiters can replace it. + */ + const delayMs = Math.max( + 1000, + queueItem.resumeAt.getTime() - Date.now() + ); + const traceCarrier = await serializeTraceCarrier(); + await queueMessage( + world, + `__wkf_workflow_${workflowName}`, + { + runId, + traceCarrier, + requestedAt: new Date(), + }, + { + delaySeconds: Math.ceil(delayMs / 1000), + idempotencyKey: queueItem.correlationId, + headers: extractTraceHeaders(traceCarrier), + } + ); + })() + ); + } + // Wait for all step and wait operations to complete waitUntil( Promise.all(ops).catch((opErr) => { diff --git a/packages/core/src/serialization.ts b/packages/core/src/serialization.ts index 3ea6939c76..4f011f069d 100644 --- a/packages/core/src/serialization.ts +++ b/packages/core/src/serialization.ts @@ -570,8 +570,7 @@ export class WorkflowServerWritableStream extends WritableStream { // unsettled promise because the cleared timer will never fire. const waiters = flushWaiters; flushWaiters = []; - const abortError = - reason ?? new Error("Stream aborted"); + const abortError = reason ?? new Error('Stream aborted'); for (const w of waiters) w.reject(abortError); }, }); diff --git a/packages/core/src/step.test.ts b/packages/core/src/step.test.ts index d6f7f0fb43..b8216faa89 100644 --- a/packages/core/src/step.test.ts +++ b/packages/core/src/step.test.ts @@ -27,6 +27,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), // All generated ulids use the workflow's started at time generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/step.ts b/packages/core/src/step.ts index bd45c3008c..33e544d19e 100644 --- a/packages/core/src/step.ts +++ b/packages/core/src/step.ts @@ -96,7 +96,7 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Finished; } queueItem.hasCreatedEvent = true; - // Continue waiting for step_started/step_completed/step_failed events + // Continue waiting for later step lifecycle events. return EventConsumerResult.Consumed; } diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index 2a9aa8b7e1..6b4ae846d3 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -1,5 +1,6 @@ import { AsyncLocalStorage } from 'node:async_hooks'; import type { CryptoKey } from '../encryption.js'; +import { WORKFLOW_HAS_STEP_CONTEXT } from '../symbols.js'; import type { WorkflowMetadata } from '../workflow/get-workflow-metadata.js'; import type { StepMetadata } from './get-step-metadata.js'; @@ -10,3 +11,6 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ closureVars?: Record; encryptionKey?: CryptoKey; }>(); + +(globalThis as any)[WORKFLOW_HAS_STEP_CONTEXT] = () => + contextStorage.getStore() !== undefined; diff --git a/packages/core/src/symbols.ts b/packages/core/src/symbols.ts index 92df4058db..c9842d22e4 100644 --- a/packages/core/src/symbols.ts +++ b/packages/core/src/symbols.ts @@ -1,7 +1,11 @@ export const WORKFLOW_USE_STEP = Symbol.for('WORKFLOW_USE_STEP'); export const WORKFLOW_CREATE_HOOK = Symbol.for('WORKFLOW_CREATE_HOOK'); export const WORKFLOW_SLEEP = Symbol.for('WORKFLOW_SLEEP'); +export const WORKFLOW_LOCK = Symbol.for('WORKFLOW_LOCK'); export const WORKFLOW_CONTEXT = Symbol.for('WORKFLOW_CONTEXT'); +export const WORKFLOW_HAS_STEP_CONTEXT = Symbol.for( + 'WORKFLOW_HAS_STEP_CONTEXT' +); export const WORKFLOW_GET_STREAM_ID = Symbol.for('WORKFLOW_GET_STREAM_ID'); export const STABLE_ULID = Symbol.for('WORKFLOW_STABLE_ULID'); export const STREAM_NAME_SYMBOL = Symbol.for('WORKFLOW_STREAM_NAME'); diff --git a/packages/core/src/workflow.test.ts b/packages/core/src/workflow.test.ts index 2d9c9a65d0..0253d95b04 100644 --- a/packages/core/src/workflow.test.ts +++ b/packages/core/src/workflow.test.ts @@ -1,6 +1,10 @@ import { types } from 'node:util'; import { HookConflictError, WorkflowRuntimeError } from '@workflow/errors'; -import type { Event, WorkflowRun } from '@workflow/world'; +import { + LIMITS_NOT_IMPLEMENTED_MESSAGE, + type Event, + type WorkflowRun, +} from '@workflow/world'; import { assert, describe, expect, it, vi } from 'vitest'; import type { WorkflowSuspension } from './global.js'; import { @@ -147,6 +151,46 @@ describe('runWorkflow', () => { }); }); + it('keeps lock() unsupported in the workflow vm on Vercel', async () => { + vi.stubEnv('VERCEL_URL', 'workflow.vercel.app'); + + try { + const ops: Promise[] = []; + const workflowCode = ` + const lock = globalThis[Symbol.for("WORKFLOW_LOCK")]; + async function workflow() { + await lock({ + key: 'workflow:user:test', + concurrency: { max: 1 }, + }); + } + ${getWorkflowTransformCode('workflow')} + `; + + const workflowRun: WorkflowRun = { + runId: 'wrun_123', + workflowName: 'workflow', + status: 'running', + input: await dehydrateWorkflowArguments( + [], + 'wrun_123', + noEncryptionKey, + ops + ), + createdAt: new Date('2024-01-01T00:00:00.000Z'), + updatedAt: new Date('2024-01-01T00:00:00.000Z'), + startedAt: new Date('2024-01-01T00:00:00.000Z'), + deploymentId: 'test-deployment', + }; + + await expect( + runWorkflow(workflowCode, workflowRun, [], noEncryptionKey) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + } finally { + vi.unstubAllEnvs(); + } + }); + it('should resolve a step that has a `step_completed` event', async () => { const ops: Promise[] = []; const workflowRunId = 'wrun_123'; diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index fdbe341b29..725c27f668 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -7,7 +7,11 @@ import { import { withResolvers } from '@workflow/utils'; import { getPort } from '@workflow/utils/get-port'; import { parseWorkflowName } from '@workflow/utils/parse-name'; -import type { Event, WorkflowRun } from '@workflow/world'; +import { + createLimitsNotImplementedError, + type Event, + type WorkflowRun, +} from '@workflow/world'; import * as nanoid from 'nanoid'; import { monotonicFactory } from 'ulid'; import type { CryptoKey } from './encryption.js'; @@ -26,6 +30,7 @@ import { STABLE_ULID, WORKFLOW_CREATE_HOOK, WORKFLOW_GET_STREAM_ID, + WORKFLOW_LOCK, WORKFLOW_SLEEP, WORKFLOW_USE_STEP, } from './symbols.js'; @@ -36,6 +41,7 @@ import { createContext } from './vm/index.js'; import type { WorkflowMetadata } from './workflow/get-workflow-metadata.js'; import { WORKFLOW_CONTEXT_SYMBOL } from './workflow/get-workflow-metadata.js'; import { createCreateHook } from './workflow/hook.js'; +import { createLock } from './workflow/lock.js'; import { createSleep } from './workflow/sleep.js'; /** @@ -80,7 +86,8 @@ export async function runWorkflow( workflowCode: string, workflowRun: WorkflowRun, events: Event[], - encryptionKey: CryptoKey | undefined + encryptionKey: CryptoKey | undefined, + lockPreApproval?: string ): Promise { return trace(`workflow.run ${workflowRun.workflowName}`, async (span) => { span?.setAttributes({ @@ -137,10 +144,15 @@ export async function runWorkflow( const workflowContext: WorkflowOrchestratorContext = { runId: workflowRun.runId, + lockPreApproval, encryptionKey, globalThis: vmGlobalThis, + advanceTimestamp: (timestamp) => { + updateTimestamp(Math.max(timestamp, vmGlobalThis.Date.now())); + }, onWorkflowError: workflowDiscontinuation.reject, eventsConsumer, + nextLockIndex: 0, generateUlid: () => ulid(+startedAt), generateNanoid, invocationsQueue: new Map(), @@ -188,6 +200,11 @@ export async function runWorkflow( const useStep = createUseStep(workflowContext); const createHook = createCreateHook(workflowContext); + const lock = isVercel + ? async () => { + throw createLimitsNotImplementedError(); + } + : createLock(workflowContext); const sleep = createSleep(workflowContext); // @ts-expect-error - `@types/node` says symbol is not valid, but it does work @@ -195,6 +212,8 @@ export async function runWorkflow( // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_CREATE_HOOK] = createHook; // @ts-expect-error - `@types/node` says symbol is not valid, but it does work + vmGlobalThis[WORKFLOW_LOCK] = lock; + // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_SLEEP] = sleep; // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_GET_STREAM_ID] = (namespace?: string) => diff --git a/packages/core/src/workflow/hook.test.ts b/packages/core/src/workflow/hook.test.ts index afea9b35d1..0dfaeb6458 100644 --- a/packages/core/src/workflow/hook.test.ts +++ b/packages/core/src/workflow/hook.test.ts @@ -28,6 +28,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/workflow/index.ts b/packages/core/src/workflow/index.ts index 61cc317491..86807ed04b 100644 --- a/packages/core/src/workflow/index.ts +++ b/packages/core/src/workflow/index.ts @@ -6,6 +6,12 @@ export { type RetryableErrorOptions, } from '@workflow/errors'; export type { Hook, HookOptions } from '../create-hook.js'; +export { + lock, + type LockHandle, + type LockOptions, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from '../lock.js'; export { sleep } from '../sleep.js'; export { createHook, createWebhook } from './create-hook.js'; export { defineHook } from './define-hook.js'; diff --git a/packages/core/src/workflow/lock.test.ts b/packages/core/src/workflow/lock.test.ts new file mode 100644 index 0000000000..0142944ac6 --- /dev/null +++ b/packages/core/src/workflow/lock.test.ts @@ -0,0 +1,441 @@ +import { TooEarlyError, WorkflowRuntimeError } from '@workflow/errors'; +import { withResolvers } from '@workflow/utils'; +import type { Event, EventResult, LimitLease } from '@workflow/world'; +import { + createLockCorrelationId, + createLockWakeCorrelationId, +} from '@workflow/world'; +import * as nanoid from 'nanoid'; +import { monotonicFactory } from 'ulid'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { EventsConsumer } from '../events-consumer.js'; +import { WorkflowSuspension } from '../global.js'; +import type { WorkflowOrchestratorContext } from '../private.js'; +import { setWorld } from '../runtime/world.js'; +import { createContext } from '../vm/index.js'; +import { createLock } from './lock.js'; +import { createSleep } from './sleep.js'; + +function createLease(): LimitLease { + return { + leaseId: 'lmt_lease', + key: 'workflow:user:test', + lockId: 'wrun_test:0', + runId: 'wrun_test', + lockIndex: 0, + acquiredAt: new Date('2025-01-01T00:00:00.000Z'), + expiresAt: new Date('2027-01-01T00:00:00.000Z'), + definition: { + concurrency: { max: 1 }, + }, + }; +} + +function setupWorkflowContext( + events: Event[], + options?: { onUnconsumedEvent?: (event: Event) => void } +): WorkflowOrchestratorContext { + const context = createContext({ + seed: 'test', + fixedTimestamp: 1753481739458, + }); + const ulid = monotonicFactory(() => context.globalThis.Math.random()); + const workflowStartedAt = context.globalThis.Date.now(); + const promiseQueueHolder = { current: Promise.resolve() }; + const workflowContext: WorkflowOrchestratorContext = { + runId: 'wrun_test', + lockPreApproval: undefined, + encryptionKey: undefined, + globalThis: context.globalThis, + advanceTimestamp: vi.fn(), + eventsConsumer: new EventsConsumer(events, { + onUnconsumedEvent: options?.onUnconsumedEvent ?? (() => {}), + getPromiseQueue: () => promiseQueueHolder.current, + }), + nextLockIndex: 0, + invocationsQueue: new Map(), + generateUlid: () => ulid(workflowStartedAt), + generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => + new Uint8Array(size).map(() => 256 * context.globalThis.Math.random()) + ), + onWorkflowError: vi.fn(), + pendingDeliveries: 0, + }; + Object.defineProperty(workflowContext, 'promiseQueue', { + get() { + return promiseQueueHolder.current; + }, + set(value: Promise) { + promiseQueueHolder.current = value; + }, + enumerable: true, + configurable: true, + }); + workflowContext.promiseQueue = Promise.resolve(); + return workflowContext; +} + +function asEventResult(event: Event): EventResult { + return { event }; +} + +afterEach(() => { + setWorld(undefined as any); + vi.restoreAllMocks(); +}); + +describe('createLock', () => { + it('creates and immediately acquires a fresh lock via world events', async () => { + const lease = createLease(); + const createEvent = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId: createLockCorrelationId('wrun_test', 0), + eventData: { lease }, + createdAt: new Date(), + }) + ) + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_release', + runId: 'wrun_test', + eventType: 'lock_release', + correlationId: createLockCorrelationId('wrun_test', 0), + createdAt: new Date(), + }) + ); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const ctx = setupWorkflowContext([]); + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + + expect(createEvent).toHaveBeenNthCalledWith( + 1, + 'wrun_test', + expect.objectContaining({ + eventType: 'lock_created', + correlationId: createLockCorrelationId('wrun_test', 0), + }) + ); + expect(handle.leaseId).toBe(lease.leaseId); + + await handle.dispose(); + + expect(createEvent).toHaveBeenNthCalledWith( + 2, + 'wrun_test', + expect.objectContaining({ + eventType: 'lock_release', + correlationId: createLockCorrelationId('wrun_test', 0), + }) + ); + }); + + it('replays a rate-only lock from lock_acquired without creating new events', async () => { + const lease = { + ...createLease(), + key: 'workflow:rate:test', + definition: { + rate: { count: 1, periodMs: 60_000 }, + }, + }; + const createEvent = vi.fn(); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease }, + createdAt: new Date(), + }, + ]); + + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + rate: { count: 1, periodMs: 60_000 }, + }); + + expect(createEvent).not.toHaveBeenCalled(); + expect(handle.leaseId).toBe(lease.leaseId); + }); + + it('ignores an expired lock_acquired event and reacquires the lease', async () => { + const expiredLease = { + ...createLease(), + expiresAt: new Date('2025-01-01T00:00:00.000Z'), + }; + const freshLease = { + ...createLease(), + leaseId: 'lmt_fresh', + expiresAt: new Date('2027-06-01T00:00:00.000Z'), + }; + const createEvent = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_acquired_fresh', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId: createLockCorrelationId('wrun_test', 0), + eventData: { lease: freshLease }, + createdAt: new Date(), + }) + ); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat: vi.fn() }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_created', + runId: 'wrun_test', + eventType: 'lock_created', + correlationId, + eventData: { + key: expiredLease.key, + definition: expiredLease.definition, + leaseTtlMs: 1_000, + }, + createdAt: new Date(), + }, + { + eventId: 'evnt_lock_acquired_expired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease: expiredLease }, + createdAt: new Date(), + }, + ]); + + const lock = createLock(ctx); + const handle = await lock({ + key: expiredLease.key, + concurrency: { max: 1 }, + }); + + expect(createEvent).toHaveBeenCalledTimes(1); + expect(createEvent).toHaveBeenCalledWith( + 'wrun_test', + expect.objectContaining({ + eventType: 'lock_acquired', + correlationId, + }) + ); + expect(handle.leaseId).toBe(freshLease.leaseId); + }); + + it('replays a released scope as a no-op without double-releasing', async () => { + const lease = createLease(); + const createEvent = vi.fn(); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease }, + createdAt: new Date(), + }, + { + eventId: 'evnt_lock_release', + runId: 'wrun_test', + eventType: 'lock_release', + correlationId, + createdAt: new Date(), + }, + ]); + + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + + await handle.dispose(); + + expect(createEvent).not.toHaveBeenCalled(); + }); + + it('re-suspends when a stale lock wake-up becomes too early again', async () => { + const now = Date.parse('2026-03-31T03:50:29.624Z'); + const retryAfterSeconds = 30; + const retryAfter = new Date(now + retryAfterSeconds * 1000); + vi.spyOn(Date, 'now').mockReturnValue(now); + const createEvent = vi + .fn<() => Promise>() + .mockRejectedValueOnce( + new TooEarlyError('not ready yet', { retryAfter: retryAfterSeconds }) + ); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat: vi.fn() }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_created', + runId: 'wrun_test', + eventType: 'lock_created', + correlationId, + eventData: { + key: 'workflow:rate:test', + definition: { rate: { count: 1, periodMs: 60_000 } }, + acquireAt: new Date(Date.now() - 1_000), + }, + createdAt: new Date(), + }, + ]); + const errorReceived = withResolvers(); + ctx.onWorkflowError = errorReceived.resolve; + + const lock = createLock(ctx); + void lock({ + key: 'workflow:rate:test', + rate: { count: 1, periodMs: 60_000 }, + }); + + const workflowError = await errorReceived.promise; + expect(workflowError).toBeInstanceOf(WorkflowSuspension); + expect(createEvent).toHaveBeenCalledTimes(1); + const waitItem = ctx.invocationsQueue.get( + createLockWakeCorrelationId('wrun_test', 0) + ); + expect(waitItem).toMatchObject({ + type: 'limit_wait', + correlationId: createLockWakeCorrelationId('wrun_test', 0), + resumeAt: retryAfter, + }); + }); + + it('does not orphan wait_created when a replayed lock is immediately followed by sleep', async () => { + const lease = createLease(); + const createEvent = vi.fn(); + const tempCtx = setupWorkflowContext([]); + const waitCorrelationId = `wait_${tempCtx.generateUlid()}`; + const onUnconsumedEvent = vi.fn(); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat: vi.fn() }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext( + [ + { + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease }, + createdAt: new Date('2025-01-01T00:00:00.000Z'), + }, + { + eventId: 'evnt_wait_created', + runId: 'wrun_test', + eventType: 'wait_created', + correlationId: waitCorrelationId, + eventData: { + resumeAt: new Date('2025-01-01T00:00:01.000Z'), + }, + createdAt: new Date('2025-01-01T00:00:00.010Z'), + }, + { + eventId: 'evnt_wait_completed', + runId: 'wrun_test', + eventType: 'wait_completed', + correlationId: waitCorrelationId, + createdAt: new Date('2025-01-01T00:00:01.000Z'), + }, + ], + { onUnconsumedEvent } + ); + const lock = createLock(ctx); + const sleep = createSleep(ctx); + + await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + await sleep(1_000); + await new Promise((resolve) => setTimeout(resolve, 150)); + + expect(createEvent).not.toHaveBeenCalled(); + expect(onUnconsumedEvent).not.toHaveBeenCalled(); + expect(ctx.onWorkflowError).not.toHaveBeenCalled(); + }); + + it('rejects heartbeat in workflow scope to preserve replay determinism', async () => { + const lease = createLease(); + const createEvent = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId: createLockCorrelationId('wrun_test', 0), + eventData: { lease }, + createdAt: new Date(), + }) + ); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const ctx = setupWorkflowContext([]); + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + + await expect(handle.heartbeat()).rejects.toBeInstanceOf( + WorkflowRuntimeError + ); + await expect(handle.heartbeat()).rejects.toThrow( + 'Lock heartbeat is not supported in workflow functions yet' + ); + expect(heartbeat).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts new file mode 100644 index 0000000000..f18ead94ad --- /dev/null +++ b/packages/core/src/workflow/lock.ts @@ -0,0 +1,463 @@ +import { + EntityConflictError, + TooEarlyError, + WorkflowRuntimeError, +} from '@workflow/errors'; +import { withResolvers } from '@workflow/utils'; +import { + type CreateEventRequest, + createLockCorrelationId, + createLockWakeCorrelationId, + type LimitDefinition, + type LimitLease, + SPEC_VERSION_CURRENT, +} from '@workflow/world'; +import { EventConsumerResult } from '../events-consumer.js'; +import { WorkflowSuspension } from '../global.js'; +import type { LockHandle, LockOptions } from '../lock.js'; +import { + scheduleWhenIdle, + type WorkflowOrchestratorContext, +} from '../private.js'; +import { getWorld } from '../runtime/world.js'; + +const DEFAULT_LOCK_LEASE_TTL_MS = 24 * 60 * 60 * 1000; +const LOCK_HEARTBEAT_UNSUPPORTED_MESSAGE = + 'Lock heartbeat is not supported in workflow functions yet because it cannot be replayed deterministically.'; + +type LockLeaseView = Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' +>; + +interface LockState { + correlationId: string; + wakeCorrelationId: string; + key: string; + leaseTtlMs: number; + definition: LimitDefinition; + acquireAt?: Date; + lease?: LockLeaseView; + hasCreatedEvent: boolean; + hasAcquiredEvent: boolean; + hasReleaseEvent: boolean; +} + +function createSuspension(ctx: WorkflowOrchestratorContext) { + scheduleWhenIdle(ctx, () => { + ctx.onWorkflowError( + new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + ); + }); +} + +function isLeaseLive(lease: Pick): boolean { + return ( + lease.expiresAt === undefined || lease.expiresAt.getTime() > Date.now() + ); +} + +function getReleasedLeaseView( + ctx: WorkflowOrchestratorContext, + event: Extract | any +): LockLeaseView | undefined { + const data = event.eventData; + if (!data?.leaseId || !data?.key || !data?.lockId) { + return undefined; + } + + return { + leaseId: data.leaseId, + key: data.key, + lockId: data.lockId, + runId: ctx.runId, + lockIndex: Number.parseInt(data.lockId.split(':').at(-1) ?? '0', 10), + expiresAt: undefined, + }; +} + +function createLockHandle( + state: LockState, + ctx: WorkflowOrchestratorContext +): LockHandle { + let disposed = false; + + const getLease = () => { + if (!state.lease) { + throw new WorkflowRuntimeError( + `Corrupted event log: lock ${state.correlationId} is missing lease metadata` + ); + } + return state.lease; + }; + + const dispose = async () => { + if (disposed || state.hasReleaseEvent) { + return; + } + + disposed = true; + let eventCreatedAt: Date | undefined; + try { + const result = await getWorld().events.create(ctx.runId, { + eventType: 'lock_release', + specVersion: SPEC_VERSION_CURRENT, + correlationId: state.correlationId, + }); + eventCreatedAt = result.event?.createdAt; + } catch (error) { + if (EntityConflictError.is(error)) { + state.hasReleaseEvent = true; + return; + } + throw error; + } + + state.hasReleaseEvent = true; + if (eventCreatedAt) { + ctx.advanceTimestamp(+eventCreatedAt); + } + }; + + const heartbeat = async (ttlMs?: number) => { + if (state.hasReleaseEvent) return; + void ttlMs; + getLease(); + throw new WorkflowRuntimeError(LOCK_HEARTBEAT_UNSUPPORTED_MESSAGE); + }; + + const handle: LockHandle = { + get leaseId() { + return getLease().leaseId; + }, + get key() { + return getLease().key; + }, + get lockId() { + return getLease().lockId; + }, + get runId() { + return getLease().runId; + }, + get lockIndex() { + return getLease().lockIndex; + }, + get expiresAt() { + return getLease().expiresAt; + }, + dispose, + heartbeat, + [Symbol.asyncDispose]: dispose, + }; + + const vmAsyncDispose = ctx.globalThis.Symbol.asyncDispose; + if (vmAsyncDispose && vmAsyncDispose !== Symbol.asyncDispose) { + (handle as any)[vmAsyncDispose] = dispose; + } + + return handle; +} + +function createLockCreatedEvent( + state: LockState +): Extract { + return { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: state.correlationId, + eventData: { + key: state.key, + definition: state.definition, + leaseTtlMs: state.leaseTtlMs, + }, + }; +} + +function createLockAcquiredEvent( + state: LockState +): Extract { + return { + eventType: 'lock_acquired', + specVersion: SPEC_VERSION_CURRENT, + correlationId: state.correlationId, + }; +} + +export function createLock(ctx: WorkflowOrchestratorContext) { + return async function lockImpl(options: LockOptions): Promise { + const lockIndex = ctx.nextLockIndex++; + const state: LockState = { + correlationId: createLockCorrelationId(ctx.runId, lockIndex), + wakeCorrelationId: createLockWakeCorrelationId(ctx.runId, lockIndex), + key: options.key, + leaseTtlMs: options.leaseTtlMs ?? DEFAULT_LOCK_LEASE_TTL_MS, + definition: { + concurrency: options.concurrency, + rate: options.rate, + }, + hasCreatedEvent: false, + hasAcquiredEvent: false, + hasReleaseEvent: false, + }; + + const { promise, resolve, reject } = withResolvers(); + let resolved = false; + let pendingRuntimeRequest = false; + let suspensionScheduled = false; + + const resolveHandle = () => { + if (resolved) return; + resolved = true; + ctx.invocationsQueue.delete(state.wakeCorrelationId); + ctx.promiseQueue = ctx.promiseQueue.then(() => { + resolve(createLockHandle(state, ctx)); + }); + }; + + const suspendWorkflow = () => { + if (suspensionScheduled || resolved) { + return; + } + suspensionScheduled = true; + createSuspension(ctx); + }; + + const scheduleRateRetry = (acquireAt: Date) => { + ctx.invocationsQueue.set(state.wakeCorrelationId, { + type: 'limit_wait', + correlationId: state.wakeCorrelationId, + resumeAt: acquireAt, + }); + suspendWorkflow(); + }; + + const shouldAttemptAcquire = (acquireAt?: Date) => { + if (ctx.lockPreApproval === state.correlationId) { + return true; + } + if (!acquireAt) { + return false; + } + return acquireAt.getTime() <= Date.now(); + }; + + const requestLockCreated = async () => { + try { + const result = await getWorld().events.create( + ctx.runId, + createLockCreatedEvent(state) + ); + const event = result.event; + if (!event) { + throw new WorkflowRuntimeError( + `World did not return an event for lock ${state.correlationId}` + ); + } + + if (event.eventType === 'lock_acquired') { + if (!event.eventData?.lease) { + throw new WorkflowRuntimeError( + `Corrupted event log: lock ${state.correlationId} acquisition is missing lease metadata` + ); + } + if (!isLeaseLive(event.eventData.lease)) { + state.hasCreatedEvent = true; + state.acquireAt = new Date(0); + suspendWorkflow(); + return; + } + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.lease = event.eventData.lease; + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + return; + } + + if (event.eventType === 'lock_release') { + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.hasReleaseEvent = true; + state.lease ??= getReleasedLeaseView(ctx, event); + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + return; + } + + if (event.eventType !== 'lock_created') { + throw new WorkflowRuntimeError( + `Unexpected event type for lock ${state.correlationId}: ${event.eventType}` + ); + } + + state.hasCreatedEvent = true; + ctx.advanceTimestamp(+event.createdAt); + if (event.eventData.acquireAt) { + state.acquireAt = event.eventData.acquireAt; + scheduleRateRetry(event.eventData.acquireAt); + return; + } + + state.acquireAt = undefined; + suspendWorkflow(); + } catch (error) { + reject(error); + } + }; + + const requestLockAcquired = async () => { + try { + const result = await getWorld().events.create( + ctx.runId, + createLockAcquiredEvent(state) + ); + const event = result.event; + if ( + !event || + (event.eventType !== 'lock_acquired' && + event.eventType !== 'lock_release') + ) { + throw new WorkflowRuntimeError( + `World did not acquire lock ${state.correlationId}` + ); + } + + if (event.eventType === 'lock_release') { + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.hasReleaseEvent = true; + state.lease ??= getReleasedLeaseView(ctx, event); + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + return; + } + + if (!event.eventData?.lease) { + throw new WorkflowRuntimeError( + `World did not acquire lock ${state.correlationId}` + ); + } + if (!isLeaseLive(event.eventData.lease)) { + state.acquireAt = new Date(0); + suspendWorkflow(); + return; + } + state.hasAcquiredEvent = true; + state.lease = event.eventData.lease; + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + } catch (error) { + if (TooEarlyError.is(error)) { + if (error.retryAfter) { + const acquireAt = new Date(Date.now() + error.retryAfter * 1000); + state.acquireAt = acquireAt; + scheduleRateRetry(acquireAt); + } else { + state.acquireAt = undefined; + suspendWorkflow(); + } + return; + } + reject(error); + } + }; + + const ensureRuntimeProgress = (acquireAt?: Date) => { + if (resolved || pendingRuntimeRequest) { + return; + } + + if (!state.hasCreatedEvent) { + pendingRuntimeRequest = true; + void requestLockCreated().finally(() => { + pendingRuntimeRequest = false; + }); + return; + } + + if (state.hasAcquiredEvent) { + resolveHandle(); + return; + } + + if (!shouldAttemptAcquire(acquireAt)) { + if (acquireAt) { + scheduleRateRetry(acquireAt); + } else { + suspendWorkflow(); + } + return; + } + + pendingRuntimeRequest = true; + void requestLockAcquired().finally(() => { + pendingRuntimeRequest = false; + }); + }; + + ctx.eventsConsumer.subscribe((event) => { + if (!event) { + ensureRuntimeProgress(state.acquireAt); + return EventConsumerResult.NotConsumed; + } + + if (event.correlationId !== state.correlationId) { + return EventConsumerResult.NotConsumed; + } + + if (event.eventType === 'lock_created') { + state.hasCreatedEvent = true; + state.acquireAt = event.eventData.acquireAt; + return EventConsumerResult.Consumed; + } + + if (event.eventType === 'lock_acquired') { + if (!event.eventData?.lease) { + ctx.promiseQueue = ctx.promiseQueue.then(() => { + ctx.onWorkflowError( + new WorkflowRuntimeError( + `Corrupted event log: lock ${state.correlationId} acquisition is missing lease metadata` + ) + ); + }); + return EventConsumerResult.Finished; + } + if (!isLeaseLive(event.eventData.lease)) { + state.hasCreatedEvent = true; + state.acquireAt = new Date(0); + return EventConsumerResult.Consumed; + } + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.lease = event.eventData.lease; + resolveHandle(); + return EventConsumerResult.Consumed; + } + + if (event.eventType === 'lock_release') { + state.lease ??= getReleasedLeaseView(ctx, event); + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.hasReleaseEvent = true; + ctx.invocationsQueue.delete(state.wakeCorrelationId); + resolveHandle(); + return EventConsumerResult.Finished; + } + + if (event.eventType === 'lock_waiter_queued') { + return EventConsumerResult.Consumed; + } + + ctx.promiseQueue = ctx.promiseQueue.then(() => { + ctx.onWorkflowError( + new WorkflowRuntimeError( + `Unexpected event type for lock ${state.correlationId} "${event.eventType}"` + ) + ); + }); + return EventConsumerResult.Finished; + }); + + return promise; + }; +} diff --git a/packages/core/src/workflow/sleep.test.ts b/packages/core/src/workflow/sleep.test.ts index fe5387a134..54624332b4 100644 --- a/packages/core/src/workflow/sleep.test.ts +++ b/packages/core/src/workflow/sleep.test.ts @@ -33,6 +33,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { }, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/errors/src/index.ts b/packages/errors/src/index.ts index 1bde17381c..dad32d44c5 100644 --- a/packages/errors/src/index.ts +++ b/packages/errors/src/index.ts @@ -396,6 +396,30 @@ export class EntityConflictError extends WorkflowWorldError { } } +export class LimitDefinitionConflictError extends WorkflowError { + key: string; + existingDefinition: unknown; + requestedDefinition: unknown; + + constructor( + key: string, + existingDefinition: unknown, + requestedDefinition: unknown + ) { + super( + `Limit key "${key}" is already configured with a different definition` + ); + this.name = 'LimitDefinitionConflictError'; + this.key = key; + this.existingDefinition = existingDefinition; + this.requestedDefinition = requestedDefinition; + } + + static is(value: unknown): value is LimitDefinitionConflictError { + return isError(value) && value.name === 'LimitDefinitionConflictError'; + } +} + /** * Thrown when a run is no longer available — either because it has been * cleaned up, expired, or already reached a terminal state (completed/failed). diff --git a/packages/swc-playground-wasm/build.js b/packages/swc-playground-wasm/build.js index 2ac8b99a25..d9a41b1ee6 100644 --- a/packages/swc-playground-wasm/build.js +++ b/packages/swc-playground-wasm/build.js @@ -1,10 +1,16 @@ import { execSync } from 'node:child_process'; -import { existsSync } from 'node:fs'; +import { existsSync, mkdirSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; import { fileURLToPath } from 'node:url'; -function runCommand(command) { +function execCommand(command, options = {}) { + return execSync(command, { stdio: 'inherit', shell: true, ...options }); +} + +function runCommand(command, options = {}) { try { - execSync(command, { stdio: 'inherit', shell: true }); + execCommand(command, options); } catch (error) { console.error(`Command failed: ${command}: ${error}`); process.exit(1); @@ -52,31 +58,95 @@ function ensureRustup() { } } -console.log('Building swc-playground-wasm...'); - -ensureRustup(); +function sleepMs(ms) { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms); +} -// Check if wasm32-unknown-unknown target exists and install if needed -console.log('Checking wasm32-unknown-unknown target...'); -try { +function isRustTargetInstalled(target) { const installedTargets = execSync('rustup target list --installed', { stdio: 'pipe', shell: true, }).toString(); - if (!installedTargets.includes('wasm32-unknown-unknown')) { - console.log('wasm32-unknown-unknown target not found, installing...'); - runCommand('rustup target add wasm32-unknown-unknown'); - } else { - console.log('wasm32-unknown-unknown target already installed'); - } -} catch (error) { - console.error( - 'Failed to check/install wasm32-unknown-unknown target:', - error.message + return installedTargets.includes(target); +} + +function withTargetInstallLock(target, callback) { + const lockDir = path.join( + tmpdir(), + `workflow-rustup-target-${target.replaceAll(/[^a-z0-9_-]/gi, '-')}.lock` ); - process.exit(1); + const timeoutMs = 2 * 60 * 1000; + const startedAt = Date.now(); + + while (true) { + try { + mkdirSync(lockDir); + break; + } catch (error) { + if (error?.code !== 'EEXIST') { + throw error; + } + + if (Date.now() - startedAt > timeoutMs) { + throw new Error( + `Timed out waiting for rustup target install lock for ${target}` + ); + } + + console.log( + `Another process is installing ${target}; waiting for the lock...` + ); + sleepMs(1000); + } + } + + try { + return callback(); + } finally { + rmSync(lockDir, { recursive: true, force: true }); + } } +function ensureRustTarget(target) { + console.log(`Checking ${target} target...`); + + try { + if (isRustTargetInstalled(target)) { + console.log(`${target} target already installed`); + return; + } + + withTargetInstallLock(target, () => { + if (isRustTargetInstalled(target)) { + console.log(`${target} target was installed by another process`); + return; + } + + console.log(`${target} target not found, installing...`); + try { + execCommand(`rustup target add ${target}`); + } catch (error) { + if (isRustTargetInstalled(target)) { + console.warn( + `${target} target appears installed after a rustup error; continuing` + ); + return; + } + throw error; + } + }); + } catch (error) { + console.error(`Failed to check/install ${target} target:`, error.message); + process.exit(1); + } +} + +console.log('Building swc-playground-wasm...'); + +ensureRustup(); + +ensureRustTarget('wasm32-unknown-unknown'); + // Check if wasm-pack is installed if (!commandExists('wasm-pack')) { console.log('Installing wasm-pack...'); diff --git a/packages/swc-plugin-workflow/build.js b/packages/swc-plugin-workflow/build.js index 95c5d430ac..17b691ec67 100644 --- a/packages/swc-plugin-workflow/build.js +++ b/packages/swc-plugin-workflow/build.js @@ -3,14 +3,22 @@ import { execSync } from 'node:child_process'; import { copyFileSync, existsSync, + mkdirSync, readdirSync, readFileSync, + rmSync, writeFileSync, } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; -function runCommand(command) { +function execCommand(command, options = {}) { + return execSync(command, { stdio: 'inherit', shell: true, ...options }); +} + +function runCommand(command, options = {}) { try { - execSync(command, { stdio: 'inherit', shell: true }); + execCommand(command, options); } catch (error) { console.error(`Command failed: ${command}: ${error}`); process.exit(1); @@ -26,6 +34,89 @@ function commandExists(command) { } } +function sleepMs(ms) { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms); +} + +function isRustTargetInstalled(target) { + const installedTargets = execSync('rustup target list --installed', { + stdio: 'pipe', + shell: true, + }).toString(); + return installedTargets.includes(target); +} + +function withTargetInstallLock(target, callback) { + const lockDir = path.join( + tmpdir(), + `workflow-rustup-target-${target.replaceAll(/[^a-z0-9_-]/gi, '-')}.lock` + ); + const timeoutMs = 2 * 60 * 1000; + const startedAt = Date.now(); + + while (true) { + try { + mkdirSync(lockDir); + break; + } catch (error) { + if (error?.code !== 'EEXIST') { + throw error; + } + + if (Date.now() - startedAt > timeoutMs) { + throw new Error( + `Timed out waiting for rustup target install lock for ${target}` + ); + } + + console.log( + `Another process is installing ${target}; waiting for the lock...` + ); + sleepMs(1000); + } + } + + try { + return callback(); + } finally { + rmSync(lockDir, { recursive: true, force: true }); + } +} + +function ensureRustTarget(target) { + console.log(`Checking ${target} target...`); + + try { + if (isRustTargetInstalled(target)) { + console.log(`${target} target already installed`); + return; + } + + withTargetInstallLock(target, () => { + if (isRustTargetInstalled(target)) { + console.log(`${target} target was installed by another process`); + return; + } + + console.log(`${target} target not found, installing...`); + try { + execCommand(`rustup target add ${target}`); + } catch (error) { + if (isRustTargetInstalled(target)) { + console.warn( + `${target} target appears installed after a rustup error; continuing` + ); + return; + } + throw error; + } + }); + } catch (error) { + console.error(`Failed to check/install ${target} target:`, error.message); + process.exit(1); + } +} + console.log('Building swc-plugin-workflow WASM...'); // Check if cargo is installed @@ -57,26 +148,7 @@ if (!commandExists('cargo')) { } } -// Check if wasm32-unknown-unknown target exists and install if needed -console.log('Checking wasm32-unknown-unknown target...'); -try { - const installedTargets = execSync('rustup target list --installed', { - stdio: 'pipe', - shell: true, - }).toString(); - if (!installedTargets.includes('wasm32-unknown-unknown')) { - console.log('wasm32-unknown-unknown target not found, installing...'); - runCommand('rustup target add wasm32-unknown-unknown'); - } else { - console.log('wasm32-unknown-unknown target already installed'); - } -} catch (error) { - console.error( - 'Failed to check/install wasm32-unknown-unknown target:', - error.message - ); - process.exit(1); -} +ensureRustTarget('wasm32-unknown-unknown'); // Build the WASM plugin console.log('Running cargo build...'); diff --git a/packages/workflow/src/internal/builtins.ts b/packages/workflow/src/internal/builtins.ts index 886686e50e..624ebbaebd 100644 --- a/packages/workflow/src/internal/builtins.ts +++ b/packages/workflow/src/internal/builtins.ts @@ -2,6 +2,9 @@ * These are the built-in steps that are "automatically available" in the workflow scope. They are * similar to "stdlib" except that are not meant to be imported by users, but are instead "just available" * alongside user defined steps. They are used internally by the runtime + * + * These helpers intentionally rely on the method receiver (`this`) so workflow + * objects like `Request` and `Response` can round-trip through step execution. */ export async function __builtin_response_array_buffer( diff --git a/packages/world-local/README.md b/packages/world-local/README.md index 9e3f0d95cc..fccc554eac 100644 --- a/packages/world-local/README.md +++ b/packages/world-local/README.md @@ -4,5 +4,13 @@ Filesystem-based workflow backend for local development and testing. Stores workflow data as JSON files on disk and provides in-memory queuing. Automatically detects development server port for queue transport. -Used by default on `next dev` and `next start`. +The `limits` namespace implements the shared flow-limits contract for local development: + +- keyed concurrency and rate limits +- FIFO waiter promotion per key +- cancelled workflow / failed step waiter pruning +- prompt wake-ups with delayed fallback retries +Limit state is persisted on disk, but queue delivery is still in-memory. That means local world matches the same live-process lock semantics as other implemented worlds, while crash-survival and durable backlog behavior remain a PostgreSQL-only advantage today. + +Used by default on `next dev` and `next start`. diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 9e0d242222..4d6a33d3d2 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -13,10 +13,11 @@ import { readJSON, } from './fs.js'; import { initDataDir } from './init.js'; -import { instrumentObject } from './instrumentObject.js'; +import { createLimits } from './limits.js'; import { createQueue, type DirectHandler } from './queue.js'; import { hashToken } from './storage/helpers.js'; import { createStorage } from './storage.js'; +import { instrumentObject } from './instrumentObject.js'; import { createStreamer } from './streamer.js'; // Re-export init types and utilities for consumers @@ -29,7 +30,7 @@ export { parseVersion, } from './init.js'; -export { type DirectHandler } from './queue.js'; +export type { DirectHandler } from './queue.js'; export type LocalWorld = World & { /** Register a direct in-process handler for a queue prefix, bypassing HTTP. */ @@ -62,8 +63,17 @@ export function createLocalWorld(args?: Partial): LocalWorld { const mergedConfig = { ...config.value, ...definedArgs }; const tag = mergedConfig.tag; const queue = createQueue(mergedConfig); - const storage = createStorage(mergedConfig.dataDir, tag); + let limits: World['limits'] | undefined; + const storage = createStorage(mergedConfig.dataDir, tag, { + getLimits: () => limits, + queue, + }); + limits = createLimits(mergedConfig.dataDir, { + tag, + storage, + }); return { + limits, ...queue, ...storage, ...instrumentObject( @@ -109,6 +119,7 @@ export function createLocalWorld(args?: Partial): LocalWorld { 'steps', 'events', 'hooks', + 'limits', 'waits', 'streams/runs', ]; diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts new file mode 100644 index 0000000000..6243de2290 --- /dev/null +++ b/packages/world-local/src/limits.test.ts @@ -0,0 +1,276 @@ +import { mkdtemp, readFile, rm } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import { LimitDefinitionConflictError } from '@workflow/errors'; +import { describe, expect, it } from 'vitest'; +import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.mts'; +import { createLocalWorld } from './index.js'; +import { createLimits } from './limits.js'; + +createLimitsContractSuite('local world limits', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const world = createLocalWorld({ dataDir: dir }); + world.registerHandler('__wkf_step_', async () => Response.json({ ok: true })); + world.registerHandler('__wkf_workflow_', async () => + Response.json({ ok: true }) + ); + + return { + limits: world.limits, + storage: world, + inspectKeyState: async (key) => { + const statePath = path.join(dir, 'limits', 'state.json'); + let raw: { + keys?: Record< + string, + { + leases?: { lockId: string }[]; + waiters?: { lockId: string }[]; + tokens?: { lockId: string }[]; + } + >; + }; + try { + raw = JSON.parse(await readFile(statePath, 'utf8')); + } catch (error) { + const code = (error as NodeJS.ErrnoException).code; + if (code === 'ENOENT') { + return { + leaseHolderIds: [], + waiterHolderIds: [], + tokenHolderIds: [], + }; + } + throw error; + } + + const keyState = raw.keys?.[key]; + return { + leaseHolderIds: keyState?.leases?.map((lease) => lease.lockId) ?? [], + waiterHolderIds: + keyState?.waiters?.map((waiter) => waiter.lockId) ?? [], + tokenHolderIds: keyState?.tokens?.map((token) => token.lockId) ?? [], + }; + }, + close: async () => { + await world.close?.(); + await rm(dir, { recursive: true, force: true }); + }, + }; +}); + +describe('local world limit retry timing', () => { + it('persists nextWaiter metadata and emits lock_waiter_queued on release', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const world = createLocalWorld({ dataDir: dir }); + world.registerHandler('__wkf_workflow_', async () => + Response.json({ ok: true }) + ); + + try { + const runA = ( + await world.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName: 'holder-a', + input: [], + }, + }) + ).run; + const runB = ( + await world.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName: 'holder-b', + input: [], + }, + }) + ).run; + if (!runA || !runB) { + throw new Error('expected runs'); + } + const correlationA = createLockCorrelationId(runA.runId, 0); + const correlationB = createLockCorrelationId(runB.runId, 0); + + const first = await world.events.create(runA.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 10_000, + }, + }); + const second = await world.events.create(runB.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationB, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 10_000, + }, + }); + + expect(first.event?.eventType).toBe('lock_acquired'); + expect(second.event?.eventType).toBe('lock_created'); + + const released = await world.events.create(runA.runId, { + eventType: 'lock_release', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + }); + + expect(released.event?.eventType).toBe('lock_release'); + if (!released.event || released.event.eventType !== 'lock_release') { + throw new Error('expected lock_release event'); + } + expect(released.event.eventData?.nextWaiter).toMatchObject({ + runId: runB.runId, + lockIndex: 0, + lockCorrelationId: correlationB, + }); + + const correlated = await world.events.listByCorrelationId({ + correlationId: correlationB, + }); + expect( + correlated.data.some( + (event) => event.eventType === 'lock_waiter_queued' + ) + ).toBe(true); + } finally { + await world.close?.(); + await rm(dir, { recursive: true, force: true }); + } + }); + + it('throws when the same key is acquired with a conflicting definition', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-a', + lockIndex: 0, + definition: { + concurrency: { max: 1 }, + }, + leaseTtlMs: 60_000, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs: 5_000 }, + }, + leaseTtlMs: 60_000, + }) + ).rejects.toBeInstanceOf(LimitDefinitionConflictError); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + it('allows a key definition to be reseeded after the key fully drains', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-a', + lockIndex: 0, + definition: { + concurrency: { max: 1 }, + }, + leaseTtlMs: 200, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + + await new Promise((resolve) => setTimeout(resolve, 400)); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs: 5_000 }, + }, + leaseTtlMs: 200, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + it('uses the head waiter retryAfter for backlog-only waiters', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + const key = 'shared-key'; + const periodMs = 5_000; + + const acquired = await limits.acquire({ + key, + runId: 'run-a', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 10, + }); + expect(acquired.status).toBe('acquired'); + + await new Promise((resolve) => setTimeout(resolve, 25)); + + const headWaiter = await limits.acquire({ + key, + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 10, + }); + expect(headWaiter.status).toBe('blocked'); + if (headWaiter.status !== 'blocked') { + throw new Error('expected blocked'); + } + expect(headWaiter.retryAfterMs).toBeGreaterThan(0); + + const backlogOnlyWaiter = await limits.acquire({ + key, + runId: 'run-c', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 10, + }); + expect(backlogOnlyWaiter.status).toBe('blocked'); + if (backlogOnlyWaiter.status !== 'blocked') { + throw new Error('expected blocked'); + } + expect(backlogOnlyWaiter.retryAfterMs).toBeGreaterThan(0); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts new file mode 100644 index 0000000000..be765ca3c0 --- /dev/null +++ b/packages/world-local/src/limits.ts @@ -0,0 +1,672 @@ +import path from 'node:path'; +import { + LimitDefinitionConflictError, + WorkflowWorldError, +} from '@workflow/errors'; +import type { Storage, WorkflowRunWithoutData } from '@workflow/world'; +import { + createLockCorrelationId, + createLockId, + createLockWakeCorrelationId, + type LimitDefinition, + LimitAcquireRequestSchema, + type LimitAcquireResult, + LimitHeartbeatRequestSchema, + type LimitLease, + LimitLeaseSchema, + type LimitNextWaiter, + type LimitReleaseResult, + LimitReleaseRequestSchema, + type Limits, + parseLockId, +} from '@workflow/world'; +import { z } from 'zod'; +import { readJSON, writeJSON } from './fs.js'; +import { monotonicUlid } from './storage/helpers.js'; + +const LimitTokenSchema = z.object({ + tokenId: z.string(), + lockId: z.string(), + acquiredAt: z.coerce.date(), + expiresAt: z.coerce.date(), +}); + +const LimitWaiterSchema = z.object({ + waiterId: z.string(), + lockId: z.string(), + runId: z.string(), + lockIndex: z.number().int().nonnegative(), + createdAt: z.coerce.date(), + leaseTtlMs: z.number().int().positive().optional(), + concurrencyMax: z.number().int().positive().nullable(), + rateCount: z.number().int().positive().nullable(), + ratePeriodMs: z.number().int().positive().nullable(), +}); + +const KeyStateSchema = z.object({ + key: z.string(), + definition: z + .object({ + concurrency: z.object({ max: z.number().int().positive() }).optional(), + rate: z + .object({ + count: z.number().int().positive(), + periodMs: z.number().int().positive(), + }) + .optional(), + }) + .optional(), + leases: z.array(LimitLeaseSchema), + tokens: z.array(LimitTokenSchema), + waiters: z.array(LimitWaiterSchema), +}); + +const LimitsStateSchema = z.object({ + version: z.union([z.literal(2), z.literal(3)]), + keys: z.record(z.string(), KeyStateSchema), +}); + +type LimitToken = z.infer; +type LimitWaiter = z.infer; +type KeyState = z.infer; +type LimitsState = z.infer; + +type HolderTarget = + | { + kind: 'lock'; + runId: string; + wakeCorrelationId: string; + lockCorrelationId: string; + } + | { + kind: 'opaque'; + }; + +export interface LocalLimitsOptions { + tag?: string; + storage?: Pick; +} + +const EMPTY_STATE: LimitsState = { + version: 3, + keys: {}, +}; + +function getStatePath(dataDir: string, tag?: string): string { + return path.join(dataDir, 'limits', tag ? `state.${tag}.json` : 'state.json'); +} + +function cloneToken(token: LimitToken): LimitToken { + return { ...token }; +} + +function cloneWaiter(waiter: LimitWaiter): LimitWaiter { + return { ...waiter }; +} + +function normalizeKeyState(keyState: KeyState): KeyState { + return { + key: keyState.key, + definition: keyState.definition, + leases: keyState.leases.map((lease) => ({ ...lease })), + tokens: keyState.tokens.map(cloneToken), + waiters: keyState.waiters.map(cloneWaiter), + }; +} + +function cloneState(state: LimitsState): LimitsState { + return { + version: 3, + keys: Object.fromEntries( + Object.entries(state.keys).map(([key, keyState]) => [ + key, + normalizeKeyState(keyState), + ]) + ), + }; +} + +function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { + return { + key: keyState.key, + definition: keyState.definition, + leases: keyState.leases.filter( + (lease) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > now + ), + tokens: keyState.tokens.filter((token) => token.expiresAt.getTime() > now), + waiters: keyState.waiters.map(cloneWaiter), + }; +} + +function areLimitDefinitionsEqual( + left: LimitDefinition | undefined, + right: LimitDefinition +): boolean { + return ( + left?.concurrency?.max === right.concurrency?.max && + left?.rate?.count === right.rate?.count && + left?.rate?.periodMs === right.rate?.periodMs + ); +} + +function assertCanonicalDefinition( + key: string, + keyState: KeyState, + requested: LimitDefinition +) { + if (!keyState.definition) { + keyState.definition = requested; + return; + } + + if (!areLimitDefinitionsEqual(keyState.definition, requested)) { + throw new LimitDefinitionConflictError(key, keyState.definition, requested); + } +} + +function getBlockedReason( + concurrencyBlocked: boolean, + rateBlocked: boolean +): 'concurrency' | 'rate' | 'concurrency_and_rate' { + if (concurrencyBlocked && rateBlocked) return 'concurrency_and_rate'; + if (concurrencyBlocked) return 'concurrency'; + return 'rate'; +} + +function getRetryAfterMs( + keyState: KeyState, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const candidates: number[] = []; + + if (concurrencyBlocked) { + for (const lease of keyState.leases) { + if (lease.expiresAt) { + candidates.push(Math.max(0, lease.expiresAt.getTime() - now)); + } + } + } + + if (rateBlocked) { + for (const token of keyState.tokens) { + candidates.push(Math.max(0, token.expiresAt.getTime() - now)); + } + } + + if (candidates.length === 0) { + return undefined; + } + + return Math.min(...candidates); +} + +function getWaiterRetryAfterMs( + keyState: KeyState, + now: number, + waiter: Pick +): number | undefined { + return getRetryAfterMs( + keyState, + now, + waiter.concurrencyMax !== null && + keyState.leases.length >= waiter.concurrencyMax, + waiter.rateCount !== null && keyState.tokens.length >= waiter.rateCount + ); +} + +function getBlockedRetryAfterMs( + keyState: KeyState, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const headWaiter = keyState.waiters[0]; + return ( + (headWaiter + ? getWaiterRetryAfterMs(keyState, now, headWaiter) + : undefined) ?? + getRetryAfterMs(keyState, now, concurrencyBlocked, rateBlocked) + ); +} + +function createLease( + key: string, + runId: string, + lockIndex: number, + definition: LimitLease['definition'], + acquiredAt: Date, + leaseTtlMs?: number +): LimitLease { + return { + leaseId: `lmt_${monotonicUlid()}`, + key, + lockId: createLockId(runId, lockIndex), + runId, + lockIndex, + acquiredAt, + expiresAt: + leaseTtlMs !== undefined + ? new Date(acquiredAt.getTime() + leaseTtlMs) + : undefined, + definition, + }; +} + +function insertToken( + keyState: KeyState, + lockId: string, + acquiredAt: Date, + periodMs: number +) { + keyState.tokens.push({ + tokenId: `lmttok_${monotonicUlid()}`, + lockId, + acquiredAt, + expiresAt: new Date(acquiredAt.getTime() + periodMs), + }); +} + +function parseHolderId(lockId: string): HolderTarget { + const parsedLockId = parseLockId(lockId); + if (parsedLockId) { + return { + kind: 'lock', + runId: parsedLockId.runId, + wakeCorrelationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + lockCorrelationId: createLockCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; + } + + return { kind: 'opaque' }; +} + +function toNextWaiter(holderId: string): LimitNextWaiter | undefined { + const parsedLockId = parseLockId(holderId); + if (!parsedLockId) { + return undefined; + } + + return { + runId: parsedLockId.runId, + lockIndex: parsedLockId.lockIndex, + wakeCorrelationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + lockCorrelationId: createLockCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; +} + +function isTerminalRun(run: WorkflowRunWithoutData | undefined) { + return !!run && ['completed', 'failed', 'cancelled'].includes(run.status); +} + +function deleteEmptyKey(state: LimitsState, key: string) { + const keyState = state.keys[key]; + if (!keyState) return; + if ( + keyState.leases.length === 0 && + keyState.tokens.length === 0 && + keyState.waiters.length === 0 + ) { + delete state.keys[key]; + } +} + +export function createLimits( + dataDir: string, + tagOrOptions?: string | LocalLimitsOptions +): Limits { + const options = + typeof tagOrOptions === 'string' ? { tag: tagOrOptions } : tagOrOptions; + const statePath = getStatePath(dataDir, options?.tag); + let stateOp = Promise.resolve(); + + const withStateLock = async (fn: () => Promise): Promise => { + const run = stateOp.then(fn, fn); + stateOp = run.then( + () => undefined, + () => undefined + ); + return run; + }; + + const readState = async (): Promise => { + const raw = + (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE); + + return cloneState(raw); + }; + + const writeState = async (state: LimitsState): Promise => { + await writeJSON(statePath, state, { overwrite: true }); + }; + + const getRun = async ( + runId: string + ): Promise => { + try { + return await options?.storage?.runs.get(runId, { resolveData: 'none' }); + } catch { + return undefined; + } + }; + + const isHolderLive = async (holderId: string): Promise => { + const target = parseHolderId(holderId); + if (target.kind === 'opaque' || !options?.storage) { + return true; + } + + const run = await getRun(target.runId); + return !isTerminalRun(run); + }; + + const pruneDeadHoldersAndWaiters = async ( + keyState: KeyState + ): Promise => { + const prunedKeyState = pruneKeyState(keyState); + const leases: LimitLease[] = []; + const waiters: LimitWaiter[] = []; + + for (const lease of prunedKeyState.leases) { + if (await isHolderLive(lease.lockId)) { + leases.push(lease); + } + } + + for (const waiter of prunedKeyState.waiters) { + if (await isHolderLive(waiter.lockId)) { + waiters.push(waiter); + } + } + + prunedKeyState.leases = leases; + prunedKeyState.waiters = waiters; + return prunedKeyState; + }; + + const promoteWaiter = ( + key: string, + keyState: KeyState, + waiter: LimitWaiter + ): { + keyState: KeyState; + lease: LimitLease; + nextWaiter?: LimitNextWaiter; + } => { + const acquiredAt = new Date(); + const definition = { + concurrency: + waiter.concurrencyMax !== null + ? { max: waiter.concurrencyMax } + : undefined, + rate: + waiter.rateCount !== null && waiter.ratePeriodMs !== null + ? { + count: waiter.rateCount, + periodMs: waiter.ratePeriodMs, + } + : undefined, + } satisfies LimitDefinition; + + const lease = createLease( + key, + waiter.runId, + waiter.lockIndex, + definition, + acquiredAt, + waiter.leaseTtlMs + ); + + keyState.waiters = keyState.waiters.filter( + (candidate) => candidate.waiterId !== waiter.waiterId + ); + keyState.leases.push(lease); + + if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { + insertToken(keyState, waiter.lockId, acquiredAt, waiter.ratePeriodMs); + } + + return { + keyState, + lease, + nextWaiter: toNextWaiter(waiter.lockId), + }; + }; + + return { + async acquire(request) { + const parsed = LimitAcquireRequestSchema.parse(request); + const lockId = createLockId(parsed.runId, parsed.lockIndex); + + return withStateLock(async (): Promise => { + const state = cloneState(await readState()); + const keyState = await pruneDeadHoldersAndWaiters( + state.keys[parsed.key] ?? { + key: parsed.key, + definition: undefined, + leases: [], + tokens: [], + waiters: [], + } + ); + if ( + keyState.leases.length === 0 && + keyState.tokens.length === 0 && + keyState.waiters.length === 0 + ) { + keyState.definition = undefined; + } + assertCanonicalDefinition(parsed.key, keyState, parsed.definition); + state.keys[parsed.key] = keyState; + + const existingLease = keyState.leases.find( + (lease) => lease.lockId === lockId + ); + if (existingLease) { + await writeState(state); + return { + status: 'acquired', + lease: existingLease, + }; + } + + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + keyState.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + keyState.tokens.length >= parsed.definition.rate.count; + const existingWaiter = keyState.waiters.find( + (waiter) => waiter.lockId === lockId + ); + + if ( + existingWaiter && + keyState.waiters[0]?.waiterId === existingWaiter.waiterId + ) { + if (!concurrencyBlocked && !rateBlocked) { + const promoted = promoteWaiter( + parsed.key, + keyState, + existingWaiter + ); + state.keys[parsed.key] = promoted.keyState; + await writeState(state); + return { + status: 'acquired', + lease: promoted.lease, + }; + } + } + + if ( + existingWaiter || + concurrencyBlocked || + rateBlocked || + keyState.waiters.length > 0 + ) { + if (!existingWaiter) { + keyState.waiters.push({ + waiterId: `lmtwait_${monotonicUlid()}`, + lockId, + runId: parsed.runId, + lockIndex: parsed.lockIndex, + createdAt: new Date(), + leaseTtlMs: parsed.leaseTtlMs, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }); + } + + state.keys[parsed.key] = keyState; + await writeState(state); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: getBlockedRetryAfterMs( + keyState, + Date.now(), + concurrencyBlocked, + rateBlocked + ), + }; + } + + const acquiredAt = new Date(); + const lease = createLease( + parsed.key, + parsed.runId, + parsed.lockIndex, + parsed.definition, + acquiredAt, + parsed.leaseTtlMs + ); + + keyState.leases.push(lease); + + if (parsed.definition.rate) { + insertToken( + keyState, + lockId, + acquiredAt, + parsed.definition.rate.periodMs + ); + } + + state.keys[parsed.key] = keyState; + await writeState(state); + + return { + status: 'acquired', + lease, + }; + }); + }, + + async release(request) { + const parsed = LimitReleaseRequestSchema.parse(request); + + return withStateLock(async (): Promise => { + const state = cloneState(await readState()); + let nextWaiter: LimitNextWaiter | undefined; + + for (const [key, keyStateValue] of Object.entries(state.keys)) { + const beforeLeases = keyStateValue.leases.length; + const keyState = await pruneDeadHoldersAndWaiters(keyStateValue); + let capacityFreed = keyState.leases.length !== beforeLeases; + const beforeExplicitRelease = keyState.leases.length; + keyState.leases = keyState.leases.filter((lease) => { + if (lease.leaseId !== parsed.leaseId) return true; + if (parsed.key && lease.key !== parsed.key) return true; + if (parsed.lockId && lease.lockId !== parsed.lockId) { + return true; + } + return false; + }); + capacityFreed ||= keyState.leases.length !== beforeExplicitRelease; + + if (capacityFreed) { + const headWaiter = keyState.waiters[0]; + if (headWaiter) { + const concurrencyBlocked = + headWaiter.concurrencyMax !== null && + keyState.leases.length >= headWaiter.concurrencyMax; + const rateBlocked = + headWaiter.rateCount !== null && + keyState.tokens.length >= headWaiter.rateCount; + + if (!concurrencyBlocked && !rateBlocked) { + const promoted = promoteWaiter(key, keyState, headWaiter); + nextWaiter = promoted.nextWaiter; + state.keys[key] = promoted.keyState; + } else { + state.keys[key] = keyState; + } + } else { + state.keys[key] = keyState; + } + } else { + state.keys[key] = keyState; + } + + deleteEmptyKey(state, key); + } + + await writeState(state); + return { nextWaiter }; + }); + }, + + async heartbeat(request) { + const parsed = LimitHeartbeatRequestSchema.parse(request); + + return withStateLock(async () => { + const state = cloneState(await readState()); + const now = Date.now(); + + for (const [key, keyStateValue] of Object.entries(state.keys)) { + const keyState = pruneKeyState(keyStateValue, now); + const leaseIndex = keyState.leases.findIndex( + (lease) => lease.leaseId === parsed.leaseId + ); + + if (leaseIndex === -1) { + state.keys[key] = keyState; + continue; + } + + const lease = keyState.leases[leaseIndex]; + const currentExpiry = lease.expiresAt?.getTime(); + const ttlMs = + parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); + const updatedLease: LimitLease = { + ...lease, + expiresAt: new Date(now + Math.max(1, ttlMs)), + }; + + keyState.leases[leaseIndex] = updatedLease; + state.keys[key] = keyState; + await writeState(state); + return updatedLease; + } + + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); + }); + }, + }; +} diff --git a/packages/world-local/src/queue.test.ts b/packages/world-local/src/queue.test.ts index 32c8d1f834..e96ed16695 100644 --- a/packages/world-local/src/queue.test.ts +++ b/packages/world-local/src/queue.test.ts @@ -2,11 +2,6 @@ import type { StepInvokePayload } from '@workflow/world'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { createQueue } from './queue'; -// Mock node:timers/promises so setTimeout resolves immediately -vi.mock('node:timers/promises', () => ({ - setTimeout: vi.fn().mockResolvedValue(undefined), -})); - const stepPayload: StepInvokePayload = { workflowName: 'test-workflow', workflowRunId: 'run_01ABC', @@ -15,14 +10,17 @@ const stepPayload: StepInvokePayload = { }; describe('queue timeout re-enqueue', () => { + const maxSetTimeoutDelayMs = 2_147_483_647; let localQueue: ReturnType; beforeEach(() => { + vi.useFakeTimers(); localQueue = createQueue({ baseUrl: 'http://localhost:3000' }); }); afterEach(async () => { await localQueue.close(); + vi.useRealTimers(); }); it('createQueueHandler returns 200 with timeoutSeconds in the body', async () => { @@ -72,29 +70,6 @@ describe('queue timeout re-enqueue', () => { expect(body).toEqual({ ok: true }); }); - it('createQueueHandler returns 200 with timeoutSeconds: 0', async () => { - const handler = localQueue.createQueueHandler('__wkf_step_', async () => ({ - timeoutSeconds: 0, - })); - - const req = new Request('http://localhost/step', { - method: 'POST', - headers: { - 'content-type': 'application/json', - 'x-vqs-queue-name': '__wkf_step_test', - 'x-vqs-message-id': 'msg_01ABC', - 'x-vqs-message-attempt': '1', - }, - body: JSON.stringify(stepPayload), - }); - - const response = await handler(req); - expect(response.status).toBe(200); - - const body = await response.json(); - expect(body).toEqual({ timeoutSeconds: 0 }); - }); - it('queue retries when handler returns timeoutSeconds > 0', async () => { let callCount = 0; const handler = localQueue.createQueueHandler('__wkf_step_', async () => { @@ -102,25 +77,18 @@ describe('queue timeout re-enqueue', () => { if (callCount < 3) { return { timeoutSeconds: 5 }; } - // Third call succeeds normally return undefined; }); localQueue.registerHandler('__wkf_step_', handler); await localQueue.queue('__wkf_step_test' as any, stepPayload); + await vi.runAllTimersAsync(); - // Wait for the async queue processing to complete - // The queue fires off processing asynchronously, so we need to wait - await vi.waitFor(() => { - expect(callCount).toBe(3); - }); + expect(callCount).toBe(3); }); it('queue retries immediately when handler returns timeoutSeconds: 0', async () => { - const { setTimeout: mockSetTimeout } = await import('node:timers/promises'); - vi.mocked(mockSetTimeout).mockClear(); - let callCount = 0; const handler = localQueue.createQueueHandler('__wkf_step_', async () => { callCount++; @@ -133,12 +101,88 @@ describe('queue timeout re-enqueue', () => { localQueue.registerHandler('__wkf_step_', handler); await localQueue.queue('__wkf_step_test' as any, stepPayload); + await vi.runAllTimersAsync(); + + expect(callCount).toBe(3); + }); + + it('replaces delayed idempotent deliveries with an immediate wake-up', async () => { + const seenStepIds: string[] = []; + const handler = localQueue.createQueueHandler( + '__wkf_step_', + async (body) => { + seenStepIds.push((body as StepInvokePayload).stepId); + return undefined; + } + ); + + localQueue.registerHandler('__wkf_step_', handler); + + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + idempotencyKey: 'step_01ABC', + delaySeconds: 30, + }); + await localQueue.queue( + '__wkf_step_test' as any, + { ...stepPayload, stepId: 'step_replacement' }, + { + idempotencyKey: 'step_01ABC', + } + ); + + await vi.runAllTimersAsync(); + + expect(seenStepIds).toEqual(['step_replacement']); + }); + + it('does not fire long delayed messages before the setTimeout max delay elapses', async () => { + let callCount = 0; + const delaySeconds = Math.ceil((maxSetTimeoutDelayMs + 5_000) / 1000); + const remainingDelayMs = delaySeconds * 1000 - maxSetTimeoutDelayMs; + const handler = localQueue.createQueueHandler('__wkf_step_', async () => { + callCount++; + return undefined; + }); + + localQueue.registerHandler('__wkf_step_', handler); - await vi.waitFor(() => { - expect(callCount).toBe(3); + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + delaySeconds, }); - // setTimeout should NOT have been called for timeoutSeconds: 0 - expect(mockSetTimeout).not.toHaveBeenCalled(); + await vi.advanceTimersByTimeAsync(maxSetTimeoutDelayMs); + expect(callCount).toBe(0); + + await vi.advanceTimersByTimeAsync(remainingDelayMs); + expect(callCount).toBe(1); + }); + + it('replaces chunked long-delay deliveries with an immediate idempotent wake-up', async () => { + const seenStepIds: string[] = []; + const handler = localQueue.createQueueHandler( + '__wkf_step_', + async (body) => { + seenStepIds.push((body as StepInvokePayload).stepId); + return undefined; + } + ); + + localQueue.registerHandler('__wkf_step_', handler); + + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + idempotencyKey: 'step_very_delayed', + delaySeconds: Math.ceil((maxSetTimeoutDelayMs + 5_000) / 1000), + }); + await localQueue.queue( + '__wkf_step_test' as any, + { ...stepPayload, stepId: 'step_immediate_replacement' }, + { + idempotencyKey: 'step_very_delayed', + } + ); + + await vi.runAllTimersAsync(); + + expect(seenStepIds).toEqual(['step_immediate_replacement']); }); }); diff --git a/packages/world-local/src/queue.ts b/packages/world-local/src/queue.ts index e49300dec7..c45e80e986 100644 --- a/packages/world-local/src/queue.ts +++ b/packages/world-local/src/queue.ts @@ -1,4 +1,3 @@ -import { setTimeout } from 'node:timers/promises'; import { JsonTransport } from '@vercel/queue'; import { MessageId, type Queue, ValidQueueName } from '@workflow/world'; import { Sema } from 'async-sema'; @@ -9,37 +8,40 @@ import type { Config } from './config.js'; import { resolveBaseUrl } from './config.js'; import { getPackageInfo } from './init.js'; -// For local queue, there is no technical limit on the message visibility lifespan, -// but the environment variable can be used for testing purposes to set a max visibility limit. const LOCAL_QUEUE_MAX_VISIBILITY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_MAX_VISIBILITY ?? '0', 10) || Infinity; -// Maximum safe delay for setTimeout in Node.js (2^31 - 1 milliseconds ≈ 24.85 days) -// Larger values cause "TimeoutOverflowWarning: X does not fit into a 32-bit signed integer" -// When the clamped timeout fires, the handler will recalculate remaining time from -// persistent state and return another timeoutSeconds if needed. -const MAX_SAFE_TIMEOUT_MS = 2147483647; - -// The local workers share the same Node.js process and event loop, -// so we need to limit concurrency to avoid overwhelming the system. const DEFAULT_CONCURRENCY_LIMIT = 1000; const WORKFLOW_LOCAL_QUEUE_CONCURRENCY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_CONCURRENCY ?? '0', 10) || DEFAULT_CONCURRENCY_LIMIT; +const MAX_SET_TIMEOUT_DELAY_MS = 2_147_483_647; export type DirectHandler = (req: Request) => Promise; export type LocalQueue = Queue & { - /** Close the HTTP agent and release resources. */ close(): Promise; - /** Register a direct in-process handler for a queue prefix, bypassing HTTP. */ registerHandler( prefix: '__wkf_step_' | '__wkf_workflow_', handler: DirectHandler ): void; }; +type ScheduledMessage = { + attempt: number; + body: Uint8Array; + headers?: Record; + idempotencyKey?: string; + messageId: MessageId; + pendingExecution: boolean; + queueName: ValidQueueName; + remainingServerRetries: number; + running: boolean; + timer?: ReturnType; + version: number; +}; + function getQueueRoute(queueName: ValidQueueName): { pathname: 'flow' | 'step'; prefix: '__wkf_step_' | '__wkf_workflow_'; @@ -54,11 +56,6 @@ function getQueueRoute(queueName: ValidQueueName): { } export function createQueue(config: Partial): LocalQueue { - // Create a custom agent optimized for high-concurrency local workflows: - // - headersTimeout: 0 allows long-running steps - // - connections: 1000 allows many parallel connections to the same host - // - pipelining: 1 (default) for HTTP/1.1 compatibility - // - keepAliveTimeout: 30s keeps connections warm for rapid step execution const httpAgent = new Agent({ headersTimeout: 0, connections: 1000, @@ -67,163 +64,249 @@ export function createQueue(config: Partial): LocalQueue { const transport = new JsonTransport(); const generateId = monotonicFactory(); const semaphore = new Sema(WORKFLOW_LOCAL_QUEUE_CONCURRENCY); - - /** - * holds inflight messages by idempotency key to ensure - * that we don't queue the same message multiple times - */ - const inflightMessages = new Map(); - /** Direct in-process handlers by queue prefix, bypassing HTTP when set. */ + const scheduledMessages = new Map(); const directHandlers = new Map(); + let closed = false; - const queue: Queue['queue'] = async (queueName, message, opts) => { - const cleanup = [] as (() => void)[]; + const cleanupMessage = (message: ScheduledMessage) => { + if (message.timer) { + clearTimeout(message.timer); + message.timer = undefined; + } + if (message.idempotencyKey) { + scheduledMessages.delete(message.idempotencyKey); + } + }; - if (opts?.idempotencyKey) { - const existing = inflightMessages.get(opts.idempotencyKey); - if (existing) { - return { messageId: existing }; - } + const scheduleExecution = (message: ScheduledMessage, delayMs: number) => { + if (closed) { + cleanupMessage(message); + return; } - const body = transport.serialize(message); - const { pathname, prefix } = getQueueRoute(queueName); - const messageId = MessageId.parse(`msg_${generateId()}`); + if (message.timer) { + clearTimeout(message.timer); + message.timer = undefined; + } - // Extract identifiers from the message for structured logging. - // Workflow messages have `runId`, step messages have `workflowRunId` + `stepId`. - const msg = message as Record; - const runId = (msg.runId ?? msg.workflowRunId ?? undefined) as - | string - | undefined; - const stepId = (msg.stepId ?? undefined) as string | undefined; + const version = ++message.version; + const enqueueRun = () => { + message.pendingExecution = true; + if (!message.running) { + void executeMessage(message); + } + }; - if (opts?.idempotencyKey) { - const key = opts.idempotencyKey; - inflightMessages.set(key, messageId); - cleanup.push(() => { - inflightMessages.delete(key); - }); + if (delayMs <= 0) { + enqueueRun(); + return; } - (async () => { - const token = semaphore.tryAcquire(); - if (!token) { - console.warn( - `[world-local]: concurrency limit (${WORKFLOW_LOCAL_QUEUE_CONCURRENCY}) reached, waiting for queue to free up` - ); - await semaphore.acquire(); + const timeoutMs = Math.min(delayMs, MAX_SET_TIMEOUT_DELAY_MS); + message.timer = globalThis.setTimeout(() => { + if (message.version !== version || closed) { + return; } - // Safety limit to prevent infinite loops in the local queue. - // The actual max delivery enforcement happens in the workflow/step handlers - // (at MAX_QUEUE_DELIVERIES = 48), so this just needs to be comfortably higher. - const MAX_LOCAL_SAFETY_LIMIT = 256; + message.timer = undefined; + if (delayMs > MAX_SET_TIMEOUT_DELAY_MS) { + scheduleExecution(message, delayMs - MAX_SET_TIMEOUT_DELAY_MS); + return; + } + enqueueRun(); + }, timeoutMs); + }; + + const deliverMessage = async ( + message: ScheduledMessage + ): Promise< + | { kind: 'success' } + | { kind: 'timeout'; delayMs: number } + | { kind: 'server_error'; status: number; text: string } + > => { + const { pathname, prefix } = getQueueRoute(message.queueName); + const headers: Record = { + ...message.headers, + 'content-type': 'application/json', + 'x-vqs-queue-name': message.queueName, + 'x-vqs-message-id': message.messageId, + 'x-vqs-message-attempt': String(message.attempt + 1), + }; + const directHandler = directHandlers.get(prefix); + let response: Response; + + if (directHandler) { + const req = new Request( + `http://localhost/.well-known/workflow/v1/${pathname}`, + { + method: 'POST', + headers, + body: message.body, + } + ); + response = await directHandler(req); + } else { + const baseUrl = await resolveBaseUrl(config); + response = await fetch(`${baseUrl}/.well-known/workflow/v1/${pathname}`, { + method: 'POST', + duplex: 'half', + dispatcher: httpAgent, + headers, + body: message.body, + } as any); + } + + const text = await response.text(); + + if (response.ok) { try { - for (let attempt = 0; attempt < MAX_LOCAL_SAFETY_LIMIT; attempt++) { - const headers: Record = { - ...opts?.headers, - 'content-type': 'application/json', - 'x-vqs-queue-name': queueName, - 'x-vqs-message-id': messageId, - 'x-vqs-message-attempt': String(attempt + 1), + const timeoutSeconds = Number(JSON.parse(text).timeoutSeconds); + if (Number.isFinite(timeoutSeconds) && timeoutSeconds >= 0) { + return { + kind: 'timeout', + delayMs: timeoutSeconds > 0 ? timeoutSeconds * 1000 : 0, }; - const directHandler = directHandlers.get(prefix); - let response: Response; - - if (directHandler) { - const req = new Request( - `http://localhost/.well-known/workflow/v1/${pathname}`, - { - method: 'POST', - headers, - body, - } - ); - response = await directHandler(req); - } else { - const baseUrl = await resolveBaseUrl(config); - // eslint-disable-next-line @typescript-eslint/no-explicit-any -- undici v7 dispatcher types don't match @types/node's RequestInit - response = await fetch( - `${baseUrl}/.well-known/workflow/v1/${pathname}`, - { - method: 'POST', - duplex: 'half', - dispatcher: httpAgent, - headers, - body, - } as any - ); + } + } catch {} + + return { kind: 'success' }; + } + + return { + kind: 'server_error', + status: response.status, + text, + }; + }; + + const executeMessage = async (message: ScheduledMessage): Promise => { + if (closed || message.running) { + return; + } + + message.running = true; + + try { + while (message.pendingExecution && !closed) { + message.pendingExecution = false; + const version = message.version; + const token = semaphore.tryAcquire(); + if (!token) { + console.warn( + `[world-local]: concurrency limit (${WORKFLOW_LOCAL_QUEUE_CONCURRENCY}) reached, waiting for queue to free up` + ); + await semaphore.acquire(); + } + + try { + if (closed) { + cleanupMessage(message); + return; } - const text = await response.text(); - - if (response.ok) { - try { - const timeoutSeconds = Number(JSON.parse(text).timeoutSeconds); - if (Number.isFinite(timeoutSeconds) && timeoutSeconds >= 0) { - // Clamp to MAX_SAFE_TIMEOUT_MS to avoid Node.js setTimeout overflow warning. - // When this fires early, the handler recalculates remaining time from - // persistent state and returns another timeoutSeconds if needed. - if (timeoutSeconds > 0) { - const timeoutMs = Math.min( - timeoutSeconds * 1000, - MAX_SAFE_TIMEOUT_MS - ); - await setTimeout(timeoutMs); - } - continue; - } - } catch {} + if (version !== message.version) { + continue; + } + + const result = await deliverMessage(message); + + if (result.kind === 'success') { + cleanupMessage(message); return; } + if (result.kind === 'timeout') { + message.attempt += 1; + scheduleExecution( + message, + result.delayMs === 0 + ? 0 + : Math.min(result.delayMs, LOCAL_QUEUE_MAX_VISIBILITY * 1000) + ); + continue; + } + console.error( - `[world-local] Queue message failed (attempt ${attempt + 1}, HTTP ${response.status})`, - { - queueName, - messageId, - ...(runId && { runId }), - ...(stepId && { stepId }), - handlerError: text, - } + `[world-local] Queue message failed (attempt ${ + message.attempt + 1 + }/3, status ${result.status}): ${result.text}`, + { queueName: message.queueName, messageId: message.messageId } ); - // 5s linear backoff to approximate VQS retry timing in local dev. - // VQS uses 5s linear for attempts 1–32, then exponential, but for - // local dev linear 5s is sufficient — the handler enforces the real - // cap at MAX_QUEUE_DELIVERIES (48) which keeps total time under ~4min. - await setTimeout(5000); - } - - console.error( - `[world-local] Queue message exhausted safety limit (${MAX_LOCAL_SAFETY_LIMIT} attempts)`, - { - queueName, - messageId, - ...(runId && { runId }), - ...(stepId && { stepId }), + message.attempt += 1; + message.remainingServerRetries -= 1; + if (message.remainingServerRetries > 0) { + scheduleExecution(message, 0); + continue; } - ); - } finally { - semaphore.release(); - } - })() - .catch((err) => { - // Silently ignore client disconnect errors (e.g., browser refresh during streaming) - // These are expected and should not cause unhandled rejection warnings - const isAbortError = - err?.name === 'AbortError' || err?.name === 'ResponseAborted'; - if (!isAbortError) { - console.error('[local world] Queue operation failed:', err); + + console.error(`[world-local] Queue message exhausted all retries`, { + queueName: message.queueName, + messageId: message.messageId, + }); + cleanupMessage(message); + return; + } finally { + semaphore.release(); } - }) - .finally(() => { - for (const fn of cleanup) { - fn(); + } + } catch (err) { + const queueError = err as { name?: string }; + const isAbortError = + queueError.name === 'AbortError' || + queueError.name === 'ResponseAborted'; + if (!isAbortError) { + console.error('[local world] Queue operation failed:', err); + } + cleanupMessage(message); + } finally { + message.running = false; + if (message.pendingExecution && !closed) { + void executeMessage(message); + } + } + }; + + const queue: Queue['queue'] = async (queueName, message, opts) => { + const body = transport.serialize(message); + const delayMs = + typeof opts?.delaySeconds === 'number' && opts.delaySeconds > 0 + ? opts.delaySeconds * 1000 + : 0; + + if (opts?.idempotencyKey) { + const existing = scheduledMessages.get(opts.idempotencyKey); + if (existing) { + if (existing.running) { + return { messageId: existing.messageId }; } - }); - return { messageId }; + existing.queueName = queueName; + existing.body = body; + existing.headers = opts.headers; + scheduleExecution(existing, delayMs); + return { messageId: existing.messageId }; + } + } + + const scheduledMessage: ScheduledMessage = { + attempt: 0, + body, + headers: opts?.headers, + idempotencyKey: opts?.idempotencyKey, + messageId: MessageId.parse(`msg_${generateId()}`), + pendingExecution: false, + queueName, + remainingServerRetries: 3, + running: false, + version: 0, + }; + + if (opts?.idempotencyKey) { + scheduledMessages.set(opts.idempotencyKey, scheduledMessage); + } + + scheduleExecution(scheduledMessage, delayMs); + return { messageId: scheduledMessage.messageId }; }; const HeaderParser = z.object({ @@ -294,6 +377,11 @@ export function createQueue(config: Partial): LocalQueue { directHandlers.set(prefix, handler); }, async close() { + closed = true; + for (const message of scheduledMessages.values()) { + cleanupMessage(message); + } + scheduledMessages.clear(); await httpAgent.close(); }, }; diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index efa031afcf..e798064c4f 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -12,6 +12,8 @@ import type { Event, EventResult, Hook, + Limits, + Queue, SerializedData, Step, Storage, @@ -65,14 +67,38 @@ async function deleteAllWaitsForRun( } } +async function listEventsByCorrelationId( + basedir: string, + correlationId: string +): Promise { + const result = await paginatedFileSystemQuery({ + directory: path.join(basedir, 'events'), + schema: EventSchema, + filter: (event) => event.correlationId === correlationId, + sortOrder: 'asc', + getCreatedAt: getObjectCreatedAt('evnt'), + getId: (event) => event.eventId, + }); + + return result.data; +} + /** * Creates the events storage implementation using the filesystem. * Implements the Storage['events'] interface with create, list, and listByCorrelationId operations. */ export function createEventsStorage( basedir: string, - tag?: string + tag?: string, + options?: { + getLimits?: () => Limits | undefined; + queue?: Pick; + runs?: Pick; + } ): Storage['events'] { + const isLeaseLive = (lease: { expiresAt?: Date }) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > Date.now(); + return { async create(runId, data, params): Promise { const eventId = `evnt_${monotonicUlid()}`; @@ -204,7 +230,11 @@ export function createEventsStorage( if ( data.eventType === 'step_created' || data.eventType === 'hook_created' || - data.eventType === 'wait_created' + data.eventType === 'wait_created' || + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' || + data.eventType === 'lock_waiter_queued' ) { throw new EntityConflictError( `Cannot create new entities on run in terminal state "${currentRun.status}"` @@ -273,7 +303,7 @@ export function createEventsStorage( throw new HookNotFoundError(data.correlationId); } } - const event: Event = { + let event: Event = { ...data, runId: effectiveRunId, eventId, @@ -287,6 +317,264 @@ export function createEventsStorage( let hook: Hook | undefined; let wait: Wait | undefined; + if ( + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' + ) { + const limits = options?.getLimits?.(); + if (!limits) { + throw new WorkflowWorldError( + `Flow limits are not configured for event type "${data.eventType}"` + ); + } + + const existingEvents = await listEventsByCorrelationId( + basedir, + data.correlationId + ); + const existingCreatedEvent = existingEvents.find( + (event) => event.eventType === 'lock_created' + ); + const existingAcquiredEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_acquired'); + const existingReleaseEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_release'); + + if (data.eventType === 'lock_created') { + const existingEvent = + existingReleaseEvent ?? + (existingAcquiredEvent?.eventData?.lease && + isLeaseLive(existingAcquiredEvent.eventData.lease) + ? existingAcquiredEvent + : undefined) ?? + existingCreatedEvent; + if (existingEvent) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingEvent, resolveData), + run, + step, + hook, + wait, + }; + } + + const result = await limits.acquire({ + key: data.eventData.key, + runId: effectiveRunId, + lockIndex: Number.parseInt( + data.correlationId.split(':').at(-1) ?? '0', + 10 + ), + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + }); + const eventCreatedAt = new Date(); + + event = + result.status === 'acquired' + ? EventSchema.parse({ + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }) + : EventSchema.parse({ + eventType: 'lock_created', + correlationId: data.correlationId, + eventData: { + key: data.eventData.key, + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + acquireAt: + result.retryAfterMs !== undefined + ? new Date( + eventCreatedAt.getTime() + result.retryAfterMs + ) + : undefined, + }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }); + } else if (data.eventType === 'lock_acquired') { + if (existingReleaseEvent) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingReleaseEvent, resolveData), + run, + step, + hook, + wait, + }; + } + if ( + existingAcquiredEvent?.eventData?.lease && + isLeaseLive(existingAcquiredEvent.eventData.lease) + ) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingAcquiredEvent, resolveData), + run, + step, + hook, + wait, + }; + } + + const createdEvent = existingCreatedEvent; + if (!createdEvent || !createdEvent.eventData) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be acquired before lock_created` + ); + } + + const result = await limits.acquire({ + key: createdEvent.eventData.key, + runId: effectiveRunId, + lockIndex: Number.parseInt( + data.correlationId.split(':').at(-1) ?? '0', + 10 + ), + definition: createdEvent.eventData.definition, + leaseTtlMs: createdEvent.eventData.leaseTtlMs, + }); + if (result.status !== 'acquired') { + const retryAfter = + result.retryAfterMs !== undefined + ? Math.ceil(result.retryAfterMs / 1000) + : undefined; + throw new TooEarlyError( + `Lock "${data.correlationId}" is not ready to acquire`, + { retryAfter } + ); + } + const eventCreatedAt = new Date(); + + event = EventSchema.parse({ + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }); + } else { + if (existingReleaseEvent) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingReleaseEvent, resolveData), + run, + step, + hook, + wait, + }; + } + + const acquiredEvent = existingAcquiredEvent; + const lease = acquiredEvent?.eventData?.lease; + if (!lease) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be released before lock_acquired` + ); + } + + const releaseResult = await limits.release({ + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }); + const eventCreatedAt = new Date(); + + event = EventSchema.parse({ + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + nextWaiter: releaseResult.nextWaiter, + }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }); + } + + const compositeKey = `${effectiveRunId}-${eventId}`; + await writeJSON( + taggedPath(basedir, 'events', compositeKey, tag), + event + ); + + if ( + event.eventType === 'lock_release' && + event.eventData?.nextWaiter && + options?.queue && + options?.runs + ) { + const nextRun = await options.runs.get( + event.eventData.nextWaiter.runId, + { + resolveData: 'none', + } + ); + if (!['completed', 'failed', 'cancelled'].includes(nextRun.status)) { + await options.queue.queue( + `__wkf_workflow_${nextRun.workflowName}`, + { + runId: event.eventData.nextWaiter.runId, + lockPreApproval: event.eventData.nextWaiter.lockCorrelationId, + requestedAt: new Date(), + }, + { + idempotencyKey: event.eventData.nextWaiter.wakeCorrelationId, + } + ); + + const waiterQueuedEvent = EventSchema.parse({ + eventType: 'lock_waiter_queued', + correlationId: event.eventData.nextWaiter.lockCorrelationId, + runId: event.eventData.nextWaiter.runId, + eventId: `evnt_${monotonicUlid()}`, + createdAt: new Date(), + specVersion: effectiveSpecVersion, + }); + await writeJSON( + taggedPath( + basedir, + 'events', + `${waiterQueuedEvent.runId}-${waiterQueuedEvent.eventId}`, + tag + ), + waiterQueuedEvent + ); + } + } + + const resolveData = params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(event, resolveData), + run, + step, + hook, + wait, + }; + } + // Create/update entity based on event type (event-sourced architecture) // Run lifecycle events if (data.eventType === 'run_created' && 'eventData' in data) { diff --git a/packages/world-local/src/storage/index.ts b/packages/world-local/src/storage/index.ts index e5304e0104..ac21a408ce 100644 --- a/packages/world-local/src/storage/index.ts +++ b/packages/world-local/src/storage/index.ts @@ -1,10 +1,16 @@ -import type { Storage } from '@workflow/world'; +import type { Limits, Queue, Storage } from '@workflow/world'; import { instrumentObject } from '../instrumentObject.js'; import { createEventsStorage } from './events-storage.js'; import { createHooksStorage } from './hooks-storage.js'; import { createRunsStorage } from './runs-storage.js'; import { createStepsStorage } from './steps-storage.js'; +export interface LocalStorageOptions { + getLimits?: () => Limits | undefined; + queue?: Pick; + runs?: Pick; +} + /** * Creates a complete storage implementation using the filesystem. * This is the main entry point that composes all storage implementations. @@ -14,12 +20,19 @@ import { createStepsStorage } from './steps-storage.js'; * @param basedir - The base directory for storing workflow data * @returns A complete Storage implementation with tracing */ -export function createStorage(basedir: string, tag?: string): Storage { - // Create raw storage implementations +export function createStorage( + basedir: string, + tag?: string, + options?: LocalStorageOptions +): Storage { + const runs = createRunsStorage(basedir, tag); const storage: Storage = { - runs: createRunsStorage(basedir, tag), + runs, steps: createStepsStorage(basedir, tag), - events: createEventsStorage(basedir, tag), + events: createEventsStorage(basedir, tag, { + ...options, + runs, + }), hooks: createHooksStorage(basedir, tag), }; diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index 9363aff0a3..12280ace44 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -133,6 +133,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - **Durable Storage**: Stores workflow runs, events, steps, hooks, and webhooks in PostgreSQL - **Queue Processing**: Uses graphile-worker as the durable queue and executes jobs over the workflow HTTP routes - **Durable Delays**: Re-schedules waits and retries in PostgreSQL +- **Flow Limits**: Implements the shared concurrency/rate-limit contract with PostgreSQL-backed leases, rate tokens, FIFO waiters, and prompt wake-ups - **Streaming**: Real-time event streaming capabilities - **Health Checks**: Built-in connection health monitoring - **Configurable Concurrency**: Adjustable worker concurrency for queue processing @@ -143,8 +144,13 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Graphile jobs are acknowledged only after the workflow or step execution finishes, or after the worker durably schedules a delayed follow-up job - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling +- Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key +- Cancelled workflow waiters are pruned before promotion +- Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` +PostgreSQL's main advantage over the local world is durability of the queue/backlog itself across host or process loss. The flow-limit behavior is intended to match other implemented worlds while the process is alive. + ## Development For local development, you can use the included Docker Compose configuration: diff --git a/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql new file mode 100644 index 0000000000..dcb6198c4c --- /dev/null +++ b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql @@ -0,0 +1,39 @@ +CREATE TABLE "workflow"."workflow_limit_keys" ( + "limit_key" varchar PRIMARY KEY NOT NULL, + "concurrency_max" integer, + "rate_count" integer, + "rate_period_ms" integer +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_limit_leases" ( + "lease_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_limit_waiters" ( + "waiter_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "created_at" timestamp DEFAULT now() NOT NULL, + "lease_ttl_ms" integer +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_rate_limit_tokens" ( + "token_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp NOT NULL +); +--> statement-breakpoint +ALTER TABLE "workflow"."workflow_limit_leases" ADD CONSTRAINT "workflow_limit_leases_limit_key_workflow_limit_keys_limit_key_fk" FOREIGN KEY ("limit_key") REFERENCES "workflow"."workflow_limit_keys"("limit_key") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "workflow"."workflow_limit_waiters" ADD CONSTRAINT "workflow_limit_waiters_limit_key_workflow_limit_keys_limit_key_fk" FOREIGN KEY ("limit_key") REFERENCES "workflow"."workflow_limit_keys"("limit_key") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "workflow"."workflow_rate_limit_tokens" ADD CONSTRAINT "workflow_rate_limit_tokens_limit_key_workflow_limit_keys_limit_key_fk" FOREIGN KEY ("limit_key") REFERENCES "workflow"."workflow_limit_keys"("limit_key") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +CREATE UNIQUE INDEX "workflow_limit_leases_limit_key_holder_id_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","holder_id");--> statement-breakpoint +CREATE INDEX "workflow_limit_leases_limit_key_expires_at_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","expires_at");--> statement-breakpoint +CREATE UNIQUE INDEX "workflow_limit_waiters_limit_key_holder_id_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","holder_id");--> statement-breakpoint +CREATE INDEX "workflow_limit_waiters_limit_key_created_at_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","created_at");--> statement-breakpoint +CREATE INDEX "workflow_rate_limit_tokens_limit_key_expires_at_index" ON "workflow"."workflow_rate_limit_tokens" USING btree ("limit_key","expires_at");--> statement-breakpoint diff --git a/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json new file mode 100644 index 0000000000..e6be10d9f3 --- /dev/null +++ b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json @@ -0,0 +1,1007 @@ +{ + "id": "c3c21664-f021-4db5-be29-7c2991e325eb", + "prevId": "7adbbd35-ca90-4353-bb34-3d1b2435a027", + "version": "7", + "dialect": "postgresql", + "tables": { + "workflow.workflow_events": { + "name": "workflow_events", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "type": { + "name": "type", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "correlation_id": { + "name": "correlation_id", + "type": "varchar", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "payload": { + "name": "payload", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "payload_cbor": { + "name": "payload_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_events_run_id_index": { + "name": "workflow_events_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_events_correlation_id_index": { + "name": "workflow_events_correlation_id_index", + "columns": [ + { + "expression": "correlation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_hooks": { + "name": "workflow_hooks", + "schema": "workflow", + "columns": { + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "hook_id": { + "name": "hook_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "token": { + "name": "token", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "owner_id": { + "name": "owner_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "project_id": { + "name": "project_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "environment": { + "name": "environment", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "metadata_cbor": { + "name": "metadata_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "is_webhook": { + "name": "is_webhook", + "type": "boolean", + "primaryKey": false, + "notNull": false, + "default": true + } + }, + "indexes": { + "workflow_hooks_run_id_index": { + "name": "workflow_hooks_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_hooks_token_index": { + "name": "workflow_hooks_token_index", + "columns": [ + { + "expression": "token", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_keys": { + "name": "workflow_limit_keys", + "schema": "workflow", + "columns": { + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "concurrency_max": { + "name": "concurrency_max", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_count": { + "name": "rate_count", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_period_ms": { + "name": "rate_period_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_leases": { + "name": "workflow_limit_leases", + "schema": "workflow", + "columns": { + "lease_id": { + "name": "lease_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "acquired_at": { + "name": "acquired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_limit_leases_limit_key_holder_id_index": { + "name": "workflow_limit_leases_limit_key_holder_id_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_leases_limit_key_expires_at_index": { + "name": "workflow_limit_leases_limit_key_expires_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "expires_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "workflow_limit_leases_limit_key_workflow_limit_keys_limit_key_fk": { + "name": "workflow_limit_leases_limit_key_workflow_limit_keys_limit_key_fk", + "tableFrom": "workflow_limit_leases", + "tableTo": "workflow_limit_keys", + "schemaTo": "workflow", + "columnsFrom": ["limit_key"], + "columnsTo": ["limit_key"], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_waiters": { + "name": "workflow_limit_waiters", + "schema": "workflow", + "columns": { + "waiter_id": { + "name": "waiter_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "lease_ttl_ms": { + "name": "lease_ttl_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_limit_waiters_limit_key_holder_id_index": { + "name": "workflow_limit_waiters_limit_key_holder_id_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_waiters_limit_key_created_at_index": { + "name": "workflow_limit_waiters_limit_key_created_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "workflow_limit_waiters_limit_key_workflow_limit_keys_limit_key_fk": { + "name": "workflow_limit_waiters_limit_key_workflow_limit_keys_limit_key_fk", + "tableFrom": "workflow_limit_waiters", + "tableTo": "workflow_limit_keys", + "schemaTo": "workflow", + "columnsFrom": ["limit_key"], + "columnsTo": ["limit_key"], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_rate_limit_tokens": { + "name": "workflow_rate_limit_tokens", + "schema": "workflow", + "columns": { + "token_id": { + "name": "token_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "acquired_at": { + "name": "acquired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "workflow_rate_limit_tokens_limit_key_expires_at_index": { + "name": "workflow_rate_limit_tokens_limit_key_expires_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "expires_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "workflow_rate_limit_tokens_limit_key_workflow_limit_keys_limit_key_fk": { + "name": "workflow_rate_limit_tokens_limit_key_workflow_limit_keys_limit_key_fk", + "tableFrom": "workflow_rate_limit_tokens", + "tableTo": "workflow_limit_keys", + "schemaTo": "workflow", + "columnsFrom": ["limit_key"], + "columnsTo": ["limit_key"], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_runs": { + "name": "workflow_runs", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "output": { + "name": "output", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "output_cbor": { + "name": "output_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "deployment_id": { + "name": "deployment_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "execution_context": { + "name": "execution_context", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "execution_context_cbor": { + "name": "execution_context_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "input": { + "name": "input", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "input_cbor": { + "name": "input_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "error_cbor": { + "name": "error_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "expired_at": { + "name": "expired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_runs_name_index": { + "name": "workflow_runs_name_index", + "columns": [ + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_runs_status_index": { + "name": "workflow_runs_status_index", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_steps": { + "name": "workflow_steps", + "schema": "workflow", + "columns": { + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "step_id": { + "name": "step_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "step_name": { + "name": "step_name", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "step_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "input": { + "name": "input", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "input_cbor": { + "name": "input_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "output": { + "name": "output", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "output_cbor": { + "name": "output_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "error_cbor": { + "name": "error_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "attempt": { + "name": "attempt", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "retry_after": { + "name": "retry_after", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_steps_run_id_index": { + "name": "workflow_steps_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_steps_status_index": { + "name": "workflow_steps_status_index", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_stream_chunks": { + "name": "workflow_stream_chunks", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "stream_id": { + "name": "stream_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": false + }, + "data": { + "name": "data", + "type": "bytea", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "eof": { + "name": "eof", + "type": "boolean", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "workflow_stream_chunks_run_id_index": { + "name": "workflow_stream_chunks_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "workflow_stream_chunks_stream_id_id_pk": { + "name": "workflow_stream_chunks_stream_id_id_pk", + "columns": ["stream_id", "id"] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_waits": { + "name": "workflow_waits", + "schema": "workflow", + "columns": { + "wait_id": { + "name": "wait_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "wait_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "resume_at": { + "name": "resume_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_waits_run_id_index": { + "name": "workflow_waits_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": { + "public.step_status": { + "name": "step_status", + "schema": "public", + "values": ["pending", "running", "completed", "failed", "cancelled"] + }, + "public.wait_status": { + "name": "wait_status", + "schema": "public", + "values": ["waiting", "completed"] + }, + "public.status": { + "name": "status", + "schema": "public", + "values": ["pending", "running", "completed", "failed", "cancelled"] + } + }, + "schemas": { + "workflow": "workflow" + }, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json index f4956666fc..43f0daa548 100644 --- a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json +++ b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json @@ -71,6 +71,13 @@ "when": 1770500000000, "tag": "0009_add_is_webhook", "breakpoints": true + }, + { + "idx": 10, + "version": "7", + "when": 1774917672940, + "tag": "0010_add_flow_limits", + "breakpoints": true } ] } diff --git a/packages/world-postgres/src/drizzle/schema.ts b/packages/world-postgres/src/drizzle/schema.ts index f353ef8ca1..8ff82f7d9b 100644 --- a/packages/world-postgres/src/drizzle/schema.ts +++ b/packages/world-postgres/src/drizzle/schema.ts @@ -21,6 +21,7 @@ import { primaryKey, text, timestamp, + uniqueIndex, varchar, } from 'drizzle-orm/pg-core'; import { Cbor, type Cborized } from './cbor.js'; @@ -192,6 +193,67 @@ export const waits = schema.table( (tb) => [index().on(tb.runId)] ); +export const limitKeys = schema.table('workflow_limit_keys', { + limitKey: varchar('limit_key').primaryKey(), + concurrencyMax: integer('concurrency_max'), + rateCount: integer('rate_count'), + ratePeriodMs: integer('rate_period_ms'), +}); + +export const limitLeases = schema.table( + 'workflow_limit_leases', + { + leaseId: varchar('lease_id').primaryKey(), + limitKey: varchar('limit_key') + .references(() => limitKeys.limitKey, { + onDelete: 'cascade', + }) + .notNull(), + holderId: varchar('holder_id').notNull(), + acquiredAt: timestamp('acquired_at').defaultNow().notNull(), + expiresAt: timestamp('expires_at'), + }, + (tb) => [ + uniqueIndex().on(tb.limitKey, tb.holderId), + index().on(tb.limitKey, tb.expiresAt), + ] +); + +export const rateLimitTokens = schema.table( + 'workflow_rate_limit_tokens', + { + tokenId: varchar('token_id').primaryKey(), + limitKey: varchar('limit_key') + .references(() => limitKeys.limitKey, { + onDelete: 'cascade', + }) + .notNull(), + holderId: varchar('holder_id').notNull(), + acquiredAt: timestamp('acquired_at').defaultNow().notNull(), + expiresAt: timestamp('expires_at').notNull(), + }, + (tb) => [index().on(tb.limitKey, tb.expiresAt)] +); + +export const limitWaiters = schema.table( + 'workflow_limit_waiters', + { + waiterId: varchar('waiter_id').primaryKey(), + limitKey: varchar('limit_key') + .references(() => limitKeys.limitKey, { + onDelete: 'cascade', + }) + .notNull(), + holderId: varchar('holder_id').notNull(), + createdAt: timestamp('created_at').defaultNow().notNull(), + leaseTtlMs: integer('lease_ttl_ms'), + }, + (tb) => [ + uniqueIndex().on(tb.limitKey, tb.holderId), + index().on(tb.limitKey, tb.createdAt), + ] +); + const bytea = customType<{ data: Buffer; notNull: false; default: false }>({ dataType() { return 'bytea'; diff --git a/packages/world-postgres/src/index.ts b/packages/world-postgres/src/index.ts index bdc2eceeeb..91649e199e 100644 --- a/packages/world-postgres/src/index.ts +++ b/packages/world-postgres/src/index.ts @@ -1,8 +1,9 @@ -import type { Storage, World } from '@workflow/world'; +import type { Limits, Queue, Storage, World } from '@workflow/world'; import { reenqueueActiveRuns } from '@workflow/world'; import { Pool } from 'pg'; import type { PostgresWorldConfig } from './config.js'; import { createClient, type Drizzle } from './drizzle/index.js'; +import { createLimits } from './limits.js'; import { createQueue } from './queue.js'; import { createEventsStorage, @@ -12,10 +13,20 @@ import { } from './storage.js'; import { createStreamer } from './streamer.js'; -function createStorage(drizzle: Drizzle): Storage { +function createStorage( + drizzle: Drizzle, + options?: { + getLimits?: () => Limits | undefined; + queue?: Pick; + } +): Storage { + const runs = createRunsStorage(drizzle); return { - runs: createRunsStorage(drizzle), - events: createEventsStorage(drizzle), + runs, + events: createEventsStorage(drizzle, { + ...options, + runs, + }), hooks: createHooksStorage(drizzle), steps: createStepsStorage(drizzle), }; @@ -53,10 +64,16 @@ export function createWorld( const drizzle = createClient(pool); const queue = createQueue(config, pool); - const storage = createStorage(drizzle); const streamer = createStreamer(pool, drizzle); + let limits: Limits | undefined; + const storage = createStorage(drizzle, { + getLimits: () => limits, + queue, + }); + limits = createLimits(config, drizzle); return { + limits, ...storage, ...streamer, ...queue, diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts new file mode 100644 index 0000000000..7cf3b5bdb2 --- /dev/null +++ b/packages/world-postgres/src/limits.test.ts @@ -0,0 +1,310 @@ +import { afterAll, beforeAll, beforeEach, expect, test, vi } from 'vitest'; +import { LimitDefinitionConflictError } from '@workflow/errors'; +import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.mts'; +import { createLimits } from './limits.js'; +import { + createEventsStorage, + createRunsStorage, + createStepsStorage, +} from './storage.js'; +import { createQueue } from './queue.js'; + +if (process.platform === 'win32') { + test.skip('skipped on Windows since it relies on a docker container', () => {}); +} else { + let db: Awaited< + ReturnType + >; + + beforeAll(async () => { + const { createPostgresTestDb } = await import('../test/test-db.js'); + db = await createPostgresTestDb(); + const queue = createQueue( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.pool + ); + await queue.start(); + await queue.close(); + }, 120_000); + + beforeEach(async () => { + await db.truncateLimits(); + }); + + async function createLockOwner(workflowName: string, lockIndex = 0) { + const events = createEventsStorage(db.drizzle); + const result = await events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName, + input: [], + }, + }); + if (!result.run) { + throw new Error('expected run'); + } + return { + runId: result.run.runId, + lockIndex, + }; + } + + afterAll(async () => { + await db?.close(); + }); + + createLimitsContractSuite('postgres world limits', async () => { + return { + limits: createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ), + storage: { + runs: createRunsStorage(db.drizzle), + steps: createStepsStorage(db.drizzle), + events: createEventsStorage(db.drizzle), + }, + inspectKeyState: async (key) => { + const [leases, waiters, tokens] = await Promise.all([ + db.pool.query<{ lockId: string }>( + ` + select holder_id as "lockId" + from workflow.workflow_limit_leases + where limit_key = $1 + order by holder_id asc + `, + [key] + ), + db.pool.query<{ lockId: string }>( + ` + select holder_id as "lockId" + from workflow.workflow_limit_waiters + where limit_key = $1 + order by created_at asc, holder_id asc + `, + [key] + ), + db.pool.query<{ lockId: string }>( + ` + select holder_id as "lockId" + from workflow.workflow_rate_limit_tokens + where limit_key = $1 + order by acquired_at asc, holder_id asc + `, + [key] + ), + ]); + + return { + leaseHolderIds: leases.rows.map((row) => row.lockId), + waiterHolderIds: waiters.rows.map((row) => row.lockId), + tokenHolderIds: tokens.rows.map((row) => row.lockId), + }; + }, + }; + }); + + test('uses the head waiter retryAfter for waiters queued behind a long rate window', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + const key = 'workflow:fifo:head-waiter-rate'; + const periodMs = 60_000; + const ownerA = await createLockOwner('holder-a'); + const ownerB = await createLockOwner('holder-b'); + const ownerC = await createLockOwner('holder-c'); + + const first = await limits.acquire({ + key, + runId: ownerA.runId, + lockIndex: ownerA.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + lockId: first.lease.lockId, + }); + + const headWaiter = await limits.acquire({ + key, + runId: ownerB.runId, + lockIndex: ownerB.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(headWaiter.status).toBe('blocked'); + if (headWaiter.status !== 'blocked') throw new Error('expected blocked'); + + const behindHead = await limits.acquire({ + key, + runId: ownerC.runId, + lockIndex: ownerC.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(behindHead.status).toBe('blocked'); + if (behindHead.status !== 'blocked') throw new Error('expected blocked'); + expect(behindHead.retryAfterMs).toBeGreaterThan(5_000); + + const existingWaiterRetry = await limits.acquire({ + key, + runId: ownerC.runId, + lockIndex: ownerC.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(existingWaiterRetry.status).toBe('blocked'); + if (existingWaiterRetry.status !== 'blocked') { + throw new Error('expected blocked'); + } + expect(existingWaiterRetry.retryAfterMs).toBeGreaterThan(5_000); + }); + + test('persists nextWaiter metadata and emits lock_waiter_queued on release', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + const runs = createRunsStorage(db.drizzle); + const queue = { queue: vi.fn().mockResolvedValue(undefined) }; + const events = createEventsStorage(db.drizzle, { + getLimits: () => limits, + queue, + runs, + }); + const ownerA = await createLockOwner('holder-a'); + const ownerB = await createLockOwner('holder-b'); + const correlationA = createLockCorrelationId( + ownerA.runId, + ownerA.lockIndex + ); + const correlationB = createLockCorrelationId( + ownerB.runId, + ownerB.lockIndex + ); + + const first = await events.create(ownerA.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }, + }); + const second = await events.create(ownerB.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationB, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }, + }); + + expect(first.event?.eventType).toBe('lock_acquired'); + expect(second.event?.eventType).toBe('lock_created'); + + const released = await events.create(ownerA.runId, { + eventType: 'lock_release', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + }); + + if (!released.event || released.event.eventType !== 'lock_release') { + throw new Error('expected lock_release event'); + } + expect(released.event?.eventData?.nextWaiter).toMatchObject({ + runId: ownerB.runId, + lockIndex: ownerB.lockIndex, + lockCorrelationId: correlationB, + }); + expect(queue.queue).toHaveBeenCalledWith( + '__wkf_workflow_holder-b', + expect.objectContaining({ + runId: ownerB.runId, + lockPreApproval: correlationB, + }), + expect.objectContaining({ + idempotencyKey: expect.any(String), + }) + ); + + const correlated = await events.listByCorrelationId({ + correlationId: correlationB, + }); + expect( + correlated.data.some((event) => event.eventType === 'lock_waiter_queued') + ).toBe(true); + }); + + test('throws when the same key is acquired with a conflicting definition', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-a', + lockIndex: 0, + definition: { + concurrency: { max: 1 }, + }, + leaseTtlMs: 1_000, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs: 5_000 }, + }, + leaseTtlMs: 1_000, + }) + ).rejects.toBeInstanceOf(LimitDefinitionConflictError); + }); + + test('does not resurrect an expired lease when heartbeating after the key lock', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + const first = await limits.acquire({ + key: 'workflow:user:heartbeat-expired', + runId: 'run-a', + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 50, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await new Promise((resolve) => setTimeout(resolve, 75)); + + await expect( + limits.heartbeat({ + leaseId: first.lease.leaseId, + }) + ).rejects.toMatchObject({ + name: 'WorkflowWorldError', + message: expect.stringContaining('not found'), + }); + }); +} diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts new file mode 100644 index 0000000000..4afd869a89 --- /dev/null +++ b/packages/world-postgres/src/limits.ts @@ -0,0 +1,703 @@ +import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; +import { + LimitDefinitionConflictError, + WorkflowWorldError, +} from '@workflow/errors'; +import { + createLockId, + type LimitDefinition, + LimitAcquireRequestSchema, + type LimitAcquireResult, + LimitHeartbeatRequestSchema, + type LimitLease, + type LimitNextWaiter, + type LimitReleaseResult, + LimitReleaseRequestSchema, + type Limits, + parseLockId, +} from '@workflow/world'; +import { monotonicFactory } from 'ulid'; +import type { PostgresWorldConfig } from './config.js'; +import type { Drizzle } from './drizzle/index.js'; +import * as Schema from './drizzle/schema.js'; + +type LeaseRow = typeof Schema.limitLeases.$inferSelect; +type LimitKeyRow = typeof Schema.limitKeys.$inferSelect; +type TokenRow = typeof Schema.rateLimitTokens.$inferSelect; +type WaiterRow = typeof Schema.limitWaiters.$inferSelect; +type Tx = Parameters[0]>[0]; +type Db = Drizzle | Tx; +const generateId = monotonicFactory(); + +function nowPlus(ms?: number): Date | undefined { + if (ms === undefined) return undefined; + return new Date(Date.now() + ms); +} + +function toDate(value: Date | string | null | undefined): Date | undefined { + if (value === null || value === undefined) return undefined; + return value instanceof Date ? value : new Date(value); +} + +function toMillis(value: Date | string | null | undefined): number | undefined { + const date = toDate(value); + return date ? date.getTime() : undefined; +} + +function toLease(row: LeaseRow, definition: LimitDefinition): LimitLease { + const parsedLockId = parseLockId(row.holderId); + return { + leaseId: row.leaseId, + key: row.limitKey, + lockId: row.holderId, + runId: parsedLockId?.runId ?? row.holderId, + lockIndex: parsedLockId?.lockIndex ?? 0, + acquiredAt: toDate(row.acquiredAt)!, + expiresAt: toDate(row.expiresAt), + definition, + }; +} + +function definitionFromRow( + row: Pick +): LimitDefinition { + return { + concurrency: + row.concurrencyMax !== null ? { max: row.concurrencyMax } : undefined, + rate: + row.rateCount !== null && row.ratePeriodMs !== null + ? { count: row.rateCount, periodMs: row.ratePeriodMs } + : undefined, + }; +} + +function areLimitDefinitionsEqual( + left: LimitDefinition | undefined, + right: LimitDefinition +): boolean { + return ( + left?.concurrency?.max === right.concurrency?.max && + left?.rate?.count === right.rate?.count && + left?.rate?.periodMs === right.rate?.periodMs + ); +} + +function toNextWaiter(holderId: string): LimitNextWaiter | undefined { + const parsedLockId = parseLockId(holderId); + if (!parsedLockId) { + return undefined; + } + + return { + runId: parsedLockId.runId, + lockIndex: parsedLockId.lockIndex, + wakeCorrelationId: `wflock_wait_${parsedLockId.runId}:${parsedLockId.lockIndex}`, + lockCorrelationId: `wflock_${parsedLockId.runId}:${parsedLockId.lockIndex}`, + }; +} + +function getBlockedReason( + concurrencyBlocked: boolean, + rateBlocked: boolean +): 'concurrency' | 'rate' | 'concurrency_and_rate' { + if (concurrencyBlocked && rateBlocked) return 'concurrency_and_rate'; + if (concurrencyBlocked) return 'concurrency'; + return 'rate'; +} + +/* +When a workflow or step is blocked, we need to calculate the retry after time. +We do this by finding the earliest expiration time for any leases or tokens. +*/ +function getRetryAfterMs( + leases: LeaseRow[], + tokens: TokenRow[], + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const candidates: number[] = []; + + if (concurrencyBlocked) { + for (const lease of leases) { + if (lease.expiresAt) { + candidates.push(Math.max(0, toMillis(lease.expiresAt)! - now)); + } + } + } + + if (rateBlocked) { + for (const token of tokens) { + candidates.push(Math.max(0, toMillis(token.expiresAt)! - now)); + } + } + + if (candidates.length === 0) return undefined; + return Math.min(...candidates); +} + +function getWaiterRetryAfterMs( + leases: LeaseRow[], + tokens: TokenRow[], + now: number, + definition: LimitDefinition +): number | undefined { + return getRetryAfterMs( + leases, + tokens, + now, + definition.concurrency !== undefined && + leases.length >= definition.concurrency.max, + definition.rate !== undefined && tokens.length >= definition.rate.count + ); +} + +function getBlockedRetryAfterMs( + state: { + keyRow?: LimitKeyRow; + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; + }, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number { + const headWaiter = state.waiters[0]; + const definition = state.keyRow ? definitionFromRow(state.keyRow) : undefined; + return ( + (headWaiter && definition + ? getWaiterRetryAfterMs(state.leases, state.tokens, now, definition) + : undefined) ?? + getRetryAfterMs( + state.leases, + state.tokens, + now, + concurrencyBlocked, + rateBlocked + ) ?? + 1000 + ); +} + +async function pruneExpired(tx: Db, key: string): Promise { + /* + Capacity is reclaimed opportunistically whenever a key is touched. + This keeps v1 simple and avoids needing a separate cleanup worker. + */ + const now = new Date(); + + await tx + .delete(Schema.rateLimitTokens) + .where( + and( + eq(Schema.rateLimitTokens.limitKey, key), + lte(Schema.rateLimitTokens.expiresAt, now) + ) + ); + + await tx + .delete(Schema.limitLeases) + .where( + and( + eq(Schema.limitLeases.limitKey, key), + isNotNull(Schema.limitLeases.expiresAt), + lte(Schema.limitLeases.expiresAt, now) + ) + ); +} + +async function getActiveState( + tx: Db, + key: string +): Promise<{ + keyRow?: LimitKeyRow; + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; +}> { + const [keyRow, leases, tokens, waiters] = await Promise.all([ + tx.query.limitKeys.findFirst({ + where: eq(Schema.limitKeys.limitKey, key), + }), + tx + .select() + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, key)) + .orderBy( + asc(Schema.limitLeases.acquiredAt), + asc(Schema.limitLeases.leaseId) + ), + tx + .select() + .from(Schema.rateLimitTokens) + .where(eq(Schema.rateLimitTokens.limitKey, key)) + .orderBy(asc(Schema.rateLimitTokens.expiresAt)), + tx + .select() + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, key)) + .orderBy( + asc(Schema.limitWaiters.createdAt), + asc(Schema.limitWaiters.waiterId) + ), + ]); + + return { keyRow, leases, tokens, waiters }; +} + +/* +We serialize limit mutations per key inside the transaction so concurrent +acquire/release flows cannot both observe the same free capacity. +*/ +async function lockLimitKey(tx: Db, key: string): Promise { + await tx.execute( + sql`select pg_advisory_xact_lock(hashtextextended(${key}, 0))` + ); +} + +async function isHolderLive(tx: Db, holderId: string): Promise { + const parsedLockId = parseLockId(holderId); + if (!parsedLockId) { + return true; + } + + const [run] = (await tx + .select({ + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, parsedLockId.runId)) + .limit(1)) as Pick[]; + + return !run || !['completed', 'failed', 'cancelled'].includes(run.status); +} + +async function pruneDeadWaiters(tx: Db, key: string): Promise { + const waiters = await tx + .select({ + waiterId: Schema.limitWaiters.waiterId, + holderId: Schema.limitWaiters.holderId, + }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, key)); + + for (const waiter of waiters) { + if (!(await isHolderLive(tx, waiter.holderId))) { + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + } + } +} + +async function pruneDeadHolders(tx: Db, key: string): Promise { + const leases = await tx + .select({ + leaseId: Schema.limitLeases.leaseId, + holderId: Schema.limitLeases.holderId, + }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, key)); + + for (const lease of leases) { + if (!(await isHolderLive(tx, lease.holderId))) { + await tx + .delete(Schema.limitLeases) + .where(eq(Schema.limitLeases.leaseId, lease.leaseId)); + } + } +} + +async function ensureCanonicalDefinition( + tx: Db, + key: string, + requested: LimitDefinition, + state: { + keyRow?: LimitKeyRow; + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; + } +) { + const existing = state.keyRow; + + if ( + existing && + state.leases.length === 0 && + state.tokens.length === 0 && + state.waiters.length === 0 + ) { + await tx.delete(Schema.limitKeys).where(eq(Schema.limitKeys.limitKey, key)); + } + + const current = + existing && + state.leases.length === 0 && + state.tokens.length === 0 && + state.waiters.length === 0 + ? undefined + : (existing ?? + (await tx.query.limitKeys.findFirst({ + where: eq(Schema.limitKeys.limitKey, key), + }))); + + if (!current) { + await tx.insert(Schema.limitKeys).values({ + limitKey: key, + concurrencyMax: requested.concurrency?.max ?? null, + rateCount: requested.rate?.count ?? null, + ratePeriodMs: requested.rate?.periodMs ?? null, + }); + return; + } + + const currentDefinition = definitionFromRow(current); + if (!areLimitDefinitionsEqual(currentDefinition, requested)) { + throw new LimitDefinitionConflictError(key, currentDefinition, requested); + } +} + +async function promoteWaiter( + tx: Db, + key: string, + waiter: WaiterRow, + definition: LimitDefinition +): Promise<{ lease: LimitLease; nextWaiter?: LimitNextWaiter }> { + const leaseId = `lmt_${generateId()}`; + const expiresAt = nowPlus(waiter.leaseTtlMs ?? undefined); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt, + }) + .onConflictDoNothing() + .returning(); + + const acquiredLease = + lease ?? + (await tx.query.limitLeases.findFirst({ + where: and( + eq(Schema.limitLeases.limitKey, key), + eq(Schema.limitLeases.holderId, waiter.holderId) + ), + })); + + if (!acquiredLease) { + throw new WorkflowWorldError(`Failed to promote waiter for key "${key}"`); + } + + if (definition.rate) { + await tx.insert(Schema.rateLimitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + definition.rate.periodMs), + }); + } + + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + + return { + lease: toLease(acquiredLease, definition), + nextWaiter: toNextWaiter(waiter.holderId), + }; +} + +export function createLimits( + _config: PostgresWorldConfig, + drizzle: Drizzle +): Limits { + return { + async acquire(request) { + const parsed = LimitAcquireRequestSchema.parse(request); + + return drizzle.transaction(async (tx) => { + await lockLimitKey(tx, parsed.key); + await pruneExpired(tx, parsed.key); + await pruneDeadHolders(tx, parsed.key); + await pruneDeadWaiters(tx, parsed.key); + + const state = await getActiveState(tx, parsed.key); + await ensureCanonicalDefinition( + tx, + parsed.key, + parsed.definition, + state + ); + const currentState = await getActiveState(tx, parsed.key); + const definition = + currentState.keyRow && definitionFromRow(currentState.keyRow); + const lockId = createLockId(parsed.runId, parsed.lockIndex); + const existingLease = currentState.leases.find( + (lease) => lease.holderId === lockId + ); + if (existingLease) { + if (!definition) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${parsed.key}"` + ); + } + return { + status: 'acquired', + lease: toLease(existingLease, definition), + } satisfies LimitAcquireResult; + } + + const existingWaiter = currentState.waiters.find( + (waiter) => waiter.holderId === lockId + ); + if (existingWaiter) { + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + currentState.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + currentState.tokens.length >= parsed.definition.rate.count; + + if ( + currentState.waiters[0]?.waiterId === existingWaiter.waiterId && + !concurrencyBlocked && + !rateBlocked + ) { + if (!definition) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${parsed.key}"` + ); + } + const promoted = await promoteWaiter( + tx, + parsed.key, + existingWaiter, + definition + ); + return { + status: 'acquired', + lease: promoted.lease, + } satisfies LimitAcquireResult; + } + + const now = Date.now(); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: getBlockedRetryAfterMs( + currentState, + now, + concurrencyBlocked, + rateBlocked + ), + } satisfies LimitAcquireResult; + } + + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + currentState.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + currentState.tokens.length >= parsed.definition.rate.count; + + if ( + !concurrencyBlocked && + !rateBlocked && + currentState.waiters.length === 0 + ) { + const expiresAt = nowPlus(parsed.leaseTtlMs); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId: `lmt_${generateId()}`, + limitKey: parsed.key, + holderId: lockId, + acquiredAt: new Date(), + expiresAt, + }) + .returning(); + + if (parsed.definition.rate) { + await tx.insert(Schema.rateLimitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: parsed.key, + holderId: lockId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + parsed.definition.rate.periodMs), + }); + } + + return { + status: 'acquired', + lease: toLease(lease, definition ?? parsed.definition), + } satisfies LimitAcquireResult; + } + + await tx + .insert(Schema.limitWaiters) + .values({ + waiterId: `lmtwait_${generateId()}`, + limitKey: parsed.key, + holderId: lockId, + createdAt: new Date(), + leaseTtlMs: parsed.leaseTtlMs ?? null, + }) + .onConflictDoNothing(); + + const now = Date.now(); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: getBlockedRetryAfterMs( + currentState, + now, + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ), + } satisfies LimitAcquireResult; + }); + }, + + async release(request) { + const parsed = LimitReleaseRequestSchema.parse(request); + + return drizzle.transaction(async (tx): Promise => { + const key = + parsed.key ?? + ( + await tx.query.limitLeases.findFirst({ + columns: { limitKey: true }, + where: eq(Schema.limitLeases.leaseId, parsed.leaseId), + }) + )?.limitKey; + + if (key) { + await lockLimitKey(tx, key); + await pruneExpired(tx, key); + } + + const beforeState = key ? await getActiveState(tx, key) : undefined; + + let where = eq(Schema.limitLeases.leaseId, parsed.leaseId); + if (parsed.key) { + where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; + } + if (parsed.lockId) { + where = and(where, eq(Schema.limitLeases.holderId, parsed.lockId))!; + } + + await tx.delete(Schema.limitLeases).where(where).returning({ + limitKey: Schema.limitLeases.limitKey, + holderId: Schema.limitLeases.holderId, + }); + + if (key) { + await pruneDeadHolders(tx, key); + await pruneDeadWaiters(tx, key); + const state = await getActiveState(tx, key); + const headWaiter = state.waiters[0]; + const capacityFreed = + (beforeState?.leases.length ?? 0) > state.leases.length; + + if (headWaiter && capacityFreed) { + const definition = state.keyRow && definitionFromRow(state.keyRow); + if (!definition) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${key}"` + ); + } + const concurrencyBlocked = + definition.concurrency !== undefined && + state.leases.length >= definition.concurrency.max; + const rateBlocked = + definition.rate !== undefined && + state.tokens.length >= definition.rate.count; + + if (!concurrencyBlocked && !rateBlocked) { + const promoted = await promoteWaiter( + tx, + key, + headWaiter, + definition + ); + return { nextWaiter: promoted.nextWaiter }; + } + } + + if ( + state.leases.length === 0 && + state.tokens.length === 0 && + state.waiters.length === 0 + ) { + await tx + .delete(Schema.limitKeys) + .where(eq(Schema.limitKeys.limitKey, key)); + } + } + + return {}; + }); + }, + + async heartbeat(request) { + const parsed = LimitHeartbeatRequestSchema.parse(request); + + // Heartbeat a lease to extend its expiry. + return drizzle.transaction(async (tx) => { + const existing = await tx.query.limitLeases.findFirst({ + where: eq(Schema.limitLeases.leaseId, parsed.leaseId), + }); + + if (!existing) { + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); + } + + await lockLimitKey(tx, existing.limitKey); + await pruneExpired(tx, existing.limitKey); + + const current = await tx.query.limitLeases.findFirst({ + where: and( + eq(Schema.limitLeases.leaseId, parsed.leaseId), + eq(Schema.limitLeases.limitKey, existing.limitKey) + ), + }); + + if (!current) { + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); + } + + const now = Date.now(); + const currentExpiry = toMillis(current.expiresAt); + const ttlMs = + parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); + const expiresAt = new Date(now + Math.max(1, ttlMs)); + + const [updated] = await tx + .update(Schema.limitLeases) + .set({ expiresAt }) + .where(eq(Schema.limitLeases.leaseId, parsed.leaseId)) + .returning(); + + if (!updated) { + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); + } + + const keyRow = await tx.query.limitKeys.findFirst({ + where: eq(Schema.limitKeys.limitKey, current.limitKey), + }); + + if (!keyRow) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${current.limitKey}"` + ); + } + + return toLease(updated, definitionFromRow(keyRow)); + }); + }, + }; +} diff --git a/packages/world-postgres/src/queue.ts b/packages/world-postgres/src/queue.ts index 6a3fd53f94..8b79e6f033 100644 --- a/packages/world-postgres/src/queue.ts +++ b/packages/world-postgres/src/queue.ts @@ -65,9 +65,10 @@ type HttpExecutionResult = * - `step` for step jobs * * When a message is queued, it is sent to graphile-worker with the appropriate job type. - * When a job is processed, it is deserialized and then re-queued into the _local world_, showing that - * we can reuse the local world, mix and match worlds to build - * hybrid architectures, and even migrate between worlds. + * When a job is processed, the worker POSTs the payload directly to the + * workflow HTTP endpoints. We reuse `world-local` only for its + * `createQueueHandler()` HTTP adapter so the request/response contract stays + * consistent across worlds; execution is not re-enqueued into the local queue. */ export type PostgresQueue = Queue & { start(): Promise; diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index d65dfa5ab9..4aafaddded 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -4,17 +4,20 @@ import { RunExpiredError, RunNotSupportedError, TooEarlyError, - WorkflowWorldError, WorkflowRunNotFoundError, + WorkflowWorldError, } from '@workflow/errors'; import type { Event, EventResult, GetEventParams, Hook, + LimitLease, + Limits, ListEventsParams, ListHooksParams, PaginatedResponse, + Queue, ResolveData, Step, StepWithoutData, @@ -41,6 +44,26 @@ import { type Drizzle, Schema } from './drizzle/index.js'; import type { SerializedContent } from './drizzle/schema.js'; import { compact } from './util.js'; +function getAcquiredLease( + event: + | { + eventType: string; + eventData?: unknown; + } + | undefined +): LimitLease | undefined { + if (!event || event.eventType !== 'lock_acquired') { + return undefined; + } + + const data = event.eventData; + if (!data || typeof data !== 'object' || !('lease' in data)) { + return undefined; + } + + return (data as { lease?: LimitLease }).lease; +} + /** * Parse legacy errorJson (text column with JSON-stringified StructuredError). * Used for backwards compatibility when reading from deprecated error column. @@ -260,9 +283,18 @@ async function handleLegacyEventPostgres( } } -export function createEventsStorage(drizzle: Drizzle): Storage['events'] { +export function createEventsStorage( + drizzle: Drizzle, + options?: { + getLimits?: () => Limits | undefined; + queue?: Pick; + runs?: Pick; + } +): Storage['events'] { const ulid = monotonicFactory(); const { events } = Schema; + const isLeaseLive = (lease: { expiresAt?: Date }) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > Date.now(); // Prepared statements for validation queries (performance optimization) const getRunForValidation = drizzle @@ -458,7 +490,11 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { if ( data.eventType === 'step_created' || data.eventType === 'hook_created' || - data.eventType === 'wait_created' + data.eventType === 'wait_created' || + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' || + data.eventType === 'lock_waiter_queued' ) { throw new EntityConflictError( `Cannot create new entities on run in terminal state "${currentRun.status}"` @@ -528,6 +564,300 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { } } + if ( + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' + ) { + const limits = options?.getLimits?.(); + if (!limits) { + throw new WorkflowWorldError( + `Flow limits are not configured for event type "${data.eventType}"` + ); + } + + const lockIndex = Number.parseInt( + data.correlationId.split(':').at(-1) ?? '0', + 10 + ); + const existingEvents = await drizzle + .select({ + eventType: Schema.events.eventType, + eventData: Schema.events.eventData, + createdAt: Schema.events.createdAt, + eventId: Schema.events.eventId, + }) + .from(Schema.events) + .where( + and( + eq(Schema.events.runId, effectiveRunId), + eq(Schema.events.correlationId, data.correlationId) + ) + ) + .orderBy(asc(Schema.events.createdAt), asc(Schema.events.eventId)); + const existingCreatedEvent = existingEvents.find( + (event) => event.eventType === 'lock_created' + ); + const existingAcquiredEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_acquired'); + const existingReleaseEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_release'); + let eventToStore: + | { + eventType: 'lock_created' | 'lock_acquired' | 'lock_release'; + correlationId: string; + eventData?: unknown; + } + | undefined; + let eventToReturn: + | { + eventType: 'lock_created' | 'lock_acquired' | 'lock_release'; + correlationId: string; + eventData?: unknown; + createdAt: Date; + eventId: string; + } + | undefined; + + if (data.eventType === 'lock_created') { + const existingLeaseData = getAcquiredLease(existingAcquiredEvent); + const existingEvent = + existingReleaseEvent ?? + (existingLeaseData && isLeaseLive(existingLeaseData) + ? existingAcquiredEvent + : undefined) ?? + existingCreatedEvent; + if (existingEvent) { + eventToReturn = { + eventType: existingEvent.eventType as + | 'lock_created' + | 'lock_acquired' + | 'lock_release', + correlationId: data.correlationId, + eventData: existingEvent.eventData ?? undefined, + createdAt: existingEvent.createdAt, + eventId: existingEvent.eventId, + }; + } else { + const result = await limits.acquire({ + key: data.eventData.key, + runId: effectiveRunId, + lockIndex, + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + }); + const eventCreatedAt = new Date(); + + eventToStore = + result.status === 'acquired' + ? { + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + } + : { + eventType: 'lock_created', + correlationId: data.correlationId, + eventData: { + key: data.eventData.key, + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + acquireAt: + result.retryAfterMs !== undefined + ? new Date( + eventCreatedAt.getTime() + result.retryAfterMs + ) + : undefined, + }, + }; + } + } else if (data.eventType === 'lock_acquired') { + const existingLeaseData = getAcquiredLease(existingAcquiredEvent); + if (existingReleaseEvent) { + eventToReturn = { + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: existingReleaseEvent.eventData ?? undefined, + createdAt: existingReleaseEvent.createdAt, + eventId: existingReleaseEvent.eventId, + }; + } else if ( + existingAcquiredEvent && + existingLeaseData && + isLeaseLive(existingLeaseData) + ) { + eventToReturn = { + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: existingAcquiredEvent.eventData ?? undefined, + createdAt: existingAcquiredEvent.createdAt, + eventId: existingAcquiredEvent.eventId, + }; + } else { + const createdEvent = existingCreatedEvent; + const createdData = createdEvent?.eventData as + | { + key: string; + definition: any; + leaseTtlMs?: number; + } + | undefined; + if (!createdData) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be acquired before lock_created` + ); + } + + const result = await limits.acquire({ + key: createdData.key, + runId: effectiveRunId, + lockIndex, + definition: createdData.definition, + leaseTtlMs: createdData.leaseTtlMs, + }); + if (result.status !== 'acquired') { + const retryAfter = + result.retryAfterMs !== undefined + ? Math.ceil(result.retryAfterMs / 1000) + : undefined; + throw new TooEarlyError( + `Lock "${data.correlationId}" is not ready to acquire`, + { retryAfter } + ); + } + + eventToStore = { + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + }; + } + } else { + if (existingReleaseEvent) { + eventToReturn = { + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: existingReleaseEvent.eventData ?? undefined, + createdAt: existingReleaseEvent.createdAt, + eventId: existingReleaseEvent.eventId, + }; + } else { + const acquiredEvent = existingAcquiredEvent; + const lease = getAcquiredLease(acquiredEvent); + if (!lease) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be released before lock_acquired` + ); + } + + const releaseResult = await limits.release({ + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }); + + eventToStore = { + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + nextWaiter: releaseResult.nextWaiter, + }, + }; + } + } + + if (eventToReturn) { + const parsed = EventSchema.parse({ + eventType: eventToReturn.eventType, + correlationId: eventToReturn.correlationId, + eventData: eventToReturn.eventData, + createdAt: eventToReturn.createdAt, + runId: effectiveRunId, + eventId: eventToReturn.eventId, + specVersion: effectiveSpecVersion, + }); + const resolveData = params?.resolveData ?? 'all'; + return { + event: stripEventDataRefs(parsed, resolveData), + run, + step, + hook, + wait, + }; + } + if (!eventToStore) { + throw new WorkflowWorldError( + `Lock event "${data.eventType}" did not resolve for "${data.correlationId}"` + ); + } + + const [value] = await drizzle + .insert(Schema.events) + .values({ + runId: effectiveRunId, + eventId, + correlationId: eventToStore.correlationId, + eventType: eventToStore.eventType, + eventData: eventToStore.eventData as SerializedContent | undefined, + specVersion: effectiveSpecVersion, + }) + .returning({ createdAt: Schema.events.createdAt }); + + const parsed = EventSchema.parse({ + ...eventToStore, + ...value, + runId: effectiveRunId, + eventId, + }); + const resolveData = params?.resolveData ?? 'all'; + if ( + parsed.eventType === 'lock_release' && + parsed.eventData?.nextWaiter && + options?.queue && + options?.runs + ) { + const nextRun = await options.runs.get( + parsed.eventData.nextWaiter.runId, + { + resolveData: 'none', + } + ); + if (!['completed', 'failed', 'cancelled'].includes(nextRun.status)) { + await options.queue.queue( + `__wkf_workflow_${nextRun.workflowName}`, + { + runId: parsed.eventData.nextWaiter.runId, + lockPreApproval: parsed.eventData.nextWaiter.lockCorrelationId, + requestedAt: new Date(), + }, + { + idempotencyKey: parsed.eventData.nextWaiter.wakeCorrelationId, + } + ); + + await drizzle.insert(Schema.events).values({ + runId: parsed.eventData.nextWaiter.runId, + eventId: `wevt_${ulid()}`, + correlationId: parsed.eventData.nextWaiter.lockCorrelationId, + eventType: 'lock_waiter_queued', + specVersion: effectiveSpecVersion, + }); + } + } + return { + event: stripEventDataRefs(parsed, resolveData), + run, + step, + hook, + wait, + }; + } + // ============================================================ // Entity creation/updates based on event type // ============================================================ diff --git a/packages/world-postgres/test/spec.test.ts b/packages/world-postgres/test/spec.test.ts index 1be4cb2636..b22e22b35e 100644 --- a/packages/world-postgres/test/spec.test.ts +++ b/packages/world-postgres/test/spec.test.ts @@ -1,8 +1,15 @@ -import { execSync } from 'node:child_process'; +import { execFileSync, execSync } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; -import { createTestSuite } from '@workflow/world-testing'; +import { createTestSuite } from '../../world-testing/dist/src/index.mjs'; import { afterAll, beforeAll, test } from 'vitest'; +const packageDir = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + '..' +); + // Skip these tests on Windows since it relies on a docker container if (process.platform === 'win32') { test.skip('skipped on Windows since it relies on a docker container', () => {}); @@ -15,9 +22,15 @@ if (process.platform === 'win32') { process.env.WORKFLOW_POSTGRES_URL = dbUrl; process.env.DATABASE_URL = dbUrl; - execSync('pnpm db:push', { + execSync('pnpm build', { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); + + execFileSync('node', ['dist/cli.js'], { stdio: 'inherit', - cwd: process.cwd(), + cwd: packageDir, env: process.env, }); }, 120_000); diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index 424c4b14c9..ca19c3a905 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1,5 +1,6 @@ import { execSync } from 'node:child_process'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; +import { EntityConflictError, WorkflowWorldError } from '@workflow/errors'; import type { Hook, Step, WorkflowRun } from '@workflow/world'; import { encode } from 'cbor-x'; import { Pool } from 'pg'; diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts new file mode 100644 index 0000000000..400337db74 --- /dev/null +++ b/packages/world-postgres/test/test-db.ts @@ -0,0 +1,66 @@ +import { execFileSync } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { PostgreSqlContainer } from '@testcontainers/postgresql'; +import { Pool } from 'pg'; +import { createClient } from '../src/drizzle/index.js'; + +const packageDir = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + '..' +); + +export interface PostgresTestDb { + container: Awaited>; + pool: Pool; + drizzle: ReturnType; + connectionString: string; + truncateLimits(): Promise; + close(): Promise; +} + +export async function createPostgresTestDb(): Promise { + const container = await new PostgreSqlContainer('postgres:15-alpine').start(); + const connectionString = container.getConnectionUri(); + process.env.DATABASE_URL = connectionString; + process.env.WORKFLOW_POSTGRES_URL = connectionString; + + execFileSync('pnpm', ['build'], { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); + + execFileSync('node', ['dist/cli.js'], { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); + + const pool = new Pool({ connectionString, max: 10 }); + const drizzle = createClient(pool); + + return { + container, + pool, + drizzle, + connectionString, + async truncateLimits() { + await pool.query(` + truncate table + workflow.workflow_limit_keys, + workflow.workflow_limit_waiters, + workflow.workflow_rate_limit_tokens, + workflow.workflow_limit_leases, + workflow.workflow_steps, + workflow.workflow_events, + workflow.workflow_runs + restart identity cascade + `); + }, + async close() { + await pool.end(); + await container.stop(); + }, + }; +} diff --git a/packages/world-testing/src/index.mts b/packages/world-testing/src/index.mts index 4b59e15267..b65248f5c7 100644 --- a/packages/world-testing/src/index.mts +++ b/packages/world-testing/src/index.mts @@ -2,6 +2,8 @@ import { addition } from './addition.mjs'; import { errors } from './errors.mjs'; import { hooks } from './hooks.mjs'; import { idempotency } from './idempotency.mjs'; +export { createLimitsContractSuite } from './limits-contract.mjs'; +export { createLimitsRuntimeSuite } from './limits-runtime.mjs'; import { nullByte } from './null-byte.mjs'; export function createTestSuite(pkgName: string) { diff --git a/packages/world-testing/src/limits-contract.mts b/packages/world-testing/src/limits-contract.mts new file mode 100644 index 0000000000..cef3d7a8d6 --- /dev/null +++ b/packages/world-testing/src/limits-contract.mts @@ -0,0 +1,936 @@ +import { setTimeout as sleep } from 'node:timers/promises'; +import { + SPEC_VERSION_CURRENT, + type LimitDefinition, + type LimitLease, + type Limits, + type Storage, +} from '@workflow/world'; +import { describe, expect, it } from 'vitest'; + +export interface LimitsHarness { + limits: Limits; + storage?: Pick; + inspectKeyState: (key: string) => Promise<{ + leaseHolderIds: string[]; + waiterHolderIds: string[]; + tokenHolderIds: string[]; + }>; + close?: () => Promise; +} + +interface LockOwner { + lockId: string; + runId: string; + lockIndex: number; +} + +function createTestLockId(runId: string, lockIndex: number) { + return `${runId}:${lockIndex}`; +} + +async function createRun( + storage: Pick, + workflowName: string +) { + const result = await storage.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName, + input: [], + }, + }); + if (!result.run) { + throw new Error('expected run'); + } + return result.run; +} + +function requireEventsStorage( + storage: LimitsHarness['storage'] +): Pick { + if (!storage) { + throw new Error('storage.events is required for limits tests'); + } + return storage; +} + +async function createLockOwner( + storage: LimitsHarness['storage'], + workflowName: string, + lockIndex = 0 +): Promise { + const run = await createRun(requireEventsStorage(storage), workflowName); + return { + lockId: createTestLockId(run.runId, lockIndex), + runId: run.runId, + lockIndex, + }; +} + +function acquireRequest( + owner: LockOwner, + key: string, + definition: LimitDefinition, + leaseTtlMs?: number +) { + return { + key, + runId: owner.runId, + lockIndex: owner.lockIndex, + definition, + ...(leaseTtlMs !== undefined ? { leaseTtlMs } : {}), + }; +} + +function releaseRequest(lease: LimitLease) { + return { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }; +} + +export function createLimitsContractSuite( + name: string, + createHarness: () => Promise +) { + describe(name, () => { + it('throws a workflow world error when heartbeating a missing lease', async () => { + const harness = await createHarness(); + try { + await expect( + harness.limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toMatchObject({ + name: 'WorkflowWorldError', + message: expect.stringContaining('not found'), + }); + } finally { + await harness.close?.(); + } + }); + + it('enforces per-key concurrency limits', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency', + }); + + await harness.limits.release(releaseRequest(first.lease)); + + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('isolates unrelated keys at the raw limits layer', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const [first, second] = await Promise.all([ + harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:a', + { concurrency: { max: 1 } }, + 1_000 + ) + ), + harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:b', + { concurrency: { max: 1 } }, + 1_000 + ) + ), + ]); + + expect(first.status).toBe('acquired'); + expect(second.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('serializes concurrent acquires for the same key', async () => { + const harness = await createHarness(); + try { + const owners = await Promise.all( + Array.from({ length: 12 }, (_, index) => + createLockOwner(harness.storage, `holder-${index}`) + ) + ); + const results = await Promise.all( + owners.map((owner) => + harness.limits.acquire( + acquireRequest( + owner, + 'workflow:user:concurrent', + { concurrency: { max: 1 } }, + 10_000 + ) + ) + ) + ); + + const acquired = results.filter( + (result) => result.status === 'acquired' + ); + const blocked = results.filter((result) => result.status === 'blocked'); + + expect(acquired).toHaveLength(1); + expect(blocked).toHaveLength(11); + } finally { + await harness.close?.(); + } + }); + + it('keeps rate capacity consumed until the window expires', async () => { + const harness = await createHarness(); + try { + const periodMs = 3_000; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const ownerC = await createLockOwner(harness.storage, 'holder-c'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 5_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.release(releaseRequest(first.lease)); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 5_000 + ) + ); + expect(second.status).toBe('blocked'); + if (second.status !== 'blocked') throw new Error('expected blocked'); + expect(second.reason).toBe('rate'); + expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + + let secondRetry = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 5_000 + ) + ); + const deadline = Date.now() + periodMs + 1_000; + while (secondRetry.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, secondRetry.retryAfterMs ?? 0) + 50); + secondRetry = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); + } + + expect(secondRetry.status).toBe('acquired'); + if (secondRetry.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.release(releaseRequest(secondRetry.lease)); + + let third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 5_000 + ) + ); + const thirdDeadline = Date.now() + periodMs + 1_000; + while (third.status === 'blocked' && Date.now() < thirdDeadline) { + await sleep(Math.max(25, third.retryAfterMs ?? 0) + 50); + third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); + } + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('returns a combined blocked reason when both limits are saturated', async () => { + const harness = await createHarness(); + try { + const periodMs = 3_000; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 5_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 5_000 + ) + ); + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency_and_rate', + }); + if (second.status !== 'blocked') throw new Error('expected blocked'); + + await harness.limits.release(releaseRequest(first.lease)); + + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); + expect(third).toMatchObject({ + status: 'blocked', + reason: 'rate', + }); + + let fourth = third; + const deadline = Date.now() + periodMs + 1_000; + while (fourth.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, fourth.retryAfterMs ?? 0) + 50); + fourth = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 5_000 + ) + ); + } + + expect(fourth.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('restores capacity immediately when a lease is released', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(second.status).toBe('blocked'); + + await harness.limits.release(releaseRequest(first.lease)); + + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('extends lease expiry when heartbeated', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:heartbeat', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const heartbeat = await harness.limits.heartbeat({ + leaseId: first.lease.leaseId, + ttlMs: 5_000, + }); + + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( + first.lease.expiresAt?.getTime() ?? 0 + ); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:heartbeat', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + expect(second.status).toBe('blocked'); + } finally { + await harness.close?.(); + } + }); + + it('reclaims expired leases without manual cleanup', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + expect(second.status).toBe('blocked'); + + await sleep(1_500); + + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('reuses an existing lease for the same holder', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:reacquire', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:reacquire', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + expect(second).toMatchObject({ + status: 'acquired', + lease: { + leaseId: first.lease.leaseId, + lockId: first.lease.lockId, + }, + }); + + if (!harness.inspectKeyState) { + throw new Error( + 'inspectKeyState is required for duplicate lease checks' + ); + } + const keyState = await harness.inspectKeyState( + 'workflow:user:reacquire' + ); + expect( + keyState.leaseHolderIds.filter((lockId) => lockId === ownerA.lockId) + ).toHaveLength(1); + expect( + keyState.waiterHolderIds.filter((lockId) => lockId === ownerA.lockId) + ).toHaveLength(0); + } finally { + await harness.close?.(); + } + }); + + it('promotes waiters in FIFO order per key', async () => { + const harness = await createHarness(); + try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const ownerC = await createLockOwner(harness.storage, 'holder-c'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 10_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 10_000 + ) + ); + const third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 10_000 + ) + ); + + expect(second.status).toBe('blocked'); + expect(third.status).toBe('blocked'); + + await harness.limits.release(releaseRequest(first.lease)); + + const promoted = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 10_000 + ) + ); + const stillWaiting = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 10_000 + ) + ); + + expect(promoted.status).toBe('acquired'); + expect(stillWaiting.status).toBe('blocked'); + if (promoted.status !== 'acquired') + throw new Error('expected waiter-b promotion'); + + await harness.limits.release(releaseRequest(promoted.lease)); + + const thirdPromoted = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 10_000 + ) + ); + + expect(thirdPromoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('skips cancelled workflow waiters before promotion', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow waiter liveness'); + } + + const deadRun = await createRun(harness.storage, 'dead-workflow'); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_cancelled', + specVersion: SPEC_VERSION_CURRENT, + }); + + const liveRun = await createRun(harness.storage, 'live-workflow'); + await harness.storage.events.create(liveRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const liveOwner = { + lockId: createTestLockId(liveRun.runId, 0), + runId: liveRun.runId, + lockIndex: 0, + }; + const deadOwner = { + lockId: createTestLockId(deadRun.runId, 0), + runId: deadRun.runId, + lockIndex: 0, + }; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.acquire( + acquireRequest( + deadOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + await harness.limits.acquire( + acquireRequest( + liveOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + + await harness.limits.release(releaseRequest(first.lease)); + + const promoted = await harness.limits.acquire( + acquireRequest( + liveOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('reclaims a terminal workflow holder lease before its ttl expires', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow holder liveness'); + } + + const terminalRun = await createRun(harness.storage, 'terminal-holder'); + await harness.storage.events.create(terminalRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const waiterRun = await createRun(harness.storage, 'waiter-holder'); + await harness.storage.events.create(waiterRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + + const acquired = await harness.limits.acquire({ + key: 'workflow:user:terminal-holder', + runId: terminalRun.runId, + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 60_000, + }); + expect(acquired.status).toBe('acquired'); + if (acquired.status !== 'acquired') { + throw new Error('expected acquisition'); + } + + await harness.storage.events.create(terminalRun.runId, { + eventType: 'run_completed', + specVersion: SPEC_VERSION_CURRENT, + eventData: { output: null }, + }); + + const promoted = await harness.limits.acquire({ + key: 'workflow:user:terminal-holder', + runId: waiterRun.runId, + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('prunes terminal holders during release before promoting the next waiter', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow holder liveness'); + } + + const liveRun = await createRun(harness.storage, 'live-holder'); + const deadRunA = await createRun(harness.storage, 'dead-holder-a'); + const deadRunB = await createRun(harness.storage, 'dead-holder-b'); + const waiterRun = await createRun(harness.storage, 'waiter-holder'); + + for (const run of [liveRun, deadRunA, deadRunB, waiterRun]) { + await harness.storage.events.create(run.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + } + + const key = 'workflow:user:terminal-holder-release'; + const definition = { concurrency: { max: 3 } } as const; + const acquiredLive = await harness.limits.acquire({ + key, + runId: liveRun.runId, + lockIndex: 0, + definition, + leaseTtlMs: 60_000, + }); + const acquiredDeadA = await harness.limits.acquire({ + key, + runId: deadRunA.runId, + lockIndex: 0, + definition, + leaseTtlMs: 60_000, + }); + const acquiredDeadB = await harness.limits.acquire({ + key, + runId: deadRunB.runId, + lockIndex: 0, + definition, + leaseTtlMs: 60_000, + }); + + expect(acquiredLive.status).toBe('acquired'); + expect(acquiredDeadA.status).toBe('acquired'); + expect(acquiredDeadB.status).toBe('acquired'); + if ( + acquiredLive.status !== 'acquired' || + acquiredDeadA.status !== 'acquired' || + acquiredDeadB.status !== 'acquired' + ) { + throw new Error('expected acquisition'); + } + + const blockedWaiter = await harness.limits.acquire({ + key, + runId: waiterRun.runId, + lockIndex: 0, + definition, + leaseTtlMs: 5_000, + }); + expect(blockedWaiter.status).toBe('blocked'); + + for (const run of [deadRunA, deadRunB]) { + await harness.storage.events.create(run.runId, { + eventType: 'run_completed', + specVersion: SPEC_VERSION_CURRENT, + eventData: { output: null }, + }); + } + + const released = await harness.limits.release( + releaseRequest(acquiredLive.lease) + ); + expect(released.nextWaiter).toMatchObject({ + runId: waiterRun.runId, + lockIndex: 0, + }); + + const promoted = await harness.limits.acquire({ + key, + runId: waiterRun.runId, + lockIndex: 0, + definition, + leaseTtlMs: 5_000, + }); + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('does not duplicate a replayed blocked holder waiter or lease', async () => { + const harness = await createHarness(); + try { + const key = 'workflow:user:replay'; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const replayOwner = await createLockOwner( + harness.storage, + 'holder-replay' + ); + const blockedLockId = replayOwner.lockId; + + const first = await harness.limits.acquire( + acquireRequest(ownerA, key, { concurrency: { max: 1 } }, 1_000) + ); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const blockedA = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); + const blockedB = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); + + expect(blockedA.status).toBe('blocked'); + expect(blockedB.status).toBe('blocked'); + + const blockedState = await harness.inspectKeyState(key); + expect( + blockedState.waiterHolderIds.filter( + (lockId) => lockId === blockedLockId + ) + ).toHaveLength(1); + expect( + blockedState.leaseHolderIds.filter( + (lockId) => lockId === blockedLockId + ) + ).toHaveLength(0); + + await harness.limits.release(releaseRequest(first.lease)); + + const acquired = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); + expect(acquired.status).toBe('acquired'); + if (acquired.status !== 'acquired') + throw new Error('expected replayed holder acquisition'); + + const acquiredState = await harness.inspectKeyState(key); + expect( + acquiredState.waiterHolderIds.filter( + (lockId) => lockId === blockedLockId + ) + ).toHaveLength(0); + expect( + acquiredState.leaseHolderIds.filter( + (lockId) => lockId === blockedLockId + ) + ).toHaveLength(1); + } finally { + await harness.close?.(); + } + }); + }); +} diff --git a/packages/world-testing/src/limits-runtime.mts b/packages/world-testing/src/limits-runtime.mts new file mode 100644 index 0000000000..d9a75c95b6 --- /dev/null +++ b/packages/world-testing/src/limits-runtime.mts @@ -0,0 +1,347 @@ +import { describe, expect, it } from 'vitest'; + +type WorkflowLockContentionResult = { + workflowLockAcquiredAt: number; + workflowLockReleasedAt: number; + stepCallLockAcquiredAt: number; + stepCallLockReleasedAt: number; +}; + +type LockedStepCallResult = { + label: string; + key?: string; + attempt: number; + acquiredAt: number; + releasedAt: number; +}; + +type WorkflowOnlyLockResult = { + label: string; + workflowLockAcquiredAt: number; + workflowLockReleasedAt: number; +}; + +type WorkflowRateLimitResult = { + label: string; + workflowRateAcquiredAt: number; + workflowRateReleasedAt: number; + periodMs: number; +}; + +type ReleasedRateLimitReplayResult = { + elapsedMs: number; + periodMs: number; + sleepMs: number; +}; + +type LeakedLockResult = { + label: string; + key: string; + leaseTtlMs: number; + lockAcquiredAt: number; + workflowCompletedAt: number; +}; + +type WorkflowMultiStepScopeResult = { + key: string; + workflowLockAcquiredAt: number; + firstStepCompletedAt: number; + secondStepCompletedAt: number; + workflowLockReleasedAt: number; +}; + +function sortContentionResults( + results: [T, T] +): [T, T] { + return [...results].sort( + (a, b) => a.workflowLockAcquiredAt - b.workflowLockAcquiredAt + ) as [T, T]; +} + +export interface LimitsRuntimeHarness { + runWorkflowWithScopedLocks(userId: string): Promise<{ + workflowKey: string; + dbKey: string; + aiKey: string; + summary: string; + }>; + runWorkflowLockContention( + userId: string, + holdMs: number + ): Promise<[WorkflowLockContentionResult, WorkflowLockContentionResult]>; + runLockedStepCallContention( + key: string, + holdMs: number, + labelA?: string, + labelB?: string + ): Promise<[LockedStepCallResult, LockedStepCallResult]>; + runWorkflowLockAcrossSuspension( + userId: string, + holdMs: number + ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; + runWorkflowExpiredLeaseRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[LeakedLockResult, WorkflowOnlyLockResult]>; + runWorkflowTerminalHolderRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[LeakedLockResult, WorkflowOnlyLockResult]>; + runLeakedKeyExpiredLeaseRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[LeakedLockResult, LockedStepCallResult]>; + runWorkflowMixedLimitContention( + userId: string, + holdMs: number, + periodMs: number + ): Promise<[WorkflowRateLimitResult, WorkflowRateLimitResult]>; + runReleasedRateLimitReplay( + userId: string, + periodMs: number, + sleepMs: number + ): Promise; + runWorkflowFifoThreeWaiters( + userId: string, + holdMs: number + ): Promise< + [WorkflowOnlyLockResult, WorkflowOnlyLockResult, WorkflowOnlyLockResult] + >; + runCancelledWorkflowWaiter( + userId: string, + holdMs: number + ): Promise<{ + cancelledError: unknown; + resultA: WorkflowOnlyLockResult; + resultC: WorkflowOnlyLockResult; + }>; + runIndependentWorkflowKeys( + holdMs: number + ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; + runIndependentStepKeys( + holdMs: number + ): Promise<[LockedStepCallResult, LockedStepCallResult]>; + runBlockedWaiterWithUnrelatedWorkflow(holdMs: number): Promise<{ + holder: WorkflowOnlyLockResult; + waiter: WorkflowOnlyLockResult; + unrelated: WorkflowOnlyLockResult; + }>; + runWorkflowSingleLockAcrossMultipleSteps( + holdMs: number + ): Promise; +} + +export function createLimitsRuntimeSuite( + name: string, + createHarness: () => Promise +) { + describe(name, () => { + it('runs locks around individual step calls end-to-end', async () => { + const harness = await createHarness(); + const userId = 'shared-user'; + const result = await harness.runWorkflowWithScopedLocks(userId); + + expect(result).toMatchObject({ + workflowKey: `workflow:user:${userId}`, + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: `summary:profile:${userId}`, + }); + }); + + it('serializes workflow locks and locks around step calls under contention', async () => { + const harness = await createHarness(); + const [resultA, resultB] = sortContentionResults( + await harness.runWorkflowLockContention('shared-user', 750) + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultB.stepCallLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepCallLockReleasedAt + ); + }); + + it('wakes promoted workflow and step-call lock waiters promptly', async () => { + const harness = await createHarness(); + const [resultA, resultB] = sortContentionResults( + await harness.runWorkflowLockContention('shared-user', 1_500) + ); + + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + expect( + resultB.stepCallLockAcquiredAt - resultA.stepCallLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('can hold one workflow lock across multiple steps in the same scope', async () => { + const harness = await createHarness(); + const result = + await harness.runWorkflowSingleLockAcrossMultipleSteps(400); + + expect(result.firstStepCompletedAt).toBeGreaterThanOrEqual( + result.workflowLockAcquiredAt + ); + expect(result.secondStepCompletedAt).toBeGreaterThanOrEqual( + result.firstStepCompletedAt + ); + expect(result.workflowLockReleasedAt).toBeGreaterThanOrEqual( + result.secondStepCompletedAt + ); + }); + + it('keeps workflow locks held across suspension until the workflow finishes', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockAcrossSuspension( + 'shared-user', + 1_500 + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('reclaims terminal workflow-held locks on workflow keys', async () => { + const harness = await createHarness(); + const leaseTtlMs = 1_250; + const [resultA, resultB] = await harness.runWorkflowExpiredLeaseRecovery( + 'expired-workflow-user', + leaseTtlMs + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + }); + + it('reclaims terminal workflow holder leases promptly before ttl expiry', async () => { + const harness = await createHarness(); + const leaseTtlMs = 30_000; + const [resultA, resultB] = + await harness.runWorkflowTerminalHolderRecovery( + 'terminal-holder-user', + leaseTtlMs + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + expect( + resultB.workflowLockAcquiredAt - resultA.lockAcquiredAt + ).toBeLessThan(leaseTtlMs - 5_000); + }); + + it('reclaims terminal workflow-held locks on arbitrary keys', async () => { + const harness = await createHarness(); + const leaseTtlMs = 1_250; + const [resultA, resultB] = await harness.runLeakedKeyExpiredLeaseRecovery( + 'expired-key-user', + leaseTtlMs + ); + + expect(resultB.acquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + }); + + it('keeps mixed concurrency and rate waiters blocked until the rate window expires', async () => { + const harness = await createHarness(); + const holdMs = 250; + const periodMs = 1_500; + const [resultA, resultB] = await harness.runWorkflowMixedLimitContention( + 'shared-user', + holdMs, + periodMs + ); + + expect( + resultB.workflowRateAcquiredAt - resultA.workflowRateAcquiredAt + ).toBeGreaterThanOrEqual(periodMs - 100); + + const remainingWindowAfterRelease = + periodMs - + (resultA.workflowRateReleasedAt - resultA.workflowRateAcquiredAt); + expect( + resultB.workflowRateAcquiredAt - resultA.workflowRateReleasedAt + ).toBeGreaterThanOrEqual(Math.max(0, remainingWindowAfterRelease - 100)); + }); + + it('does not reacquire a released rate-only lock on later replay', async () => { + const harness = await createHarness(); + const result = await harness.runReleasedRateLimitReplay( + 'replay-user', + 6_000, + 100 + ); + + expect(result.elapsedMs).toBeLessThan(4_000); + }); + + it('promotes 3 workflow waiters in FIFO order', async () => { + const harness = await createHarness(); + const [resultA, resultB, resultC] = + await harness.runWorkflowFifoThreeWaiters('shared-user', 750); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultB.workflowLockReleasedAt + ); + }); + + it('skips cancelled workflow waiters before promoting the next run', async () => { + const harness = await createHarness(); + const { cancelledError, resultA, resultC } = + await harness.runCancelledWorkflowWaiter('shared-user', 1_500); + + expect(cancelledError).toBeTruthy(); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(6_000); + }); + + it('does not block unrelated workflow keys', async () => { + const harness = await createHarness(); + const [resultA, resultB] = + await harness.runIndependentWorkflowKeys(3_000); + + expect(resultB.workflowLockAcquiredAt).toBeLessThan( + resultA.workflowLockReleasedAt + ); + }); + + it('does not block unrelated step-like keys', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runIndependentStepKeys(3_000); + + expect(resultB.acquiredAt).toBeLessThan(resultA.releasedAt); + }); + + it.skipIf(process.env.WORKFLOW_LIMITS_LOW_CONCURRENCY !== '1')( + 'frees worker slots for unrelated workflows while a waiter is blocked', + async () => { + const harness = await createHarness(); + const { holder, waiter, unrelated } = + await harness.runBlockedWaiterWithUnrelatedWorkflow(1_500); + + expect(waiter.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + holder.workflowLockReleasedAt + ); + expect(unrelated.workflowLockReleasedAt).toBeLessThan( + waiter.workflowLockAcquiredAt + ); + } + ); + }); +} diff --git a/packages/world-vercel/src/index.ts b/packages/world-vercel/src/index.ts index 896fede377..5d5cda63c2 100644 --- a/packages/world-vercel/src/index.ts +++ b/packages/world-vercel/src/index.ts @@ -1,5 +1,6 @@ import type { World } from '@workflow/world'; import { createGetEncryptionKeyForRun } from './encryption.js'; +import { createLimits } from './limits.js'; import { instrumentObject } from './instrumentObject.js'; import { createQueue } from './queue.js'; import { createResolveLatestDeploymentId } from './resolve-latest-deployment.js'; @@ -24,6 +25,7 @@ export function createVercelWorld(config?: APIConfig): World { config?.projectConfig?.projectId || process.env.VERCEL_PROJECT_ID; return { + limits: createLimits(config), ...createQueue(config), ...createStorage(config), ...instrumentObject('world.streams', createStreamer(config)), diff --git a/packages/world-vercel/src/limits.test.ts b/packages/world-vercel/src/limits.test.ts new file mode 100644 index 0000000000..ff6bf0151a --- /dev/null +++ b/packages/world-vercel/src/limits.test.ts @@ -0,0 +1,41 @@ +import { describe, expect, it } from 'vitest'; +import { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; +import { createVercelWorld } from './index.js'; +import { createLimits } from './limits.js'; + +describe('vercel world limits', () => { + it('exposes the required limits namespace', () => { + const limits = createLimits(); + + expect(limits).toMatchObject({ + acquire: expect.any(Function), + release: expect.any(Function), + heartbeat: expect.any(Function), + }); + }); + + it('keeps limits unimplemented until lock support exists', async () => { + const world = createVercelWorld(); + + await expect( + world.limits.acquire({ + key: 'workflow:user:test', + runId: 'wrun_test', + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + + await expect( + world.limits.release({ + leaseId: 'lease_test', + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + + await expect( + world.limits.heartbeat({ + leaseId: 'lease_test', + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + }); +}); diff --git a/packages/world-vercel/src/limits.ts b/packages/world-vercel/src/limits.ts new file mode 100644 index 0000000000..785fa4886e --- /dev/null +++ b/packages/world-vercel/src/limits.ts @@ -0,0 +1,16 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import type { APIConfig } from './utils.js'; + +export function createLimits(_config?: APIConfig): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md new file mode 100644 index 0000000000..20aa5e227b --- /dev/null +++ b/packages/world/FLOW_LIMITS.md @@ -0,0 +1,376 @@ +# Flow Limits Design Notes + +This note summarizes the implemented direction for flow concurrency and rate +limiting across `@workflow/core`, `@workflow/world`, and concrete world +implementations. + +## Status + +- The shared `limits` interface and `lock()` API surface now exist. +- Local world now implements the shared live-process limits semantics with + leases, rate tokens, FIFO waiters, and prompt wake-up with delayed fallback. +- Postgres implements the same limits semantics with PostgreSQL-backed leases, + rate tokens, durable waiters, and durable queue wake-up. +- Vercel still exposes `limits` as a stub. +- The Next.js Turbopack workbench has shared E2E coverage for `lock()` used + with `await using`, including locks that wrap individual step calls or + groups of steps. + +## Goals + +- Support keyed concurrency limits. +- Support keyed rate limits. +- Allow concurrency and rate to be colocated in one interface. +- Support locks whose lifetime follows normal `await using` lexical scope. +- Make crash recovery possible through leases with TTL/expiry. +- Keep worker throughput controls separate from business-level flow limits. + +## Core Terms + +- `worker concurrency`: backend throughput setting for queue/job processing. +- `workflow limit`: admission control for workflow runs that share a key. +- `scoped resource key`: any user-defined key acquired from workflow scope to + protect one step call, multiple step calls, or a whole workflow section. +- `lease`: durable record that a workflow currently occupies capacity for a + key. + +## Shared Contract vs World-Specific Behavior + +The limits contract is intended to describe one shared set of observable +semantics across implemented worlds. That shared contract includes: + +- `acquire()`, `release()`, and `heartbeat()` surface behavior +- `WorkflowWorldError` when heartbeating a missing lease +- per-key concurrency and rate limiting outcomes +- same-holder lease reuse +- serialization of concurrent acquires for a single key +- FIFO waiter promotion per key +- pruning terminal workflow holders and waiters +- blocked acquisitions not consuming execution concurrency +- prompt wake-up with delayed fallback replay + +World-specific behavior should be limited to implementation mechanics and +durability characteristics, for example: + +- how waiter state is stored internally +- how per-key mutations are serialized internally +- how prompt wake-up is delivered +- whether queued wake-ups survive process or host loss +- backend-specific observability or debugging surfaces + +That means SQL row layout, advisory locks, and Graphile jobs are PostgreSQL +implementation details, while FIFO fairness and waiter skipping are contract +behavior that local and Postgres should both exhibit. + +## Decisions So Far + +### 1. Use one shared limits model + +The shared world interface uses a single `limits` namespace and a single limit +definition shape that can contain either or both: + +- `concurrency` +- `rate` + +This allows one key to express: + +- concurrency only +- rate only +- both together + +### 2. Use leases, not plain mutexes + +Limits are modeled as leases with TTL/expiry so capacity can be recovered after: + +- worker crashes +- process death +- machine shutdown +- lost retries + +Normal completion should dispose/release the lease explicitly. Crash recovery +comes from lease expiry plus future reclaim logic. + +The default workflow lock TTL should be high enough to cover normal suspended +execution without making users tune it eagerly. The current runtime default is +24 hours unless the caller overrides `leaseTtlMs`. + +### 3. Keep worker concurrency separate from flow limits + +Current world-level concurrency settings are infrastructure controls, not +business-level locking: + +- local world: `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` +- postgres world: `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` + +These control how many queue jobs can be processed at once. They should remain +independent from flow limits like: + +- `workflow:user:123` +- `step:db:cheap` +- `step:provider:openai` + +### 4. Rate-limited waits are scheduled with `acquireAt` + +For a rate limit like: + +- `rate: { count: 10, periodMs: 60_000 }` + +the observable contract is: + +- blocked acquires receive an `acquireAt` time through `lock_created` +- a workflow retries `lock_acquired` only once that `acquireAt` has arrived, or + sooner if it is explicitly re-queued with lock pre-approval +- a historical `lock_acquired` is only valid while its lease is still live +- once the lease has expired, replay must ignore that old acquisition and + acquire again + +The important distinction in the event log is: + +- `lock_created`: reservation / retry scheduling information +- `lock_acquired`: proof that a live lease was actually granted +- `lock_release`: disposal of the granted lease, optionally with a nominated + next waiter to wake + +### 5. Use one `lock()` API from workflow scope + +We want one user-facing primitive: + +```ts +await using lease = await lock({ ... }); +``` + +`lock()` means workflow code acquires ownership of a keyed lease. + +If placed at the top of a workflow, it should hold the lease across the logical +workflow scope, even though the workflow may suspend and resume many times. + +Steps themselves do not acquire locks directly. To limit one step category or a +group of steps, the workflow acquires the lock and then calls those steps while +the lease is held. + +### 6. `await using` is the preferred user-facing shape + +The preferred API is explicit resource management: + +```ts +await using lease = await lock({ ... }); +``` + +This gives automatic cleanup on scope exit and reads well for critical sections +that may include one or many step calls. + +For manual early cleanup, the user-facing `LockHandle` should expose: + +- `dispose()` +- `[Symbol.asyncDispose]()` + +The backend-facing world contract can continue to use `release(...)` internally. + +### 7. Locks follow logical scope, not request lifetime + +For workflows, `await using` must be tied to the logical workflow scope across: + +- step round trips +- queue turns +- sleeps +- hooks +- replay/resume + +The lease must not be disposed merely because one host process invocation ends. + +### 8. Keep admission decisions in workflow code + +Current preferred model: + +- workflow code acquires and releases limits +- steps execute inside whatever critical section the workflow establishes +- step code never waits on a separate lock of its own + +This keeps the dependency direction simple: + +- workflow admission / critical section -> step execution + +That avoids needing separate workflow-lock and step-lock runtime semantics. + +### 9. Waiters are FIFO per key + +Implemented worlds use a waiter queue and promote waiters in FIFO order for a +single limit key. + +Important details: + +- FIFO is per key, not global across all limit keys +- promotion order is based on waiter creation order +- terminal holders are pruned before capacity decisions +- dead or terminal waiters are pruned before promotion +- a live waiter may still be skipped if it is no longer eligible when promotion runs +- releasing a lease or reclaiming an expired lease can both trigger promotion +- rate-window expiry can also make the head waiter eligible again + +Implemented worlds currently reclaim terminal holders opportunistically when a +key is touched, so completed, failed, or cancelled workflows do not hold +concurrency capacity until lease TTL expiry. + +This gives deterministic and inspectable fairness for a key without requiring a +global scheduler. + +### 9.5. First writer wins for key configuration + +Each limit key has one canonical definition while it is live. + +- the first acquire for a key seeds that definition +- later acquires for the same key must match it exactly +- a mismatched definition is a hard error +- once a key fully drains, the canonical definition is forgotten and the next + acquire may seed a new one + +### 10. Blocked limits do not consume worker concurrency + +Blocked flow limits and worker concurrency are intentionally separate. + +For implemented worlds: + +- blocked workflows are suspended and re-queued, not left running on a worker +- worker slots are free to service unrelated work while the blocked execution is + waiting to be retried or promoted + +PostgreSQL additionally keeps that backlog durable in the database. The local +world keeps queue delivery in-memory, so cross-process crash recovery for the +backlog is explicitly outside the shared limits contract today. + +### 11. Wake-up is prompt, with a delayed fallback + +Implemented worlds use the world-owned limit state as the source of truth and +try to resume promoted waiters promptly, with a delayed fallback still in place +so progress is possible if an immediate wake-up is missed. + +Current behavior: + +- leases, rate tokens, and waiters live in world-owned limit state +- promotion decisions are made from that limit state +- `lock_release` may nominate the next waiter to wake +- event storage is responsible for enqueuing that waiter with lock pre-approval + and then appending `lock_waiter_queued` for the waiter correlation +- workflows also keep a delayed replay fallback so progress is still possible if + an immediate wake-up is missed + +PostgreSQL uses Graphile jobs for that wake-up path and keeps the backlog +durable across host/process failure. The local world uses an in-memory queue, so +prompt wake behavior matches while the process is alive, but durable backlog +survival is not guaranteed after process loss. + +### 12. V1 semantics are intentionally opinionated + +For v1, the intended semantics are: + +- workflow locks count admitted, in-flight workflows for a key +- workflow-held keys may be used to serialize or rate-limit specific step categories +- worker concurrency remains a separate infrastructure throttle + +More concretely: + +- if a workflow acquires a lock and then sleeps for 10 minutes, + it still counts as active for that workflow key during the sleep +- if a workflow acquires a lock for a step-like key such as `step:db:cheap`, + that key remains occupied until the workflow releases it, even if the + protected work is just one step call or a small group of step calls +- rate-limited step-like keys still consume rate capacity when the workflow + acquires that key, and that usage remains counted until the window expires + even if the workflow releases the lease quickly + +For the current local implementation specifically: + +- workflow locks now follow the same live-process waiter/fairness semantics as + Postgres +- the queue remains in-memory, so queued wake-ups are not durable across process + loss + +This means the current v1 interpretation of a workflow lock is: + +- "How many workflows for this key are admitted and in flight at all?" + +not: + +- "How many workflows are actively burning CPU right this instant?" + +## Current Example Shape + +The current placeholder E2E example models: + +- workflow-level user concurrency: + - `workflow:user:${userId}` +- step-level DB concurrency: + - `step:db:cheap` +- step-level AI rate limit: + - `step:provider:openai` + +With intended usage like: + +```ts +async function cheapDbStep(userId: string) { + 'use step'; + return { userId, prompt: `profile:${userId}` }; +} + +async function expensiveAIStep(prompt: string) { + 'use step'; + return `summary:${prompt}`; +} + +export async function workflowWithScopedLocks(userId: string) { + 'use workflow'; + await using userLimit = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 2 }, + }); + + let row: Awaited>; + { + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + }); + row = await cheapDbStep(userId); + } + + let summary: Awaited>; + { + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + }); + summary = await expensiveAIStep(row.prompt); + } + return { row, summary }; +} +``` + +## Important Clarification + +Flow limits and worker concurrency are different layers. + +For example: + +- a cheap DB step may continue making progress even while an expensive AI step + is rate-limited +- the main shared coupling between them is the worker pool +- if workers are available, unrelated step categories should continue + +So overall system throughput is not one simple global minimum. Different +workflow paths may be bottlenecked by different limits at different times. + +Two more practical clarifications: + +- a blocked workflow lock should not monopolize + `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` or + `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` just because it is waiting +- a released lease may nominate one waiter for prompt wake-up, but delayed retry + remains in place as the fallback path + +## Open Questions + +- Whether workflow-level locks should always be whole-run admission locks or + also support narrower lexical scopes within workflow code. +- Whether `heartbeat()` should remain user-visible or become mostly internal. +- Whether `lock()` should eventually grow optional metadata or + config sugar for common per-step resource keys. diff --git a/packages/world/package.json b/packages/world/package.json index d473cdb588..aec9ae88dc 100644 --- a/packages/world/package.json +++ b/packages/world/package.json @@ -20,7 +20,8 @@ "scripts": { "build": "tsc", "dev": "tsc --watch", - "clean": "tsc --build --clean && rm -rf dist" + "clean": "tsc --build --clean && rm -rf dist", + "test": "vitest run src" }, "dependencies": { "ulid": "catalog:" @@ -30,6 +31,7 @@ }, "devDependencies": { "@types/node": "catalog:", + "vitest": "catalog:", "zod": "catalog:", "@workflow/tsconfig": "workspace:*" }, diff --git a/packages/world/src/events.ts b/packages/world/src/events.ts index 2965906f7b..9fc6675e5a 100644 --- a/packages/world/src/events.ts +++ b/packages/world/src/events.ts @@ -1,4 +1,9 @@ import { z } from 'zod'; +import { + LimitDefinitionSchema, + LimitLeaseSchema, + LimitNextWaiterSchema, +} from './limits.js'; import { SerializedDataSchema } from './serialization.js'; import type { PaginationOptions, ResolveData } from './shared.js'; @@ -74,6 +79,11 @@ export const EventTypeSchema = z.enum([ // Wait lifecycle events 'wait_created', 'wait_completed', + // Lock lifecycle events + 'lock_created', + 'lock_acquired', + 'lock_release', + 'lock_waiter_queued', ]); // Base event schema with common properties @@ -202,6 +212,45 @@ const WaitCompletedEventSchema = BaseEventSchema.extend({ correlationId: z.string(), }); +const LockCreatedEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_created'), + correlationId: z.string(), + eventData: z.object({ + key: z.string(), + definition: LimitDefinitionSchema, + leaseTtlMs: z.number().int().positive().optional(), + acquireAt: z.coerce.date().optional(), + }), +}); + +const LockAcquiredEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_acquired'), + correlationId: z.string(), + eventData: z + .object({ + lease: LimitLeaseSchema, + }) + .optional(), +}); + +const LockReleaseEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_release'), + correlationId: z.string(), + eventData: z + .object({ + leaseId: z.string().min(1), + key: z.string(), + lockId: z.string(), + nextWaiter: LimitNextWaiterSchema.optional(), + }) + .optional(), +}); + +const LockWaiterQueuedEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_waiter_queued'), + correlationId: z.string(), +}); + // ============================================================================= // Run lifecycle events // ============================================================================= @@ -281,6 +330,11 @@ export const CreateEventSchema = z.discriminatedUnion('eventType', [ // Wait lifecycle events WaitCreatedEventSchema, WaitCompletedEventSchema, + // Lock lifecycle events + LockCreatedEventSchema, + LockAcquiredEventSchema, + LockReleaseEventSchema, + LockWaiterQueuedEventSchema, ]); // Discriminated union for ALL events (includes World-only events like hook_conflict) @@ -306,6 +360,11 @@ const AllEventsSchema = z.discriminatedUnion('eventType', [ // Wait lifecycle events WaitCreatedEventSchema, WaitCompletedEventSchema, + // Lock lifecycle events + LockCreatedEventSchema, + LockAcquiredEventSchema, + LockReleaseEventSchema, + LockWaiterQueuedEventSchema, ]); // Server response includes runId, eventId, and createdAt diff --git a/packages/world/src/index.ts b/packages/world/src/index.ts index baa62a1480..d6d7746f96 100644 --- a/packages/world/src/index.ts +++ b/packages/world/src/index.ts @@ -10,6 +10,31 @@ export { export type * from './hooks.js'; export { HookSchema } from './hooks.js'; export type * from './interfaces.js'; +export type * from './limits.js'; +export { + createLockId, + createLockCorrelationId, + createLockWakeCorrelationId, + createLimitsNotImplementedError, + LimitAcquireAcquiredResultSchema, + LimitAcquireBlockedResultSchema, + LimitAcquireRequestSchema, + LimitAcquireResultSchema, + LimitAcquireStatusSchema, + LimitBlockedReasonSchema, + LimitConcurrencySchema, + LimitDefinitionSchema, + LimitHeartbeatRequestSchema, + LimitKeySchema, + LimitLeaseSchema, + LimitLockIdSchema, + LimitNextWaiterSchema, + LimitRateSchema, + LimitReleaseResultSchema, + LimitReleaseRequestSchema, + LIMITS_NOT_IMPLEMENTED_MESSAGE, + parseLockId, +} from './limits.js'; export type * from './queue.js'; export { HealthCheckPayloadSchema, diff --git a/packages/world/src/interfaces.ts b/packages/world/src/interfaces.ts index 77b8cd04e6..bd1d622850 100644 --- a/packages/world/src/interfaces.ts +++ b/packages/world/src/interfaces.ts @@ -9,6 +9,7 @@ import type { RunCreatedEventRequest, } from './events.js'; import type { GetHookParams, Hook, ListHooksParams } from './hooks.js'; +import type { Limits } from './limits.js'; import type { Queue } from './queue.js'; import type { GetWorkflowRunParams, @@ -220,6 +221,8 @@ export interface Storage { * The "World" interface represents how Workflows are able to communicate with the outside world. */ export interface World extends Queue, Storage, Streamer { + limits: Limits; + /** * A function that will be called to start any background tasks needed by the World implementation. * For example, in the case of a queue backed World, this would start the queue processing. diff --git a/packages/world/src/limits.test.ts b/packages/world/src/limits.test.ts new file mode 100644 index 0000000000..8796d636ad --- /dev/null +++ b/packages/world/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('limits schemas', () => { + it.fails('accepts concurrency-only, rate-only, and combined limit definitions', () => { + throw new Error('TODO: implement'); + }); + + it.fails('rejects invalid or empty limit definitions', () => { + throw new Error('TODO: implement'); + }); + + it.fails('discriminates acquired and blocked acquire results', () => { + throw new Error('TODO: implement'); + }); + + it.fails('keeps lease, release, and heartbeat request shapes stable', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world/src/limits.ts b/packages/world/src/limits.ts new file mode 100644 index 0000000000..ad8d2b7af4 --- /dev/null +++ b/packages/world/src/limits.ts @@ -0,0 +1,160 @@ +import { z } from 'zod'; + +export const LIMITS_NOT_IMPLEMENTED_MESSAGE = + 'Flow limits are reserved for future support and are not implemented yet.'; + +export function createLimitsNotImplementedError(): Error { + return new Error(LIMITS_NOT_IMPLEMENTED_MESSAGE); +} + +export const LimitKeySchema = z.string().min(1); +export type LimitKey = z.infer; + +export const LimitConcurrencySchema = z.object({ + max: z.number().int().positive(), +}); +export type LimitConcurrency = z.infer; + +export const LimitRateSchema = z.object({ + count: z.number().int().positive(), + periodMs: z.number().int().positive(), +}); +export type LimitRate = z.infer; + +export const LimitDefinitionSchema = z + .object({ + concurrency: LimitConcurrencySchema.optional(), + rate: LimitRateSchema.optional(), + }) + .refine( + (value) => value.concurrency !== undefined || value.rate !== undefined, + { + message: 'At least one limit must be configured', + } + ); +export type LimitDefinition = z.infer; + +export const LimitLockIdSchema = z.string().min(1); +export type LimitLockId = z.infer; + +export function createLockId(runId: string, lockIndex: number): LimitLockId { + return `${runId}:${lockIndex}`; +} + +export function parseLockId( + lockId: string +): { runId: string; lockIndex: number } | null { + const separatorIndex = lockId.lastIndexOf(':'); + if (separatorIndex <= 0 || separatorIndex === lockId.length - 1) { + return null; + } + + const runId = lockId.slice(0, separatorIndex); + const rawLockIndex = lockId.slice(separatorIndex + 1); + const lockIndex = Number.parseInt(rawLockIndex, 10); + if (!Number.isInteger(lockIndex) || lockIndex < 0) { + return null; + } + + return { runId, lockIndex }; +} + +export function createLockWakeCorrelationId( + runId: string, + lockIndex: number +): string { + return `wflock_wait_${runId}:${lockIndex}`; +} + +export function createLockCorrelationId( + runId: string, + lockIndex: number +): string { + return `wflock_${runId}:${lockIndex}`; +} + +export const LimitLeaseSchema = z.object({ + leaseId: z.string().min(1), + key: LimitKeySchema, + lockId: LimitLockIdSchema, + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), + acquiredAt: z.coerce.date(), + expiresAt: z.coerce.date().optional(), + definition: LimitDefinitionSchema, +}); +export type LimitLease = z.infer; + +export const LimitAcquireRequestSchema = z.object({ + key: LimitKeySchema, + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), + definition: LimitDefinitionSchema, + leaseTtlMs: z.number().int().positive().optional(), +}); +export type LimitAcquireRequest = z.infer; + +export const LimitBlockedReasonSchema = z.enum([ + 'concurrency', + 'rate', + 'concurrency_and_rate', +]); +export type LimitBlockedReason = z.infer; + +export const LimitAcquireStatusSchema = z.enum(['acquired', 'blocked']); +export type LimitAcquireStatus = z.infer; + +export const LimitAcquireAcquiredResultSchema = z.object({ + status: z.literal(LimitAcquireStatusSchema.enum.acquired), + lease: LimitLeaseSchema, +}); +export type LimitAcquireAcquiredResult = z.infer< + typeof LimitAcquireAcquiredResultSchema +>; + +export const LimitAcquireBlockedResultSchema = z.object({ + status: z.literal(LimitAcquireStatusSchema.enum.blocked), + reason: LimitBlockedReasonSchema, + retryAfterMs: z.number().int().nonnegative().optional(), +}); +export type LimitAcquireBlockedResult = z.infer< + typeof LimitAcquireBlockedResultSchema +>; + +export const LimitAcquireResultSchema = z.discriminatedUnion('status', [ + LimitAcquireAcquiredResultSchema, + LimitAcquireBlockedResultSchema, +]); +export type LimitAcquireResult = z.infer; + +export const LimitReleaseRequestSchema = z.object({ + leaseId: z.string().min(1), + key: LimitKeySchema.optional(), + lockId: LimitLockIdSchema.optional(), +}); +export type LimitReleaseRequest = z.infer; + +export const LimitNextWaiterSchema = z.object({ + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), + wakeCorrelationId: z.string().min(1), + lockCorrelationId: z.string().min(1), +}); +export type LimitNextWaiter = z.infer; + +export const LimitReleaseResultSchema = z.object({ + nextWaiter: LimitNextWaiterSchema.optional(), +}); +export type LimitReleaseResult = z.infer; + +export const LimitHeartbeatRequestSchema = z.object({ + leaseId: z.string().min(1), + ttlMs: z.number().int().positive().optional(), +}); +export type LimitHeartbeatRequest = z.infer; + +export interface Limits { + acquire(request: LimitAcquireRequest): Promise; + release(request: LimitReleaseRequest): Promise; + heartbeat(request: LimitHeartbeatRequest): Promise; +} diff --git a/packages/world/src/queue.ts b/packages/world/src/queue.ts index 5093b62dd3..78eb23bbe8 100644 --- a/packages/world/src/queue.ts +++ b/packages/world/src/queue.ts @@ -23,6 +23,7 @@ export type TraceCarrier = z.infer; export const WorkflowInvokePayloadSchema = z.object({ runId: z.string(), + lockPreApproval: z.string().optional(), traceCarrier: TraceCarrierSchema.optional(), requestedAt: z.coerce.date().optional(), /** Number of times this message has been re-enqueued due to server errors (5xx) */ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 70311f2e81..cc6346e3ce 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1289,6 +1289,9 @@ importers: '@workflow/tsconfig': specifier: workspace:* version: link:../tsconfig + vitest: + specifier: 'catalog:' + version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jiti@2.6.1)(jsdom@26.1.0)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3) zod: specifier: 'catalog:' version: 4.3.6 diff --git a/workbench/example/tsconfig.json b/workbench/example/tsconfig.json index 39c2f1ea68..58fb97394f 100644 --- a/workbench/example/tsconfig.json +++ b/workbench/example/tsconfig.json @@ -1,14 +1,15 @@ { "compilerOptions": { "target": "es2022", - "module": "NodeNext", + "module": "esnext", "lib": ["dom", "dom.iterable", "esnext"], + "baseUrl": ".", "allowJs": true, "skipLibCheck": true, "strict": true, "noEmit": true, "esModuleInterop": true, - "moduleResolution": "NodeNext", + "moduleResolution": "bundler", "resolveJsonModule": true, "isolatedModules": true, "jsx": "preserve", diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 7a0a7b77d6..bf95d34fe8 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -9,6 +9,7 @@ import { getStepMetadata, getWorkflowMetadata, getWritable, + lock, type RequestWithResponse, RetryableError, sleep, @@ -213,6 +214,330 @@ export async function parallelSleepWorkflow() { return { startTime, endTime }; } +async function cheapDbStep(userId: string) { + 'use step'; + return { + userId, + prompt: `profile:${userId}`, + }; +} + +async function expensiveAIStep(prompt: string) { + 'use step'; + return `summary:${prompt}`; +} + +export async function workflowWithScopedLocks(userId = 'user-123') { + 'use workflow'; + + await using userLimit = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 2 }, + leaseTtlMs: 30_000, + }); + + let row: Awaited>; + { + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + leaseTtlMs: 30_000, + }); + row = await cheapDbStep(userId); + } + + let summary: Awaited>; + { + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + leaseTtlMs: 30_000, + }); + summary = await expensiveAIStep(row.prompt); + } + + return { + workflowKey: userLimit.key, + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary, + }; +} + +async function serializedLimitStep( + label: string, + holdMs: number, + key = 'step:db:serialized' +) { + 'use step'; + + const metadata = getStepMetadata(); + const acquiredAt = Date.now(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); + const releasedAt = Date.now(); + + return { + label, + key, + attempt: metadata.attempt, + acquiredAt, + releasedAt, + }; +} + +export async function workflowLockContentionWorkflow( + userId = 'user-123', + holdMs = 750 +) { + 'use workflow'; + + const workflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const workflowLockAcquiredAt = Date.now(); + let step: Awaited>; + { + await using _nestedLock = await lock({ + key: 'step:db:serialized', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + step = await serializedLimitStep(userId, holdMs); + } + const stepCallLockReleasedAt = Date.now(); + await workflowLock.dispose(); + const workflowLockReleasedAt = Date.now(); + + return { + userId, + workflowLockAcquiredAt, + workflowLockReleasedAt, + stepCallLockAcquiredAt: step.acquiredAt, + stepCallLockReleasedAt, + }; +} + +export async function lockedStepCallContentionWorkflow( + key = 'step:db:key-contention', + holdMs = 750, + label = key +) { + 'use workflow'; + + { + await using _lock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + return await serializedLimitStep(label, holdMs, key); + } +} + +////////////////////////////////////////////////////////// + +export async function workflowOnlyLockContentionWorkflow( + userId = 'user-123', + holdMs = 750, + label = userId +) { + 'use workflow'; + + await using _workflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const workflowLockAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowLockReleasedAt = Date.now(); + + return { + label, + userId, + workflowLockAcquiredAt, + workflowLockReleasedAt, + }; +} + +export async function workflowLeakedLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId +) { + 'use workflow'; + + const leakedWorkflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs, + }); + + const workflowLockAcquiredAt = Date.now(); + + return { + label, + userId, + key: leakedWorkflowLock.key, + leaseTtlMs, + leakedLeaseId: leakedWorkflowLock.leaseId, + lockAcquiredAt: workflowLockAcquiredAt, + workflowCompletedAt: Date.now(), + }; +} + +export async function leakedKeyLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId +) { + 'use workflow'; + + const leakedLock = await lock({ + key: `workflow:key:expired:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs, + }); + + return { + label, + key: leakedLock.key, + leaseTtlMs, + leakedLeaseId: leakedLock.leaseId, + lockAcquiredAt: Date.now(), + workflowCompletedAt: Date.now(), + }; +} + +export async function workflowRateLimitContentionWorkflow( + userId = 'user-123', + holdMs = 250, + periodMs = 1_500, + label = userId +) { + 'use workflow'; + + await using _workflowRateLimit = await lock({ + key: `workflow:rate:${userId}`, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + + const workflowRateAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowRateReleasedAt = Date.now(); + + return { + label, + userId, + periodMs, + workflowRateAcquiredAt, + workflowRateReleasedAt, + }; +} + +export async function releasedRateLimitReplayWorkflow( + userId = 'user-123', + periodMs = 6_000, + sleepMs = 100 +) { + 'use workflow'; + + const startedAt = Date.now(); + { + await using _releasedRateLimit = await lock({ + key: `workflow:replay-rate:${userId}`, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + } + + await sleep(sleepMs); + + return { + elapsedMs: Date.now() - startedAt, + periodMs, + sleepMs, + }; +} + +export async function workflowMixedLimitContentionWorkflow( + userId = 'user-123', + holdMs = 250, + periodMs = 1_500, + label = userId +) { + 'use workflow'; + + await using _mixedLimit = await lock({ + key: `workflow:mixed:${userId}`, + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + + const workflowRateAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowRateReleasedAt = Date.now(); + + return { + label, + userId, + periodMs, + workflowRateAcquiredAt, + workflowRateReleasedAt, + }; +} + +async function scopedMultiStepStep(label: string, holdMs: number) { + 'use step'; + + const metadata = getStepMetadata(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); + return { + label, + attempt: metadata.attempt, + completedAt: Date.now(), + }; +} + +export async function singleLockAcrossMultipleStepsWorkflow( + key = 'step:db:batch', + holdMs = 400 +) { + 'use workflow'; + + let workflowLockAcquiredAt: number; + let first: Awaited>; + let second: Awaited>; + let workflowLockReleasedAt: number; + { + await using _lock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: holdMs * 2 + 5_000, + }); + + workflowLockAcquiredAt = Date.now(); + first = await scopedMultiStepStep('first', holdMs); + second = await scopedMultiStepStep('second', holdMs); + workflowLockReleasedAt = Date.now(); + } + + return { + key, + workflowLockAcquiredAt, + firstStepCompletedAt: first.completedAt, + secondStepCompletedAt: second.completedAt, + workflowLockReleasedAt, + }; +} + ////////////////////////////////////////////////////////// async function nullByteStep() { diff --git a/workbench/nextjs-turbopack/next.config.ts b/workbench/nextjs-turbopack/next.config.ts index 78df6b2090..5d1a204118 100644 --- a/workbench/nextjs-turbopack/next.config.ts +++ b/workbench/nextjs-turbopack/next.config.ts @@ -1,7 +1,9 @@ -import type { NextConfig } from 'next'; import path from 'node:path'; +import type { NextConfig } from 'next'; import { withWorkflow } from 'workflow/next'; +process.env.WORKFLOW_PUBLIC_MANIFEST ??= '1'; + const turbopackRoot = path.resolve(process.cwd(), '../..'); const nextConfig: NextConfig = {