From 5544e9cb965faf4fea06b38d77a84354abd34c10 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 15:22:07 -0400 Subject: [PATCH 01/34] initial local + pg stubs Signed-off-by: nathancolosimo --- packages/core/src/index.ts | 6 ++ packages/core/src/lock.ts | 36 +++++++++ packages/core/src/workflow/index.ts | 6 ++ packages/world-local/README.md | 3 +- packages/world-local/src/index.ts | 2 + packages/world-local/src/limits.ts | 15 ++++ packages/world-postgres/src/index.ts | 3 + packages/world-postgres/src/limits.ts | 20 +++++ packages/world-vercel/src/index.ts | 2 + packages/world-vercel/src/limits.ts | 16 ++++ packages/world/package.json | 4 +- packages/world/src/index.ts | 18 +++++ packages/world/src/interfaces.ts | 3 + packages/world/src/limits.ts | 105 ++++++++++++++++++++++++++ 14 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 packages/core/src/lock.ts create mode 100644 packages/world-local/src/limits.ts create mode 100644 packages/world-postgres/src/limits.ts create mode 100644 packages/world-vercel/src/limits.ts create mode 100644 packages/world/src/limits.ts diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 1d969aeaa6..413f87fa74 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -25,6 +25,12 @@ export { type WebhookOptions, } from './create-hook.js'; export { defineHook, type TypedHook } from './define-hook.js'; +export { + lock, + type LockHandle, + type LockOptions, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from './lock.js'; export { sleep } from './sleep.js'; export { getStepMetadata, diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts new file mode 100644 index 0000000000..fc9a848875 --- /dev/null +++ b/packages/core/src/lock.ts @@ -0,0 +1,36 @@ +import { + createLimitsNotImplementedError, + type LimitDefinition, + type LimitKey, + type LimitLease, +} from '@workflow/world'; + +export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; + +/** + * Reserved first-pass user-facing API for future flow concurrency and rate + * limiting inside workflow functions. + */ +export interface LockOptions extends LimitDefinition { + key: LimitKey; + leaseTtlMs?: number; +} + +/** + * Reserved handle shape for future lock acquisition. + */ +export interface LockHandle + extends Pick { + release(): Promise; + heartbeat(ttlMs?: number): Promise; +} + +/** + * Reserved workflow API for future concurrency and rate limiting. + * + * This placeholder intentionally throws until the runtime and world + * implementations gain real support. + */ +export async function lock(_options: LockOptions): Promise { + throw createLimitsNotImplementedError(); +} diff --git a/packages/core/src/workflow/index.ts b/packages/core/src/workflow/index.ts index 61cc317491..86807ed04b 100644 --- a/packages/core/src/workflow/index.ts +++ b/packages/core/src/workflow/index.ts @@ -6,6 +6,12 @@ export { type RetryableErrorOptions, } from '@workflow/errors'; export type { Hook, HookOptions } from '../create-hook.js'; +export { + lock, + type LockHandle, + type LockOptions, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from '../lock.js'; export { sleep } from '../sleep.js'; export { createHook, createWebhook } from './create-hook.js'; export { defineHook } from './define-hook.js'; diff --git a/packages/world-local/README.md b/packages/world-local/README.md index 9e3f0d95cc..cff6a3354a 100644 --- a/packages/world-local/README.md +++ b/packages/world-local/README.md @@ -4,5 +4,6 @@ Filesystem-based workflow backend for local development and testing. Stores workflow data as JSON files on disk and provides in-memory queuing. Automatically detects development server port for queue transport. -Used by default on `next dev` and `next start`. +The `limits` namespace is exposed as part of the shared world contract, but flow concurrency and rate limiting are not implemented in this package yet. +Used by default on `next dev` and `next start`. diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 6ec4800c8e..96f03efa57 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -12,6 +12,7 @@ import { readJSON, } from './fs.js'; import { initDataDir } from './init.js'; +import { createLimits } from './limits.js'; import { createQueue, type DirectHandler } from './queue.js'; import { createStorage } from './storage.js'; import { hashToken } from './storage/helpers.js'; @@ -61,6 +62,7 @@ export function createLocalWorld(args?: Partial): LocalWorld { const tag = mergedConfig.tag; const queue = createQueue(mergedConfig); return { + limits: createLimits(mergedConfig.dataDir, tag), ...queue, ...createStorage(mergedConfig.dataDir, tag), ...createStreamer(mergedConfig.dataDir, tag), diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts new file mode 100644 index 0000000000..5e2f249449 --- /dev/null +++ b/packages/world-local/src/limits.ts @@ -0,0 +1,15 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; + +export function createLimits(dataDir: string, tag?: string): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world-postgres/src/index.ts b/packages/world-postgres/src/index.ts index 6f2993e3db..ad1a4c0028 100644 --- a/packages/world-postgres/src/index.ts +++ b/packages/world-postgres/src/index.ts @@ -3,6 +3,7 @@ import type { Storage, World } from '@workflow/world'; import createPostgres from 'postgres'; import type { PostgresWorldConfig } from './config.js'; import { createClient, type Drizzle } from './drizzle/index.js'; +import { createLimits } from './limits.js'; import { createQueue } from './queue.js'; import { createEventsStorage, @@ -37,8 +38,10 @@ export function createWorld( const queue = createQueue(config, postgres); const storage = createStorage(drizzle); const streamer = createStreamer(postgres, drizzle); + const limits = createLimits(config, drizzle); return { + limits, ...storage, ...streamer, ...queue, diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts new file mode 100644 index 0000000000..7294a90c3b --- /dev/null +++ b/packages/world-postgres/src/limits.ts @@ -0,0 +1,20 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import type { PostgresWorldConfig } from './config.js'; +import type { Drizzle } from './drizzle/index.js'; + +export function createLimits( + config: PostgresWorldConfig, + drizzle: Drizzle +): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world-vercel/src/index.ts b/packages/world-vercel/src/index.ts index 975dc49863..ec7b9bdb1b 100644 --- a/packages/world-vercel/src/index.ts +++ b/packages/world-vercel/src/index.ts @@ -1,5 +1,6 @@ import type { World } from '@workflow/world'; import { createGetEncryptionKeyForRun } from './encryption.js'; +import { createLimits } from './limits.js'; import { createQueue } from './queue.js'; import { createResolveLatestDeploymentId } from './resolve-latest-deployment.js'; import { createStorage } from './storage.js'; @@ -23,6 +24,7 @@ export function createVercelWorld(config?: APIConfig): World { config?.projectConfig?.projectId || process.env.VERCEL_PROJECT_ID; return { + limits: createLimits(config), ...createQueue(config), ...createStorage(config), ...createStreamer(config), diff --git a/packages/world-vercel/src/limits.ts b/packages/world-vercel/src/limits.ts new file mode 100644 index 0000000000..bff6c07ac2 --- /dev/null +++ b/packages/world-vercel/src/limits.ts @@ -0,0 +1,16 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import type { APIConfig } from './utils.js'; + +export function createLimits(config?: APIConfig): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world/package.json b/packages/world/package.json index e250e45412..57546d8b1c 100644 --- a/packages/world/package.json +++ b/packages/world/package.json @@ -20,7 +20,8 @@ "scripts": { "build": "tsc", "dev": "tsc --watch", - "clean": "tsc --build --clean && rm -rf dist" + "clean": "tsc --build --clean && rm -rf dist", + "test": "vitest run src" }, "dependencies": { "ulid": "catalog:" @@ -30,6 +31,7 @@ }, "devDependencies": { "@types/node": "catalog:", + "vitest": "catalog:", "zod": "catalog:", "@workflow/tsconfig": "workspace:*" }, diff --git a/packages/world/src/index.ts b/packages/world/src/index.ts index 3e7ed1c4fb..fd12d63d94 100644 --- a/packages/world/src/index.ts +++ b/packages/world/src/index.ts @@ -10,6 +10,24 @@ export { export type * from './hooks.js'; export { HookSchema } from './hooks.js'; export type * from './interfaces.js'; +export type * from './limits.js'; +export { + createLimitsNotImplementedError, + LimitAcquireAcquiredResultSchema, + LimitAcquireBlockedResultSchema, + LimitAcquireRequestSchema, + LimitAcquireResultSchema, + LimitAcquireStatusSchema, + LimitBlockedReasonSchema, + LimitConcurrencySchema, + LimitDefinitionSchema, + LimitHeartbeatRequestSchema, + LimitKeySchema, + LimitLeaseSchema, + LimitRateSchema, + LimitReleaseRequestSchema, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from './limits.js'; export type * from './queue.js'; export { HealthCheckPayloadSchema, diff --git a/packages/world/src/interfaces.ts b/packages/world/src/interfaces.ts index d53fd96d14..87c57c0c8f 100644 --- a/packages/world/src/interfaces.ts +++ b/packages/world/src/interfaces.ts @@ -9,6 +9,7 @@ import type { RunCreatedEventRequest, } from './events.js'; import type { GetHookParams, Hook, ListHooksParams } from './hooks.js'; +import type { Limits } from './limits.js'; import type { Queue } from './queue.js'; import type { GetWorkflowRunParams, @@ -179,6 +180,8 @@ export interface Storage { * The "World" interface represents how Workflows are able to communicate with the outside world. */ export interface World extends Queue, Storage, Streamer { + limits: Limits; + /** * A function that will be called to start any background tasks needed by the World implementation. * For example, in the case of a queue backed World, this would start the queue processing. diff --git a/packages/world/src/limits.ts b/packages/world/src/limits.ts new file mode 100644 index 0000000000..ec155b2d8d --- /dev/null +++ b/packages/world/src/limits.ts @@ -0,0 +1,105 @@ +import { z } from 'zod'; + +export const LIMITS_NOT_IMPLEMENTED_MESSAGE = + 'Flow limits are reserved for future support and are not implemented yet.'; + +export function createLimitsNotImplementedError(): Error { + return new Error(LIMITS_NOT_IMPLEMENTED_MESSAGE); +} + +export const LimitKeySchema = z.string().min(1); +export type LimitKey = z.infer; + +export const LimitConcurrencySchema = z.object({ + max: z.number().int().positive(), +}); +export type LimitConcurrency = z.infer; + +export const LimitRateSchema = z.object({ + count: z.number().int().positive(), + periodMs: z.number().int().positive(), +}); +export type LimitRate = z.infer; + +export const LimitDefinitionSchema = z + .object({ + concurrency: LimitConcurrencySchema.optional(), + rate: LimitRateSchema.optional(), + }) + .refine( + (value) => value.concurrency !== undefined || value.rate !== undefined, + { + message: 'At least one limit must be configured', + } + ); +export type LimitDefinition = z.infer; + +export const LimitLeaseSchema = z.object({ + leaseId: z.string().min(1), + key: LimitKeySchema, + holderId: z.string().min(1), + acquiredAt: z.coerce.date(), + expiresAt: z.coerce.date().optional(), + definition: LimitDefinitionSchema, +}); +export type LimitLease = z.infer; + +export const LimitAcquireRequestSchema = z.object({ + key: LimitKeySchema, + holderId: z.string().min(1), + definition: LimitDefinitionSchema, + leaseTtlMs: z.number().int().positive().optional(), +}); +export type LimitAcquireRequest = z.infer; + +export const LimitBlockedReasonSchema = z.enum([ + 'concurrency', + 'rate', + 'concurrency_and_rate', +]); +export type LimitBlockedReason = z.infer; + +export const LimitAcquireStatusSchema = z.enum(['acquired', 'blocked']); +export type LimitAcquireStatus = z.infer; + +export const LimitAcquireAcquiredResultSchema = z.object({ + status: z.literal(LimitAcquireStatusSchema.enum.acquired), + lease: LimitLeaseSchema, +}); +export type LimitAcquireAcquiredResult = z.infer< + typeof LimitAcquireAcquiredResultSchema +>; + +export const LimitAcquireBlockedResultSchema = z.object({ + status: z.literal(LimitAcquireStatusSchema.enum.blocked), + reason: LimitBlockedReasonSchema, + retryAfterMs: z.number().int().nonnegative().optional(), +}); +export type LimitAcquireBlockedResult = z.infer< + typeof LimitAcquireBlockedResultSchema +>; + +export const LimitAcquireResultSchema = z.discriminatedUnion('status', [ + LimitAcquireAcquiredResultSchema, + LimitAcquireBlockedResultSchema, +]); +export type LimitAcquireResult = z.infer; + +export const LimitReleaseRequestSchema = z.object({ + leaseId: z.string().min(1), + key: LimitKeySchema.optional(), + holderId: z.string().min(1).optional(), +}); +export type LimitReleaseRequest = z.infer; + +export const LimitHeartbeatRequestSchema = z.object({ + leaseId: z.string().min(1), + ttlMs: z.number().int().positive().optional(), +}); +export type LimitHeartbeatRequest = z.infer; + +export interface Limits { + acquire(request: LimitAcquireRequest): Promise; + release(request: LimitReleaseRequest): Promise; + heartbeat(request: LimitHeartbeatRequest): Promise; +} From 03b016c77b67f21d3999ec08b37e1f36b86ed6f2 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 15:31:51 -0400 Subject: [PATCH 02/34] added test stubs Signed-off-by: nathancolosimo --- packages/core/src/lock.test.ts | 19 +++++++++++++++++++ packages/world-local/src/limits.test.ts | 19 +++++++++++++++++++ packages/world-local/src/limits.ts | 2 +- packages/world-postgres/src/limits.test.ts | 19 +++++++++++++++++++ packages/world-postgres/src/limits.ts | 4 ++-- packages/world-vercel/src/limits.test.ts | 19 +++++++++++++++++++ packages/world-vercel/src/limits.ts | 2 +- packages/world/src/limits.test.ts | 19 +++++++++++++++++++ 8 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 packages/core/src/lock.test.ts create mode 100644 packages/world-local/src/limits.test.ts create mode 100644 packages/world-postgres/src/limits.test.ts create mode 100644 packages/world-vercel/src/limits.test.ts create mode 100644 packages/world/src/limits.test.ts diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts new file mode 100644 index 0000000000..3c1177b07c --- /dev/null +++ b/packages/core/src/lock.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('lock', () => { + it.fails('is only callable inside workflow execution context', () => { + throw new Error('TODO: implement'); + }); + + it.fails('returns a handle with release and heartbeat behavior', () => { + throw new Error('TODO: implement'); + }); + + it.fails('allows multiple holders for one key up to the concurrency max', () => { + throw new Error('TODO: implement'); + }); + + it.fails('blocks rate-only locks until the rate window advances', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts new file mode 100644 index 0000000000..16ce754f8a --- /dev/null +++ b/packages/world-local/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('local world limits', () => { + it.fails('exposes the required limits namespace', () => { + throw new Error('TODO: implement'); + }); + + it.fails('enforces per-key concurrency limits', () => { + throw new Error('TODO: implement'); + }); + + it.fails('returns a retry path when rate limits block acquisition', () => { + throw new Error('TODO: implement'); + }); + + it.fails('restores capacity when a lease is released or expires', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 5e2f249449..68de99ccbb 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,6 +1,6 @@ import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; -export function createLimits(dataDir: string, tag?: string): Limits { +export function createLimits(_dataDir: string, _tag?: string): Limits { return { async acquire() { throw createLimitsNotImplementedError(); diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts new file mode 100644 index 0000000000..2c43f08584 --- /dev/null +++ b/packages/world-postgres/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('postgres world limits', () => { + it.fails('exposes the required limits namespace', () => { + throw new Error('TODO: implement'); + }); + + it.fails('respects the concurrency cap across concurrent acquires', () => { + throw new Error('TODO: implement'); + }); + + it.fails('wakes waiters in deterministic order when a lease is released', () => { + throw new Error('TODO: implement'); + }); + + it.fails('reclaims stale leases after worker or process death', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 7294a90c3b..01e8184c79 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -3,8 +3,8 @@ import type { PostgresWorldConfig } from './config.js'; import type { Drizzle } from './drizzle/index.js'; export function createLimits( - config: PostgresWorldConfig, - drizzle: Drizzle + _config: PostgresWorldConfig, + _drizzle: Drizzle ): Limits { return { async acquire() { diff --git a/packages/world-vercel/src/limits.test.ts b/packages/world-vercel/src/limits.test.ts new file mode 100644 index 0000000000..2afdf8af80 --- /dev/null +++ b/packages/world-vercel/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('vercel world limits', () => { + it.fails('exposes the required limits namespace', () => { + throw new Error('TODO: implement'); + }); + + it.fails('enforces per-key concurrency limits', () => { + throw new Error('TODO: implement'); + }); + + it.fails('returns a retry path when rate limits block acquisition', () => { + throw new Error('TODO: implement'); + }); + + it.fails('restores capacity when a lease is released or expires', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-vercel/src/limits.ts b/packages/world-vercel/src/limits.ts index bff6c07ac2..785fa4886e 100644 --- a/packages/world-vercel/src/limits.ts +++ b/packages/world-vercel/src/limits.ts @@ -1,7 +1,7 @@ import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; import type { APIConfig } from './utils.js'; -export function createLimits(config?: APIConfig): Limits { +export function createLimits(_config?: APIConfig): Limits { return { async acquire() { throw createLimitsNotImplementedError(); diff --git a/packages/world/src/limits.test.ts b/packages/world/src/limits.test.ts new file mode 100644 index 0000000000..8796d636ad --- /dev/null +++ b/packages/world/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('limits schemas', () => { + it.fails('accepts concurrency-only, rate-only, and combined limit definitions', () => { + throw new Error('TODO: implement'); + }); + + it.fails('rejects invalid or empty limit definitions', () => { + throw new Error('TODO: implement'); + }); + + it.fails('discriminates acquired and blocked acquire results', () => { + throw new Error('TODO: implement'); + }); + + it.fails('keeps lease, release, and heartbeat request shapes stable', () => { + throw new Error('TODO: implement'); + }); +}); From 4b918ca431dd22a7343e067f4b2e64f3b0442c1d Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 17:46:54 -0400 Subject: [PATCH 03/34] add e2e examples --- packages/core/e2e/e2e.test.ts | 43 ++++++++++++++++++++++++ packages/core/src/lock.test.ts | 2 +- packages/core/src/lock.ts | 3 +- workbench/example/workflows/99_e2e.ts | 48 +++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 2 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 1c9eeb8451..f24042d2a7 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -220,6 +220,9 @@ describe('e2e', () => { const isNext = process.env.APP_NAME?.includes('nextjs'); const isLocal = deploymentUrl.includes('localhost'); + const isPostgresWorld = + process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres'; + const isLocalWorld = isLocalDeployment() && !isPostgresWorld; // only works with framework that transpiles react and // doesn't work on Vercel due to eval hack so react isn't // bundled in function @@ -544,6 +547,46 @@ describe('e2e', () => { expect(elapsed).toBeLessThan(25_000); }); + if (isLocalWorld) { + test.fails( + 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on local world', + { timeout: 60_000 }, + async () => { + const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ + 'local-world', + ]); + const returnValue = await run.returnValue; + + expect(returnValue).toMatchObject({ + workflowKey: 'workflow:user:local-world', + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: 'summary:profile:local-world', + }); + } + ); + } + + if (isPostgresWorld) { + test.fails( + 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on postgres world', + { timeout: 60_000 }, + async () => { + const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ + 'postgres-world', + ]); + const returnValue = await run.returnValue; + + expect(returnValue).toMatchObject({ + workflowKey: 'workflow:user:postgres-world', + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: 'summary:profile:postgres-world', + }); + } + ); + } + test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index 3c1177b07c..58419ae49e 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -5,7 +5,7 @@ describe('lock', () => { throw new Error('TODO: implement'); }); - it.fails('returns a handle with release and heartbeat behavior', () => { + it.fails('returns a handle with dispose and heartbeat behavior', () => { throw new Error('TODO: implement'); }); diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index fc9a848875..b21fc6834d 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -21,8 +21,9 @@ export interface LockOptions extends LimitDefinition { */ export interface LockHandle extends Pick { - release(): Promise; + dispose(): Promise; heartbeat(ttlMs?: number): Promise; + [Symbol.asyncDispose](): Promise; } /** diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index d6caf9dcc8..13dcd7cb0b 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -9,6 +9,7 @@ import { getStepMetadata, getWorkflowMetadata, getWritable, + lock, type RequestWithResponse, RetryableError, sleep, @@ -213,6 +214,53 @@ export async function parallelSleepWorkflow() { return { startTime, endTime }; } +async function cheapDbStep(userId: string) { + 'use step'; + + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + leaseTtlMs: 30_000, + }); + + return { + userId, + prompt: `profile:${userId}`, + }; +} + +async function expensiveAIStep(prompt: string) { + 'use step'; + + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + leaseTtlMs: 30_000, + }); + + return `summary:${prompt}`; +} + +export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { + 'use workflow'; + + await using userLimit = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 2 }, + leaseTtlMs: 30_000, + }); + + const row = await cheapDbStep(userId); + const summary = await expensiveAIStep(row.prompt); + + return { + workflowKey: userLimit.key, + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary, + }; +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From 4b9c8b675e3a31901a0ea271fa7f64b6f18b2300 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 22:34:23 -0400 Subject: [PATCH 04/34] DCO Remediation Commit for nathancolosimo I, nathancolosimo , hereby add my Signed-off-by to this commit: b0e2f2a37bc813ec244991353f58e5885a5e8540 Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 2 +- packages/core/src/lock.ts | 22 +- packages/core/src/runtime/step-handler.ts | 62 +++-- packages/core/src/step/context-storage.ts | 1 + packages/core/src/step/lock.ts | 78 ++++++ packages/core/src/symbols.ts | 2 + packages/core/src/workflow.ts | 5 + packages/core/src/workflow/lock.ts | 130 ++++++++++ packages/world-local/src/index.ts | 5 +- packages/world-local/src/limits.test.ts | 148 ++++++++++- packages/world-local/src/limits.ts | 301 +++++++++++++++++++++- packages/world/FLOW_LIMITS.md | 280 ++++++++++++++++++++ pnpm-lock.yaml | 13 +- workbench/nextjs-turbopack/next.config.ts | 4 +- 14 files changed, 995 insertions(+), 58 deletions(-) create mode 100644 packages/core/src/step/lock.ts create mode 100644 packages/core/src/workflow/lock.ts create mode 100644 packages/world/FLOW_LIMITS.md diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index f24042d2a7..86325b1c58 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -548,7 +548,7 @@ describe('e2e', () => { }); if (isLocalWorld) { - test.fails( + test( 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on local world', { timeout: 60_000 }, async () => { diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index b21fc6834d..11829957d0 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -4,6 +4,7 @@ import { type LimitKey, type LimitLease, } from '@workflow/world'; +import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; @@ -28,10 +29,23 @@ export interface LockHandle /** * Reserved workflow API for future concurrency and rate limiting. - * - * This placeholder intentionally throws until the runtime and world - * implementations gain real support. */ -export async function lock(_options: LockOptions): Promise { +export async function lock(options: LockOptions): Promise { + const workflowLock = (globalThis as any)[WORKFLOW_LOCK] as + | ((options: LockOptions) => Promise) + | undefined; + + if (workflowLock) { + return workflowLock(options); + } + + const stepLock = (globalThis as any)[STEP_LOCK] as + | ((options: LockOptions) => Promise) + | undefined; + + if (stepLock) { + return stepLock(options); + } + throw createLimitsNotImplementedError(); } diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index 8e7f01b983..44c585315a 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -16,6 +16,8 @@ import { hydrateStepArguments, } from '../serialization.js'; import { contextStorage } from '../step/context-storage.js'; +import { createStepLock } from '../step/lock.js'; +import { STEP_LOCK } from '../symbols.js'; import * as Attribute from '../telemetry/semantic-conventions.js'; import { getSpanKind, @@ -117,7 +119,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // - Step not in terminal state (returns 409) // - retryAfter timestamp reached (returns 425 with Retry-After header) // - Workflow still active (returns 410 if completed) - let step; + let step: Awaited>; try { const startResult = await world.events.create( workflowRunId, @@ -384,31 +386,43 @@ const stepHandler = getWorldHandlers().createQueueHandler( const executionStartTime = Date.now(); try { + const previousStepLock = (globalThis as any)[STEP_LOCK]; + (globalThis as any)[STEP_LOCK] = createStepLock(world); + result = await trace('step.execute', {}, async () => { - return await contextStorage.run( - { - stepMetadata: { - stepName, - stepId, - stepStartedAt: new Date(+stepStartedAt), - attempt, - }, - workflowMetadata: { - workflowName, - workflowRunId, - workflowStartedAt: new Date(+workflowStartedAt), - // TODO: there should be a getUrl method on the world interface itself. This - // solution only works for vercel + local worlds. - url: isVercel - ? `https://${process.env.VERCEL_URL}` - : `http://localhost:${port ?? 3000}`, + try { + return await contextStorage.run( + { + stepMetadata: { + stepName, + stepId, + stepStartedAt: new Date(+stepStartedAt), + attempt, + }, + workflowMetadata: { + workflowName, + workflowRunId, + workflowStartedAt: new Date(+workflowStartedAt), + // TODO: there should be a getUrl method on the world interface itself. This + // solution only works for vercel + local worlds. + url: isVercel + ? `https://${process.env.VERCEL_URL}` + : `http://localhost:${port ?? 3000}`, + }, + ops, + closureVars: hydratedInput.closureVars, + encryptionKey, + lockCounter: 0, }, - ops, - closureVars: hydratedInput.closureVars, - encryptionKey, - }, - () => stepFn.apply(thisVal, args) - ); + () => stepFn.apply(thisVal, args) + ); + } finally { + if (previousStepLock === undefined) { + delete (globalThis as any)[STEP_LOCK]; + } else { + (globalThis as any)[STEP_LOCK] = previousStepLock; + } + } }); } catch (err) { userCodeError = err; diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index 2a9aa8b7e1..dadb25b132 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -9,4 +9,5 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ ops: Promise[]; closureVars?: Record; encryptionKey?: CryptoKey; + lockCounter: number; }>(); diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts new file mode 100644 index 0000000000..7451b4e712 --- /dev/null +++ b/packages/core/src/step/lock.ts @@ -0,0 +1,78 @@ +import type { LimitLease, World } from '@workflow/world'; +import type { LockHandle, LockOptions } from '../lock.js'; +import { contextStorage } from './context-storage.js'; + +function createStepLockHandle(lease: LimitLease, world: World): LockHandle { + let currentLease = lease; + let disposed = false; + + const dispose = async () => { + if (disposed) return; + disposed = true; + await world.limits.release({ + leaseId: currentLease.leaseId, + key: currentLease.key, + holderId: currentLease.holderId, + }); + }; + + const heartbeat = async (ttlMs?: number) => { + currentLease = await world.limits.heartbeat({ + leaseId: currentLease.leaseId, + ttlMs, + }); + }; + + return { + get leaseId() { + return currentLease.leaseId; + }, + get key() { + return currentLease.key; + }, + get holderId() { + return currentLease.holderId; + }, + get expiresAt() { + return currentLease.expiresAt; + }, + dispose, + heartbeat, + [Symbol.asyncDispose]: dispose, + }; +} + +export function createStepLock(world: World) { + return async function lockInStep(options: LockOptions): Promise { + const store = contextStorage.getStore(); + if (!store) { + throw new Error( + '`lock()` can only be called inside a workflow or step function' + ); + } + + const lockIndex = store.lockCounter++; + const holderId = `stplock_${store.workflowMetadata.workflowRunId}:${store.stepMetadata.stepId}:${lockIndex}`; + const definition = { + concurrency: options.concurrency, + rate: options.rate, + }; + + while (true) { + const result = await world.limits.acquire({ + key: options.key, + holderId, + definition, + leaseTtlMs: options.leaseTtlMs, + }); + + if (result.status === 'acquired') { + return createStepLockHandle(result.lease, world); + } + + await new Promise((resolve) => + setTimeout(resolve, result.retryAfterMs || 1000) + ); + } + }; +} diff --git a/packages/core/src/symbols.ts b/packages/core/src/symbols.ts index 92df4058db..cd9616b17e 100644 --- a/packages/core/src/symbols.ts +++ b/packages/core/src/symbols.ts @@ -1,6 +1,8 @@ export const WORKFLOW_USE_STEP = Symbol.for('WORKFLOW_USE_STEP'); export const WORKFLOW_CREATE_HOOK = Symbol.for('WORKFLOW_CREATE_HOOK'); export const WORKFLOW_SLEEP = Symbol.for('WORKFLOW_SLEEP'); +export const WORKFLOW_LOCK = Symbol.for('WORKFLOW_LOCK'); +export const STEP_LOCK = Symbol.for('STEP_LOCK'); export const WORKFLOW_CONTEXT = Symbol.for('WORKFLOW_CONTEXT'); export const WORKFLOW_GET_STREAM_ID = Symbol.for('WORKFLOW_GET_STREAM_ID'); export const STABLE_ULID = Symbol.for('WORKFLOW_STABLE_ULID'); diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index 5d18c085b4..ece1823196 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -22,6 +22,7 @@ import { STABLE_ULID, WORKFLOW_CREATE_HOOK, WORKFLOW_GET_STREAM_ID, + WORKFLOW_LOCK, WORKFLOW_SLEEP, WORKFLOW_USE_STEP, } from './symbols.js'; @@ -32,6 +33,7 @@ import { createContext } from './vm/index.js'; import type { WorkflowMetadata } from './workflow/get-workflow-metadata.js'; import { WORKFLOW_CONTEXT_SYMBOL } from './workflow/get-workflow-metadata.js'; import { createCreateHook } from './workflow/hook.js'; +import { createLock } from './workflow/lock.js'; import { createSleep } from './workflow/sleep.js'; /** @@ -184,6 +186,7 @@ export async function runWorkflow( const useStep = createUseStep(workflowContext); const createHook = createCreateHook(workflowContext); + const lock = createLock(workflowContext); const sleep = createSleep(workflowContext); // @ts-expect-error - `@types/node` says symbol is not valid, but it does work @@ -191,6 +194,8 @@ export async function runWorkflow( // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_CREATE_HOOK] = createHook; // @ts-expect-error - `@types/node` says symbol is not valid, but it does work + vmGlobalThis[WORKFLOW_LOCK] = lock; + // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_SLEEP] = sleep; // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_GET_STREAM_ID] = (namespace?: string) => diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts new file mode 100644 index 0000000000..21db74d825 --- /dev/null +++ b/packages/core/src/workflow/lock.ts @@ -0,0 +1,130 @@ +import { EventConsumerResult } from '../events-consumer.js'; +import { WorkflowSuspension } from '../global.js'; +import type { LockHandle, LockOptions } from '../lock.js'; +import { + scheduleWhenIdle, + type WorkflowOrchestratorContext, +} from '../private.js'; +import { getWorld } from '../runtime/world.js'; + +function createLockHandle( + lease: { + leaseId: string; + key: string; + holderId: string; + expiresAt?: Date; + }, + ctx: WorkflowOrchestratorContext +): LockHandle { + let currentLease = lease; + let disposed = false; + + const dispose = async () => { + if (disposed) return; + disposed = true; + await getWorld().limits.release({ + leaseId: currentLease.leaseId, + key: currentLease.key, + holderId: currentLease.holderId, + }); + }; + + const heartbeat = async (ttlMs?: number) => { + currentLease = await getWorld().limits.heartbeat({ + leaseId: currentLease.leaseId, + ttlMs, + }); + }; + + const handle: LockHandle = { + get leaseId() { + return currentLease.leaseId; + }, + get key() { + return currentLease.key; + }, + get holderId() { + return currentLease.holderId; + }, + get expiresAt() { + return currentLease.expiresAt; + }, + dispose, + heartbeat, + [Symbol.asyncDispose]: dispose, + }; + + const vmAsyncDispose = ctx.globalThis.Symbol.asyncDispose; + if (vmAsyncDispose && vmAsyncDispose !== Symbol.asyncDispose) { + (handle as any)[vmAsyncDispose] = dispose; + } + + return handle; +} + +export function createLock(ctx: WorkflowOrchestratorContext) { + return async function lockImpl(options: LockOptions): Promise { + const holderId = `wflock_${ctx.generateUlid()}`; + const definition = { + concurrency: options.concurrency, + rate: options.rate, + }; + + while (true) { + const result = await getWorld().limits.acquire({ + key: options.key, + holderId, + definition, + leaseTtlMs: options.leaseTtlMs, + }); + + if (result.status === 'acquired') { + return createLockHandle(result.lease, ctx); + } + + const correlationId = `wflock_wait_${ctx.generateUlid()}`; + const resumeAt = new Date(Date.now() + (result.retryAfterMs || 1000)); + ctx.invocationsQueue.set(correlationId, { + type: 'wait', + correlationId, + resumeAt, + }); + + await new Promise((resolve) => { + ctx.eventsConsumer.subscribe((event) => { + if (!event) { + scheduleWhenIdle(ctx, () => { + ctx.onWorkflowError( + new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + ); + }); + return EventConsumerResult.NotConsumed; + } + + if (event.correlationId !== correlationId) { + return EventConsumerResult.NotConsumed; + } + + if (event.eventType === 'wait_created') { + const queueItem = ctx.invocationsQueue.get(correlationId); + if (queueItem && queueItem.type === 'wait') { + queueItem.hasCreatedEvent = true; + queueItem.resumeAt = event.eventData.resumeAt; + } + return EventConsumerResult.Consumed; + } + + if (event.eventType === 'wait_completed') { + ctx.invocationsQueue.delete(correlationId); + ctx.promiseQueue = ctx.promiseQueue.then(() => { + resolve(); + }); + return EventConsumerResult.Finished; + } + + return EventConsumerResult.NotConsumed; + }); + }); + } + }; +} diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 96f03efa57..029154649d 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -14,8 +14,8 @@ import { import { initDataDir } from './init.js'; import { createLimits } from './limits.js'; import { createQueue, type DirectHandler } from './queue.js'; -import { createStorage } from './storage.js'; import { hashToken } from './storage/helpers.js'; +import { createStorage } from './storage.js'; import { createStreamer } from './streamer.js'; // Re-export init types and utilities for consumers @@ -28,7 +28,7 @@ export { parseVersion, } from './init.js'; -export { type DirectHandler } from './queue.js'; +export type { DirectHandler } from './queue.js'; export type LocalWorld = World & { /** Register a direct in-process handler for a queue prefix, bypassing HTTP. */ @@ -104,6 +104,7 @@ export function createLocalWorld(args?: Partial): LocalWorld { 'steps', 'events', 'hooks', + 'limits', 'waits', 'streams/runs', ]; diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 16ce754f8a..1db72676af 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,19 +1,149 @@ -import { describe, it } from 'vitest'; +import { mkdtemp, rm } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import { setTimeout as sleep } from 'node:timers/promises'; +import { describe, expect, it } from 'vitest'; +import { createLocalWorld } from './index.js'; +import { createLimits } from './limits.js'; + +async function withTempDir(fn: (dir: string) => Promise): Promise { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + try { + return await fn(dir); + } finally { + await rm(dir, { recursive: true, force: true }); + } +} describe('local world limits', () => { - it.fails('exposes the required limits namespace', () => { - throw new Error('TODO: implement'); + it('exposes the required limits namespace', async () => { + await withTempDir(async (dir) => { + const world = createLocalWorld({ dataDir: dir }); + expect(world.limits).toBeDefined(); + expect(typeof world.limits.acquire).toBe('function'); + expect(typeof world.limits.release).toBe('function'); + expect(typeof world.limits.heartbeat).toBe('function'); + await world.close?.(); + }); }); - it.fails('enforces per-key concurrency limits', () => { - throw new Error('TODO: implement'); + it('enforces per-key concurrency limits', async () => { + await withTempDir(async (dir) => { + const limits = createLimits(dir); + + const first = await limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') { + throw new Error('expected first lease to be acquired'); + } + + const second = await limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency', + }); + + await limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(third.status).toBe('acquired'); + }); }); - it.fails('returns a retry path when rate limits block acquisition', () => { - throw new Error('TODO: implement'); + it('returns a retry path when rate limits block acquisition', async () => { + await withTempDir(async (dir) => { + const limits = createLimits(dir); + + const first = await limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-a', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') { + throw new Error('expected first lease to be acquired'); + } + + await limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const second = await limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-b', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('blocked'); + if (second.status !== 'blocked') { + throw new Error('expected second acquire to be blocked'); + } + expect(second.reason).toBe('rate'); + expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + }); }); - it.fails('restores capacity when a lease is released or expires', () => { - throw new Error('TODO: implement'); + it('restores capacity when a lease is released or expires', async () => { + await withTempDir(async (dir) => { + const limits = createLimits(dir); + + const first = await limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 25, + }); + + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') { + throw new Error('expected first lease to be acquired'); + } + + const heartbeat = await limits.heartbeat({ + leaseId: first.lease.leaseId, + ttlMs: 50, + }); + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( + first.lease.expiresAt?.getTime() ?? 0 + ); + + await sleep(60); + + const second = await limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('acquired'); + }); }); }); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 68de99ccbb..9dfac5d931 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,15 +1,300 @@ -import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import path from 'node:path'; +import { WorkflowAPIError } from '@workflow/errors'; +import { + LimitAcquireRequestSchema, + type LimitAcquireResult, + LimitHeartbeatRequestSchema, + type LimitLease, + LimitLeaseSchema, + LimitReleaseRequestSchema, + type Limits, +} from '@workflow/world'; +import { z } from 'zod'; +import { readJSON, writeJSON } from './fs.js'; +import { monotonicUlid } from './storage/helpers.js'; + +const LimitTokenSchema = z.object({ + tokenId: z.string(), + holderId: z.string(), + acquiredAt: z.coerce.date(), + expiresAt: z.coerce.date(), +}); + +const KeyStateSchema = z.object({ + key: z.string(), + leases: z.array(LimitLeaseSchema), + tokens: z.array(LimitTokenSchema), +}); + +const LimitsStateSchema = z.object({ + version: z.literal(1), + keys: z.record(z.string(), KeyStateSchema), +}); + +type LimitToken = z.infer; +type KeyState = z.infer; +type LimitsState = z.infer; + +const EMPTY_STATE: LimitsState = { + version: 1, + keys: {}, +}; + +function getStatePath(dataDir: string, tag?: string): string { + return path.join(dataDir, 'limits', tag ? `state.${tag}.json` : 'state.json'); +} + +function cloneToken(token: LimitToken): LimitToken { + return { ...token }; +} + +function cloneState(state: LimitsState): LimitsState { + return { + version: 1, + keys: Object.fromEntries( + Object.entries(state.keys).map(([key, keyState]) => [ + key, + { + key: keyState.key, + leases: keyState.leases.map((lease) => ({ ...lease })), + tokens: keyState.tokens.map(cloneToken), + }, + ]) + ), + }; +} + +function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { + return { + key: keyState.key, + leases: keyState.leases.filter( + (lease) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > now + ), + tokens: keyState.tokens.filter((token) => token.expiresAt.getTime() > now), + }; +} + +function getBlockedReason( + concurrencyBlocked: boolean, + rateBlocked: boolean +): 'concurrency' | 'rate' | 'concurrency_and_rate' { + if (concurrencyBlocked && rateBlocked) return 'concurrency_and_rate'; + if (concurrencyBlocked) return 'concurrency'; + return 'rate'; +} + +function getRetryAfterMs( + keyState: KeyState, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const candidates: number[] = []; + + if (concurrencyBlocked) { + for (const lease of keyState.leases) { + if (lease.expiresAt) { + candidates.push(Math.max(0, lease.expiresAt.getTime() - now)); + } + } + } + + if (rateBlocked) { + for (const token of keyState.tokens) { + candidates.push(Math.max(0, token.expiresAt.getTime() - now)); + } + } + + if (candidates.length === 0) { + return undefined; + } + + return Math.min(...candidates); +} + +export function createLimits(dataDir: string, tag?: string): Limits { + const statePath = getStatePath(dataDir, tag); + let stateOp = Promise.resolve(); + + // This block is an in-process async mutex / operation queue. + // stateOp starts as an already-resolved promise. + // Each call to withStateLock() chains a new operation onto the tail of that promise. + // Because every new operation waits for the previous one, reads/modifies/writes to the limits state file happen serially. + const withStateLock = async (fn: () => Promise): Promise => { + const run = stateOp.then(fn, fn); + stateOp = run.then( + () => undefined, + () => undefined + ); + return run; + }; + + const readState = async (): Promise => { + return ( + (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE) + ); + }; + + const writeState = async (state: LimitsState): Promise => { + await writeJSON(statePath, state, { overwrite: true }); + }; -export function createLimits(_dataDir: string, _tag?: string): Limits { return { - async acquire() { - throw createLimitsNotImplementedError(); + async acquire(request) { + const parsed = LimitAcquireRequestSchema.parse(request); + + return withStateLock(async (): Promise => { + const state = cloneState(await readState()); + const now = new Date(); + const nowMs = now.getTime(); + const keyState = pruneKeyState( + state.keys[parsed.key] ?? { + key: parsed.key, + leases: [], + tokens: [], + }, + nowMs + ); + + const existingLease = keyState.leases.find( + (lease) => lease.holderId === parsed.holderId + ); + if (existingLease) { + state.keys[parsed.key] = keyState; + await writeState(state); + return { + status: 'acquired', + lease: existingLease, + }; + } + + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + keyState.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + keyState.tokens.length >= parsed.definition.rate.count; + + if (concurrencyBlocked || rateBlocked) { + state.keys[parsed.key] = keyState; + await writeState(state); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: getRetryAfterMs( + keyState, + nowMs, + concurrencyBlocked, + rateBlocked + ), + }; + } + + const lease: LimitLease = { + leaseId: `lmt_${monotonicUlid()}`, + key: parsed.key, + holderId: parsed.holderId, + acquiredAt: now, + expiresAt: + parsed.leaseTtlMs !== undefined + ? new Date(nowMs + parsed.leaseTtlMs) + : undefined, + definition: parsed.definition, + }; + + keyState.leases.push(lease); + + if (parsed.definition.rate) { + keyState.tokens.push({ + tokenId: `lmttok_${monotonicUlid()}`, + holderId: parsed.holderId, + acquiredAt: now, + expiresAt: new Date(nowMs + parsed.definition.rate.periodMs), + }); + } + + state.keys[parsed.key] = keyState; + await writeState(state); + + return { + status: 'acquired', + lease, + }; + }); }, - async release() { - throw createLimitsNotImplementedError(); + + async release(request) { + const parsed = LimitReleaseRequestSchema.parse(request); + + await withStateLock(async () => { + const state = cloneState(await readState()); + + for (const [key, keyStateValue] of Object.entries(state.keys)) { + const keyState = pruneKeyState(keyStateValue); + const nextLeases = keyState.leases.filter((lease) => { + if (lease.leaseId !== parsed.leaseId) return true; + if (parsed.key && lease.key !== parsed.key) return true; + if (parsed.holderId && lease.holderId !== parsed.holderId) { + return true; + } + return false; + }); + + state.keys[key] = { + ...keyState, + leases: nextLeases, + }; + + if ( + state.keys[key].leases.length === 0 && + state.keys[key].tokens.length === 0 + ) { + delete state.keys[key]; + } + } + + await writeState(state); + }); }, - async heartbeat() { - throw createLimitsNotImplementedError(); + + async heartbeat(request) { + const parsed = LimitHeartbeatRequestSchema.parse(request); + + return withStateLock(async () => { + const state = cloneState(await readState()); + const now = Date.now(); + + for (const [key, keyStateValue] of Object.entries(state.keys)) { + const keyState = pruneKeyState(keyStateValue, now); + const leaseIndex = keyState.leases.findIndex( + (lease) => lease.leaseId === parsed.leaseId + ); + + if (leaseIndex === -1) { + state.keys[key] = keyState; + continue; + } + + const lease = keyState.leases[leaseIndex]; + const currentExpiry = lease.expiresAt?.getTime(); + const ttlMs = + parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); + const updatedLease: LimitLease = { + ...lease, + expiresAt: new Date(now + Math.max(1, ttlMs)), + }; + + keyState.leases[leaseIndex] = updatedLease; + state.keys[key] = keyState; + await writeState(state); + return updatedLease; + } + + throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { + status: 404, + }); + }); }, }; } diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md new file mode 100644 index 0000000000..df5d3275c8 --- /dev/null +++ b/packages/world/FLOW_LIMITS.md @@ -0,0 +1,280 @@ +# Planning to delete after PR is implemented / ready to merge + +# Flow Limits Design Notes + +This note summarizes the current direction for flow concurrency and rate limiting +across `@workflow/core`, `@workflow/world`, and concrete world implementations. + +## Status + +- The shared `limits` interface and `lock()` API surface now exist. +- Local world has an initial working implementation for acquire/release/heartbeat. +- Postgres and Vercel worlds still expose `limits` as stubs. +- There is a real local E2E example for workflow and step locks in the Next.js Turbopack workbench. + +## Goals + +- Support keyed concurrency limits. +- Support keyed rate limits. +- Allow concurrency and rate to be colocated in one interface. +- Support workflow-scoped limits and step-scoped limits. +- Make crash recovery possible through leases with TTL/expiry. +- Keep worker throughput controls separate from business-level flow limits. + +## Core Terms + +- `worker concurrency`: backend throughput setting for queue/job processing. +- `workflow limit`: admission control for workflow runs that share a key. +- `step limit`: execution control for a specific step/resource key. +- `lease`: durable record that a workflow or step currently occupies capacity for a key. + +## Decisions So Far + +### 1. Use one shared limits model + +The shared world interface uses a single `limits` namespace and a single limit +definition shape that can contain either or both: + +- `concurrency` +- `rate` + +This allows one key to express: + +- concurrency only +- rate only +- both together + +### 2. Use leases, not plain mutexes + +Limits are modeled as leases with TTL/expiry so capacity can be recovered after: + +- worker crashes +- process death +- machine shutdown +- lost retries + +Normal completion should dispose/release the lease explicitly. Crash recovery +comes from lease expiry plus future reclaim logic. + +### 3. Keep worker concurrency separate from flow limits + +Current world-level concurrency settings are infrastructure controls, not +business-level locking: + +- local world: `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` +- postgres world: `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` + +These control how many queue jobs can be processed at once. They should remain +independent from flow limits like: + +- `workflow:user:123` +- `step:db:cheap` +- `step:provider:openai` + +### 4. Use a sliding-window model for rate limits in v1 + +The current rate-limit model is a sliding-window log model, not a token bucket. + +For a limit like: + +- `rate: { count: 10, periodMs: 60_000 }` + +the intended semantics are: + +- allow at most 10 successful acquires in the last 60 seconds +- each successful acquire records a timestamped rate usage entry +- rate capacity returns only when that entry ages out of the window + +This is simpler than a token bucket and matches the current local-world +implementation direction well. + +Important distinction: + +- `lease`: active occupancy / ownership for a holder +- `token`: internal rate-usage record that remains until the rate window expires + +Releasing a lease should free concurrency capacity immediately, but it should +not restore rate capacity until the associated rate usage entry expires. + +### 5. Use one `lock()` API in both workflows and steps + +We want one user-facing primitive: + +```ts +await using lease = await lock({ ... }); +``` + +But the runtime meaning differs by context. + +#### In workflows + +`lock()` means workflow admission / workflow-scope ownership. + +If placed at the top of a workflow, it should hold the lease across the logical +workflow scope, even though the workflow may suspend and resume many times. + +#### In steps + +`lock()` should act like a step gate. + +The intended long-term behavior is: + +- declare the limit at the top of the step +- runtime/compiler hoists or interprets it as a pre-step requirement +- the step should not occupy a worker just waiting for capacity +- lease is disposed automatically when the step attempt completes + +This means step `lock()` is conceptually the same API, but not a literal +"block inside already-running user step code" implementation. + +### 6. `await using` is the preferred user-facing shape + +The preferred API is explicit resource management: + +```ts +await using lease = await lock({ ... }); +``` + +This gives automatic cleanup on scope exit and reads well for both workflow +scopes and step scopes. + +For manual early cleanup, the user-facing `LockHandle` should expose: + +- `dispose()` +- `[Symbol.asyncDispose]()` + +The backend-facing world contract can continue to use `release(...)` internally. + +### 7. Workflow-scoped locks are logical-scope locks, not request-lifetime locks + +For workflows, `await using` must be tied to the logical workflow scope across: + +- step round trips +- queue turns +- sleeps +- hooks +- replay/resume + +The lease must not be disposed merely because one host process invocation ends. + +### 8. Prefer Option B for deadlock avoidance + +Current preferred model: + +- workflow-level limits may be held by a run +- step-level limits are acquired only at step boundaries +- step-level limits are short-lived +- step code should not acquire additional locks dynamically +- step execution should not wait on workflow-level locks + +This keeps the dependency direction one-way: + +- workflow admission -> step admission -> step execution + +That avoids the classic cycle where one workflow holds a workflow lock and +another holds a step lock and each waits on the other. + +### 9. V1 semantics are intentionally opinionated + +For v1, the intended semantics are: + +- workflow locks count admitted, in-flight workflows for a key +- step locks count or rate-limit specific step execution categories +- worker concurrency remains a separate infrastructure throttle + +More concretely: + +- if a workflow acquires a workflow-scoped lock and then sleeps for 10 minutes, + it still counts as active for that workflow key during the sleep +- if a workflow is parked waiting for a step-level limit, it still counts as + active for its workflow-level lock +- a step-level lock should conceptually be an admission gate for the step + attempt, not a second workflow-level lock +- step-level rate limits should consume rate capacity when the step starts, and + that rate usage should remain counted until the window expires even if the + step releases its lease quickly + +For the current local implementation specifically: + +- workflow locks already behave like durable logical-scope leases +- step locks currently use in-process retry polling once the step is already + executing, which is acceptable for local v1 but not the ideal long-term + admission model + +This means the current v1 interpretation of a workflow lock is: + +- "How many workflows for this key are admitted and in flight at all?" + +not: + +- "How many workflows are actively burning CPU right this instant?" + +## Current Example Shape + +The current placeholder E2E example models: + +- workflow-level user concurrency: + - `workflow:user:${userId}` +- step-level DB concurrency: + - `step:db:cheap` +- step-level AI rate limit: + - `step:provider:openai` + +With intended usage like: + +```ts +async function cheapDbStep(userId: string) { + 'use step'; + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + }); + return { userId, prompt: `profile:${userId}` }; +} + +async function expensiveAIStep(prompt: string) { + 'use step'; + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + }); + return `summary:${prompt}`; +} + +export async function workflowWithWorkflowAndStepLocks(userId: string) { + 'use workflow'; + await using userLimit = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 2 }, + }); + + const row = await cheapDbStep(userId); + const summary = await expensiveAIStep(row.prompt); + return { row, summary }; +} +``` + +## Important Clarification + +Flow limits and worker concurrency are different layers. + +For example: + +- a cheap DB step may continue making progress even while an expensive AI step + is rate-limited +- the main shared coupling between them is the worker pool +- if workers are available, unrelated step categories should continue + +So overall system throughput is not one simple global minimum. Different +workflow paths may be bottlenecked by different limits at different times. + +## Open Questions + +- Exact runtime/compiler behavior for step-scoped `lock()` hoisting. +- Whether workflow-level locks should always be whole-run admission locks or + also support narrower workflow-scoped blocks. +- Whether `heartbeat()` should remain user-visible or become mostly internal. +- Whether step limits should only be expressed through `lock()` or also through + step metadata/config sugar. +- Fairness/wake-up policy for waiters per key in local and Postgres worlds. +- Exact event-log representation for acquire/block/dispose transitions. diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1e644f7416..5ec582352c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1284,6 +1284,9 @@ importers: '@workflow/tsconfig': specifier: workspace:* version: link:../tsconfig + vitest: + specifier: 'catalog:' + version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jiti@2.6.1)(jsdom@26.1.0)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1) zod: specifier: 'catalog:' version: 4.3.6 @@ -23456,14 +23459,6 @@ snapshots: optionalDependencies: vite: 7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1) - '@vitest/mocker@4.0.18(vite@7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1))': - dependencies: - '@vitest/spy': 4.0.18 - estree-walker: 3.0.3 - magic-string: 0.30.21 - optionalDependencies: - vite: 7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1) - '@vitest/mocker@4.0.18(vite@7.1.12(@types/node@24.6.2)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1))': dependencies: '@vitest/spy': 4.0.18 @@ -32855,7 +32850,7 @@ snapshots: vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jiti@2.6.1)(jsdom@26.1.0)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1): dependencies: '@vitest/expect': 4.0.18 - '@vitest/mocker': 4.0.18(vite@7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1)) + '@vitest/mocker': 4.0.18(vite@7.1.12(@types/node@24.6.2)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1)) '@vitest/pretty-format': 4.0.18 '@vitest/runner': 4.0.18 '@vitest/snapshot': 4.0.18 diff --git a/workbench/nextjs-turbopack/next.config.ts b/workbench/nextjs-turbopack/next.config.ts index 78df6b2090..5d1a204118 100644 --- a/workbench/nextjs-turbopack/next.config.ts +++ b/workbench/nextjs-turbopack/next.config.ts @@ -1,7 +1,9 @@ -import type { NextConfig } from 'next'; import path from 'node:path'; +import type { NextConfig } from 'next'; import { withWorkflow } from 'workflow/next'; +process.env.WORKFLOW_PUBLIC_MANIFEST ??= '1'; + const turbopackRoot = path.resolve(process.cwd(), '../..'); const nextConfig: NextConfig = { From 49ae775d9acf5b42069b39c9d8faafa271f16dd5 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 15:51:31 -0400 Subject: [PATCH 05/34] add pg limit tests, lock tests, schema, migration Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 27 +- packages/core/src/global.ts | 13 +- packages/core/src/lock.test.ts | 49 +- .../core/src/runtime/step-handler.test.ts | 23 + packages/core/src/runtime/step-handler.ts | 18 +- .../core/src/runtime/suspension-handler.ts | 32 + packages/core/src/step/lock.ts | 38 +- packages/core/src/workflow/lock.ts | 50 +- packages/workflow/src/internal/builtins.ts | 3 + packages/world-local/src/limits.test.ts | 152 +-- packages/world-postgres/README.md | 3 + .../migrations/0010_add_flow_limits.sql | 35 + .../migrations/meta/0010_snapshot.json | 973 ++++++++++++++++++ .../src/drizzle/migrations/meta/_journal.json | 7 + packages/world-postgres/src/drizzle/schema.ts | 49 + packages/world-postgres/src/limits.test.ts | 114 +- packages/world-postgres/test/test-db.ts | 59 ++ packages/world-testing/src/limits-contract.ts | 191 ++++ packages/world/FLOW_LIMITS.md | 97 +- workbench/example/workflows/99_e2e.ts | 47 + 20 files changed, 1740 insertions(+), 240 deletions(-) create mode 100644 packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql create mode 100644 packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json create mode 100644 packages/world-postgres/test/test-db.ts create mode 100644 packages/world-testing/src/limits-contract.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 86325b1c58..e4d4379259 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -568,7 +568,7 @@ describe('e2e', () => { } if (isPostgresWorld) { - test.fails( + test( 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on postgres world', { timeout: 60_000 }, async () => { @@ -587,6 +587,31 @@ describe('e2e', () => { ); } + if (isPostgresWorld) { + test( + 'workflowLockContentionWorkflow serializes workflow and step locks under contention', + { timeout: 60_000 }, + async () => { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, ['shared-user', 750]); + await sleep(100); + const runB = await start(workflow, ['shared-user', 750]); + + const [resultA, resultB] = await Promise.all([ + runA.returnValue, + runB.returnValue, + ]); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepLockReleasedAt + ); + } + ); + } + test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; diff --git a/packages/core/src/global.ts b/packages/core/src/global.ts index 3dd5c52ac8..6891e0a761 100644 --- a/packages/core/src/global.ts +++ b/packages/core/src/global.ts @@ -28,10 +28,17 @@ export interface WaitInvocationQueueItem { hasCreatedEvent?: boolean; } +export interface LimitWaitInvocationQueueItem { + type: 'limit_wait'; + correlationId: string; + resumeAt: Date; +} + export type QueueItem = | StepInvocationQueueItem | HookInvocationQueueItem - | WaitInvocationQueueItem; + | WaitInvocationQueueItem + | LimitWaitInvocationQueueItem; /** * An error that is thrown when one or more operations (steps/hooks/etc.) are called but do @@ -61,7 +68,9 @@ export class WorkflowSuspension extends Error { else if (item.type === 'hook') { if (item.disposed) hookDisposedCount++; else hookCount++; - } else if (item.type === 'wait') waitCount++; + } else if (item.type === 'wait' || item.type === 'limit_wait') { + waitCount++; + } } // Build description parts diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index 58419ae49e..c9237066e3 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -1,19 +1,48 @@ -import { describe, it } from 'vitest'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { lock, LIMITS_NOT_IMPLEMENTED_MESSAGE } from './lock.js'; +import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; + +afterEach(() => { + delete (globalThis as any)[WORKFLOW_LOCK]; + delete (globalThis as any)[STEP_LOCK]; +}); describe('lock', () => { - it.fails('is only callable inside workflow execution context', () => { - throw new Error('TODO: implement'); + it('throws when called outside workflow or step execution context', async () => { + await expect( + lock({ + key: 'workflow:user:test', + concurrency: { max: 1 }, + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); }); - it.fails('returns a handle with dispose and heartbeat behavior', () => { - throw new Error('TODO: implement'); - }); + it('prefers the workflow runtime lock when both runtimes are present', async () => { + const workflowHandle = { leaseId: 'lease_workflow' }; + const workflowLock = vi.fn().mockResolvedValue(workflowHandle); + const stepLock = vi.fn().mockResolvedValue({ leaseId: 'lease_step' }); + (globalThis as any)[WORKFLOW_LOCK] = workflowLock; + (globalThis as any)[STEP_LOCK] = stepLock; + const options = { + key: 'workflow:user:test', + concurrency: { max: 1 }, + }; - it.fails('allows multiple holders for one key up to the concurrency max', () => { - throw new Error('TODO: implement'); + await expect(lock(options)).resolves.toBe(workflowHandle); + expect(workflowLock).toHaveBeenCalledWith(options); + expect(stepLock).not.toHaveBeenCalled(); }); - it.fails('blocks rate-only locks until the rate window advances', () => { - throw new Error('TODO: implement'); + it('falls back to the step runtime lock when no workflow runtime is present', async () => { + const handle = { leaseId: 'lease_step' }; + const stepLock = vi.fn().mockResolvedValue(handle); + (globalThis as any)[STEP_LOCK] = stepLock; + const options = { + key: 'step:db:cheap', + concurrency: { max: 2 }, + }; + + await expect(lock(options)).resolves.toBe(handle); + expect(stepLock).toHaveBeenCalledWith(options); }); }); diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index e99290661f..ee4df5ea88 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -1,5 +1,6 @@ import { WorkflowAPIError } from '@workflow/errors'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { StepLockBlockedError } from '../step/lock.js'; // Use vi.hoisted so these are available in mock factories const { @@ -223,6 +224,28 @@ describe('step-handler 409 handling', () => { mockStepFn.mockResolvedValue('step-result'); }); + it('returns a timeout when a step lock is blocked before user code can proceed', async () => { + mockEventsCreate.mockResolvedValue({ + step: { + stepId: 'step_abc', + status: 'running', + attempt: 1, + startedAt: new Date(), + input: [], + }, + }); + mockStepFn.mockRejectedValue(new StepLockBlockedError(2_500)); + + const result = await capturedHandler( + createMessage(), + createMetadata('myStep') + ); + + expect(result).toEqual({ timeoutSeconds: 3 }); + expect(mockQueueMessage).not.toHaveBeenCalled(); + expect(mockEventsCreate).toHaveBeenCalledTimes(1); + }); + afterEach(() => { vi.restoreAllMocks(); }); diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index 44c585315a..fd3c1292dc 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -16,7 +16,7 @@ import { hydrateStepArguments, } from '../serialization.js'; import { contextStorage } from '../step/context-storage.js'; -import { createStepLock } from '../step/lock.js'; +import { createStepLock, StepLockBlockedError } from '../step/lock.js'; import { STEP_LOCK } from '../symbols.js'; import * as Attribute from '../telemetry/semantic-conventions.js'; import { @@ -438,6 +438,22 @@ const stepHandler = getWorldHandlers().createQueueHandler( if (userCodeFailed) { const err = userCodeError; + if (StepLockBlockedError.is(err)) { + const timeoutSeconds = Math.max( + 1, + Math.ceil((err.retryAfterMs ?? 1000) / 1000) + ); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + span?.addEvent?.('step.lock_blocked', { + 'retry.timeout_seconds': timeoutSeconds, + 'step.id': stepId, + 'step.name': stepName, + }); + return { timeoutSeconds }; + } + // Infrastructure errors that somehow surfaced through user code // should propagate to the queue handler for retry, not consume // step attempts. diff --git a/packages/core/src/runtime/suspension-handler.ts b/packages/core/src/runtime/suspension-handler.ts index caa45da753..dea2a50b5f 100644 --- a/packages/core/src/runtime/suspension-handler.ts +++ b/packages/core/src/runtime/suspension-handler.ts @@ -11,6 +11,7 @@ import { import { importKey } from '../encryption.js'; import type { HookInvocationQueueItem, + LimitWaitInvocationQueueItem, StepInvocationQueueItem, WaitInvocationQueueItem, WorkflowSuspension, @@ -79,6 +80,9 @@ export async function handleSuspension({ const waitItems = suspension.steps.filter( (item): item is WaitInvocationQueueItem => item.type === 'wait' ); + const limitWaitItems = suspension.steps.filter( + (item): item is LimitWaitInvocationQueueItem => item.type === 'limit_wait' + ); // Split hooks by what actions they need const hooksNeedingCreation = allHookItems.filter( @@ -307,6 +311,34 @@ export async function handleSuspension({ } } + // Lock waits: schedule a delayed workflow replay keyed by correlationId so a + // later immediate wake-up can replace it. + for (const queueItem of limitWaitItems) { + ops.push( + (async () => { + const delayMs = Math.max( + 1000, + queueItem.resumeAt.getTime() - Date.now() + ); + const traceCarrier = await serializeTraceCarrier(); + await queueMessage( + world, + `__wkf_workflow_${workflowName}`, + { + runId, + traceCarrier, + requestedAt: new Date(), + }, + { + delaySeconds: Math.ceil(delayMs / 1000), + idempotencyKey: queueItem.correlationId, + headers: extractTraceHeaders(traceCarrier), + } + ); + })() + ); + } + // Wait for all step and wait operations to complete waitUntil( Promise.all(ops).catch((opErr) => { diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts index 7451b4e712..fc3901f986 100644 --- a/packages/core/src/step/lock.ts +++ b/packages/core/src/step/lock.ts @@ -2,6 +2,20 @@ import type { LimitLease, World } from '@workflow/world'; import type { LockHandle, LockOptions } from '../lock.js'; import { contextStorage } from './context-storage.js'; +export class StepLockBlockedError extends Error { + retryAfterMs?: number; + + constructor(retryAfterMs?: number) { + super('Step lock blocked'); + this.name = 'StepLockBlockedError'; + this.retryAfterMs = retryAfterMs; + } + + static is(value: unknown): value is StepLockBlockedError { + return value instanceof StepLockBlockedError; + } +} + function createStepLockHandle(lease: LimitLease, world: World): LockHandle { let currentLease = lease; let disposed = false; @@ -58,21 +72,17 @@ export function createStepLock(world: World) { rate: options.rate, }; - while (true) { - const result = await world.limits.acquire({ - key: options.key, - holderId, - definition, - leaseTtlMs: options.leaseTtlMs, - }); - - if (result.status === 'acquired') { - return createStepLockHandle(result.lease, world); - } + const result = await world.limits.acquire({ + key: options.key, + holderId, + definition, + leaseTtlMs: options.leaseTtlMs, + }); - await new Promise((resolve) => - setTimeout(resolve, result.retryAfterMs || 1000) - ); + if (result.status === 'acquired') { + return createStepLockHandle(result.lease, world); } + + throw new StepLockBlockedError(result.retryAfterMs); }; } diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index 21db74d825..8f284d1003 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -1,4 +1,3 @@ -import { EventConsumerResult } from '../events-consumer.js'; import { WorkflowSuspension } from '../global.js'; import type { LockHandle, LockOptions } from '../lock.js'; import { @@ -64,7 +63,8 @@ function createLockHandle( export function createLock(ctx: WorkflowOrchestratorContext) { return async function lockImpl(options: LockOptions): Promise { - const holderId = `wflock_${ctx.generateUlid()}`; + const correlationId = `wflock_wait_${ctx.generateUlid()}`; + const holderId = `wflock_${ctx.runId}:${correlationId}:${ctx.generateUlid()}`; const definition = { concurrency: options.concurrency, rate: options.rate, @@ -82,49 +82,19 @@ export function createLock(ctx: WorkflowOrchestratorContext) { return createLockHandle(result.lease, ctx); } - const correlationId = `wflock_wait_${ctx.generateUlid()}`; - const resumeAt = new Date(Date.now() + (result.retryAfterMs || 1000)); ctx.invocationsQueue.set(correlationId, { - type: 'wait', + type: 'limit_wait', correlationId, - resumeAt, + resumeAt: new Date(Date.now() + (result.retryAfterMs || 1000)), }); - await new Promise((resolve) => { - ctx.eventsConsumer.subscribe((event) => { - if (!event) { - scheduleWhenIdle(ctx, () => { - ctx.onWorkflowError( - new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) - ); - }); - return EventConsumerResult.NotConsumed; - } - - if (event.correlationId !== correlationId) { - return EventConsumerResult.NotConsumed; - } - - if (event.eventType === 'wait_created') { - const queueItem = ctx.invocationsQueue.get(correlationId); - if (queueItem && queueItem.type === 'wait') { - queueItem.hasCreatedEvent = true; - queueItem.resumeAt = event.eventData.resumeAt; - } - return EventConsumerResult.Consumed; - } - - if (event.eventType === 'wait_completed') { - ctx.invocationsQueue.delete(correlationId); - ctx.promiseQueue = ctx.promiseQueue.then(() => { - resolve(); - }); - return EventConsumerResult.Finished; - } - - return EventConsumerResult.NotConsumed; - }); + scheduleWhenIdle(ctx, () => { + ctx.onWorkflowError( + new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + ); }); + + await new Promise(() => {}); } }; } diff --git a/packages/workflow/src/internal/builtins.ts b/packages/workflow/src/internal/builtins.ts index 886686e50e..624ebbaebd 100644 --- a/packages/workflow/src/internal/builtins.ts +++ b/packages/workflow/src/internal/builtins.ts @@ -2,6 +2,9 @@ * These are the built-in steps that are "automatically available" in the workflow scope. They are * similar to "stdlib" except that are not meant to be imported by users, but are instead "just available" * alongside user defined steps. They are used internally by the runtime + * + * These helpers intentionally rely on the method receiver (`this`) so workflow + * objects like `Request` and `Response` can round-trip through step execution. */ export async function __builtin_response_array_buffer( diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 1db72676af..3f8351f99b 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,149 +1,19 @@ +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { createLocalWorld } from './index.js'; +import { createLimits } from './limits.js'; import { mkdtemp, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; -import { setTimeout as sleep } from 'node:timers/promises'; -import { describe, expect, it } from 'vitest'; -import { createLocalWorld } from './index.js'; -import { createLimits } from './limits.js'; -async function withTempDir(fn: (dir: string) => Promise): Promise { +createLimitsContractSuite('local world limits', async () => { const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); - try { - return await fn(dir); - } finally { - await rm(dir, { recursive: true, force: true }); - } -} + const world = createLocalWorld({ dataDir: dir }); -describe('local world limits', () => { - it('exposes the required limits namespace', async () => { - await withTempDir(async (dir) => { - const world = createLocalWorld({ dataDir: dir }); - expect(world.limits).toBeDefined(); - expect(typeof world.limits.acquire).toBe('function'); - expect(typeof world.limits.release).toBe('function'); - expect(typeof world.limits.heartbeat).toBe('function'); + return { + limits: createLimits(dir), + close: async () => { await world.close?.(); - }); - }); - - it('enforces per-key concurrency limits', async () => { - await withTempDir(async (dir) => { - const limits = createLimits(dir); - - const first = await limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') { - throw new Error('expected first lease to be acquired'); - } - - const second = await limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(second).toMatchObject({ - status: 'blocked', - reason: 'concurrency', - }); - - await limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); - - const third = await limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(third.status).toBe('acquired'); - }); - }); - - it('returns a retry path when rate limits block acquisition', async () => { - await withTempDir(async (dir) => { - const limits = createLimits(dir); - - const first = await limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-a', - definition: { rate: { count: 1, periodMs: 100 } }, - leaseTtlMs: 1_000, - }); - - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') { - throw new Error('expected first lease to be acquired'); - } - - await limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); - - const second = await limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-b', - definition: { rate: { count: 1, periodMs: 100 } }, - leaseTtlMs: 1_000, - }); - - expect(second.status).toBe('blocked'); - if (second.status !== 'blocked') { - throw new Error('expected second acquire to be blocked'); - } - expect(second.reason).toBe('rate'); - expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); - }); - }); - - it('restores capacity when a lease is released or expires', async () => { - await withTempDir(async (dir) => { - const limits = createLimits(dir); - - const first = await limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 25, - }); - - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') { - throw new Error('expected first lease to be acquired'); - } - - const heartbeat = await limits.heartbeat({ - leaseId: first.lease.leaseId, - ttlMs: 50, - }); - expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( - first.lease.expiresAt?.getTime() ?? 0 - ); - - await sleep(60); - - const second = await limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(second.status).toBe('acquired'); - }); - }); + await rm(dir, { recursive: true, force: true }); + }, + }; }); diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index bfb617c9b6..7e2888f69f 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -117,6 +117,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - **Durable Storage**: Stores workflow runs, events, steps, hooks, and webhooks in PostgreSQL - **Queue Processing**: Uses graphile-worker as the durable queue and executes jobs over the workflow HTTP routes - **Durable Delays**: Re-schedules waits and retries in PostgreSQL +- **Flow Limits**: Enforces durable concurrency/rate limits with PostgreSQL-backed leases, rate tokens, and waiter promotion - **Streaming**: Real-time event streaming capabilities - **Health Checks**: Built-in connection health monitoring - **Configurable Concurrency**: Adjustable worker concurrency for queue processing @@ -127,6 +128,8 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Graphile jobs are acknowledged only after the workflow or step execution finishes, or after the worker durably schedules a delayed follow-up job - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling +- Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key +- Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` ## Development diff --git a/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql new file mode 100644 index 0000000000..01892d0bfe --- /dev/null +++ b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql @@ -0,0 +1,35 @@ +CREATE TABLE "workflow"."workflow_limit_leases" ( + "lease_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp, + "concurrency_max" integer, + "rate_count" integer, + "rate_period_ms" integer +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_limit_tokens" ( + "token_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp NOT NULL +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_limit_waiters" ( + "waiter_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "created_at" timestamp DEFAULT now() NOT NULL, + "lease_ttl_ms" integer, + "concurrency_max" integer, + "rate_count" integer, + "rate_period_ms" integer +); +--> statement-breakpoint +CREATE UNIQUE INDEX "workflow_limit_leases_limit_key_holder_id_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","holder_id");--> statement-breakpoint +CREATE INDEX "workflow_limit_leases_limit_key_expires_at_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","expires_at");--> statement-breakpoint +CREATE INDEX "workflow_limit_tokens_limit_key_expires_at_index" ON "workflow"."workflow_limit_tokens" USING btree ("limit_key","expires_at");--> statement-breakpoint +CREATE UNIQUE INDEX "workflow_limit_waiters_limit_key_holder_id_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","holder_id");--> statement-breakpoint +CREATE INDEX "workflow_limit_waiters_limit_key_created_at_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","created_at");--> statement-breakpoint diff --git a/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json new file mode 100644 index 0000000000..97ddba3774 --- /dev/null +++ b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json @@ -0,0 +1,973 @@ +{ + "id": "c4af56df-d588-4810-a8b4-f4eb68b270b2", + "prevId": "7adbbd35-ca90-4353-bb34-3d1b2435a027", + "version": "7", + "dialect": "postgresql", + "tables": { + "workflow.workflow_events": { + "name": "workflow_events", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "type": { + "name": "type", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "correlation_id": { + "name": "correlation_id", + "type": "varchar", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "payload": { + "name": "payload", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "payload_cbor": { + "name": "payload_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_events_run_id_index": { + "name": "workflow_events_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_events_correlation_id_index": { + "name": "workflow_events_correlation_id_index", + "columns": [ + { + "expression": "correlation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_hooks": { + "name": "workflow_hooks", + "schema": "workflow", + "columns": { + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "hook_id": { + "name": "hook_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "token": { + "name": "token", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "owner_id": { + "name": "owner_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "project_id": { + "name": "project_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "environment": { + "name": "environment", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "metadata_cbor": { + "name": "metadata_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "is_webhook": { + "name": "is_webhook", + "type": "boolean", + "primaryKey": false, + "notNull": false, + "default": true + } + }, + "indexes": { + "workflow_hooks_run_id_index": { + "name": "workflow_hooks_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_hooks_token_index": { + "name": "workflow_hooks_token_index", + "columns": [ + { + "expression": "token", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_leases": { + "name": "workflow_limit_leases", + "schema": "workflow", + "columns": { + "lease_id": { + "name": "lease_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "acquired_at": { + "name": "acquired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "concurrency_max": { + "name": "concurrency_max", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_count": { + "name": "rate_count", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_period_ms": { + "name": "rate_period_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_limit_leases_limit_key_holder_id_index": { + "name": "workflow_limit_leases_limit_key_holder_id_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_leases_limit_key_expires_at_index": { + "name": "workflow_limit_leases_limit_key_expires_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "expires_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_tokens": { + "name": "workflow_limit_tokens", + "schema": "workflow", + "columns": { + "token_id": { + "name": "token_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "acquired_at": { + "name": "acquired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "workflow_limit_tokens_limit_key_expires_at_index": { + "name": "workflow_limit_tokens_limit_key_expires_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "expires_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_waiters": { + "name": "workflow_limit_waiters", + "schema": "workflow", + "columns": { + "waiter_id": { + "name": "waiter_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "lease_ttl_ms": { + "name": "lease_ttl_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "concurrency_max": { + "name": "concurrency_max", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_count": { + "name": "rate_count", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_period_ms": { + "name": "rate_period_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_limit_waiters_limit_key_holder_id_index": { + "name": "workflow_limit_waiters_limit_key_holder_id_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_waiters_limit_key_created_at_index": { + "name": "workflow_limit_waiters_limit_key_created_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_runs": { + "name": "workflow_runs", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "output": { + "name": "output", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "output_cbor": { + "name": "output_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "deployment_id": { + "name": "deployment_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "execution_context": { + "name": "execution_context", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "execution_context_cbor": { + "name": "execution_context_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "input": { + "name": "input", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "input_cbor": { + "name": "input_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "error_cbor": { + "name": "error_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "expired_at": { + "name": "expired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_runs_name_index": { + "name": "workflow_runs_name_index", + "columns": [ + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_runs_status_index": { + "name": "workflow_runs_status_index", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_steps": { + "name": "workflow_steps", + "schema": "workflow", + "columns": { + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "step_id": { + "name": "step_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "step_name": { + "name": "step_name", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "step_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "input": { + "name": "input", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "input_cbor": { + "name": "input_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "output": { + "name": "output", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "output_cbor": { + "name": "output_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "error_cbor": { + "name": "error_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "attempt": { + "name": "attempt", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "retry_after": { + "name": "retry_after", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_steps_run_id_index": { + "name": "workflow_steps_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_steps_status_index": { + "name": "workflow_steps_status_index", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_stream_chunks": { + "name": "workflow_stream_chunks", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "stream_id": { + "name": "stream_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": false + }, + "data": { + "name": "data", + "type": "bytea", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "eof": { + "name": "eof", + "type": "boolean", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "workflow_stream_chunks_run_id_index": { + "name": "workflow_stream_chunks_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "workflow_stream_chunks_stream_id_id_pk": { + "name": "workflow_stream_chunks_stream_id_id_pk", + "columns": ["stream_id", "id"] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_waits": { + "name": "workflow_waits", + "schema": "workflow", + "columns": { + "wait_id": { + "name": "wait_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "wait_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "resume_at": { + "name": "resume_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_waits_run_id_index": { + "name": "workflow_waits_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": { + "public.step_status": { + "name": "step_status", + "schema": "public", + "values": ["pending", "running", "completed", "failed", "cancelled"] + }, + "public.wait_status": { + "name": "wait_status", + "schema": "public", + "values": ["waiting", "completed"] + }, + "public.status": { + "name": "status", + "schema": "public", + "values": ["pending", "running", "completed", "failed", "cancelled"] + } + }, + "schemas": { + "workflow": "workflow" + }, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json index f4956666fc..e98c400c01 100644 --- a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json +++ b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json @@ -71,6 +71,13 @@ "when": 1770500000000, "tag": "0009_add_is_webhook", "breakpoints": true + }, + { + "idx": 10, + "version": "7", + "when": 1773863098757, + "tag": "0010_add_flow_limits", + "breakpoints": true } ] } diff --git a/packages/world-postgres/src/drizzle/schema.ts b/packages/world-postgres/src/drizzle/schema.ts index f353ef8ca1..b6e8205237 100644 --- a/packages/world-postgres/src/drizzle/schema.ts +++ b/packages/world-postgres/src/drizzle/schema.ts @@ -21,6 +21,7 @@ import { primaryKey, text, timestamp, + uniqueIndex, varchar, } from 'drizzle-orm/pg-core'; import { Cbor, type Cborized } from './cbor.js'; @@ -192,6 +193,54 @@ export const waits = schema.table( (tb) => [index().on(tb.runId)] ); +export const limitLeases = schema.table( + 'workflow_limit_leases', + { + leaseId: varchar('lease_id').primaryKey(), + limitKey: varchar('limit_key').notNull(), + holderId: varchar('holder_id').notNull(), + acquiredAt: timestamp('acquired_at').defaultNow().notNull(), + expiresAt: timestamp('expires_at'), + concurrencyMax: integer('concurrency_max'), + rateCount: integer('rate_count'), + ratePeriodMs: integer('rate_period_ms'), + }, + (tb) => [ + uniqueIndex().on(tb.limitKey, tb.holderId), + index().on(tb.limitKey, tb.expiresAt), + ] +); + +export const limitTokens = schema.table( + 'workflow_limit_tokens', + { + tokenId: varchar('token_id').primaryKey(), + limitKey: varchar('limit_key').notNull(), + holderId: varchar('holder_id').notNull(), + acquiredAt: timestamp('acquired_at').defaultNow().notNull(), + expiresAt: timestamp('expires_at').notNull(), + }, + (tb) => [index().on(tb.limitKey, tb.expiresAt)] +); + +export const limitWaiters = schema.table( + 'workflow_limit_waiters', + { + waiterId: varchar('waiter_id').primaryKey(), + limitKey: varchar('limit_key').notNull(), + holderId: varchar('holder_id').notNull(), + createdAt: timestamp('created_at').defaultNow().notNull(), + leaseTtlMs: integer('lease_ttl_ms'), + concurrencyMax: integer('concurrency_max'), + rateCount: integer('rate_count'), + ratePeriodMs: integer('rate_period_ms'), + }, + (tb) => [ + uniqueIndex().on(tb.limitKey, tb.holderId), + index().on(tb.limitKey, tb.createdAt), + ] +); + const bytea = customType<{ data: Buffer; notNull: false; default: false }>({ dataType() { return 'bytea'; diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 2c43f08584..bf6ae15e23 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,19 +1,111 @@ -import { describe, it } from 'vitest'; +import { asc, eq } from 'drizzle-orm'; +import { + afterAll, + beforeAll, + beforeEach, + describe, + expect, + it, + test, +} from 'vitest'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import * as Schema from './drizzle/schema.js'; +import { createLimits } from './limits.js'; -describe('postgres world limits', () => { - it.fails('exposes the required limits namespace', () => { - throw new Error('TODO: implement'); +if (process.platform === 'win32') { + test.skip('skipped on Windows since it relies on a docker container', () => {}); +} else { + let db: Awaited< + ReturnType + >; + + beforeAll(async () => { + const { createPostgresTestDb } = await import('../test/test-db.js'); + db = await createPostgresTestDb(); + }, 120_000); + + beforeEach(async () => { + await db.truncateLimits(); }); - it.fails('respects the concurrency cap across concurrent acquires', () => { - throw new Error('TODO: implement'); + afterAll(async () => { + await db.close(); }); - it.fails('wakes waiters in deterministic order when a lease is released', () => { - throw new Error('TODO: implement'); + createLimitsContractSuite('postgres world limits', async () => { + return { + limits: createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ), + }; }); - it.fails('reclaims stale leases after worker or process death', () => { - throw new Error('TODO: implement'); + describe('postgres waiter promotion', () => { + it('promotes the earliest waiter on release', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + const first = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + const second = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const third = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('blocked'); + expect(third.status).toBe('blocked'); + + await limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, first.lease.key)) + .orderBy( + asc(Schema.limitLeases.acquiredAt), + asc(Schema.limitLeases.leaseId) + ); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) + .orderBy( + asc(Schema.limitWaiters.createdAt), + asc(Schema.limitWaiters.waiterId) + ); + + expect(leases).toEqual([{ holderId: 'holder-b' }]); + expect(waiters).toEqual([{ holderId: 'holder-c' }]); + + const stillWaiting = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(stillWaiting.status).toBe('blocked'); + }); }); -}); +} diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts new file mode 100644 index 0000000000..2bb21aa380 --- /dev/null +++ b/packages/world-postgres/test/test-db.ts @@ -0,0 +1,59 @@ +import { execSync } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { PostgreSqlContainer } from '@testcontainers/postgresql'; +import postgres from 'postgres'; +import { createClient } from '../src/drizzle/index.js'; + +const packageDir = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + '..' +); + +export interface PostgresTestDb { + container: Awaited>; + sql: ReturnType; + drizzle: ReturnType; + connectionString: string; + truncateLimits(): Promise; + close(): Promise; +} + +export async function createPostgresTestDb(): Promise { + const container = await new PostgreSqlContainer('postgres:15-alpine').start(); + const connectionString = container.getConnectionUri(); + process.env.DATABASE_URL = connectionString; + process.env.WORKFLOW_POSTGRES_URL = connectionString; + + execSync('pnpm db:push', { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); + + const sql = postgres(connectionString, { max: 1 }); + const drizzle = createClient(sql); + + return { + container, + sql, + drizzle, + connectionString, + async truncateLimits() { + await sql` + truncate table + workflow.workflow_limit_waiters, + workflow.workflow_limit_tokens, + workflow.workflow_limit_leases, + workflow.workflow_steps, + workflow.workflow_events, + workflow.workflow_runs + restart identity cascade + `; + }, + async close() { + await sql.end(); + await container.stop(); + }, + }; +} diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts new file mode 100644 index 0000000000..2acfbc1d72 --- /dev/null +++ b/packages/world-testing/src/limits-contract.ts @@ -0,0 +1,191 @@ +import { setTimeout as sleep } from 'node:timers/promises'; +import type { Limits } from '@workflow/world'; +import { describe, expect, it } from 'vitest'; + +export interface LimitsHarness { + limits: Limits; + close?: () => Promise; +} + +export function createLimitsContractSuite( + name: string, + createHarness: () => Promise +) { + describe(name, () => { + it('enforces per-key concurrency limits', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency', + }); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await harness.limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('returns a retry path when rate limits block acquisition', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-a', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const second = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-b', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('blocked'); + if (second.status !== 'blocked') throw new Error('expected blocked'); + expect(second.reason).toBe('rate'); + expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + } finally { + await harness.close?.(); + } + }); + + it('returns a combined blocked reason when both limits are saturated', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-a', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs: 1_000 }, + }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-b', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs: 1_000 }, + }, + leaseTtlMs: 1_000, + }); + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency_and_rate', + }); + } finally { + await harness.close?.(); + } + }); + + it('restores capacity when a lease is released or expires', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 100, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const heartbeat = await harness.limits.heartbeat({ + leaseId: first.lease.leaseId, + ttlMs: 200, + }); + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( + first.lease.expiresAt?.getTime() ?? 0 + ); + + await sleep(250); + + const second = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('reuses an existing lease for the same holder', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:reacquire', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'workflow:user:reacquire', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second).toMatchObject({ + status: 'acquired', + lease: { + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + }, + }); + } finally { + await harness.close?.(); + } + }); + }); +} diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index df5d3275c8..769f78ba74 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -1,16 +1,18 @@ -# Planning to delete after PR is implemented / ready to merge - # Flow Limits Design Notes -This note summarizes the current direction for flow concurrency and rate limiting -across `@workflow/core`, `@workflow/world`, and concrete world implementations. +This note summarizes the implemented direction for flow concurrency and rate +limiting across `@workflow/core`, `@workflow/world`, and concrete world +implementations. ## Status - The shared `limits` interface and `lock()` API surface now exist. -- Local world has an initial working implementation for acquire/release/heartbeat. -- Postgres and Vercel worlds still expose `limits` as stubs. -- There is a real local E2E example for workflow and step locks in the Next.js Turbopack workbench. +- Local world has a working lease-based implementation for + acquire/release/heartbeat. +- Postgres now has a PostgreSQL-backed implementation with leases, rate tokens, + and durable waiters. +- Vercel still exposes `limits` as a stub. +- The Next.js Turbopack workbench has E2E coverage for workflow and step locks. ## Goals @@ -115,17 +117,19 @@ workflow scope, even though the workflow may suspend and resume many times. #### In steps -`lock()` should act like a step gate. +`lock()` acts like a step gate. -The intended long-term behavior is: +The current behavior is: - declare the limit at the top of the step -- runtime/compiler hoists or interprets it as a pre-step requirement -- the step should not occupy a worker just waiting for capacity +- the runtime treats a blocked acquisition as step-boundary admission failure +- the step does not keep executing user code while waiting for capacity +- the step is re-queued and retried after promotion or timeout - lease is disposed automatically when the step attempt completes -This means step `lock()` is conceptually the same API, but not a literal -"block inside already-running user step code" implementation. +This means step `lock()` is conceptually the same API, but it is not a literal +"spin inside already-running user step code until capacity appears" +implementation. ### 6. `await using` is the preferred user-facing shape @@ -157,7 +161,7 @@ For workflows, `await using` must be tied to the logical workflow scope across: The lease must not be disposed merely because one host process invocation ends. -### 8. Prefer Option B for deadlock avoidance +### 8. Prefer step-boundary admission for deadlock avoidance Current preferred model: @@ -174,7 +178,55 @@ This keeps the dependency direction one-way: That avoids the classic cycle where one workflow holds a workflow lock and another holds a step lock and each waits on the other. -### 9. V1 semantics are intentionally opinionated +### 9. Waiters are FIFO per key + +The PostgreSQL implementation uses a durable waiter queue and promotes waiters +in FIFO order for a single limit key. + +Important details: + +- FIFO is per key, not global across all limit keys +- promotion order is based on waiter creation order +- a waiter may be skipped if it is no longer eligible when promotion runs +- releasing a lease or reclaiming an expired lease can both trigger promotion +- rate-window expiry can also make the head waiter eligible again + +This gives deterministic and inspectable fairness for a key without requiring a +global scheduler. + +### 10. Blocked limits do not consume worker concurrency + +Blocked flow limits and worker concurrency are intentionally separate. + +In the PostgreSQL world: + +- blocked workflows are suspended and re-queued, not left running on a worker +- blocked steps exit the current attempt and are re-queued instead of polling in + a live worker slot +- backlog remains durable in PostgreSQL while worker slots are free to service + unrelated work + +This is the main practical difference between a durable waiter model and a pure +polling loop. + +### 11. Wake-up is prompt, with a delayed fallback + +The PostgreSQL world uses Graphile for wake-up delivery, but PostgreSQL tables +remain the source of truth for limit state. + +Current behavior: + +- leases, rate tokens, and waiters live in PostgreSQL tables +- promotion decisions are made from SQL state +- when a waiter is promoted, the runtime is woken by enqueuing the appropriate + workflow or step job +- workflows also keep a delayed replay fallback so progress is still possible if + an immediate wake-up is missed + +This means Graphile is used to resume work quickly, not to decide fairness or +capacity ownership. + +### 12. V1 semantics are intentionally opinionated For v1, the intended semantics are: @@ -197,9 +249,8 @@ More concretely: For the current local implementation specifically: - workflow locks already behave like durable logical-scope leases -- step locks currently use in-process retry polling once the step is already - executing, which is acceptable for local v1 but not the ideal long-term - admission model +- step locks are still simpler than Postgres and do not provide the same durable + waiter/wake-up behavior This means the current v1 interpretation of a workflow lock is: @@ -268,13 +319,19 @@ For example: So overall system throughput is not one simple global minimum. Different workflow paths may be bottlenecked by different limits at different times. +Two more practical clarifications: + +- a blocked workflow lock should not monopolize + `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` or + `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` just because it is waiting +- a released concurrency lease frees concurrency immediately, but associated + rate usage still remains counted until its token ages out of the rate window + ## Open Questions -- Exact runtime/compiler behavior for step-scoped `lock()` hoisting. - Whether workflow-level locks should always be whole-run admission locks or also support narrower workflow-scoped blocks. - Whether `heartbeat()` should remain user-visible or become mostly internal. - Whether step limits should only be expressed through `lock()` or also through step metadata/config sugar. -- Fairness/wake-up policy for waiters per key in local and Postgres worlds. - Exact event-log representation for acquire/block/dispose transitions. diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 13dcd7cb0b..985e9331e4 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -261,6 +261,53 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { }; } +async function serializedLimitStep(label: string, holdMs: number) { + 'use step'; + + const stepLock = await lock({ + key: 'step:db:serialized', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const acquiredAt = Date.now(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); + await stepLock.dispose(); + const releasedAt = Date.now(); + + return { + label, + acquiredAt, + releasedAt, + }; +} + +export async function workflowLockContentionWorkflow( + userId = 'user-123', + holdMs = 750 +) { + 'use workflow'; + + const workflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const workflowLockAcquiredAt = Date.now(); + const step = await serializedLimitStep(userId, holdMs); + await workflowLock.dispose(); + const workflowLockReleasedAt = Date.now(); + + return { + userId, + workflowLockAcquiredAt, + workflowLockReleasedAt, + stepLockAcquiredAt: step.acquiredAt, + stepLockReleasedAt: step.releasedAt, + }; +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From 1677f3dcf229a6125455a19f307eaae9da81baf6 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 16:47:14 -0400 Subject: [PATCH 06/34] pg limits implementation Signed-off-by: nathancolosimo --- .../core/src/runtime/suspension-handler.ts | 4 + packages/core/src/step/lock.ts | 4 + packages/core/src/workflow/lock.ts | 5 + packages/world-postgres/src/limits.ts | 604 +++++++++++++++++- 4 files changed, 608 insertions(+), 9 deletions(-) diff --git a/packages/core/src/runtime/suspension-handler.ts b/packages/core/src/runtime/suspension-handler.ts index dea2a50b5f..eee6439556 100644 --- a/packages/core/src/runtime/suspension-handler.ts +++ b/packages/core/src/runtime/suspension-handler.ts @@ -316,6 +316,10 @@ export async function handleSuspension({ for (const queueItem of limitWaitItems) { ops.push( (async () => { + /* + Lock waits are runtime control flow, not user-visible wait events. + We only enqueue a fallback replay here; promoted waiters can replace it. + */ const delayMs = Math.max( 1000, queueItem.resumeAt.getTime() - Date.now() diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts index fc3901f986..6aa59132a5 100644 --- a/packages/core/src/step/lock.ts +++ b/packages/core/src/step/lock.ts @@ -83,6 +83,10 @@ export function createStepLock(world: World) { return createStepLockHandle(result.lease, world); } + /* + Steps do not sit inside user code polling for a lease. + The runtime catches this and re-queues the step attempt at the boundary. + */ throw new StepLockBlockedError(result.retryAfterMs); }; } diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index 8f284d1003..f0905e06e9 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -63,6 +63,11 @@ function createLockHandle( export function createLock(ctx: WorkflowOrchestratorContext) { return async function lockImpl(options: LockOptions): Promise { + /* + Blocked workflow locks suspend the workflow turn instead of creating a real + wait event. Postgres can wake this correlation id early when the waiter is + promoted, and the delayed replay is just a fallback. + */ const correlationId = `wflock_wait_${ctx.generateUlid()}`; const holderId = `wflock_${ctx.runId}:${correlationId}:${ctx.generateUlid()}`; const definition = { diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 01e8184c79..037b57d66a 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -1,20 +1,606 @@ -import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import { JsonTransport } from '@vercel/queue'; +import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; +import { WorkflowAPIError } from '@workflow/errors'; +import { + LimitAcquireRequestSchema, + type LimitAcquireResult, + LimitHeartbeatRequestSchema, + type LimitLease, + LimitReleaseRequestSchema, + type Limits, + MessageId, +} from '@workflow/world'; +import { monotonicFactory } from 'ulid'; import type { PostgresWorldConfig } from './config.js'; import type { Drizzle } from './drizzle/index.js'; +import * as Schema from './drizzle/schema.js'; +import { MessageData } from './message.js'; + +type LeaseRow = typeof Schema.limitLeases.$inferSelect; +type TokenRow = typeof Schema.limitTokens.$inferSelect; +type WaiterRow = typeof Schema.limitWaiters.$inferSelect; +type RunRow = Pick< + typeof Schema.runs.$inferSelect, + 'workflowName' | 'startedAt' | 'status' +>; +type StepRow = Pick; +type Tx = Parameters[0]>[0]; +type Db = Drizzle | Tx; + +type HolderTarget = + | { + kind: 'workflow'; + runId: string; + correlationId: string; + } + | { + kind: 'step'; + runId: string; + stepId: string; + } + | { + kind: 'opaque'; + }; + +const transport = new JsonTransport(); +const generateId = monotonicFactory(); + +function getQueues(config: PostgresWorldConfig) { + const prefix = config.jobPrefix || 'workflow_'; + return { + workflow: `${prefix}flows`, + step: `${prefix}steps`, + } as const; +} + +function nowPlus(ms?: number): Date | undefined { + if (ms === undefined) return undefined; + return new Date(Date.now() + ms); +} + +function toDate(value: Date | string | null | undefined): Date | undefined { + if (value === null || value === undefined) return undefined; + return value instanceof Date ? value : new Date(value); +} + +function toMillis(value: Date | string | null | undefined): number | undefined { + const date = toDate(value); + return date ? date.getTime() : undefined; +} + +/* +Holder ids double as wake-up hints. +When a waiter is promoted, we decode the holder id to decide which queue to poke. +*/ +function parseHolderId(holderId: string): HolderTarget { + if (holderId.startsWith('wflock_')) { + const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); + if (runId && correlationId) { + return { kind: 'workflow', runId, correlationId }; + } + } + + if (holderId.startsWith('stplock_')) { + const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); + if (runId && stepId) { + return { kind: 'step', runId, stepId }; + } + } + + return { kind: 'opaque' }; +} + +function toLease(row: LeaseRow): LimitLease { + return { + leaseId: row.leaseId, + key: row.limitKey, + holderId: row.holderId, + acquiredAt: toDate(row.acquiredAt)!, + expiresAt: toDate(row.expiresAt), + definition: { + concurrency: + row.concurrencyMax !== null ? { max: row.concurrencyMax } : undefined, + rate: + row.rateCount !== null && row.ratePeriodMs !== null + ? { + count: row.rateCount, + periodMs: row.ratePeriodMs, + } + : undefined, + }, + }; +} + +function getBlockedReason( + concurrencyBlocked: boolean, + rateBlocked: boolean +): 'concurrency' | 'rate' | 'concurrency_and_rate' { + if (concurrencyBlocked && rateBlocked) return 'concurrency_and_rate'; + if (concurrencyBlocked) return 'concurrency'; + return 'rate'; +} + +/* +When a workflow or step is blocked, we need to calculate the retry after time. +We do this by finding the earliest expiration time for any leases or tokens. +*/ +function getRetryAfterMs( + leases: LeaseRow[], + tokens: TokenRow[], + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const candidates: number[] = []; + + if (concurrencyBlocked) { + for (const lease of leases) { + if (lease.expiresAt) { + candidates.push(Math.max(0, toMillis(lease.expiresAt)! - now)); + } + } + } + + if (rateBlocked) { + for (const token of tokens) { + candidates.push(Math.max(0, toMillis(token.expiresAt)! - now)); + } + } + + if (candidates.length === 0) return undefined; + return Math.min(...candidates); +} + +async function queueWorkflowWake( + tx: Db, + config: PostgresWorldConfig, + runId: string, + workflowName: string, + idempotencyKey: string +) { + const messageId = MessageId.parse(`msg_${generateId()}`); + const payload = MessageData.encode({ + id: workflowName, + data: Buffer.from( + transport.serialize({ + runId, + requestedAt: new Date(), + }) + ), + attempt: 1, + idempotencyKey, + messageId, + }); + + await tx.execute(sql` + select graphile_worker.add_job( + ${getQueues(config).workflow}::text, + payload := ${JSON.stringify(payload)}::json, + max_attempts := 3, + job_key := ${idempotencyKey}::text, + job_key_mode := 'replace' + ) + `); +} + +async function queueStepWake( + tx: Db, + config: PostgresWorldConfig, + step: { + stepId: string; + stepName: string; + workflowName: string; + workflowStartedAt: number; + workflowRunId: string; + } +) { + const messageId = MessageId.parse(`msg_${generateId()}`); + const payload = MessageData.encode({ + id: step.stepName, + data: Buffer.from( + transport.serialize({ + workflowName: step.workflowName, + workflowRunId: step.workflowRunId, + workflowStartedAt: step.workflowStartedAt, + stepId: step.stepId, + requestedAt: new Date(), + }) + ), + attempt: 1, + idempotencyKey: step.stepId, + messageId, + }); + + await tx.execute(sql` + select graphile_worker.add_job( + ${getQueues(config).step}::text, + payload := ${JSON.stringify(payload)}::json, + max_attempts := 3, + job_key := ${step.stepId}::text, + job_key_mode := 'replace' + ) + `); +} + +async function queueWakeForHolder( + tx: Db, + config: PostgresWorldConfig, + holderId: string +) { + /* + Limit state is durable in Postgres, but wake-ups still need a runtime target. + If the run or step is already terminal, there is nothing left to resume. + */ + const target = parseHolderId(holderId); + if (target.kind === 'opaque') { + return; + } + + if (target.kind === 'workflow') { + const [run] = (await tx + .select({ + workflowName: Schema.runs.workflowName, + startedAt: Schema.runs.startedAt, + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as RunRow[]; + + if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { + return; + } + + await queueWorkflowWake( + tx, + config, + target.runId, + run.workflowName, + target.correlationId + ); + return; + } + + const [step] = (await tx + .select({ + stepName: Schema.steps.stepName, + status: Schema.steps.status, + }) + .from(Schema.steps) + .where(eq(Schema.steps.stepId, target.stepId)) + .limit(1)) as StepRow[]; + if (!step || ['completed', 'failed'].includes(step.status)) { + return; + } + + const [run] = (await tx + .select({ + workflowName: Schema.runs.workflowName, + startedAt: Schema.runs.startedAt, + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as RunRow[]; + if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { + return; + } + + await queueStepWake(tx, config, { + stepId: target.stepId, + stepName: step.stepName, + workflowName: run.workflowName, + workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), + workflowRunId: target.runId, + }); +} + +async function pruneExpired(tx: Db, key: string): Promise { + /* + Capacity is reclaimed opportunistically whenever a key is touched. + This keeps v1 simple and avoids needing a separate cleanup worker. + */ + const now = new Date(); + + await tx + .delete(Schema.limitTokens) + .where( + and( + eq(Schema.limitTokens.limitKey, key), + lte(Schema.limitTokens.expiresAt, now) + ) + ); + + await tx + .delete(Schema.limitLeases) + .where( + and( + eq(Schema.limitLeases.limitKey, key), + isNotNull(Schema.limitLeases.expiresAt), + lte(Schema.limitLeases.expiresAt, now) + ) + ); +} + +async function getActiveState( + tx: Db, + key: string +): Promise<{ + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; +}> { + const [leases, tokens, waiters] = await Promise.all([ + tx + .select() + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, key)) + .orderBy( + asc(Schema.limitLeases.acquiredAt), + asc(Schema.limitLeases.leaseId) + ), + tx + .select() + .from(Schema.limitTokens) + .where(eq(Schema.limitTokens.limitKey, key)) + .orderBy(asc(Schema.limitTokens.expiresAt)), + tx + .select() + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, key)) + .orderBy( + asc(Schema.limitWaiters.createdAt), + asc(Schema.limitWaiters.waiterId) + ), + ]); + + return { leases, tokens, waiters }; +} + +async function promoteWaiters( + tx: Db, + config: PostgresWorldConfig, + key: string +): Promise { + /* + We walk waiters in FIFO order and stop at the first waiter that is still blocked. + Later waiters cannot jump ahead of an earlier waiter for the same key. (getActiveState returns waiters in FIFO order) + */ + const state = await getActiveState(tx, key); + let activeLeases = state.leases.length; + let activeTokens = state.tokens.length; + + for (const waiter of state.waiters) { + const concurrencyBlocked = + waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; + const rateBlocked = + waiter.rateCount !== null && activeTokens >= waiter.rateCount; + + if (concurrencyBlocked || rateBlocked) { + break; + } + + const leaseId = `lmt_${generateId()}`; + const expiresAt = nowPlus(waiter.leaseTtlMs ?? undefined); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt, + concurrencyMax: waiter.concurrencyMax, + rateCount: waiter.rateCount, + ratePeriodMs: waiter.ratePeriodMs, + }) + .onConflictDoNothing() + .returning(); + + const acquiredLease = + lease ?? + (await tx.query.limitLeases.findFirst({ + where: and( + eq(Schema.limitLeases.limitKey, key), + eq(Schema.limitLeases.holderId, waiter.holderId) + ), + })); + + if (!acquiredLease) { + continue; + } + + if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { + await tx.insert(Schema.limitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + waiter.ratePeriodMs), + }); + activeTokens += 1; + } + + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + + activeLeases += 1; + await queueWakeForHolder(tx, config, acquiredLease.holderId); + } +} export function createLimits( - _config: PostgresWorldConfig, - _drizzle: Drizzle + config: PostgresWorldConfig, + drizzle: Drizzle ): Limits { return { - async acquire() { - throw createLimitsNotImplementedError(); + async acquire(request) { + const parsed = LimitAcquireRequestSchema.parse(request); + + return drizzle.transaction(async (tx) => { + // Prune expired leases and tokens, promote pre-existing waiters before attempting to acquire a new lease or token. + await pruneExpired(tx, parsed.key); + await promoteWaiters(tx, config, parsed.key); + + const state = await getActiveState(tx, parsed.key); + const existingLease = state.leases.find( + (lease) => lease.holderId === parsed.holderId + ); + if (existingLease) { + return { + status: 'acquired', + lease: toLease(existingLease), + } satisfies LimitAcquireResult; + } + + const existingWaiter = state.waiters.find( + (waiter) => waiter.holderId === parsed.holderId + ); + // If there are already waiters for this key and holder no need to queue a new waiter. + if (existingWaiter) { + const now = Date.now(); + return { + status: 'blocked', + reason: getBlockedReason( + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ), + retryAfterMs: + getRetryAfterMs( + state.leases, + state.tokens, + now, + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ) ?? 1000, + } satisfies LimitAcquireResult; + } + + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + state.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + state.tokens.length >= parsed.definition.rate.count; + + // If we are not blocked, and there are no waiters for this key and holder, we can acquire a new lease or token. + if (!concurrencyBlocked && !rateBlocked && state.waiters.length === 0) { + const expiresAt = nowPlus(parsed.leaseTtlMs); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId: `lmt_${generateId()}`, + limitKey: parsed.key, + holderId: parsed.holderId, + acquiredAt: new Date(), + expiresAt, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }) + .returning(); + + if (parsed.definition.rate) { + await tx.insert(Schema.limitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: parsed.key, + holderId: parsed.holderId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + parsed.definition.rate.periodMs), + }); + } + + return { + status: 'acquired', + lease: toLease(lease), + } satisfies LimitAcquireResult; + } + + // If we are blocked, we need to queue a waiter. + await tx + .insert(Schema.limitWaiters) + .values({ + waiterId: `lmtwait_${generateId()}`, + limitKey: parsed.key, + holderId: parsed.holderId, + createdAt: new Date(), + leaseTtlMs: parsed.leaseTtlMs ?? null, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }) + .onConflictDoNothing(); + + const now = Date.now(); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: + getRetryAfterMs( + state.leases, + state.tokens, + now, + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ) ?? 1000, + } satisfies LimitAcquireResult; + }); }, - async release() { - throw createLimitsNotImplementedError(); + + async release(request) { + const parsed = LimitReleaseRequestSchema.parse(request); + + await drizzle.transaction(async (tx) => { + let where = eq(Schema.limitLeases.leaseId, parsed.leaseId); + if (parsed.key) { + where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; + } + if (parsed.holderId) { + where = and(where, eq(Schema.limitLeases.holderId, parsed.holderId))!; + } + + const [deleted] = await tx + .delete(Schema.limitLeases) + .where(where) + .returning({ limitKey: Schema.limitLeases.limitKey }); + + if (deleted?.limitKey) { + await pruneExpired(tx, deleted.limitKey); + await promoteWaiters(tx, config, deleted.limitKey); + } + }); }, - async heartbeat() { - throw createLimitsNotImplementedError(); + + async heartbeat(request) { + const parsed = LimitHeartbeatRequestSchema.parse(request); + + // Heartbeat a lease to extend its expiry. + return drizzle.transaction(async (tx) => { + const existing = await tx.query.limitLeases.findFirst({ + where: eq(Schema.limitLeases.leaseId, parsed.leaseId), + }); + + if (!existing) { + throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { + status: 404, + }); + } + + const now = Date.now(); + const currentExpiry = toMillis(existing.expiresAt); + const ttlMs = + parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); + const expiresAt = new Date(now + Math.max(1, ttlMs)); + + const [updated] = await tx + .update(Schema.limitLeases) + .set({ expiresAt }) + .where(eq(Schema.limitLeases.leaseId, parsed.leaseId)) + .returning(); + + return toLease(updated); + }); }, }; } From 45cd62bbf49c9815ad89a46a322a01de0ef23ba6 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 17:56:57 -0400 Subject: [PATCH 07/34] DCO Remediation Commit for nathancolosimo I, nathancolosimo , hereby add my Signed-off-by to this commit: 4b918ca431dd22a7343e067f4b2e64f3b0442c1d Signed-off-by: nathancolosimo --- packages/world-postgres/src/limits.test.ts | 36 ++++++++++++++++++++++ packages/world-postgres/src/limits.ts | 26 ++++++++++++++++ packages/world-postgres/test/test-db.ts | 2 +- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index bf6ae15e23..01d0605c9b 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -42,6 +42,42 @@ if (process.platform === 'win32') { }); describe('postgres waiter promotion', () => { + it('serializes concurrent acquires for the same key', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + const results = await Promise.all( + Array.from({ length: 12 }, (_, index) => + limits.acquire({ + key: 'workflow:user:concurrent', + holderId: `holder-${index}`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }) + ) + ); + + const acquired = results.filter((result) => result.status === 'acquired'); + const blocked = results.filter((result) => result.status === 'blocked'); + + expect(acquired).toHaveLength(1); + expect(blocked).toHaveLength(11); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, 'workflow:user:concurrent')); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, 'workflow:user:concurrent')); + + expect(leases).toHaveLength(1); + expect(waiters).toHaveLength(11); + }); + it('promotes the earliest waiter on release', async () => { const limits = createLimits( { connectionString: db.connectionString, queueConcurrency: 1 }, diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 037b57d66a..a3892f72aa 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -357,6 +357,16 @@ async function getActiveState( return { leases, tokens, waiters }; } +/* +We serialize limit mutations per key inside the transaction so concurrent +acquire/release flows cannot both observe the same free capacity. +*/ +async function lockLimitKey(tx: Db, key: string): Promise { + await tx.execute( + sql`select pg_advisory_xact_lock(hashtextextended(${key}, 0))` + ); +} + async function promoteWaiters( tx: Db, config: PostgresWorldConfig, @@ -439,6 +449,7 @@ export function createLimits( const parsed = LimitAcquireRequestSchema.parse(request); return drizzle.transaction(async (tx) => { + await lockLimitKey(tx, parsed.key); // Prune expired leases and tokens, promote pre-existing waiters before attempting to acquire a new lease or token. await pruneExpired(tx, parsed.key); await promoteWaiters(tx, config, parsed.key); @@ -552,6 +563,19 @@ export function createLimits( const parsed = LimitReleaseRequestSchema.parse(request); await drizzle.transaction(async (tx) => { + const key = + parsed.key ?? + ( + await tx.query.limitLeases.findFirst({ + columns: { limitKey: true }, + where: eq(Schema.limitLeases.leaseId, parsed.leaseId), + }) + )?.limitKey; + + if (key) { + await lockLimitKey(tx, key); + } + let where = eq(Schema.limitLeases.leaseId, parsed.leaseId); if (parsed.key) { where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; @@ -587,6 +611,8 @@ export function createLimits( }); } + await lockLimitKey(tx, existing.limitKey); + const now = Date.now(); const currentExpiry = toMillis(existing.expiresAt); const ttlMs = diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts index 2bb21aa380..ef27f70052 100644 --- a/packages/world-postgres/test/test-db.ts +++ b/packages/world-postgres/test/test-db.ts @@ -31,7 +31,7 @@ export async function createPostgresTestDb(): Promise { env: process.env, }); - const sql = postgres(connectionString, { max: 1 }); + const sql = postgres(connectionString, { max: 10 }); const drizzle = createClient(sql); return { From dc85a46607b57a390d97ebb534cbb0d062320373 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 22:16:06 -0400 Subject: [PATCH 08/34] Add in-step locking support - doesn't hang the step though Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 55 ++++++ .../core/src/runtime/step-handler.test.ts | 155 ++++++++++++++++- packages/core/src/runtime/step-handler.ts | 159 +++++++++++++++++- packages/core/src/step.test.ts | 36 ++++ packages/core/src/step.ts | 7 +- packages/core/src/step/context-storage.ts | 2 + packages/core/src/step/lock.ts | 23 ++- packages/world-local/src/storage.test.ts | 77 +++++++++ .../world-local/src/storage/events-storage.ts | 32 +++- packages/world-postgres/src/limits.test.ts | 158 +++++++++++++++++ packages/world-postgres/src/limits.ts | 47 ++++++ packages/world-postgres/src/storage.ts | 61 ++++++- packages/world-postgres/test/storage.test.ts | 77 +++++++++ packages/world/FLOW_LIMITS.md | 8 +- packages/world/src/events.ts | 17 ++ workbench/example/workflows/99_e2e.ts | 32 ++++ 16 files changed, 926 insertions(+), 20 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index e4d4379259..a1979f1bc6 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -612,6 +612,61 @@ describe('e2e', () => { ); } + test( + 'stepLockNoRetriesContentionWorkflow does not consume retries while blocked on a step lock', + { timeout: 60_000 }, + async () => { + const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); + const runA = await start(workflow, ['shared-user', 750]); + await sleep(100); + const runB = await start(workflow, ['shared-user', 750]); + + const [resultA, resultB] = await Promise.all([ + runA.returnValue, + runB.returnValue, + ]); + const [firstResult, secondResult] = [resultA, resultB].sort( + (left, right) => left.acquiredAt - right.acquiredAt + ); + + expect(resultA.attempt).toBe(1); + expect(resultB.attempt).toBe(1); + expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( + firstResult.releasedAt + ); + } + ); + + if (isPostgresWorld) { + test( + 'cancelled workflow waiters are skipped before the next waiter is promoted', + { timeout: 60_000 }, + async () => { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, ['shared-user', 500]); + await sleep(100); + const runB = await start(workflow, ['shared-user', 500]); + await sleep(200); + await cliCancel(runB.runId); + const cancelledError = await runB.returnValue.catch((error) => error); + const runC = await start(workflow, ['shared-user', 500]); + + const [resultA, resultC] = await Promise.all([ + runA.returnValue, + runC.returnValue, + ]); + + expect(cancelledError).toBeInstanceOf(WorkflowRunCancelledError); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + } + ); + } + test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index ee4df5ea88..78050bc819 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -6,9 +6,14 @@ import { StepLockBlockedError } from '../step/lock.js'; const { capturedHandlerRef, mockEventsCreate, + mockEventsListByCorrelationId, + mockLimitsAcquire, + mockLimitsHeartbeat, + mockLimitsRelease, mockQueue, mockRuntimeLogger, mockStepLogger, + mockStepGet, mockQueueMessage, mockStepFn, } = vi.hoisted(() => { @@ -20,6 +25,14 @@ const { current: null as null | ((...args: unknown[]) => Promise), }, mockEventsCreate: vi.fn(), + mockEventsListByCorrelationId: vi.fn().mockResolvedValue({ + data: [], + cursor: null, + hasMore: false, + }), + mockLimitsAcquire: vi.fn(), + mockLimitsHeartbeat: vi.fn(), + mockLimitsRelease: vi.fn().mockResolvedValue(undefined), mockQueue: vi.fn().mockResolvedValue({ messageId: 'msg_test' }), mockRuntimeLogger: { warn: vi.fn(), @@ -34,6 +47,16 @@ const { error: vi.fn(), }, mockQueueMessage: vi.fn().mockResolvedValue(undefined), + mockStepGet: vi.fn().mockResolvedValue({ + stepId: 'step_abc', + runId: 'wrun_test123', + stepName: 'myStep', + status: 'pending', + input: [], + attempt: 0, + createdAt: new Date(), + updatedAt: new Date(), + }), mockStepFn, }; }); @@ -49,7 +72,18 @@ vi.mock('@vercel/functions', () => ({ // Mock the world module - createQueueHandler captures the handler vi.mock('./world.js', () => ({ getWorld: vi.fn(() => ({ - events: { create: mockEventsCreate }, + events: { + create: mockEventsCreate, + listByCorrelationId: mockEventsListByCorrelationId, + }, + limits: { + acquire: mockLimitsAcquire, + heartbeat: mockLimitsHeartbeat, + release: mockLimitsRelease, + }, + steps: { + get: mockStepGet, + }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), })), @@ -204,9 +238,38 @@ describe('step-handler 409 handling', () => { mockStepFn.mockReset().mockResolvedValue('step-result'); mockStepFn.maxRetries = 3; mockQueueMessage.mockResolvedValue(undefined); + mockEventsListByCorrelationId.mockReset().mockResolvedValue({ + data: [], + cursor: null, + hasMore: false, + }); + mockLimitsAcquire.mockReset(); + mockLimitsHeartbeat.mockReset(); + mockLimitsRelease.mockReset().mockResolvedValue(undefined); + mockStepGet.mockReset().mockResolvedValue({ + stepId: 'step_abc', + runId: 'wrun_test123', + stepName: 'myStep', + status: 'pending', + input: [], + attempt: 0, + createdAt: new Date(), + updatedAt: new Date(), + }); // Re-set getWorld mock since clearAllMocks resets it vi.mocked(getWorld).mockReturnValue({ - events: { create: mockEventsCreate }, + events: { + create: mockEventsCreate, + listByCorrelationId: mockEventsListByCorrelationId, + }, + limits: { + acquire: mockLimitsAcquire, + heartbeat: mockLimitsHeartbeat, + release: mockLimitsRelease, + }, + steps: { + get: mockStepGet, + }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), } as any); @@ -234,7 +297,17 @@ describe('step-handler 409 handling', () => { input: [], }, }); - mockStepFn.mockRejectedValue(new StepLockBlockedError(2_500)); + mockStepFn.mockRejectedValue( + new StepLockBlockedError( + { + key: 'step:db:no-retries', + holderId: 'stplock_wrun_test123:step_abc:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }, + 2_500 + ) + ); const result = await capturedHandler( createMessage(), @@ -243,7 +316,83 @@ describe('step-handler 409 handling', () => { expect(result).toEqual({ timeoutSeconds: 3 }); expect(mockQueueMessage).not.toHaveBeenCalled(); + expect(mockEventsCreate).toHaveBeenCalledTimes(2); + expect(mockEventsCreate).toHaveBeenNthCalledWith( + 1, + 'wrun_test123', + expect.objectContaining({ + eventType: 'step_started', + }), + expect.anything() + ); + expect(mockEventsCreate).toHaveBeenNthCalledWith( + 2, + 'wrun_test123', + expect.objectContaining({ + eventType: 'step_deferred', + correlationId: 'step_abc', + eventData: { + retryAfter: expect.any(Date), + lockRequest: expect.objectContaining({ + key: expect.any(String), + holderId: 'stplock_wrun_test123:step_abc:0', + }), + }, + }), + expect.anything() + ); + }); + + it('rechecks a deferred lock before step_started and re-defers without running user code', async () => { + mockEventsListByCorrelationId.mockResolvedValue({ + data: [ + { + eventId: 'evnt_1', + runId: 'wrun_test123', + eventType: 'step_deferred', + correlationId: 'step_abc', + eventData: { + retryAfter: new Date(Date.now() - 1_000), + lockRequest: { + key: 'step:db:no-retries', + holderId: 'stplock_wrun_test123:step_abc:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }, + }, + createdAt: new Date(), + }, + ], + cursor: null, + hasMore: false, + }); + mockLimitsAcquire.mockResolvedValue({ + status: 'blocked', + reason: 'concurrency', + retryAfterMs: 2_500, + }); + + const result = await capturedHandler( + createMessage(), + createMetadata('myStep') + ); + + expect(result).toEqual({ timeoutSeconds: 3 }); + expect(mockStepFn).not.toHaveBeenCalled(); + expect(mockLimitsAcquire).toHaveBeenCalledWith({ + key: 'step:db:no-retries', + holderId: 'stplock_wrun_test123:step_abc:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); expect(mockEventsCreate).toHaveBeenCalledTimes(1); + expect(mockEventsCreate).toHaveBeenCalledWith( + 'wrun_test123', + expect.objectContaining({ + eventType: 'step_deferred', + }), + expect.anything() + ); }); afterEach(() => { diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index fd3c1292dc..1b2257f256 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -7,7 +7,12 @@ import { } from '@workflow/errors'; import { pluralize } from '@workflow/utils'; import { getPort } from '@workflow/utils/get-port'; -import { SPEC_VERSION_CURRENT, StepInvokePayloadSchema } from '@workflow/world'; +import { + LimitAcquireRequestSchema, + SPEC_VERSION_CURRENT, + StepInvokePayloadSchema, + type LimitLease, +} from '@workflow/world'; import { importKey } from '../encryption.js'; import { runtimeLogger, stepLogger } from '../logger.js'; import { getStepFunction } from '../private.js'; @@ -43,6 +48,65 @@ import { getWorld, getWorldHandlers } from './world.js'; const DEFAULT_STEP_MAX_RETRIES = 3; +async function getDeferredStepLock( + world: ReturnType, + workflowRunId: string, + stepId: string +) { + let step: Awaited>; + try { + step = await world.steps.get(workflowRunId, stepId); + } catch (error) { + if (WorkflowAPIError.is(error) && error.status === 404) { + return null; + } + throw error; + } + if (step.status !== 'pending') { + return null; + } + + const result = await world.events.listByCorrelationId({ + correlationId: stepId, + pagination: { + limit: 1, + sortOrder: 'desc', + }, + }); + const latestEvent = result.data[0]; + + if ( + !latestEvent || + latestEvent.runId !== workflowRunId || + latestEvent.eventType !== 'step_deferred' || + !latestEvent.eventData.lockRequest + ) { + return null; + } + + return { + step, + lockRequest: LimitAcquireRequestSchema.parse( + latestEvent.eventData.lockRequest + ), + }; +} + +async function releaseUnusedPreAcquiredLocks( + world: ReturnType, + preAcquiredLocks: Record +) { + await Promise.all( + Object.values(preAcquiredLocks).map((lease) => + world.limits.release({ + leaseId: lease.leaseId, + key: lease.key, + holderId: lease.holderId, + }) + ) + ); +} + const stepHandler = getWorldHandlers().createQueueHandler( '__wkf_step_', async (message_, metadata) => { @@ -114,6 +178,56 @@ const stepHandler = getWorldHandlers().createQueueHandler( ...Attribute.StepTracePropagated(!!traceContext), }); + const preAcquiredLocks: Record = {}; + const deferredStepLock = await getDeferredStepLock( + world, + workflowRunId, + stepId + ); + if (deferredStepLock) { + const retryAfter = deferredStepLock.step.retryAfter; + if (retryAfter && retryAfter.getTime() > Date.now()) { + const timeoutSeconds = Math.max( + 1, + Math.ceil((retryAfter.getTime() - Date.now()) / 1000) + ); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + return { timeoutSeconds }; + } + + const lockResult = await world.limits.acquire( + deferredStepLock.lockRequest + ); + if (lockResult.status === 'blocked') { + const retryAfterMs = Math.max(1, lockResult.retryAfterMs ?? 1000); + const timeoutSeconds = Math.max( + 1, + Math.ceil(retryAfterMs / 1000) + ); + await world.events.create( + workflowRunId, + { + eventType: 'step_deferred', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + retryAfter: new Date(Date.now() + retryAfterMs), + lockRequest: deferredStepLock.lockRequest, + }, + }, + { requestId } + ); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + return { timeoutSeconds }; + } + + preAcquiredLocks[lockResult.lease.holderId] = lockResult.lease; + } + // step_started validates state and returns the step entity, so no separate // world.steps.get() call is needed. The server checks: // - Step not in terminal state (returns 409) @@ -140,6 +254,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( } catch (err) { if (WorkflowAPIError.is(err)) { if (err.status === 429) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); const retryRetryAfter = Math.max( 1, typeof err.retryAfter === 'number' ? err.retryAfter : 1 @@ -154,6 +269,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( } // 410 Gone: Workflow has already completed if (err.status === 410) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.info( `Workflow run "${workflowRunId}" has already completed, skipping step "${stepId}": ${err.message}` ); @@ -163,6 +279,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // 409 Conflict: Step in terminal state (completed/failed/cancelled) // Re-enqueue the workflow to continue processing if (err.status === 409) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.debug( 'Step in terminal state, re-enqueuing workflow', { @@ -194,6 +311,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // 425 Too Early: retryAfter timestamp not reached yet // Return timeout to queue so it retries later if (err.status === 425) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); // Parse retryAfter from error response meta const retryAfterStr = (err as any).meta?.retryAfter; const retryAfter = retryAfterStr @@ -413,6 +531,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( closureVars: hydratedInput.closureVars, encryptionKey, lockCounter: 0, + preAcquiredLocks, }, () => stepFn.apply(thisVal, args) ); @@ -427,6 +546,8 @@ const stepHandler = getWorldHandlers().createQueueHandler( } catch (err) { userCodeError = err; userCodeFailed = true; + } finally { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); } const executionTimeMs = Date.now() - executionStartTime; @@ -439,10 +560,12 @@ const stepHandler = getWorldHandlers().createQueueHandler( const err = userCodeError; if (StepLockBlockedError.is(err)) { + const retryAfterMs = Math.max(1, err.retryAfterMs ?? 1000); const timeoutSeconds = Math.max( 1, - Math.ceil((err.retryAfterMs ?? 1000) / 1000) + Math.ceil(retryAfterMs / 1000) ); + const retryAfter = new Date(Date.now() + retryAfterMs); span?.setAttributes({ ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), }); @@ -451,6 +574,38 @@ const stepHandler = getWorldHandlers().createQueueHandler( 'step.id': stepId, 'step.name': stepName, }); + try { + await world.events.create( + workflowRunId, + { + eventType: 'step_deferred', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + retryAfter, + lockRequest: err.request, + }, + }, + { requestId } + ); + } catch (stepDeferredErr) { + if ( + WorkflowAPIError.is(stepDeferredErr) && + stepDeferredErr.status === 409 + ) { + runtimeLogger.info( + 'Tried deferring step, but step has already finished.', + { + workflowRunId, + stepId, + stepName, + message: stepDeferredErr.message, + } + ); + return; + } + throw stepDeferredErr; + } return { timeoutSeconds }; } diff --git a/packages/core/src/step.test.ts b/packages/core/src/step.test.ts index a8f080e0b7..5a0e47af56 100644 --- a/packages/core/src/step.test.ts +++ b/packages/core/src/step.test.ts @@ -412,6 +412,42 @@ describe('createUseStep', () => { expect(ctx.invocationsQueue.size).toBe(1); }); + it('should consume step_deferred event and continue waiting', async () => { + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_0', + runId: 'wrun_123', + eventType: 'step_deferred', + correlationId: 'step_01K11TFZ62YS0YYFDQ3E8B9YCV', + eventData: { + retryAfter: new Date(), + }, + createdAt: new Date(), + }, + ]); + + let workflowErrorReject: (err: Error) => void; + const workflowErrorPromise = new Promise((_, reject) => { + workflowErrorReject = reject; + }); + ctx.onWorkflowError = (err) => { + workflowErrorReject(err); + }; + + const useStep = createUseStep(ctx); + const add = useStep('add'); + + let error: Error | undefined; + try { + await Promise.race([add(1, 2), workflowErrorPromise]); + } catch (err_) { + error = err_ as Error; + } + + expect(error).toBeInstanceOf(WorkflowSuspension); + expect(ctx.invocationsQueue.size).toBe(1); + }); + it('should remove queue item when step_completed (terminal state)', async () => { const ctx = setupWorkflowContext([ { diff --git a/packages/core/src/step.ts b/packages/core/src/step.ts index bd45c3008c..3cc9e59ce4 100644 --- a/packages/core/src/step.ts +++ b/packages/core/src/step.ts @@ -96,7 +96,7 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Finished; } queueItem.hasCreatedEvent = true; - // Continue waiting for step_started/step_completed/step_failed events + // Continue waiting for later step lifecycle events. return EventConsumerResult.Consumed; } @@ -112,6 +112,11 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Consumed; } + if (event.eventType === 'step_deferred') { + // Admission was blocked before user work could proceed, so keep waiting. + return EventConsumerResult.Consumed; + } + if (event.eventType === 'step_failed') { // Terminal state - we can remove the invocationQueue item ctx.invocationsQueue.delete(event.correlationId); diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index dadb25b132..b63329dd20 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -1,4 +1,5 @@ import { AsyncLocalStorage } from 'node:async_hooks'; +import type { LimitLease } from '@workflow/world'; import type { CryptoKey } from '../encryption.js'; import type { WorkflowMetadata } from '../workflow/get-workflow-metadata.js'; import type { StepMetadata } from './get-step-metadata.js'; @@ -10,4 +11,5 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ closureVars?: Record; encryptionKey?: CryptoKey; lockCounter: number; + preAcquiredLocks?: Record; }>(); diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts index 6aa59132a5..b537cc7503 100644 --- a/packages/core/src/step/lock.ts +++ b/packages/core/src/step/lock.ts @@ -1,14 +1,16 @@ -import type { LimitLease, World } from '@workflow/world'; +import type { LimitAcquireRequest, LimitLease, World } from '@workflow/world'; import type { LockHandle, LockOptions } from '../lock.js'; import { contextStorage } from './context-storage.js'; export class StepLockBlockedError extends Error { retryAfterMs?: number; + request: LimitAcquireRequest; - constructor(retryAfterMs?: number) { + constructor(request: LimitAcquireRequest, retryAfterMs?: number) { super('Step lock blocked'); this.name = 'StepLockBlockedError'; this.retryAfterMs = retryAfterMs; + this.request = request; } static is(value: unknown): value is StepLockBlockedError { @@ -71,13 +73,22 @@ export function createStepLock(world: World) { concurrency: options.concurrency, rate: options.rate, }; - - const result = await world.limits.acquire({ + const request = { key: options.key, holderId, definition, leaseTtlMs: options.leaseTtlMs, - }); + } satisfies LimitAcquireRequest; + + const preAcquiredLease = store.preAcquiredLocks?.[holderId]; + if (preAcquiredLease) { + if (store.preAcquiredLocks) { + delete store.preAcquiredLocks[holderId]; + } + return createStepLockHandle(preAcquiredLease, world); + } + + const result = await world.limits.acquire(request); if (result.status === 'acquired') { return createStepLockHandle(result.lease, world); @@ -87,6 +98,6 @@ export function createStepLock(world: World) { Steps do not sit inside user code polling for a lease. The runtime catches this and re-queues the step attempt at the boundary. */ - throw new StepLockBlockedError(result.retryAfterMs); + throw new StepLockBlockedError(request, result.retryAfterMs); }; } diff --git a/packages/world-local/src/storage.test.ts b/packages/world-local/src/storage.test.ts index 89600b7fa3..b1abcdbce3 100644 --- a/packages/world-local/src/storage.test.ts +++ b/packages/world-local/src/storage.test.ts @@ -2452,6 +2452,83 @@ describe('Storage', () => { }); }); + describe('step_deferred event handling', () => { + let testRunId: string; + + beforeEach(async () => { + const run = await createRun(storage, { + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + input: new Uint8Array(), + }); + testRunId = run.runId; + }); + + it('should roll back the first blocked attempt without recording an error', async () => { + await createStep(storage, testRunId, { + stepId: 'step_deferred_1', + stepName: 'test-step', + input: new Uint8Array(), + }); + await updateStep(storage, testRunId, 'step_deferred_1', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await storage.events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_1', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 0, + startedAt: undefined, + retryAfter, + error: undefined, + }); + }); + + it('should preserve the original startedAt after a prior real attempt', async () => { + await createStep(storage, testRunId, { + stepId: 'step_deferred_2', + stepName: 'test-step', + input: new Uint8Array(), + }); + + const started1 = await updateStep( + storage, + testRunId, + 'step_deferred_2', + 'step_started' + ); + await storage.events.create(testRunId, { + eventType: 'step_retrying', + correlationId: 'step_deferred_2', + eventData: { error: 'Temporary failure' }, + }); + await updateStep(storage, testRunId, 'step_deferred_2', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await storage.events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_2', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 1, + retryAfter, + error: undefined, + }); + expect(result.step?.startedAt).toEqual(started1.startedAt); + }); + }); + describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(storage, { diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index 93e176030a..0f741e018b 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -101,11 +101,15 @@ export function createEventsStorage( ['completed', 'failed', 'cancelled'].includes(status); // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed and step_retrying - they only operate + // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves filesystem reads per step event. let currentRun: WorkflowRun | null = null; - const skipRunValidationEvents = ['step_completed', 'step_retrying']; + const skipRunValidationEvents = [ + 'step_completed', + 'step_deferred', + 'step_retrying', + ]; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -123,7 +127,7 @@ export function createEventsStorage( // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -214,6 +218,7 @@ export function createEventsStorage( 'step_started', 'step_completed', 'step_failed', + 'step_deferred', 'step_retrying', ]; if (stepEvents.includes(data.eventType) && data.correlationId) { @@ -606,6 +611,27 @@ export function createEventsStorage( { overwrite: true } ); } + } else if (data.eventType === 'step_deferred' && 'eventData' in data) { + // step_deferred: returns the step to pending without recording a failure + if (validatedStep) { + const stepCompositeKey = `${effectiveRunId}-${data.correlationId}`; + const rolledBackAttempt = Math.max(0, validatedStep.attempt - 1); + step = { + ...validatedStep, + status: 'pending', + attempt: rolledBackAttempt, + startedAt: + rolledBackAttempt === 0 ? undefined : validatedStep.startedAt, + error: undefined, + retryAfter: data.eventData.retryAfter, + updatedAt: now, + }; + await writeJSON( + taggedPath(basedir, 'steps', stepCompositeKey, tag), + step, + { overwrite: true } + ); + } } else if (data.eventType === 'step_retrying' && 'eventData' in data) { // step_retrying: Sets status back to 'pending', records error // Reuse validatedStep from validation (already read above) diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 01d0605c9b..e54b9e8010 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -143,5 +143,163 @@ if (process.platform === 'win32') { }); expect(stillWaiting.status).toBe('blocked'); }); + + it('skips cancelled workflow waiters before promotion', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await db.drizzle.insert(Schema.runs).values([ + { + runId: 'wrun_dead_workflow', + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + status: 'cancelled', + }, + ]); + + const first = await limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'holder-a', + definition: { + concurrency: { max: 1 }, + rate: { count: 2, periodMs: 5_000 }, + }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'wflock_wrun_dead_workflow:limitwait_dead', + definition: { + concurrency: { max: 1 }, + rate: { count: 2, periodMs: 5_000 }, + }, + leaseTtlMs: 5_000, + }); + await limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'holder-live', + definition: { + concurrency: { max: 1 }, + rate: { count: 2, periodMs: 5_000 }, + }, + leaseTtlMs: 5_000, + }); + + await limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitLeases.acquiredAt)); + const tokens = await db.drizzle + .select({ holderId: Schema.limitTokens.holderId }) + .from(Schema.limitTokens) + .where(eq(Schema.limitTokens.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitTokens.acquiredAt)); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitWaiters.createdAt)); + + expect(leases).toEqual([{ holderId: 'holder-live' }]); + expect(tokens).toEqual([ + { holderId: first.lease.holderId }, + { holderId: 'holder-live' }, + ]); + expect(waiters).toEqual([]); + }); + + it('skips failed step waiters before promotion', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await db.drizzle.insert(Schema.runs).values([ + { + runId: 'wrun_dead_step', + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + status: 'running', + startedAt: new Date(), + }, + { + runId: 'wrun_live_step', + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + status: 'running', + startedAt: new Date(), + }, + ]); + await db.drizzle.insert(Schema.steps).values([ + { + runId: 'wrun_dead_step', + stepId: 'step_dead', + stepName: 'test-step', + status: 'failed', + attempt: 1, + }, + { + runId: 'wrun_live_step', + stepId: 'step_live', + stepName: 'test-step', + status: 'pending', + attempt: 0, + }, + ]); + + const first = await limits.acquire({ + key: 'workflow:user:skip-dead-step', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await limits.acquire({ + key: 'workflow:user:skip-dead-step', + holderId: 'stplock_wrun_dead_step:step_dead:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + await limits.acquire({ + key: 'workflow:user:skip-dead-step', + holderId: 'holder-live', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + await limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitLeases.acquiredAt)); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitWaiters.createdAt)); + + expect(leases).toEqual([{ holderId: 'holder-live' }]); + expect(waiters).toEqual([]); + }); }); } diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index a3892f72aa..d36be6c695 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -367,6 +367,46 @@ async function lockLimitKey(tx: Db, key: string): Promise { ); } +async function isHolderLive(tx: Db, holderId: string): Promise { + const target = parseHolderId(holderId); + if (target.kind === 'opaque') { + return true; + } + + if (target.kind === 'workflow') { + const [run] = (await tx + .select({ + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as Pick[]; + + return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); + } + + const [step] = (await tx + .select({ + status: Schema.steps.status, + }) + .from(Schema.steps) + .where(eq(Schema.steps.stepId, target.stepId)) + .limit(1)) as Pick[]; + if (!step || ['completed', 'failed'].includes(step.status)) { + return false; + } + + const [run] = (await tx + .select({ + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as Pick[]; + + return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); +} + async function promoteWaiters( tx: Db, config: PostgresWorldConfig, @@ -381,6 +421,13 @@ async function promoteWaiters( let activeTokens = state.tokens.length; for (const waiter of state.waiters) { + if (!(await isHolderLive(tx, waiter.holderId))) { + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + continue; + } + const concurrencyBlocked = waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; const rateBlocked = diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 8d59f929af..cdffba87a9 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -354,12 +354,16 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // ============================================================ // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed and step_retrying - they only operate + // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves database queries per step event. let currentRun: { status: string; specVersion: number | null } | null = null; - const skipRunValidationEvents = ['step_completed', 'step_retrying']; + const skipRunValidationEvents = [ + 'step_completed', + 'step_deferred', + 'step_retrying', + ]; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -375,7 +379,7 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -472,7 +476,11 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { startedAt: Date | null; retryAfter: Date | null; } | null = null; - const stepEventsNeedingValidation = ['step_started', 'step_retrying']; + const stepEventsNeedingValidation = [ + 'step_started', + 'step_deferred', + 'step_retrying', + ]; if ( stepEventsNeedingValidation.includes(data.eventType) && data.correlationId @@ -928,6 +936,51 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { } } + // Handle step_deferred event: returns the step to pending without recording a failure + if (data.eventType === 'step_deferred') { + const eventData = (data as any).eventData as { + retryAfter?: Date; + }; + + const [stepValue] = await drizzle + .update(Schema.steps) + .set({ + status: 'pending', + attempt: sql`GREATEST(${Schema.steps.attempt} - 1, 0)`, + startedAt: sql`CASE WHEN ${Schema.steps.attempt} <= 1 THEN NULL ELSE ${Schema.steps.startedAt} END`, + error: null, + retryAfter: eventData.retryAfter, + }) + .where( + and( + eq(Schema.steps.runId, effectiveRunId), + eq(Schema.steps.stepId, data.correlationId!), + notInArray(Schema.steps.status, terminalStepStatuses) + ) + ) + .returning(); + if (stepValue) { + step = deserializeStepError(compact(stepValue)); + } else { + const [existing] = await getStepForValidation.execute({ + runId: effectiveRunId, + stepId: data.correlationId!, + }); + if (!existing) { + throw new WorkflowAPIError( + `Step "${data.correlationId}" not found`, + { status: 404 } + ); + } + if (isStepTerminal(existing.status)) { + throw new WorkflowAPIError( + `Cannot modify step in terminal state "${existing.status}"`, + { status: 409 } + ); + } + } + } + // Handle step_retrying event: sets status back to 'pending', records error // Uses conditional UPDATE to prevent retrying an already-terminal step. if (data.eventType === 'step_retrying') { diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index 5a59b99cde..30d9e7cbb2 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1807,6 +1807,83 @@ describe('Storage (Postgres integration)', () => { }); }); + describe('step_deferred event handling', () => { + let testRunId: string; + + beforeEach(async () => { + const run = await createRun(events, { + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + input: new Uint8Array(), + }); + testRunId = run.runId; + }); + + it('should roll back the first blocked attempt without recording an error', async () => { + await createStep(events, testRunId, { + stepId: 'step_deferred_1', + stepName: 'test-step', + input: new Uint8Array(), + }); + await updateStep(events, testRunId, 'step_deferred_1', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_1', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 0, + startedAt: undefined, + retryAfter, + error: undefined, + }); + }); + + it('should preserve the original startedAt after a prior real attempt', async () => { + await createStep(events, testRunId, { + stepId: 'step_deferred_2', + stepName: 'test-step', + input: new Uint8Array(), + }); + + const started1 = await updateStep( + events, + testRunId, + 'step_deferred_2', + 'step_started' + ); + await events.create(testRunId, { + eventType: 'step_retrying', + correlationId: 'step_deferred_2', + eventData: { error: 'Temporary failure' }, + }); + await updateStep(events, testRunId, 'step_deferred_2', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_2', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 1, + retryAfter, + error: undefined, + }); + expect(result.step?.startedAt).toEqual(started1.startedAt); + }); + }); + describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(events, { diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 769f78ba74..8306576d8a 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -127,6 +127,11 @@ The current behavior is: - the step is re-queued and retried after promotion or timeout - lease is disposed automatically when the step attempt completes +Important caveat: + +- zero-attempt semantics are only guaranteed when `lock()` is used as a top-of-step admission gate +- calling `lock()` after side effects or meaningful user work is unsupported/best-effort + This means step `lock()` is conceptually the same API, but it is not a literal "spin inside already-running user step code until capacity appears" implementation. @@ -187,7 +192,8 @@ Important details: - FIFO is per key, not global across all limit keys - promotion order is based on waiter creation order -- a waiter may be skipped if it is no longer eligible when promotion runs +- dead or terminal waiters are pruned before promotion +- a live waiter may still be skipped if it is no longer eligible when promotion runs - releasing a lease or reclaiming an expired lease can both trigger promotion - rate-window expiry can also make the head waiter eligible again diff --git a/packages/world/src/events.ts b/packages/world/src/events.ts index 2965906f7b..eac141c1f7 100644 --- a/packages/world/src/events.ts +++ b/packages/world/src/events.ts @@ -1,4 +1,5 @@ import { z } from 'zod'; +import { LimitAcquireRequestSchema } from './limits.js'; import { SerializedDataSchema } from './serialization.js'; import type { PaginationOptions, ResolveData } from './shared.js'; @@ -64,6 +65,7 @@ export const EventTypeSchema = z.enum([ 'step_created', 'step_completed', 'step_failed', + 'step_deferred', 'step_retrying', 'step_started', // Hook lifecycle events @@ -109,6 +111,19 @@ const StepFailedEventSchema = BaseEventSchema.extend({ }), }); +/** + * Event created when a step is blocked on admission and should be retried + * without counting the blocked attempt against maxRetries. + */ +const StepDeferredEventSchema = BaseEventSchema.extend({ + eventType: z.literal('step_deferred'), + correlationId: z.string(), + eventData: z.object({ + retryAfter: z.coerce.date().optional(), + lockRequest: LimitAcquireRequestSchema.optional(), + }), +}); + /** * Event created when a step fails and will be retried. * Sets the step status back to 'pending' and records the error. @@ -272,6 +287,7 @@ export const CreateEventSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, + StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events @@ -296,6 +312,7 @@ const AllEventsSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, + StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 985e9331e4..f7d43aab9f 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -308,6 +308,38 @@ export async function workflowLockContentionWorkflow( }; } +async function stepLockNoRetriesStep(label: string, holdMs: number) { + 'use step'; + + await using _stepLock = await lock({ + key: 'step:db:no-retries', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const metadata = getStepMetadata(); + const acquiredAt = Date.now(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); + const releasedAt = Date.now(); + + return { + label, + attempt: metadata.attempt, + acquiredAt, + releasedAt, + }; +} +stepLockNoRetriesStep.maxRetries = 0; + +export async function stepLockNoRetriesContentionWorkflow( + userId = 'user-123', + holdMs = 750 +) { + 'use workflow'; + + return await stepLockNoRetriesStep(userId, holdMs); +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From 27486dc2539e72e07091a01b34690daf9d094c59 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 23:49:42 -0400 Subject: [PATCH 09/34] add new errors Signed-off-by: nathancolosimo --- packages/world-local/src/limits.test.ts | 19 ++++++++++ packages/world-local/src/limits.ts | 6 +-- packages/world-postgres/src/limits.test.ts | 14 +++++++ packages/world-postgres/src/limits.ts | 6 +-- packages/world-postgres/src/storage.ts | 10 ++--- packages/world-postgres/test/storage.test.ts | 40 ++++++++++++++++++++ 6 files changed, 81 insertions(+), 14 deletions(-) diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 3f8351f99b..d6bde93f45 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,4 +1,6 @@ +import { WorkflowWorldError } from '@workflow/errors'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { describe, expect, it } from 'vitest'; import { createLocalWorld } from './index.js'; import { createLimits } from './limits.js'; import { mkdtemp, rm } from 'node:fs/promises'; @@ -17,3 +19,20 @@ createLimitsContractSuite('local world limits', async () => { }, }; }); + +describe('local limits', () => { + it('throws WorkflowWorldError when heartbeating a missing lease', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + await expect( + limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toBeInstanceOf(WorkflowWorldError); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 9dfac5d931..e577e3bfea 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,5 +1,5 @@ import path from 'node:path'; -import { WorkflowAPIError } from '@workflow/errors'; +import { WorkflowWorldError } from '@workflow/errors'; import { LimitAcquireRequestSchema, type LimitAcquireResult, @@ -291,9 +291,7 @@ export function createLimits(dataDir: string, tag?: string): Limits { return updatedLease; } - throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { - status: 404, - }); + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); }); }, }; diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index e54b9e8010..35358b9f15 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,4 +1,5 @@ import { asc, eq } from 'drizzle-orm'; +import { WorkflowWorldError } from '@workflow/errors'; import { afterAll, beforeAll, @@ -42,6 +43,19 @@ if (process.platform === 'win32') { }); describe('postgres waiter promotion', () => { + it('throws WorkflowWorldError when heartbeating a missing lease', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await expect( + limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toBeInstanceOf(WorkflowWorldError); + }); + it('serializes concurrent acquires for the same key', async () => { const limits = createLimits( { connectionString: db.connectionString, queueConcurrency: 1 }, diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index d36be6c695..7e58f682f6 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -1,6 +1,6 @@ import { JsonTransport } from '@vercel/queue'; import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; -import { WorkflowAPIError } from '@workflow/errors'; +import { WorkflowWorldError } from '@workflow/errors'; import { LimitAcquireRequestSchema, type LimitAcquireResult, @@ -653,9 +653,7 @@ export function createLimits( }); if (!existing) { - throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { - status: 404, - }); + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); } await lockLimitKey(tx, existing.limitKey); diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 1afcbc6c2f..5fa5adac83 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -944,15 +944,13 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { stepId: data.correlationId!, }); if (!existing) { - throw new WorkflowAPIError( - `Step "${data.correlationId}" not found`, - { status: 404 } + throw new WorkflowWorldError( + `Step "${data.correlationId}" not found` ); } if (isStepTerminal(existing.status)) { - throw new WorkflowAPIError( - `Cannot modify step in terminal state "${existing.status}"`, - { status: 409 } + throw new EntityConflictError( + `Cannot modify step in terminal state "${existing.status}"` ); } } diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index ded60138f0..8b2328c4c1 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1,5 +1,6 @@ import { execSync } from 'node:child_process'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; +import { EntityConflictError, WorkflowWorldError } from '@workflow/errors'; import type { Hook, Step, WorkflowRun } from '@workflow/world'; import { encode } from 'cbor-x'; import postgres from 'postgres'; @@ -1882,6 +1883,45 @@ describe('Storage (Postgres integration)', () => { }); expect(result.step?.startedAt).toEqual(started1.startedAt); }); + + it('throws WorkflowWorldError when step_deferred targets a missing step', async () => { + await expect( + events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_missing_deferred', + eventData: { + retryAfter: new Date(Date.now() + 5_000), + }, + }) + ).rejects.toBeInstanceOf(WorkflowWorldError); + }); + + it('throws EntityConflictError when step_deferred targets a terminal step', async () => { + await createStep(events, testRunId, { + stepId: 'step_deferred_terminal', + stepName: 'test-step', + input: new Uint8Array(), + }); + await updateStep( + events, + testRunId, + 'step_deferred_terminal', + 'step_failed', + { + error: 'already failed', + } + ); + + await expect( + events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_terminal', + eventData: { + retryAfter: new Date(Date.now() + 5_000), + }, + }) + ).rejects.toBeInstanceOf(EntityConflictError); + }); }); describe('run cancellation with in-flight entities', () => { From 8c683a313abae7835d6b9a7a9fedf89b24891c3c Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 00:49:17 -0400 Subject: [PATCH 10/34] Increase ttl times for flaky tests on slow runners Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-contract.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 2acfbc1d72..5037039e83 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -129,7 +129,7 @@ export function createLimitsContractSuite( key: 'workflow:user:123', holderId: 'holder-a', definition: { concurrency: { max: 1 } }, - leaseTtlMs: 100, + leaseTtlMs: 500, }); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') @@ -137,13 +137,13 @@ export function createLimitsContractSuite( const heartbeat = await harness.limits.heartbeat({ leaseId: first.lease.leaseId, - ttlMs: 200, + ttlMs: 1_000, }); expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( first.lease.expiresAt?.getTime() ?? 0 ); - await sleep(250); + await sleep(1_100); const second = await harness.limits.acquire({ key: 'workflow:user:123', From 71de1c584fa00f0af8e9e1e5f2c5d0447f71eac1 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 09:37:04 -0400 Subject: [PATCH 11/34] fix e2e test Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index a1979f1bc6..f6eb8d1d45 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -16,6 +16,7 @@ import { } from 'vitest'; import type { Run } from '../src/runtime'; import { + cancelRun, getHookByToken, getRun, getWorld, @@ -643,13 +644,13 @@ describe('e2e', () => { { timeout: 60_000 }, async () => { const workflow = await e2e('workflowLockContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 500]); + const runA = await start(workflow, ['shared-user', 1_500]); await sleep(100); - const runB = await start(workflow, ['shared-user', 500]); - await sleep(200); - await cliCancel(runB.runId); + const runB = await start(workflow, ['shared-user', 1_500]); + await sleep(100); + await cancelRun(getWorld(), runB.runId); const cancelledError = await runB.returnValue.catch((error) => error); - const runC = await start(workflow, ['shared-user', 500]); + const runC = await start(workflow, ['shared-user', 1_500]); const [resultA, resultC] = await Promise.all([ runA.returnValue, From 4ed44fc9cfa93a222cd6b16f2823d97e08d9d519 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 13:45:15 -0400 Subject: [PATCH 12/34] Add FIFO to local and group e2e and contract tests Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 234 +++++----- packages/world-local/README.md | 9 +- packages/world-local/src/index.ts | 9 +- packages/world-local/src/limits.test.ts | 27 +- packages/world-local/src/limits.ts | 419 +++++++++++++++--- packages/world-local/src/queue.test.ts | 74 ++-- packages/world-local/src/queue.ts | 360 +++++++++------ packages/world-postgres/README.md | 5 +- packages/world-postgres/src/limits.test.ts | 308 +------------ packages/world-testing/src/index.mts | 2 + packages/world-testing/src/limits-contract.ts | 334 +++++++++++++- packages/world-testing/src/limits-runtime.ts | 218 +++++++++ packages/world/FLOW_LIMITS.md | 76 +++- workbench/example/tsconfig.json | 1 + workbench/example/workflows/99_e2e.ts | 67 ++- workbench/example/workflows/serde-steps.ts | 2 +- 16 files changed, 1450 insertions(+), 695 deletions(-) create mode 100644 packages/world-testing/src/limits-runtime.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index f6eb8d1d45..5f7aefc97e 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -14,7 +14,8 @@ import { expect, test, } from 'vitest'; -import type { Run } from '../src/runtime'; +import { createLimitsRuntimeSuite } from '../../world-testing/src/limits-runtime.js'; +import type { Run, StartOptions } from '../src/runtime.js'; import { cancelRun, getHookByToken, @@ -23,7 +24,7 @@ import { healthCheck, start as rawStart, resumeHook, -} from '../src/runtime'; +} from '../src/runtime.js'; import { cliCancel, cliHealthJson, @@ -50,10 +51,25 @@ if (!deploymentUrl) { * Tracked wrapper around start() that automatically registers runs * for diagnostics on test failure and observability metadata collection. */ -async function start( - ...args: Parameters> -): Promise> { - const run = await rawStart(...args); +type E2EWorkflowMetadata = Awaited>; + +async function start( + workflow: E2EWorkflowMetadata, + options?: StartOptions +): Promise>; +async function start( + workflow: E2EWorkflowMetadata, + args: TArgs, + options?: StartOptions +): Promise>; +async function start( + workflow: E2EWorkflowMetadata, + argsOrOptions?: unknown[] | StartOptions, + options?: StartOptions +): Promise> { + const run = Array.isArray(argsOrOptions) + ? await rawStart(workflow, argsOrOptions, options) + : await rawStart(workflow, argsOrOptions); trackRun(run); return run; } @@ -229,6 +245,90 @@ describe('e2e', () => { // bundled in function const shouldSkipReactRenderTest = !(isNext && isLocal); + if (isLocalWorld || isPostgresWorld) { + createLimitsRuntimeSuite( + `limits runtime (${isPostgresWorld ? 'postgres' : 'local'})`, + async () => ({ + async runWorkflowWithWorkflowAndStepLocks(userId) { + const run = await start( + await e2e('workflowWithWorkflowAndStepLocks'), + [userId] + ); + return await run.returnValue; + }, + async runWorkflowLockContention(userId, holdMs) { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs]); + await sleep(100); + const runB = await start(workflow, [userId, holdMs]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runStepLockNoRetriesContention(userId, holdMs) { + const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await sleep(100); + const runC = await start(workflow, [userId, holdMs, 'C']); + return await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + }, + async runWorkflowLockAcrossSuspension(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runWorkflowRateLimitContention(userId, holdMs, periodMs) { + const workflow = await e2e('workflowRateLimitContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, periodMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, periodMs, 'B']); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runWorkflowFifoThreeWaiters(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await sleep(100); + const runC = await start(workflow, [userId, holdMs, 'C']); + return await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + }, + async runCancelledWorkflowWaiter(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await sleep(100); + await cancelRun(getWorld(), runB.runId); + const cancelledError = await runB.returnValue.catch((error) => error); + const runC = await start(workflow, [userId, holdMs, 'C']); + const [resultA, resultC] = await Promise.all([ + runA.returnValue, + runC.returnValue, + ]); + return { cancelledError, resultA, resultC }; + }, + async runIndependentWorkflowKeys(holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, ['user-a', holdMs]); + await sleep(100); + const runB = await start(workflow, ['user-b', holdMs]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + }) + ); + } + test.skipIf(shouldSkipReactRenderTest)( 'should work with react rendering in step', async () => { @@ -548,126 +648,6 @@ describe('e2e', () => { expect(elapsed).toBeLessThan(25_000); }); - if (isLocalWorld) { - test( - 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on local world', - { timeout: 60_000 }, - async () => { - const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ - 'local-world', - ]); - const returnValue = await run.returnValue; - - expect(returnValue).toMatchObject({ - workflowKey: 'workflow:user:local-world', - dbKey: 'step:db:cheap', - aiKey: 'step:provider:openai', - summary: 'summary:profile:local-world', - }); - } - ); - } - - if (isPostgresWorld) { - test( - 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on postgres world', - { timeout: 60_000 }, - async () => { - const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ - 'postgres-world', - ]); - const returnValue = await run.returnValue; - - expect(returnValue).toMatchObject({ - workflowKey: 'workflow:user:postgres-world', - dbKey: 'step:db:cheap', - aiKey: 'step:provider:openai', - summary: 'summary:profile:postgres-world', - }); - } - ); - } - - if (isPostgresWorld) { - test( - 'workflowLockContentionWorkflow serializes workflow and step locks under contention', - { timeout: 60_000 }, - async () => { - const workflow = await e2e('workflowLockContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 750]); - await sleep(100); - const runB = await start(workflow, ['shared-user', 750]); - - const [resultA, resultB] = await Promise.all([ - runA.returnValue, - runB.returnValue, - ]); - - expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.workflowLockReleasedAt - ); - expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.stepLockReleasedAt - ); - } - ); - } - - test( - 'stepLockNoRetriesContentionWorkflow does not consume retries while blocked on a step lock', - { timeout: 60_000 }, - async () => { - const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 750]); - await sleep(100); - const runB = await start(workflow, ['shared-user', 750]); - - const [resultA, resultB] = await Promise.all([ - runA.returnValue, - runB.returnValue, - ]); - const [firstResult, secondResult] = [resultA, resultB].sort( - (left, right) => left.acquiredAt - right.acquiredAt - ); - - expect(resultA.attempt).toBe(1); - expect(resultB.attempt).toBe(1); - expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( - firstResult.releasedAt - ); - } - ); - - if (isPostgresWorld) { - test( - 'cancelled workflow waiters are skipped before the next waiter is promoted', - { timeout: 60_000 }, - async () => { - const workflow = await e2e('workflowLockContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 1_500]); - await sleep(100); - const runB = await start(workflow, ['shared-user', 1_500]); - await sleep(100); - await cancelRun(getWorld(), runB.runId); - const cancelledError = await runB.returnValue.catch((error) => error); - const runC = await start(workflow, ['shared-user', 1_500]); - - const [resultA, resultC] = await Promise.all([ - runA.returnValue, - runC.returnValue, - ]); - - expect(cancelledError).toBeInstanceOf(WorkflowRunCancelledError); - expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.workflowLockReleasedAt - ); - expect( - resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt - ).toBeLessThan(4_000); - } - ); - } - test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; @@ -1900,7 +1880,7 @@ describe('e2e', () => { // Cancel the run using the core runtime cancelRun function. // This exercises the same cancelRun code path that the CLI uses // (the CLI delegates directly to this function). - const { cancelRun } = await import('../src/runtime'); + const { cancelRun } = await import('../src/runtime.js'); await cancelRun(getWorld(), run.runId); // Verify the run was cancelled - returnValue should throw WorkflowRunCancelledError diff --git a/packages/world-local/README.md b/packages/world-local/README.md index cff6a3354a..fccc554eac 100644 --- a/packages/world-local/README.md +++ b/packages/world-local/README.md @@ -4,6 +4,13 @@ Filesystem-based workflow backend for local development and testing. Stores workflow data as JSON files on disk and provides in-memory queuing. Automatically detects development server port for queue transport. -The `limits` namespace is exposed as part of the shared world contract, but flow concurrency and rate limiting are not implemented in this package yet. +The `limits` namespace implements the shared flow-limits contract for local development: + +- keyed concurrency and rate limits +- FIFO waiter promotion per key +- cancelled workflow / failed step waiter pruning +- prompt wake-ups with delayed fallback retries + +Limit state is persisted on disk, but queue delivery is still in-memory. That means local world matches the same live-process lock semantics as other implemented worlds, while crash-survival and durable backlog behavior remain a PostgreSQL-only advantage today. Used by default on `next dev` and `next start`. diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 029154649d..142fe26ccf 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -61,10 +61,15 @@ export function createLocalWorld(args?: Partial): LocalWorld { const mergedConfig = { ...config.value, ...definedArgs }; const tag = mergedConfig.tag; const queue = createQueue(mergedConfig); + const storage = createStorage(mergedConfig.dataDir, tag); return { - limits: createLimits(mergedConfig.dataDir, tag), + limits: createLimits(mergedConfig.dataDir, { + tag, + queue, + storage, + }), ...queue, - ...createStorage(mergedConfig.dataDir, tag), + ...storage, ...createStreamer(mergedConfig.dataDir, tag), async start() { await initDataDir(mergedConfig.dataDir); diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index d6bde93f45..2e248bb516 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,8 +1,5 @@ -import { WorkflowWorldError } from '@workflow/errors'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; -import { describe, expect, it } from 'vitest'; import { createLocalWorld } from './index.js'; -import { createLimits } from './limits.js'; import { mkdtemp, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; @@ -10,29 +7,17 @@ import path from 'node:path'; createLimitsContractSuite('local world limits', async () => { const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); const world = createLocalWorld({ dataDir: dir }); + world.registerHandler('__wkf_step_', async () => Response.json({ ok: true })); + world.registerHandler('__wkf_workflow_', async () => + Response.json({ ok: true }) + ); return { - limits: createLimits(dir), + limits: world.limits, + storage: world, close: async () => { await world.close?.(); await rm(dir, { recursive: true, force: true }); }, }; }); - -describe('local limits', () => { - it('throws WorkflowWorldError when heartbeating a missing lease', async () => { - const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); - const limits = createLimits(dir); - - try { - await expect( - limits.heartbeat({ - leaseId: 'lmt_missing', - }) - ).rejects.toBeInstanceOf(WorkflowWorldError); - } finally { - await rm(dir, { recursive: true, force: true }); - } - }); -}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index e577e3bfea..081b95f63e 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,5 +1,11 @@ import path from 'node:path'; import { WorkflowWorldError } from '@workflow/errors'; +import type { + Queue, + Storage, + WorkflowRunWithoutData, + StepWithoutData, +} from '@workflow/world'; import { LimitAcquireRequestSchema, type LimitAcquireResult, @@ -20,23 +26,56 @@ const LimitTokenSchema = z.object({ expiresAt: z.coerce.date(), }); +const LimitWaiterSchema = z.object({ + waiterId: z.string(), + holderId: z.string(), + createdAt: z.coerce.date(), + leaseTtlMs: z.number().int().positive().optional(), + concurrencyMax: z.number().int().positive().nullable(), + rateCount: z.number().int().positive().nullable(), + ratePeriodMs: z.number().int().positive().nullable(), +}); + const KeyStateSchema = z.object({ key: z.string(), leases: z.array(LimitLeaseSchema), tokens: z.array(LimitTokenSchema), + waiters: z.array(LimitWaiterSchema), }); const LimitsStateSchema = z.object({ - version: z.literal(1), + version: z.literal(2), keys: z.record(z.string(), KeyStateSchema), }); type LimitToken = z.infer; +type LimitWaiter = z.infer; type KeyState = z.infer; type LimitsState = z.infer; +type HolderTarget = + | { + kind: 'workflow'; + runId: string; + correlationId: string; + } + | { + kind: 'step'; + runId: string; + stepId: string; + } + | { + kind: 'opaque'; + }; + +export interface LocalLimitsOptions { + tag?: string; + queue?: Pick; + storage?: Pick; +} + const EMPTY_STATE: LimitsState = { - version: 1, + version: 2, keys: {}, }; @@ -48,17 +87,26 @@ function cloneToken(token: LimitToken): LimitToken { return { ...token }; } +function cloneWaiter(waiter: LimitWaiter): LimitWaiter { + return { ...waiter }; +} + +function normalizeKeyState(keyState: KeyState): KeyState { + return { + key: keyState.key, + leases: keyState.leases.map((lease) => ({ ...lease })), + tokens: keyState.tokens.map(cloneToken), + waiters: keyState.waiters.map(cloneWaiter), + }; +} + function cloneState(state: LimitsState): LimitsState { return { - version: 1, + version: 2, keys: Object.fromEntries( Object.entries(state.keys).map(([key, keyState]) => [ key, - { - key: keyState.key, - leases: keyState.leases.map((lease) => ({ ...lease })), - tokens: keyState.tokens.map(cloneToken), - }, + normalizeKeyState(keyState), ]) ), }; @@ -72,6 +120,7 @@ function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { lease.expiresAt === undefined || lease.expiresAt.getTime() > now ), tokens: keyState.tokens.filter((token) => token.expiresAt.getTime() > now), + waiters: keyState.waiters.map(cloneWaiter), }; } @@ -113,14 +162,91 @@ function getRetryAfterMs( return Math.min(...candidates); } -export function createLimits(dataDir: string, tag?: string): Limits { - const statePath = getStatePath(dataDir, tag); +function createLease( + key: string, + holderId: string, + definition: LimitLease['definition'], + acquiredAt: Date, + leaseTtlMs?: number +): LimitLease { + return { + leaseId: `lmt_${monotonicUlid()}`, + key, + holderId, + acquiredAt, + expiresAt: + leaseTtlMs !== undefined + ? new Date(acquiredAt.getTime() + leaseTtlMs) + : undefined, + definition, + }; +} + +function insertToken( + keyState: KeyState, + holderId: string, + acquiredAt: Date, + periodMs: number +) { + keyState.tokens.push({ + tokenId: `lmttok_${monotonicUlid()}`, + holderId, + acquiredAt, + expiresAt: new Date(acquiredAt.getTime() + periodMs), + }); +} + +function parseHolderId(holderId: string): HolderTarget { + if (holderId.startsWith('wflock_')) { + const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); + if (runId && correlationId) { + return { kind: 'workflow', runId, correlationId }; + } + } + + if (holderId.startsWith('stplock_')) { + const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); + if (runId && stepId) { + return { kind: 'step', runId, stepId }; + } + } + + return { kind: 'opaque' }; +} + +function isTerminalRun(run: WorkflowRunWithoutData | undefined) { + return !run || ['completed', 'failed', 'cancelled'].includes(run.status); +} + +function isTerminalStep(step: StepWithoutData | undefined) { + return !step || ['completed', 'failed', 'cancelled'].includes(step.status); +} + +function toMillis(value: Date | undefined): number | undefined { + return value ? value.getTime() : undefined; +} + +function deleteEmptyKey(state: LimitsState, key: string) { + const keyState = state.keys[key]; + if (!keyState) return; + if ( + keyState.leases.length === 0 && + keyState.tokens.length === 0 && + keyState.waiters.length === 0 + ) { + delete state.keys[key]; + } +} + +export function createLimits( + dataDir: string, + tagOrOptions?: string | LocalLimitsOptions +): Limits { + const options = + typeof tagOrOptions === 'string' ? { tag: tagOrOptions } : tagOrOptions; + const statePath = getStatePath(dataDir, options?.tag); let stateOp = Promise.resolve(); - // This block is an in-process async mutex / operation queue. - // stateOp starts as an already-resolved promise. - // Each call to withStateLock() chains a new operation onto the tail of that promise. - // Because every new operation waits for the previous one, reads/modifies/writes to the limits state file happen serially. const withStateLock = async (fn: () => Promise): Promise => { const run = stateOp.then(fn, fn); stateOp = run.then( @@ -131,38 +257,205 @@ export function createLimits(dataDir: string, tag?: string): Limits { }; const readState = async (): Promise => { - return ( - (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE) - ); + const raw = + (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE); + + return cloneState(raw); }; const writeState = async (state: LimitsState): Promise => { await writeJSON(statePath, state, { overwrite: true }); }; + const getRun = async ( + runId: string + ): Promise => { + try { + return await options?.storage?.runs.get(runId, { resolveData: 'none' }); + } catch { + return undefined; + } + }; + + const getStep = async ( + runId: string, + stepId: string + ): Promise => { + try { + return await options?.storage?.steps.get(runId, stepId, { + resolveData: 'none', + }); + } catch { + return undefined; + } + }; + + const isHolderLive = async (holderId: string): Promise => { + const target = parseHolderId(holderId); + if (target.kind === 'opaque' || !options?.storage) { + return true; + } + + if (target.kind === 'workflow') { + const run = await getRun(target.runId); + return !isTerminalRun(run); + } + + const [run, step] = await Promise.all([ + getRun(target.runId), + getStep(target.runId, target.stepId), + ]); + return !isTerminalRun(run) && !isTerminalStep(step); + }; + + const queueWakeForHolder = async (holderId: string): Promise => { + const target = parseHolderId(holderId); + if (target.kind === 'opaque' || !options?.queue || !options?.storage) { + return; + } + + try { + if (target.kind === 'workflow') { + const run = await getRun(target.runId); + if (isTerminalRun(run) || !run) return; + + await options.queue.queue( + `__wkf_workflow_${run.workflowName}`, + { + runId: target.runId, + requestedAt: new Date(), + }, + { + idempotencyKey: target.correlationId, + } + ); + return; + } + + const [run, step] = await Promise.all([ + getRun(target.runId), + getStep(target.runId, target.stepId), + ]); + if (isTerminalRun(run) || isTerminalStep(step) || !run || !step) return; + + await options.queue.queue( + `__wkf_step_${step.stepName}`, + { + workflowName: run.workflowName, + workflowRunId: target.runId, + workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), + stepId: target.stepId, + requestedAt: new Date(), + }, + { + idempotencyKey: target.stepId, + } + ); + } catch (error) { + console.warn('[world-local] Failed to queue lock wake-up', error); + } + }; + + const promoteWaiters = async ( + key: string, + keyState: KeyState + ): Promise<{ keyState: KeyState; wakeHolders: string[] }> => { + const wakeHolders: string[] = []; + const promotedKeyState = pruneKeyState(keyState); + const remainingWaiters: LimitWaiter[] = []; + let activeLeases = promotedKeyState.leases.length; + let activeTokens = promotedKeyState.tokens.length; + + for (let index = 0; index < promotedKeyState.waiters.length; index++) { + const waiter = promotedKeyState.waiters[index]; + + if (!(await isHolderLive(waiter.holderId))) { + continue; + } + + const concurrencyBlocked = + waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; + const rateBlocked = + waiter.rateCount !== null && activeTokens >= waiter.rateCount; + + if (concurrencyBlocked || rateBlocked) { + remainingWaiters.push( + waiter, + ...promotedKeyState.waiters.slice(index + 1) + ); + promotedKeyState.waiters = remainingWaiters; + return { keyState: promotedKeyState, wakeHolders }; + } + + const acquiredAt = new Date(); + const definition = { + concurrency: + waiter.concurrencyMax !== null + ? { max: waiter.concurrencyMax } + : undefined, + rate: + waiter.rateCount !== null && waiter.ratePeriodMs !== null + ? { + count: waiter.rateCount, + periodMs: waiter.ratePeriodMs, + } + : undefined, + }; + + promotedKeyState.leases.push( + createLease( + key, + waiter.holderId, + definition, + acquiredAt, + waiter.leaseTtlMs + ) + ); + activeLeases += 1; + + if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { + insertToken( + promotedKeyState, + waiter.holderId, + acquiredAt, + waiter.ratePeriodMs + ); + activeTokens += 1; + } + + wakeHolders.push(waiter.holderId); + } + + promotedKeyState.waiters = remainingWaiters; + return { keyState: promotedKeyState, wakeHolders }; + }; + return { async acquire(request) { const parsed = LimitAcquireRequestSchema.parse(request); return withStateLock(async (): Promise => { const state = cloneState(await readState()); - const now = new Date(); - const nowMs = now.getTime(); - const keyState = pruneKeyState( + const baseKeyState = pruneKeyState( state.keys[parsed.key] ?? { key: parsed.key, leases: [], tokens: [], - }, - nowMs + waiters: [], + } + ); + const { keyState, wakeHolders } = await promoteWaiters( + parsed.key, + baseKeyState ); + state.keys[parsed.key] = keyState; const existingLease = keyState.leases.find( (lease) => lease.holderId === parsed.holderId ); if (existingLease) { - state.keys[parsed.key] = keyState; await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'acquired', lease: existingLease, @@ -175,47 +468,66 @@ export function createLimits(dataDir: string, tag?: string): Limits { const rateBlocked = parsed.definition.rate !== undefined && keyState.tokens.length >= parsed.definition.rate.count; + const existingWaiter = keyState.waiters.find( + (waiter) => waiter.holderId === parsed.holderId + ); + + if ( + existingWaiter || + concurrencyBlocked || + rateBlocked || + keyState.waiters.length > 0 + ) { + if (!existingWaiter) { + keyState.waiters.push({ + waiterId: `lmtwait_${monotonicUlid()}`, + holderId: parsed.holderId, + createdAt: new Date(), + leaseTtlMs: parsed.leaseTtlMs, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }); + } - if (concurrencyBlocked || rateBlocked) { state.keys[parsed.key] = keyState; await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'blocked', reason: getBlockedReason(concurrencyBlocked, rateBlocked), retryAfterMs: getRetryAfterMs( keyState, - nowMs, + Date.now(), concurrencyBlocked, rateBlocked ), }; } - const lease: LimitLease = { - leaseId: `lmt_${monotonicUlid()}`, - key: parsed.key, - holderId: parsed.holderId, - acquiredAt: now, - expiresAt: - parsed.leaseTtlMs !== undefined - ? new Date(nowMs + parsed.leaseTtlMs) - : undefined, - definition: parsed.definition, - }; + const acquiredAt = new Date(); + const lease = createLease( + parsed.key, + parsed.holderId, + parsed.definition, + acquiredAt, + parsed.leaseTtlMs + ); keyState.leases.push(lease); if (parsed.definition.rate) { - keyState.tokens.push({ - tokenId: `lmttok_${monotonicUlid()}`, - holderId: parsed.holderId, - acquiredAt: now, - expiresAt: new Date(nowMs + parsed.definition.rate.periodMs), - }); + insertToken( + keyState, + parsed.holderId, + acquiredAt, + parsed.definition.rate.periodMs + ); } state.keys[parsed.key] = keyState; await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'acquired', @@ -229,10 +541,12 @@ export function createLimits(dataDir: string, tag?: string): Limits { await withStateLock(async () => { const state = cloneState(await readState()); + const wakeHolders: string[] = []; for (const [key, keyStateValue] of Object.entries(state.keys)) { const keyState = pruneKeyState(keyStateValue); - const nextLeases = keyState.leases.filter((lease) => { + const beforeLeases = keyState.leases.length; + keyState.leases = keyState.leases.filter((lease) => { if (lease.leaseId !== parsed.leaseId) return true; if (parsed.key && lease.key !== parsed.key) return true; if (parsed.holderId && lease.holderId !== parsed.holderId) { @@ -241,20 +555,19 @@ export function createLimits(dataDir: string, tag?: string): Limits { return false; }); - state.keys[key] = { - ...keyState, - leases: nextLeases, - }; - - if ( - state.keys[key].leases.length === 0 && - state.keys[key].tokens.length === 0 - ) { - delete state.keys[key]; + if (keyState.leases.length !== beforeLeases) { + const promoted = await promoteWaiters(key, keyState); + state.keys[key] = promoted.keyState; + wakeHolders.push(...promoted.wakeHolders); + } else { + state.keys[key] = keyState; } + + deleteEmptyKey(state, key); } await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); }); }, diff --git a/packages/world-local/src/queue.test.ts b/packages/world-local/src/queue.test.ts index 32c8d1f834..f07677fe49 100644 --- a/packages/world-local/src/queue.test.ts +++ b/packages/world-local/src/queue.test.ts @@ -2,11 +2,6 @@ import type { StepInvokePayload } from '@workflow/world'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { createQueue } from './queue'; -// Mock node:timers/promises so setTimeout resolves immediately -vi.mock('node:timers/promises', () => ({ - setTimeout: vi.fn().mockResolvedValue(undefined), -})); - const stepPayload: StepInvokePayload = { workflowName: 'test-workflow', workflowRunId: 'run_01ABC', @@ -18,11 +13,13 @@ describe('queue timeout re-enqueue', () => { let localQueue: ReturnType; beforeEach(() => { + vi.useFakeTimers(); localQueue = createQueue({ baseUrl: 'http://localhost:3000' }); }); afterEach(async () => { await localQueue.close(); + vi.useRealTimers(); }); it('createQueueHandler returns 200 with timeoutSeconds in the body', async () => { @@ -72,29 +69,6 @@ describe('queue timeout re-enqueue', () => { expect(body).toEqual({ ok: true }); }); - it('createQueueHandler returns 200 with timeoutSeconds: 0', async () => { - const handler = localQueue.createQueueHandler('__wkf_step_', async () => ({ - timeoutSeconds: 0, - })); - - const req = new Request('http://localhost/step', { - method: 'POST', - headers: { - 'content-type': 'application/json', - 'x-vqs-queue-name': '__wkf_step_test', - 'x-vqs-message-id': 'msg_01ABC', - 'x-vqs-message-attempt': '1', - }, - body: JSON.stringify(stepPayload), - }); - - const response = await handler(req); - expect(response.status).toBe(200); - - const body = await response.json(); - expect(body).toEqual({ timeoutSeconds: 0 }); - }); - it('queue retries when handler returns timeoutSeconds > 0', async () => { let callCount = 0; const handler = localQueue.createQueueHandler('__wkf_step_', async () => { @@ -102,25 +76,18 @@ describe('queue timeout re-enqueue', () => { if (callCount < 3) { return { timeoutSeconds: 5 }; } - // Third call succeeds normally return undefined; }); localQueue.registerHandler('__wkf_step_', handler); await localQueue.queue('__wkf_step_test' as any, stepPayload); + await vi.runAllTimersAsync(); - // Wait for the async queue processing to complete - // The queue fires off processing asynchronously, so we need to wait - await vi.waitFor(() => { - expect(callCount).toBe(3); - }); + expect(callCount).toBe(3); }); it('queue retries immediately when handler returns timeoutSeconds: 0', async () => { - const { setTimeout: mockSetTimeout } = await import('node:timers/promises'); - vi.mocked(mockSetTimeout).mockClear(); - let callCount = 0; const handler = localQueue.createQueueHandler('__wkf_step_', async () => { callCount++; @@ -133,12 +100,37 @@ describe('queue timeout re-enqueue', () => { localQueue.registerHandler('__wkf_step_', handler); await localQueue.queue('__wkf_step_test' as any, stepPayload); + await vi.runAllTimersAsync(); - await vi.waitFor(() => { - expect(callCount).toBe(3); + expect(callCount).toBe(3); + }); + + it('replaces delayed idempotent deliveries with an immediate wake-up', async () => { + const seenStepIds: string[] = []; + const handler = localQueue.createQueueHandler( + '__wkf_step_', + async (body) => { + seenStepIds.push((body as StepInvokePayload).stepId); + return undefined; + } + ); + + localQueue.registerHandler('__wkf_step_', handler); + + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + idempotencyKey: 'step_01ABC', + delaySeconds: 30, }); + await localQueue.queue( + '__wkf_step_test' as any, + { ...stepPayload, stepId: 'step_replacement' }, + { + idempotencyKey: 'step_01ABC', + } + ); + + await vi.runAllTimersAsync(); - // setTimeout should NOT have been called for timeoutSeconds: 0 - expect(mockSetTimeout).not.toHaveBeenCalled(); + expect(seenStepIds).toEqual(['step_replacement']); }); }); diff --git a/packages/world-local/src/queue.ts b/packages/world-local/src/queue.ts index fd3b511509..c356730daf 100644 --- a/packages/world-local/src/queue.ts +++ b/packages/world-local/src/queue.ts @@ -1,4 +1,3 @@ -import { setTimeout } from 'node:timers/promises'; import { JsonTransport } from '@vercel/queue'; import { MessageId, type Queue, ValidQueueName } from '@workflow/world'; import { Sema } from 'async-sema'; @@ -9,20 +8,10 @@ import type { Config } from './config.js'; import { resolveBaseUrl } from './config.js'; import { getPackageInfo } from './init.js'; -// For local queue, there is no technical limit on the message visibility lifespan, -// but the environment variable can be used for testing purposes to set a max visibility limit. const LOCAL_QUEUE_MAX_VISIBILITY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_MAX_VISIBILITY ?? '0', 10) || Infinity; -// Maximum safe delay for setTimeout in Node.js (2^31 - 1 milliseconds ≈ 24.85 days) -// Larger values cause "TimeoutOverflowWarning: X does not fit into a 32-bit signed integer" -// When the clamped timeout fires, the handler will recalculate remaining time from -// persistent state and return another timeoutSeconds if needed. -const MAX_SAFE_TIMEOUT_MS = 2147483647; - -// The local workers share the same Node.js process and event loop, -// so we need to limit concurrency to avoid overwhelming the system. const DEFAULT_CONCURRENCY_LIMIT = 1000; const WORKFLOW_LOCAL_QUEUE_CONCURRENCY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_CONCURRENCY ?? '0', 10) || @@ -31,15 +20,27 @@ const WORKFLOW_LOCAL_QUEUE_CONCURRENCY = export type DirectHandler = (req: Request) => Promise; export type LocalQueue = Queue & { - /** Close the HTTP agent and release resources. */ close(): Promise; - /** Register a direct in-process handler for a queue prefix, bypassing HTTP. */ registerHandler( prefix: '__wkf_step_' | '__wkf_workflow_', handler: DirectHandler ): void; }; +type ScheduledMessage = { + attempt: number; + body: Uint8Array; + headers?: Record; + idempotencyKey?: string; + messageId: MessageId; + pendingExecution: boolean; + queueName: ValidQueueName; + remainingServerRetries: number; + running: boolean; + timer?: ReturnType; + version: number; +}; + function getQueueRoute(queueName: ValidQueueName): { pathname: 'flow' | 'step'; prefix: '__wkf_step_' | '__wkf_workflow_'; @@ -54,11 +55,6 @@ function getQueueRoute(queueName: ValidQueueName): { } export function createQueue(config: Partial): LocalQueue { - // Create a custom agent optimized for high-concurrency local workflows: - // - headersTimeout: 0 allows long-running steps - // - connections: 1000 allows many parallel connections to the same host - // - pipelining: 1 (default) for HTTP/1.1 compatibility - // - keepAliveTimeout: 30s keeps connections warm for rapid step execution const httpAgent = new Agent({ headersTimeout: 0, connections: 1000, @@ -67,139 +63,240 @@ export function createQueue(config: Partial): LocalQueue { const transport = new JsonTransport(); const generateId = monotonicFactory(); const semaphore = new Sema(WORKFLOW_LOCAL_QUEUE_CONCURRENCY); - - /** - * holds inflight messages by idempotency key to ensure - * that we don't queue the same message multiple times - */ - const inflightMessages = new Map(); - /** Direct in-process handlers by queue prefix, bypassing HTTP when set. */ + const scheduledMessages = new Map(); const directHandlers = new Map(); + let closed = false; - const queue: Queue['queue'] = async (queueName, message, opts) => { - const cleanup = [] as (() => void)[]; + const cleanupMessage = (message: ScheduledMessage) => { + if (message.timer) { + clearTimeout(message.timer); + message.timer = undefined; + } + if (message.idempotencyKey) { + scheduledMessages.delete(message.idempotencyKey); + } + }; - if (opts?.idempotencyKey) { - const existing = inflightMessages.get(opts.idempotencyKey); - if (existing) { - return { messageId: existing }; - } + const scheduleExecution = (message: ScheduledMessage, delayMs: number) => { + if (closed) { + cleanupMessage(message); + return; } - const body = transport.serialize(message); - const { pathname, prefix } = getQueueRoute(queueName); - const messageId = MessageId.parse(`msg_${generateId()}`); + if (message.timer) { + clearTimeout(message.timer); + message.timer = undefined; + } - if (opts?.idempotencyKey) { - const key = opts.idempotencyKey; - inflightMessages.set(key, messageId); - cleanup.push(() => { - inflightMessages.delete(key); - }); + const version = ++message.version; + const enqueueRun = () => { + message.pendingExecution = true; + if (!message.running) { + void executeMessage(message); + } + }; + + if (delayMs <= 0) { + enqueueRun(); + return; } - (async () => { - const token = semaphore.tryAcquire(); - if (!token) { - console.warn( - `[world-local]: concurrency limit (${WORKFLOW_LOCAL_QUEUE_CONCURRENCY}) reached, waiting for queue to free up` - ); - await semaphore.acquire(); + message.timer = globalThis.setTimeout(() => { + if (message.version !== version || closed) { + return; } + message.timer = undefined; + enqueueRun(); + }, delayMs); + }; + + const deliverMessage = async ( + message: ScheduledMessage + ): Promise< + | { kind: 'success' } + | { kind: 'timeout'; delayMs: number } + | { kind: 'server_error'; status: number; text: string } + > => { + const { pathname, prefix } = getQueueRoute(message.queueName); + const headers: Record = { + ...message.headers, + 'content-type': 'application/json', + 'x-vqs-queue-name': message.queueName, + 'x-vqs-message-id': message.messageId, + 'x-vqs-message-attempt': String(message.attempt + 1), + }; + const directHandler = directHandlers.get(prefix); + let response: Response; + + if (directHandler) { + const req = new Request( + `http://localhost/.well-known/workflow/v1/${pathname}`, + { + method: 'POST', + headers, + body: message.body, + } + ); + response = await directHandler(req); + } else { + const baseUrl = await resolveBaseUrl(config); + response = await fetch(`${baseUrl}/.well-known/workflow/v1/${pathname}`, { + method: 'POST', + duplex: 'half', + dispatcher: httpAgent, + headers, + body: message.body, + } as any); + } + + const text = await response.text(); + + if (response.ok) { try { - const maxAttempts = 3; - let defaultRetriesLeft = maxAttempts; - for (let attempt = 0; defaultRetriesLeft > 0; attempt++) { - defaultRetriesLeft--; - - const headers: Record = { - ...opts?.headers, - 'content-type': 'application/json', - 'x-vqs-queue-name': queueName, - 'x-vqs-message-id': messageId, - 'x-vqs-message-attempt': String(attempt + 1), + const timeoutSeconds = Number(JSON.parse(text).timeoutSeconds); + if (Number.isFinite(timeoutSeconds) && timeoutSeconds >= 0) { + return { + kind: 'timeout', + delayMs: timeoutSeconds > 0 ? timeoutSeconds * 1000 : 0, }; - const directHandler = directHandlers.get(prefix); - let response: Response; - - if (directHandler) { - const req = new Request( - `http://localhost/.well-known/workflow/v1/${pathname}`, - { - method: 'POST', - headers, - body, - } - ); - response = await directHandler(req); - } else { - const baseUrl = await resolveBaseUrl(config); - // eslint-disable-next-line @typescript-eslint/no-explicit-any -- undici v7 dispatcher types don't match @types/node's RequestInit - response = await fetch( - `${baseUrl}/.well-known/workflow/v1/${pathname}`, - { - method: 'POST', - duplex: 'half', - dispatcher: httpAgent, - headers, - body, - } as any - ); + } + } catch {} + + return { kind: 'success' }; + } + + return { + kind: 'server_error', + status: response.status, + text, + }; + }; + + const executeMessage = async (message: ScheduledMessage): Promise => { + if (closed || message.running) { + return; + } + + message.running = true; + + try { + while (message.pendingExecution && !closed) { + message.pendingExecution = false; + const version = message.version; + const token = semaphore.tryAcquire(); + if (!token) { + console.warn( + `[world-local]: concurrency limit (${WORKFLOW_LOCAL_QUEUE_CONCURRENCY}) reached, waiting for queue to free up` + ); + await semaphore.acquire(); + } + + try { + if (closed) { + cleanupMessage(message); + return; + } + + if (version !== message.version) { + continue; } - const text = await response.text(); - - if (response.ok) { - try { - const timeoutSeconds = Number(JSON.parse(text).timeoutSeconds); - if (Number.isFinite(timeoutSeconds) && timeoutSeconds >= 0) { - // Clamp to MAX_SAFE_TIMEOUT_MS to avoid Node.js setTimeout overflow warning. - // When this fires early, the handler recalculates remaining time from - // persistent state and returns another timeoutSeconds if needed. - if (timeoutSeconds > 0) { - const timeoutMs = Math.min( - timeoutSeconds * 1000, - MAX_SAFE_TIMEOUT_MS - ); - await setTimeout(timeoutMs); - } - defaultRetriesLeft++; - continue; - } - } catch {} + const result = await deliverMessage(message); + + if (result.kind === 'success') { + cleanupMessage(message); return; } + if (result.kind === 'timeout') { + message.attempt += 1; + scheduleExecution( + message, + result.delayMs === 0 + ? 0 + : Math.min(result.delayMs, LOCAL_QUEUE_MAX_VISIBILITY * 1000) + ); + continue; + } + console.error( - `[world-local] Queue message failed (attempt ${attempt + 1}/${maxAttempts}, status ${response.status}): ${text}`, - { queueName, messageId } + `[world-local] Queue message failed (attempt ${ + message.attempt + 1 + }/3, status ${result.status}): ${result.text}`, + { queueName: message.queueName, messageId: message.messageId } ); + + message.attempt += 1; + message.remainingServerRetries -= 1; + if (message.remainingServerRetries > 0) { + scheduleExecution(message, 0); + continue; + } + + console.error(`[world-local] Queue message exhausted all retries`, { + queueName: message.queueName, + messageId: message.messageId, + }); + cleanupMessage(message); + return; + } finally { + semaphore.release(); } + } + } catch (err) { + const queueError = err as { name?: string }; + const isAbortError = + queueError.name === 'AbortError' || + queueError.name === 'ResponseAborted'; + if (!isAbortError) { + console.error('[local world] Queue operation failed:', err); + } + cleanupMessage(message); + } finally { + message.running = false; + if (message.pendingExecution && !closed) { + void executeMessage(message); + } + } + }; - console.error(`[world-local] Queue message exhausted all retries`, { - queueName, - messageId, - }); - } finally { - semaphore.release(); + const queue: Queue['queue'] = async (queueName, message, opts) => { + const body = transport.serialize(message); + const delayMs = + typeof opts?.delaySeconds === 'number' && opts.delaySeconds > 0 + ? opts.delaySeconds * 1000 + : 0; + + if (opts?.idempotencyKey) { + const existing = scheduledMessages.get(opts.idempotencyKey); + if (existing) { + existing.queueName = queueName; + existing.body = body; + existing.headers = opts.headers; + scheduleExecution(existing, delayMs); + return { messageId: existing.messageId }; } - })() - .catch((err) => { - // Silently ignore client disconnect errors (e.g., browser refresh during streaming) - // These are expected and should not cause unhandled rejection warnings - const isAbortError = - err?.name === 'AbortError' || err?.name === 'ResponseAborted'; - if (!isAbortError) { - console.error('[local world] Queue operation failed:', err); - } - }) - .finally(() => { - for (const fn of cleanup) { - fn(); - } - }); + } + + const scheduledMessage: ScheduledMessage = { + attempt: 0, + body, + headers: opts?.headers, + idempotencyKey: opts?.idempotencyKey, + messageId: MessageId.parse(`msg_${generateId()}`), + pendingExecution: false, + queueName, + remainingServerRetries: 3, + running: false, + version: 0, + }; - return { messageId }; + if (opts?.idempotencyKey) { + scheduledMessages.set(opts.idempotencyKey, scheduledMessage); + } + + scheduleExecution(scheduledMessage, delayMs); + return { messageId: scheduledMessage.messageId }; }; const HeaderParser = z.object({ @@ -270,6 +367,11 @@ export function createQueue(config: Partial): LocalQueue { directHandlers.set(prefix, handler); }, async close() { + closed = true; + for (const message of scheduledMessages.values()) { + cleanupMessage(message); + } + scheduledMessages.clear(); await httpAgent.close(); }, }; diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index 7e2888f69f..a96cf3b680 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -117,7 +117,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - **Durable Storage**: Stores workflow runs, events, steps, hooks, and webhooks in PostgreSQL - **Queue Processing**: Uses graphile-worker as the durable queue and executes jobs over the workflow HTTP routes - **Durable Delays**: Re-schedules waits and retries in PostgreSQL -- **Flow Limits**: Enforces durable concurrency/rate limits with PostgreSQL-backed leases, rate tokens, and waiter promotion +- **Flow Limits**: Implements the shared concurrency/rate-limit contract with PostgreSQL-backed leases, rate tokens, FIFO waiters, and prompt wake-ups - **Streaming**: Real-time event streaming capabilities - **Health Checks**: Built-in connection health monitoring - **Configurable Concurrency**: Adjustable worker concurrency for queue processing @@ -129,9 +129,12 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling - Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key +- Cancelled workflow and failed/completed step waiters are pruned before promotion - Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` +PostgreSQL's main advantage over the local world is durability of the queue/backlog itself across host or process loss. The flow-limit behavior is intended to match other implemented worlds while the process is alive. + ## Development For local development, you can use the included Docker Compose configuration: diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 35358b9f15..5d8e1a74f6 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,17 +1,12 @@ -import { asc, eq } from 'drizzle-orm'; -import { WorkflowWorldError } from '@workflow/errors'; -import { - afterAll, - beforeAll, - beforeEach, - describe, - expect, - it, - test, -} from 'vitest'; +import { afterAll, beforeAll, beforeEach, test } from 'vitest'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; -import * as Schema from './drizzle/schema.js'; import { createLimits } from './limits.js'; +import { + createEventsStorage, + createRunsStorage, + createStepsStorage, +} from './storage.js'; +import { createQueue } from './queue.js'; if (process.platform === 'win32') { test.skip('skipped on Windows since it relies on a docker container', () => {}); @@ -19,10 +14,16 @@ if (process.platform === 'win32') { let db: Awaited< ReturnType >; + let queue: ReturnType; beforeAll(async () => { const { createPostgresTestDb } = await import('../test/test-db.js'); db = await createPostgresTestDb(); + queue = createQueue( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.sql + ); + await queue.start(); }, 120_000); beforeEach(async () => { @@ -30,7 +31,8 @@ if (process.platform === 'win32') { }); afterAll(async () => { - await db.close(); + await queue?.close(); + await db?.close(); }); createLimitsContractSuite('postgres world limits', async () => { @@ -39,281 +41,11 @@ if (process.platform === 'win32') { { connectionString: db.connectionString, queueConcurrency: 1 }, db.drizzle ), + storage: { + runs: createRunsStorage(db.drizzle), + steps: createStepsStorage(db.drizzle), + events: createEventsStorage(db.drizzle), + }, }; }); - - describe('postgres waiter promotion', () => { - it('throws WorkflowWorldError when heartbeating a missing lease', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - await expect( - limits.heartbeat({ - leaseId: 'lmt_missing', - }) - ).rejects.toBeInstanceOf(WorkflowWorldError); - }); - - it('serializes concurrent acquires for the same key', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - const results = await Promise.all( - Array.from({ length: 12 }, (_, index) => - limits.acquire({ - key: 'workflow:user:concurrent', - holderId: `holder-${index}`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }) - ) - ); - - const acquired = results.filter((result) => result.status === 'acquired'); - const blocked = results.filter((result) => result.status === 'blocked'); - - expect(acquired).toHaveLength(1); - expect(blocked).toHaveLength(11); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, 'workflow:user:concurrent')); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, 'workflow:user:concurrent')); - - expect(leases).toHaveLength(1); - expect(waiters).toHaveLength(11); - }); - - it('promotes the earliest waiter on release', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - const first = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') throw new Error('expected acquisition'); - - const second = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const third = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(second.status).toBe('blocked'); - expect(third.status).toBe('blocked'); - - await limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, first.lease.key)) - .orderBy( - asc(Schema.limitLeases.acquiredAt), - asc(Schema.limitLeases.leaseId) - ); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) - .orderBy( - asc(Schema.limitWaiters.createdAt), - asc(Schema.limitWaiters.waiterId) - ); - - expect(leases).toEqual([{ holderId: 'holder-b' }]); - expect(waiters).toEqual([{ holderId: 'holder-c' }]); - - const stillWaiting = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - expect(stillWaiting.status).toBe('blocked'); - }); - - it('skips cancelled workflow waiters before promotion', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - await db.drizzle.insert(Schema.runs).values([ - { - runId: 'wrun_dead_workflow', - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - status: 'cancelled', - }, - ]); - - const first = await limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'holder-a', - definition: { - concurrency: { max: 1 }, - rate: { count: 2, periodMs: 5_000 }, - }, - leaseTtlMs: 5_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') throw new Error('expected acquisition'); - - await limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'wflock_wrun_dead_workflow:limitwait_dead', - definition: { - concurrency: { max: 1 }, - rate: { count: 2, periodMs: 5_000 }, - }, - leaseTtlMs: 5_000, - }); - await limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'holder-live', - definition: { - concurrency: { max: 1 }, - rate: { count: 2, periodMs: 5_000 }, - }, - leaseTtlMs: 5_000, - }); - - await limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitLeases.acquiredAt)); - const tokens = await db.drizzle - .select({ holderId: Schema.limitTokens.holderId }) - .from(Schema.limitTokens) - .where(eq(Schema.limitTokens.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitTokens.acquiredAt)); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitWaiters.createdAt)); - - expect(leases).toEqual([{ holderId: 'holder-live' }]); - expect(tokens).toEqual([ - { holderId: first.lease.holderId }, - { holderId: 'holder-live' }, - ]); - expect(waiters).toEqual([]); - }); - - it('skips failed step waiters before promotion', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - await db.drizzle.insert(Schema.runs).values([ - { - runId: 'wrun_dead_step', - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - status: 'running', - startedAt: new Date(), - }, - { - runId: 'wrun_live_step', - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - status: 'running', - startedAt: new Date(), - }, - ]); - await db.drizzle.insert(Schema.steps).values([ - { - runId: 'wrun_dead_step', - stepId: 'step_dead', - stepName: 'test-step', - status: 'failed', - attempt: 1, - }, - { - runId: 'wrun_live_step', - stepId: 'step_live', - stepName: 'test-step', - status: 'pending', - attempt: 0, - }, - ]); - - const first = await limits.acquire({ - key: 'workflow:user:skip-dead-step', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') throw new Error('expected acquisition'); - - await limits.acquire({ - key: 'workflow:user:skip-dead-step', - holderId: 'stplock_wrun_dead_step:step_dead:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - await limits.acquire({ - key: 'workflow:user:skip-dead-step', - holderId: 'holder-live', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - await limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitLeases.acquiredAt)); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitWaiters.createdAt)); - - expect(leases).toEqual([{ holderId: 'holder-live' }]); - expect(waiters).toEqual([]); - }); - }); } diff --git a/packages/world-testing/src/index.mts b/packages/world-testing/src/index.mts index 4b59e15267..db42585942 100644 --- a/packages/world-testing/src/index.mts +++ b/packages/world-testing/src/index.mts @@ -2,6 +2,8 @@ import { addition } from './addition.mjs'; import { errors } from './errors.mjs'; import { hooks } from './hooks.mjs'; import { idempotency } from './idempotency.mjs'; +export { createLimitsContractSuite } from './limits-contract.js'; +export { createLimitsRuntimeSuite } from './limits-runtime.js'; import { nullByte } from './null-byte.mjs'; export function createTestSuite(pkgName: string) { diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 5037039e83..2a65750181 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -1,17 +1,77 @@ import { setTimeout as sleep } from 'node:timers/promises'; -import type { Limits } from '@workflow/world'; +import { + SPEC_VERSION_CURRENT, + type Limits, + type Storage, +} from '@workflow/world'; import { describe, expect, it } from 'vitest'; export interface LimitsHarness { limits: Limits; + storage?: Pick; close?: () => Promise; } +async function createRun( + storage: Pick, + workflowName: string +) { + const result = await storage.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName, + input: [], + }, + }); + if (!result.run) { + throw new Error('expected run'); + } + return result.run; +} + +async function createStep( + storage: Pick, + runId: string, + stepId: string +) { + const result = await storage.events.create(runId, { + eventType: 'step_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + stepName: 'test-step', + input: [], + }, + }); + if (!result.step) { + throw new Error('expected step'); + } + return result.step; +} + export function createLimitsContractSuite( name: string, createHarness: () => Promise ) { describe(name, () => { + it('throws a workflow world error when heartbeating a missing lease', async () => { + const harness = await createHarness(); + try { + await expect( + harness.limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toMatchObject({ + name: 'WorkflowWorldError', + message: expect.stringContaining('not found'), + }); + } finally { + await harness.close?.(); + } + }); + it('enforces per-key concurrency limits', async () => { const harness = await createHarness(); try { @@ -54,13 +114,40 @@ export function createLimitsContractSuite( } }); - it('returns a retry path when rate limits block acquisition', async () => { + it('serializes concurrent acquires for the same key', async () => { const harness = await createHarness(); try { + const results = await Promise.all( + Array.from({ length: 12 }, (_, index) => + harness.limits.acquire({ + key: 'workflow:user:concurrent', + holderId: `holder-${index}`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }) + ) + ); + + const acquired = results.filter( + (result) => result.status === 'acquired' + ); + const blocked = results.filter((result) => result.status === 'blocked'); + + expect(acquired).toHaveLength(1); + expect(blocked).toHaveLength(11); + } finally { + await harness.close?.(); + } + }); + + it('keeps rate capacity consumed until the window expires', async () => { + const harness = await createHarness(); + try { + const periodMs = 200; const first = await harness.limits.acquire({ key: 'step:provider:openai', holderId: 'holder-a', - definition: { rate: { count: 1, periodMs: 100 } }, + definition: { rate: { count: 1, periodMs } }, leaseTtlMs: 1_000, }); expect(first.status).toBe('acquired'); @@ -76,13 +163,31 @@ export function createLimitsContractSuite( const second = await harness.limits.acquire({ key: 'step:provider:openai', holderId: 'holder-b', - definition: { rate: { count: 1, periodMs: 100 } }, + definition: { rate: { count: 1, periodMs } }, leaseTtlMs: 1_000, }); expect(second.status).toBe('blocked'); if (second.status !== 'blocked') throw new Error('expected blocked'); expect(second.reason).toBe('rate'); expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + + let third = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-c', + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + const deadline = Date.now() + periodMs + 1_000; + while (third.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, third.retryAfterMs) + 50); + third = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-c', + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + } + expect(third.status).toBe('acquired'); } finally { await harness.close?.(); } @@ -187,5 +292,226 @@ export function createLimitsContractSuite( await harness.close?.(); } }); + + it('promotes waiters in FIFO order per key', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const third = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('blocked'); + expect(third.status).toBe('blocked'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const promoted = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const stillWaiting = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(promoted.status).toBe('acquired'); + expect(stillWaiting.status).toBe('blocked'); + if (promoted.status !== 'acquired') + throw new Error('expected waiter-b promotion'); + + await harness.limits.release({ + leaseId: promoted.lease.leaseId, + holderId: promoted.lease.holderId, + key: promoted.lease.key, + }); + + const thirdPromoted = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(thirdPromoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('skips cancelled workflow waiters before promotion', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow waiter liveness'); + } + + const deadRun = await createRun(harness.storage, 'dead-workflow'); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_cancelled', + specVersion: SPEC_VERSION_CURRENT, + }); + + const liveRun = await createRun(harness.storage, 'live-workflow'); + await harness.storage.events.create(liveRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + + const first = await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: `wflock_${deadRun.runId}:limitwait_dead`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: `wflock_${liveRun.runId}:limitwait_live`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const promoted = await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: `wflock_${liveRun.runId}:limitwait_live`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('skips failed step waiters before promotion', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for step waiter liveness'); + } + + const deadRun = await createRun(harness.storage, 'dead-step-workflow'); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const deadStep = await createStep( + harness.storage, + deadRun.runId, + 'step-dead' + ); + await harness.storage.events.create(deadRun.runId, { + eventType: 'step_started', + specVersion: SPEC_VERSION_CURRENT, + correlationId: deadStep.stepId, + }); + await harness.storage.events.create(deadRun.runId, { + eventType: 'step_failed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: deadStep.stepId, + eventData: { + error: { name: 'Error', message: 'failed waiter' }, + }, + } as any); + + const liveRun = await createRun(harness.storage, 'live-step-workflow'); + await harness.storage.events.create(liveRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const liveStep = await createStep( + harness.storage, + liveRun.runId, + 'step-live' + ); + + const first = await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: `stplock_${deadRun.runId}:${deadStep.stepId}:0`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const promoted = await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); }); } diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts new file mode 100644 index 0000000000..60bfe5789d --- /dev/null +++ b/packages/world-testing/src/limits-runtime.ts @@ -0,0 +1,218 @@ +import { describe, expect, it } from 'vitest'; + +type WorkflowLockContentionResult = { + workflowLockAcquiredAt: number; + workflowLockReleasedAt: number; + stepLockAcquiredAt: number; + stepLockReleasedAt: number; +}; + +type StepLockNoRetriesResult = { + label: string; + attempt: number; + acquiredAt: number; + releasedAt: number; +}; + +type WorkflowOnlyLockResult = { + label: string; + workflowLockAcquiredAt: number; + workflowLockReleasedAt: number; +}; + +type WorkflowRateLimitResult = { + label: string; + workflowRateAcquiredAt: number; + workflowRateReleasedAt: number; + periodMs: number; +}; + +export interface LimitsRuntimeHarness { + runWorkflowWithWorkflowAndStepLocks(userId: string): Promise<{ + workflowKey: string; + dbKey: string; + aiKey: string; + summary: string; + }>; + runWorkflowLockContention( + userId: string, + holdMs: number + ): Promise<[WorkflowLockContentionResult, WorkflowLockContentionResult]>; + runStepLockNoRetriesContention( + userId: string, + holdMs: number + ): Promise< + [StepLockNoRetriesResult, StepLockNoRetriesResult, StepLockNoRetriesResult] + >; + runWorkflowLockAcrossSuspension( + userId: string, + holdMs: number + ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; + runWorkflowRateLimitContention( + userId: string, + holdMs: number, + periodMs: number + ): Promise<[WorkflowRateLimitResult, WorkflowRateLimitResult]>; + runWorkflowFifoThreeWaiters( + userId: string, + holdMs: number + ): Promise< + [WorkflowOnlyLockResult, WorkflowOnlyLockResult, WorkflowOnlyLockResult] + >; + runCancelledWorkflowWaiter( + userId: string, + holdMs: number + ): Promise<{ + cancelledError: unknown; + resultA: WorkflowOnlyLockResult; + resultC: WorkflowOnlyLockResult; + }>; + runIndependentWorkflowKeys( + holdMs: number + ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; +} + +export function createLimitsRuntimeSuite( + name: string, + createHarness: () => Promise +) { + describe(name, () => { + it('runs workflow and step locks end-to-end', async () => { + const harness = await createHarness(); + const userId = 'shared-user'; + const result = await harness.runWorkflowWithWorkflowAndStepLocks(userId); + + expect(result).toMatchObject({ + workflowKey: `workflow:user:${userId}`, + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: `summary:profile:${userId}`, + }); + }); + + it('serializes workflow and step admission under contention', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockContention( + 'shared-user', + 750 + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepLockReleasedAt + ); + }); + + it('wakes promoted workflow and step waiters promptly', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockContention( + 'shared-user', + 1_500 + ); + + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + expect( + resultB.stepLockAcquiredAt - resultA.stepLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('does not consume retries while blocked on a top-of-step lock', async () => { + const harness = await createHarness(); + const [resultA, resultB, resultC] = + await harness.runStepLockNoRetriesContention('shared-user', 750); + const [firstResult, secondResult, thirdResult] = [ + resultA, + resultB, + resultC, + ].sort((left, right) => left.acquiredAt - right.acquiredAt); + + expect(resultA.attempt).toBe(1); + expect(resultB.attempt).toBe(1); + expect(resultC.attempt).toBe(1); + expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( + firstResult.releasedAt + ); + expect(thirdResult.acquiredAt).toBeGreaterThanOrEqual( + secondResult.releasedAt + ); + }); + + it('keeps workflow locks held across suspension until the workflow finishes', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockAcrossSuspension( + 'shared-user', + 1_500 + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('wakes rate-limited waiters only after the rate window expires', async () => { + const harness = await createHarness(); + const holdMs = 250; + const periodMs = 1_500; + const [resultA, resultB] = await harness.runWorkflowRateLimitContention( + 'shared-user', + holdMs, + periodMs + ); + + expect( + resultB.workflowRateAcquiredAt - resultA.workflowRateAcquiredAt + ).toBeGreaterThanOrEqual(periodMs - 100); + + const remainingWindowAfterRelease = + periodMs - + (resultA.workflowRateReleasedAt - resultA.workflowRateAcquiredAt); + expect( + resultB.workflowRateAcquiredAt - resultA.workflowRateReleasedAt + ).toBeGreaterThanOrEqual(Math.max(0, remainingWindowAfterRelease - 100)); + }); + + it('promotes 3 workflow waiters in FIFO order', async () => { + const harness = await createHarness(); + const [resultA, resultB, resultC] = + await harness.runWorkflowFifoThreeWaiters('shared-user', 750); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultB.workflowLockReleasedAt + ); + }); + + it('skips cancelled workflow waiters before promoting the next run', async () => { + const harness = await createHarness(); + const { cancelledError, resultA, resultC } = + await harness.runCancelledWorkflowWaiter('shared-user', 1_500); + + expect(cancelledError).toBeTruthy(); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('does not block unrelated workflow keys', async () => { + const harness = await createHarness(); + const [resultA, resultB] = + await harness.runIndependentWorkflowKeys(1_000); + + expect(resultB.workflowLockAcquiredAt).toBeLessThan( + resultA.workflowLockReleasedAt + ); + }); + }); +} diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 8306576d8a..565e258d71 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -7,12 +7,13 @@ implementations. ## Status - The shared `limits` interface and `lock()` API surface now exist. -- Local world has a working lease-based implementation for - acquire/release/heartbeat. -- Postgres now has a PostgreSQL-backed implementation with leases, rate tokens, - and durable waiters. +- Local world now implements the shared live-process limits semantics with + leases, rate tokens, FIFO waiters, and prompt wake-up with delayed fallback. +- Postgres implements the same limits semantics with PostgreSQL-backed leases, + rate tokens, durable waiters, and durable queue wake-up. - Vercel still exposes `limits` as a stub. -- The Next.js Turbopack workbench has E2E coverage for workflow and step locks. +- The Next.js Turbopack workbench has shared E2E coverage for workflow and step + locks on implemented worlds. ## Goals @@ -30,6 +31,34 @@ implementations. - `step limit`: execution control for a specific step/resource key. - `lease`: durable record that a workflow or step currently occupies capacity for a key. +## Shared Contract vs World-Specific Behavior + +The limits contract is intended to describe one shared set of observable +semantics across implemented worlds. That shared contract includes: + +- `acquire()`, `release()`, and `heartbeat()` surface behavior +- `WorkflowWorldError` when heartbeating a missing lease +- per-key concurrency and rate limiting outcomes +- same-holder lease reuse +- serialization of concurrent acquires for a single key +- FIFO waiter promotion per key +- pruning cancelled workflow waiters and failed/completed step waiters +- blocked acquisitions not consuming execution concurrency +- prompt wake-up with delayed fallback replay + +World-specific behavior should be limited to implementation mechanics and +durability characteristics, for example: + +- how waiter state is stored internally +- how per-key mutations are serialized internally +- how prompt wake-up is delivered +- whether queued wake-ups survive process or host loss +- backend-specific observability or debugging surfaces + +That means SQL row layout, advisory locks, and Graphile jobs are PostgreSQL +implementation details, while FIFO fairness and waiter skipping are contract +behavior that local and Postgres should both exhibit. + ## Decisions So Far ### 1. Use one shared limits model @@ -185,8 +214,8 @@ another holds a step lock and each waits on the other. ### 9. Waiters are FIFO per key -The PostgreSQL implementation uses a durable waiter queue and promotes waiters -in FIFO order for a single limit key. +Implemented worlds use a waiter queue and promote waiters in FIFO order for a +single limit key. Important details: @@ -204,33 +233,37 @@ global scheduler. Blocked flow limits and worker concurrency are intentionally separate. -In the PostgreSQL world: +For implemented worlds: - blocked workflows are suspended and re-queued, not left running on a worker - blocked steps exit the current attempt and are re-queued instead of polling in a live worker slot -- backlog remains durable in PostgreSQL while worker slots are free to service - unrelated work +- worker slots are free to service unrelated work while the blocked execution is + waiting to be retried or promoted -This is the main practical difference between a durable waiter model and a pure -polling loop. +PostgreSQL additionally keeps that backlog durable in the database. The local +world keeps queue delivery in-memory, so cross-process crash recovery for the +backlog is explicitly outside the shared limits contract today. ### 11. Wake-up is prompt, with a delayed fallback -The PostgreSQL world uses Graphile for wake-up delivery, but PostgreSQL tables -remain the source of truth for limit state. +Implemented worlds use the world-owned limit state as the source of truth and +try to resume promoted waiters promptly, with a delayed fallback still in place +so progress is possible if an immediate wake-up is missed. Current behavior: -- leases, rate tokens, and waiters live in PostgreSQL tables -- promotion decisions are made from SQL state +- leases, rate tokens, and waiters live in world-owned limit state +- promotion decisions are made from that limit state - when a waiter is promoted, the runtime is woken by enqueuing the appropriate workflow or step job - workflows also keep a delayed replay fallback so progress is still possible if an immediate wake-up is missed -This means Graphile is used to resume work quickly, not to decide fairness or -capacity ownership. +PostgreSQL uses Graphile jobs for that wake-up path and keeps the backlog +durable across host/process failure. The local world uses an in-memory queue, so +prompt wake behavior matches while the process is alive, but durable backlog +survival is not guaranteed after process loss. ### 12. V1 semantics are intentionally opinionated @@ -254,9 +287,10 @@ More concretely: For the current local implementation specifically: -- workflow locks already behave like durable logical-scope leases -- step locks are still simpler than Postgres and do not provide the same durable - waiter/wake-up behavior +- workflow and step locks now follow the same live-process waiter/fairness + semantics as Postgres +- the queue remains in-memory, so queued wake-ups are not durable across process + loss This means the current v1 interpretation of a workflow lock is: diff --git a/workbench/example/tsconfig.json b/workbench/example/tsconfig.json index 39c2f1ea68..4e131954f0 100644 --- a/workbench/example/tsconfig.json +++ b/workbench/example/tsconfig.json @@ -3,6 +3,7 @@ "target": "es2022", "module": "NodeNext", "lib": ["dom", "dom.iterable", "esnext"], + "baseUrl": ".", "allowJs": true, "skipLibCheck": true, "strict": true, diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index f7d43aab9f..0d3db80e83 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -1,6 +1,6 @@ // Test path alias resolution - imports a helper from outside the workbench directory /** biome-ignore-all lint/complexity/noStaticOnlyClass: */ -import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test'; +import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test.js'; import { createHook, createWebhook, @@ -15,8 +15,8 @@ import { sleep, } from 'workflow'; import { getRun, start } from 'workflow/api'; -import { importedStepOnly } from './_imported_step_only'; -import { callThrower, stepThatThrowsFromHelper } from './helpers'; +import { importedStepOnly } from './_imported_step_only.js'; +import { callThrower, stepThatThrowsFromHelper } from './helpers.js'; ////////////////////////////////////////////////////////// @@ -333,11 +333,66 @@ stepLockNoRetriesStep.maxRetries = 0; export async function stepLockNoRetriesContentionWorkflow( userId = 'user-123', - holdMs = 750 + holdMs = 750, + label = userId +) { + 'use workflow'; + + return await stepLockNoRetriesStep(label, holdMs); +} + +////////////////////////////////////////////////////////// + +export async function workflowOnlyLockContentionWorkflow( + userId = 'user-123', + holdMs = 750, + label = userId +) { + 'use workflow'; + + await using _workflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const workflowLockAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowLockReleasedAt = Date.now(); + + return { + label, + userId, + workflowLockAcquiredAt, + workflowLockReleasedAt, + }; +} + +export async function workflowRateLimitContentionWorkflow( + userId = 'user-123', + holdMs = 250, + periodMs = 1_500, + label = userId ) { 'use workflow'; - return await stepLockNoRetriesStep(userId, holdMs); + await using _workflowRateLimit = await lock({ + key: `workflow:rate:${userId}`, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + + const workflowRateAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowRateReleasedAt = Date.now(); + + return { + label, + userId, + periodMs, + workflowRateAcquiredAt, + workflowRateReleasedAt, + }; } ////////////////////////////////////////////////////////// @@ -1277,7 +1332,7 @@ import { createVector, scaleVector, sumVectors, -} from './serde-steps'; +} from './serde-steps.js'; /** * Workflow that tests cross-context class registration. diff --git a/workbench/example/workflows/serde-steps.ts b/workbench/example/workflows/serde-steps.ts index 227de88399..9726bbe6c0 100644 --- a/workbench/example/workflows/serde-steps.ts +++ b/workbench/example/workflows/serde-steps.ts @@ -6,7 +6,7 @@ * step calls. This tests cross-context class registration. */ -import { Vector } from './serde-models'; +import { Vector } from './serde-models.js'; /** * Step that receives a Vector and scales it. From 39efdb38bffc6cac8cadeca4a6f63d6f9c068940 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 15:42:38 -0400 Subject: [PATCH 13/34] add more e2e test cases Signed-off-by: nathancolosimo --- .github/workflows/tests.yml | 13 + packages/core/e2e/e2e.test.ts | 104 +++++++- packages/world-local/src/limits.test.ts | 36 ++- packages/world-postgres/src/limits.test.ts | 28 ++ packages/world-postgres/src/limits.ts | 15 +- packages/world-testing/src/limits-contract.ts | 249 +++++++++++++++++- packages/world-testing/src/limits-runtime.ts | 123 ++++++++- packages/world/FLOW_LIMITS.md | 22 +- workbench/example/workflows/99_e2e.ts | 214 ++++++++++++++- 9 files changed, 770 insertions(+), 34 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 768e857a2f..4f6d697db7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -565,6 +565,19 @@ jobs: DEPLOYMENT_URL: "http://localhost:${{ matrix.app.name == 'sveltekit' && '4173' || (matrix.app.name == 'astro' && '4321' || '3000') }}" NEXT_CANARY: ${{ matrix.app.canary && '1' || '' }} + - name: Run Low-Concurrency Worker-Slot Test + if: ${{ !matrix.app.canary && matrix.app.name == 'nextjs-turbopack' }} + run: | + cd "${{ steps.prepare-workbench.outputs.workbench_app_path }}" && PORT=3001 WORKFLOW_POSTGRES_WORKER_CONCURRENCY=1 pnpm start & + echo "starting low-concurrency tests in 10 seconds" && sleep 10 + pnpm vitest run packages/core/e2e/e2e.test.ts -t "frees worker slots for unrelated workflows while a waiter is blocked" + env: + NODE_OPTIONS: "--enable-source-maps" + APP_NAME: ${{ matrix.app.name }} + WORKBENCH_APP_PATH: ${{ steps.prepare-workbench.outputs.workbench_app_path }} + DEPLOYMENT_URL: "http://localhost:3001" + WORKFLOW_LIMITS_LOW_CONCURRENCY: "1" + - name: Generate E2E summary if: always() run: node .github/scripts/aggregate-e2e-results.js . --job-name "E2E Local Postgres (${{ matrix.app.name }})" >> $GITHUB_STEP_SUMMARY || true diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 5f7aefc97e..d20f73e037 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -283,8 +283,40 @@ describe('e2e', () => { const runB = await start(workflow, [userId, holdMs, 'B']); return await Promise.all([runA.returnValue, runB.returnValue]); }, - async runWorkflowRateLimitContention(userId, holdMs, periodMs) { - const workflow = await e2e('workflowRateLimitContentionWorkflow'); + async runWorkflowExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('workflowLeakedLockWorkflow'); + const waiterWorkflow = await e2e( + 'workflowOnlyLockContentionWorkflow' + ); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [userId, 0, 'B']); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runStepExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('stepLeakedLockWorkflow'); + const waiterWorkflow = await e2e('stepKeyLockContentionWorkflow'); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [ + leakedResult.key, + 0, + 'B', + ]); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runWorkflowMixedLimitContention(userId, holdMs, periodMs) { + const workflow = await e2e('workflowMixedLimitContentionWorkflow'); const runA = await start(workflow, [userId, holdMs, periodMs, 'A']); await sleep(100); const runB = await start(workflow, [userId, holdMs, periodMs, 'B']); @@ -325,6 +357,74 @@ describe('e2e', () => { const runB = await start(workflow, ['user-b', holdMs]); return await Promise.all([runA.returnValue, runB.returnValue]); }, + async runIndependentStepKeys(holdMs) { + const workflow = await e2e('stepKeyLockContentionWorkflow'); + const runA = await start(workflow, [ + 'step:db:isolation:a', + holdMs, + 'A', + ]); + await sleep(100); + const runB = await start(workflow, [ + 'step:db:isolation:b', + holdMs, + 'B', + ]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runBlockedWaiterWithUnrelatedWorkflow(holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [ + 'worker-slot-shared', + holdMs, + 'A', + ]); + await sleep(100); + const runB = await start(workflow, [ + 'worker-slot-shared', + holdMs, + 'B', + ]); + await sleep(100); + const runC = await start(workflow, [ + 'worker-slot-unrelated', + Math.max(100, Math.floor(holdMs / 4)), + 'C', + ]); + + const [holder, waiter, unrelated] = await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + return { holder, waiter, unrelated }; + }, + async runMidStepLockContract(holdMs) { + const holderWorkflow = await e2e('stepKeyLockContentionWorkflow'); + const waiterWorkflow = await e2e('midStepLockContentionWorkflow'); + const traceToken = `mid-step-${Date.now()}-${Math.random() + .toString(36) + .slice(2)}`; + const key = `step:db:mid-step:${traceToken}`; + + const holderRun = await start(holderWorkflow, [ + key, + holdMs, + 'holder', + ]); + await sleep(100); + const waiterRun = await start(waiterWorkflow, [ + key, + traceToken, + 'waiter', + ]); + + const [holder, waiter] = await Promise.all([ + holderRun.returnValue, + waiterRun.returnValue, + ]); + return { holder, waiter }; + }, }) ); } diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 2e248bb516..6428422dbb 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,6 +1,6 @@ import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; import { createLocalWorld } from './index.js'; -import { mkdtemp, rm } from 'node:fs/promises'; +import { mkdtemp, readFile, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; @@ -15,6 +15,40 @@ createLimitsContractSuite('local world limits', async () => { return { limits: world.limits, storage: world, + inspectKeyState: async (key) => { + const statePath = path.join(dir, 'limits', 'state.json'); + let raw: { + keys?: Record< + string, + { + leases?: { holderId: string }[]; + waiters?: { holderId: string }[]; + tokens?: { holderId: string }[]; + } + >; + }; + try { + raw = JSON.parse(await readFile(statePath, 'utf8')); + } catch (error) { + const code = (error as NodeJS.ErrnoException).code; + if (code === 'ENOENT') { + return { + leaseHolderIds: [], + waiterHolderIds: [], + tokenHolderIds: [], + }; + } + throw error; + } + + const keyState = raw.keys?.[key]; + return { + leaseHolderIds: keyState?.leases?.map((lease) => lease.holderId) ?? [], + waiterHolderIds: + keyState?.waiters?.map((waiter) => waiter.holderId) ?? [], + tokenHolderIds: keyState?.tokens?.map((token) => token.holderId) ?? [], + }; + }, close: async () => { await world.close?.(); await rm(dir, { recursive: true, force: true }); diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 5d8e1a74f6..e7c8193788 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -46,6 +46,34 @@ if (process.platform === 'win32') { steps: createStepsStorage(db.drizzle), events: createEventsStorage(db.drizzle), }, + inspectKeyState: async (key) => { + const [leases, waiters, tokens] = await Promise.all([ + db.sql<{ holderId: string }[]>` + select holder_id as "holderId" + from workflow.workflow_limit_leases + where limit_key = ${key} + order by holder_id asc + `, + db.sql<{ holderId: string }[]>` + select holder_id as "holderId" + from workflow.workflow_limit_waiters + where limit_key = ${key} + order by created_at asc, holder_id asc + `, + db.sql<{ holderId: string }[]>` + select holder_id as "holderId" + from workflow.workflow_limit_tokens + where limit_key = ${key} + order by acquired_at asc, holder_id asc + `, + ]); + + return { + leaseHolderIds: leases.map((row) => row.holderId), + waiterHolderIds: waiters.map((row) => row.holderId), + tokenHolderIds: tokens.map((row) => row.holderId), + }; + }, }; }); } diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 7e58f682f6..b83680a2f1 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -518,19 +518,22 @@ export function createLimits( // If there are already waiters for this key and holder no need to queue a new waiter. if (existingWaiter) { const now = Date.now(); + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + state.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + state.tokens.length >= parsed.definition.rate.count; return { status: 'blocked', - reason: getBlockedReason( - parsed.definition.concurrency !== undefined, - parsed.definition.rate !== undefined - ), + reason: getBlockedReason(concurrencyBlocked, rateBlocked), retryAfterMs: getRetryAfterMs( state.leases, state.tokens, now, - parsed.definition.concurrency !== undefined, - parsed.definition.rate !== undefined + concurrencyBlocked, + rateBlocked ) ?? 1000, } satisfies LimitAcquireResult; } diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 2a65750181..dba6f7d291 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -9,6 +9,11 @@ import { describe, expect, it } from 'vitest'; export interface LimitsHarness { limits: Limits; storage?: Pick; + inspectKeyState: (key: string) => Promise<{ + leaseHolderIds: string[]; + waiterHolderIds: string[]; + tokenHolderIds: string[]; + }>; close?: () => Promise; } @@ -114,6 +119,31 @@ export function createLimitsContractSuite( } }); + it('isolates unrelated keys at the raw limits layer', async () => { + const harness = await createHarness(); + try { + const [first, second] = await Promise.all([ + harness.limits.acquire({ + key: 'workflow:user:a', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }), + harness.limits.acquire({ + key: 'workflow:user:b', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }), + ]); + + expect(first.status).toBe('acquired'); + expect(second.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + it('serializes concurrent acquires for the same key', async () => { const harness = await createHarness(); try { @@ -196,12 +226,13 @@ export function createLimitsContractSuite( it('returns a combined blocked reason when both limits are saturated', async () => { const harness = await createHarness(); try { + const periodMs = 300; const first = await harness.limits.acquire({ key: 'step:mixed', holderId: 'holder-a', definition: { concurrency: { max: 1 }, - rate: { count: 1, periodMs: 1_000 }, + rate: { count: 1, periodMs }, }, leaseTtlMs: 1_000, }); @@ -214,7 +245,7 @@ export function createLimitsContractSuite( holderId: 'holder-b', definition: { concurrency: { max: 1 }, - rate: { count: 1, periodMs: 1_000 }, + rate: { count: 1, periodMs }, }, leaseTtlMs: 1_000, }); @@ -222,19 +253,96 @@ export function createLimitsContractSuite( status: 'blocked', reason: 'concurrency_and_rate', }); + if (second.status !== 'blocked') throw new Error('expected blocked'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-b', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 1_000, + }); + expect(third).toMatchObject({ + status: 'blocked', + reason: 'rate', + }); + + let fourth = third; + const deadline = Date.now() + periodMs + 1_000; + while (fourth.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, fourth.retryAfterMs ?? 0) + 50); + fourth = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-b', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 1_000, + }); + } + + expect(fourth.status).toBe('acquired'); } finally { await harness.close?.(); } }); - it('restores capacity when a lease is released or expires', async () => { + it('restores capacity immediately when a lease is released', async () => { const harness = await createHarness(); try { const first = await harness.limits.acquire({ key: 'workflow:user:123', holderId: 'holder-a', definition: { concurrency: { max: 1 } }, - leaseTtlMs: 500, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('blocked'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('extends lease expiry when heartbeated', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:heartbeat', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 200, }); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') @@ -242,21 +350,55 @@ export function createLimitsContractSuite( const heartbeat = await harness.limits.heartbeat({ leaseId: first.lease.leaseId, - ttlMs: 1_000, + ttlMs: 600, }); + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( first.lease.expiresAt?.getTime() ?? 0 ); - await sleep(1_100); + const second = await harness.limits.acquire({ + key: 'workflow:user:heartbeat', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('blocked'); + } finally { + await harness.close?.(); + } + }); + + it('reclaims expired leases without manual cleanup', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:expired', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 250, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); const second = await harness.limits.acquire({ - key: 'workflow:user:123', + key: 'workflow:user:expired', holderId: 'holder-b', definition: { concurrency: { max: 1 } }, leaseTtlMs: 1_000, }); - expect(second.status).toBe('acquired'); + expect(second.status).toBe('blocked'); + + await sleep(400); + + const third = await harness.limits.acquire({ + key: 'workflow:user:expired', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(third.status).toBe('acquired'); } finally { await harness.close?.(); } @@ -288,6 +430,21 @@ export function createLimitsContractSuite( holderId: first.lease.holderId, }, }); + + if (!harness.inspectKeyState) { + throw new Error( + 'inspectKeyState is required for duplicate lease checks' + ); + } + const keyState = await harness.inspectKeyState( + 'workflow:user:reacquire' + ); + expect( + keyState.leaseHolderIds.filter((holderId) => holderId === 'holder-a') + ).toHaveLength(1); + expect( + keyState.waiterHolderIds.filter((holderId) => holderId === 'holder-a') + ).toHaveLength(0); } finally { await harness.close?.(); } @@ -513,5 +670,81 @@ export function createLimitsContractSuite( await harness.close?.(); } }); + + it('does not duplicate a replayed blocked holder waiter or lease', async () => { + const harness = await createHarness(); + try { + const key = 'workflow:user:replay'; + const blockedHolderId = 'wflock_wrun_replay:corr_replay:holder_replay'; + + const first = await harness.limits.acquire({ + key, + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const blockedA = await harness.limits.acquire({ + key, + holderId: blockedHolderId, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const blockedB = await harness.limits.acquire({ + key, + holderId: blockedHolderId, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(blockedA.status).toBe('blocked'); + expect(blockedB.status).toBe('blocked'); + + const blockedState = await harness.inspectKeyState(key); + expect( + blockedState.waiterHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(1); + expect( + blockedState.leaseHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(0); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const acquired = await harness.limits.acquire({ + key, + holderId: blockedHolderId, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(acquired.status).toBe('acquired'); + if (acquired.status !== 'acquired') + throw new Error('expected replayed holder acquisition'); + + const acquiredState = await harness.inspectKeyState(key); + expect( + acquiredState.waiterHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(0); + expect( + acquiredState.leaseHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(1); + } finally { + await harness.close?.(); + } + }); }); } diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts index 60bfe5789d..4627023ba9 100644 --- a/packages/world-testing/src/limits-runtime.ts +++ b/packages/world-testing/src/limits-runtime.ts @@ -9,6 +9,7 @@ type WorkflowLockContentionResult = { type StepLockNoRetriesResult = { label: string; + key?: string; attempt: number; acquiredAt: number; releasedAt: number; @@ -27,6 +28,32 @@ type WorkflowRateLimitResult = { periodMs: number; }; +type WorkflowLeakedLockResult = { + label: string; + key: string; + leaseTtlMs: number; + workflowLockAcquiredAt: number; + workflowCompletedAt: number; +}; + +type StepLeakedLockResult = { + label: string; + key: string; + leaseTtlMs: number; + stepLockAcquiredAt: number; + workflowCompletedAt: number; +}; + +type MidStepLockResult = { + label: string; + key: string; + attempt: number; + lockAcquiredAt: number; + preLockEffects: number; + postLockEffects: number; + trace: string[]; +}; + export interface LimitsRuntimeHarness { runWorkflowWithWorkflowAndStepLocks(userId: string): Promise<{ workflowKey: string; @@ -48,7 +75,15 @@ export interface LimitsRuntimeHarness { userId: string, holdMs: number ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; - runWorkflowRateLimitContention( + runWorkflowExpiredLeaseRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[WorkflowLeakedLockResult, WorkflowOnlyLockResult]>; + runStepExpiredLeaseRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[StepLeakedLockResult, StepLockNoRetriesResult]>; + runWorkflowMixedLimitContention( userId: string, holdMs: number, periodMs: number @@ -70,6 +105,18 @@ export interface LimitsRuntimeHarness { runIndependentWorkflowKeys( holdMs: number ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; + runIndependentStepKeys( + holdMs: number + ): Promise<[StepLockNoRetriesResult, StepLockNoRetriesResult]>; + runBlockedWaiterWithUnrelatedWorkflow(holdMs: number): Promise<{ + holder: WorkflowOnlyLockResult; + waiter: WorkflowOnlyLockResult; + unrelated: WorkflowOnlyLockResult; + }>; + runMidStepLockContract(holdMs: number): Promise<{ + holder: StepLockNoRetriesResult; + waiter: MidStepLockResult; + }>; } export function createLimitsRuntimeSuite( @@ -156,11 +203,43 @@ export function createLimitsRuntimeSuite( ).toBeLessThan(4_000); }); - it('wakes rate-limited waiters only after the rate window expires', async () => { + it('reclaims expired leaked workflow leases without manual cleanup', async () => { + const harness = await createHarness(); + const leaseTtlMs = 1_250; + const [resultA, resultB] = await harness.runWorkflowExpiredLeaseRecovery( + 'expired-workflow-user', + leaseTtlMs + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockAcquiredAt + ).toBeGreaterThanOrEqual(leaseTtlMs - 100); + }); + + it('reclaims expired leaked step leases without manual cleanup', async () => { + const harness = await createHarness(); + const leaseTtlMs = 1_250; + const [resultA, resultB] = await harness.runStepExpiredLeaseRecovery( + 'expired-step-user', + leaseTtlMs + ); + + expect(resultB.acquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + expect( + resultB.acquiredAt - resultA.stepLockAcquiredAt + ).toBeGreaterThanOrEqual(leaseTtlMs - 100); + }); + + it('keeps mixed concurrency and rate waiters blocked until the rate window expires', async () => { const harness = await createHarness(); const holdMs = 250; const periodMs = 1_500; - const [resultA, resultB] = await harness.runWorkflowRateLimitContention( + const [resultA, resultB] = await harness.runWorkflowMixedLimitContention( 'shared-user', holdMs, periodMs @@ -214,5 +293,43 @@ export function createLimitsRuntimeSuite( resultA.workflowLockReleasedAt ); }); + + it('does not block unrelated step keys', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runIndependentStepKeys(1_000); + + expect(resultB.acquiredAt).toBeLessThan(resultA.releasedAt); + }); + + it.skipIf(process.env.WORKFLOW_LIMITS_LOW_CONCURRENCY !== '1')( + 'frees worker slots for unrelated workflows while a waiter is blocked', + async () => { + const harness = await createHarness(); + const { holder, waiter, unrelated } = + await harness.runBlockedWaiterWithUnrelatedWorkflow(1_500); + + expect(waiter.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + holder.workflowLockReleasedAt + ); + expect(unrelated.workflowLockReleasedAt).toBeLessThan( + waiter.workflowLockAcquiredAt + ); + } + ); + + it('replays a mid-step lock at the acquire boundary without duplicating post-lock effects', async () => { + const harness = await createHarness(); + const { holder, waiter } = await harness.runMidStepLockContract(1_500); + + expect(waiter.lockAcquiredAt).toBeGreaterThanOrEqual(holder.releasedAt); + expect(waiter.preLockEffects).toBe(2); + expect(waiter.postLockEffects).toBe(1); + expect(waiter.trace.map((event) => event.split(':')[0])).toEqual([ + 'pre', + 'pre', + 'lock', + 'post', + ]); + }); }); } diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 565e258d71..07cc69168f 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -150,20 +150,22 @@ workflow scope, even though the workflow may suspend and resume many times. The current behavior is: -- declare the limit at the top of the step +- declare the limit at the top of the step when possible - the runtime treats a blocked acquisition as step-boundary admission failure - the step does not keep executing user code while waiting for capacity - the step is re-queued and retried after promotion or timeout - lease is disposed automatically when the step attempt completes -Important caveat: +If `lock()` is called in the middle of a step, the intended contract is: -- zero-attempt semantics are only guaranteed when `lock()` is used as a top-of-step admission gate -- calling `lock()` after side effects or meaningful user work is unsupported/best-effort +- the current attempt stops at the blocked `lock()` call +- the step is deferred and re-queued rather than polling in-process +- code before the blocked `lock()` may replay on the next attempt +- code after the `lock()` runs only after the lock is actually acquired -This means step `lock()` is conceptually the same API, but it is not a literal -"spin inside already-running user step code until capacity appears" -implementation. +This means zero-attempt semantics are still strongest when `lock()` is used as +a top-of-step admission gate, but mid-step `lock()` is now part of the shared +runtime contract rather than unsupported behavior. ### 6. `await using` is the preferred user-facing shape @@ -200,9 +202,8 @@ The lease must not be disposed merely because one host process invocation ends. Current preferred model: - workflow-level limits may be held by a run -- step-level limits are acquired only at step boundaries +- blocked step-level limits return control to the runtime at the step boundary - step-level limits are short-lived -- step code should not acquire additional locks dynamically - step execution should not wait on workflow-level locks This keeps the dependency direction one-way: @@ -280,7 +281,8 @@ More concretely: - if a workflow is parked waiting for a step-level limit, it still counts as active for its workflow-level lock - a step-level lock should conceptually be an admission gate for the step - attempt, not a second workflow-level lock + attempt, not a second workflow-level lock, even when the `lock()` call + appears in the middle of user code - step-level rate limits should consume rate capacity when the step starts, and that rate usage should remain counted until the window expires even if the step releases its lease quickly diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 0d3db80e83..338dc863f3 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -261,15 +261,75 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { }; } -async function serializedLimitStep(label: string, holdMs: number) { +type LimitTraceState = { + events: string[]; +}; + +function sanitizeLimitTraceToken(traceToken: string) { + return traceToken.replace(/[^a-zA-Z0-9_-]/g, '_'); +} + +async function getLimitTracePath(traceToken: string) { + const path = await import('node:path'); + return path.join( + process.cwd(), + '.workflow-e2e', + `limits-${sanitizeLimitTraceToken(traceToken)}.json` + ); +} + +async function readLimitTraceState( + traceToken: string +): Promise { + const { mkdir, readFile } = await import('node:fs/promises'); + const path = await import('node:path'); + const tracePath = await getLimitTracePath(traceToken); + await mkdir(path.dirname(tracePath), { recursive: true }); + + try { + return JSON.parse(await readFile(tracePath, 'utf8')) as LimitTraceState; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + return { events: [] }; + } + throw error; + } +} + +async function writeLimitTraceState( + traceToken: string, + state: LimitTraceState +) { + const { mkdir, writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + const tracePath = await getLimitTracePath(traceToken); + await mkdir(path.dirname(tracePath), { recursive: true }); + await writeFile(tracePath, JSON.stringify(state), 'utf8'); +} + +async function appendLimitTraceEvent(traceToken: string, event: string) { + const state = await readLimitTraceState(traceToken); + const nextState = { + events: [...state.events, event], + }; + await writeLimitTraceState(traceToken, nextState); + return nextState.events; +} + +async function serializedLimitStep( + label: string, + holdMs: number, + key = 'step:db:serialized' +) { 'use step'; const stepLock = await lock({ - key: 'step:db:serialized', + key, concurrency: { max: 1 }, leaseTtlMs: holdMs + 5_000, }); + const metadata = getStepMetadata(); const acquiredAt = Date.now(); await new Promise((resolve) => setTimeout(resolve, holdMs)); await stepLock.dispose(); @@ -277,6 +337,8 @@ async function serializedLimitStep(label: string, holdMs: number) { return { label, + key, + attempt: metadata.attempt, acquiredAt, releasedAt, }; @@ -308,11 +370,15 @@ export async function workflowLockContentionWorkflow( }; } -async function stepLockNoRetriesStep(label: string, holdMs: number) { +async function stepLockNoRetriesStep( + label: string, + holdMs: number, + key = 'step:db:no-retries' +) { 'use step'; await using _stepLock = await lock({ - key: 'step:db:no-retries', + key, concurrency: { max: 1 }, leaseTtlMs: holdMs + 5_000, }); @@ -324,6 +390,7 @@ async function stepLockNoRetriesStep(label: string, holdMs: number) { return { label, + key, attempt: metadata.attempt, acquiredAt, releasedAt, @@ -341,6 +408,16 @@ export async function stepLockNoRetriesContentionWorkflow( return await stepLockNoRetriesStep(label, holdMs); } +export async function stepKeyLockContentionWorkflow( + key = 'step:db:key-contention', + holdMs = 750, + label = key +) { + 'use workflow'; + + return await stepLockNoRetriesStep(label, holdMs, key); +} + ////////////////////////////////////////////////////////// export async function workflowOnlyLockContentionWorkflow( @@ -368,6 +445,69 @@ export async function workflowOnlyLockContentionWorkflow( }; } +export async function workflowLeakedLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId +) { + 'use workflow'; + + const leakedWorkflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs, + }); + + const workflowLockAcquiredAt = Date.now(); + + return { + label, + userId, + key: leakedWorkflowLock.key, + leaseTtlMs, + leakedLeaseId: leakedWorkflowLock.leaseId, + workflowLockAcquiredAt, + workflowCompletedAt: Date.now(), + }; +} + +async function leakedStepLockStep( + key: string, + leaseTtlMs: number, + label: string +) { + 'use step'; + + const leakedStepLock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs, + }); + + return { + label, + key, + leaseTtlMs, + leakedLeaseId: leakedStepLock.leaseId, + stepLockAcquiredAt: Date.now(), + workflowCompletedAt: Date.now(), + }; +} + +export async function stepLeakedLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId +) { + 'use workflow'; + + return await leakedStepLockStep( + `step:db:expired:${userId}`, + leaseTtlMs, + label + ); +} + export async function workflowRateLimitContentionWorkflow( userId = 'user-123', holdMs = 250, @@ -395,6 +535,72 @@ export async function workflowRateLimitContentionWorkflow( }; } +export async function workflowMixedLimitContentionWorkflow( + userId = 'user-123', + holdMs = 250, + periodMs = 1_500, + label = userId +) { + 'use workflow'; + + await using _mixedLimit = await lock({ + key: `workflow:mixed:${userId}`, + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + + const workflowRateAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowRateReleasedAt = Date.now(); + + return { + label, + userId, + periodMs, + workflowRateAcquiredAt, + workflowRateReleasedAt, + }; +} + +async function midStepLockStep(key: string, traceToken: string, label: string) { + 'use step'; + + const { attempt } = getStepMetadata(); + await appendLimitTraceEvent(traceToken, `pre:${attempt}`); + + await using _midStepLock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: 5_000, + }); + + const lockAcquiredAt = Date.now(); + await appendLimitTraceEvent(traceToken, `lock:${attempt}`); + const trace = await appendLimitTraceEvent(traceToken, `post:${attempt}`); + + return { + label, + key, + attempt, + lockAcquiredAt, + preLockEffects: trace.filter((event) => event.startsWith('pre:')).length, + postLockEffects: trace.filter((event) => event.startsWith('post:')).length, + trace, + }; +} +midStepLockStep.maxRetries = 0; + +export async function midStepLockContentionWorkflow( + key = 'step:db:mid-step', + traceToken = 'mid-step', + label = key +) { + 'use workflow'; + + return await midStepLockStep(key, traceToken, label); +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From eabe5ef14e264419107565fcca1ad220e06628cc Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 16:15:07 -0400 Subject: [PATCH 14/34] fixed type error Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-contract.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index dba6f7d291..515bbf7fc4 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -209,7 +209,7 @@ export function createLimitsContractSuite( }); const deadline = Date.now() + periodMs + 1_000; while (third.status === 'blocked' && Date.now() < deadline) { - await sleep(Math.max(25, third.retryAfterMs) + 50); + await sleep(Math.max(25, third.retryAfterMs ?? 0) + 50); third = await harness.limits.acquire({ key: 'step:provider:openai', holderId: 'holder-c', From b8480d394d9f7d2883f380969feb488fcad16d7e Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 17:45:21 -0400 Subject: [PATCH 15/34] fix ci issues Signed-off-by: nathancolosimo --- workbench/example/tsconfig.json | 4 ++-- workbench/example/workflows/99_e2e.ts | 8 ++++---- workbench/example/workflows/serde-steps.ts | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/workbench/example/tsconfig.json b/workbench/example/tsconfig.json index 4e131954f0..58fb97394f 100644 --- a/workbench/example/tsconfig.json +++ b/workbench/example/tsconfig.json @@ -1,7 +1,7 @@ { "compilerOptions": { "target": "es2022", - "module": "NodeNext", + "module": "esnext", "lib": ["dom", "dom.iterable", "esnext"], "baseUrl": ".", "allowJs": true, @@ -9,7 +9,7 @@ "strict": true, "noEmit": true, "esModuleInterop": true, - "moduleResolution": "NodeNext", + "moduleResolution": "bundler", "resolveJsonModule": true, "isolatedModules": true, "jsx": "preserve", diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 338dc863f3..b85e49cf3e 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -1,6 +1,6 @@ // Test path alias resolution - imports a helper from outside the workbench directory /** biome-ignore-all lint/complexity/noStaticOnlyClass: */ -import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test.js'; +import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test'; import { createHook, createWebhook, @@ -15,8 +15,8 @@ import { sleep, } from 'workflow'; import { getRun, start } from 'workflow/api'; -import { importedStepOnly } from './_imported_step_only.js'; -import { callThrower, stepThatThrowsFromHelper } from './helpers.js'; +import { importedStepOnly } from './_imported_step_only'; +import { callThrower, stepThatThrowsFromHelper } from './helpers'; ////////////////////////////////////////////////////////// @@ -1538,7 +1538,7 @@ import { createVector, scaleVector, sumVectors, -} from './serde-steps.js'; +} from './serde-steps'; /** * Workflow that tests cross-context class registration. diff --git a/workbench/example/workflows/serde-steps.ts b/workbench/example/workflows/serde-steps.ts index 9726bbe6c0..227de88399 100644 --- a/workbench/example/workflows/serde-steps.ts +++ b/workbench/example/workflows/serde-steps.ts @@ -6,7 +6,7 @@ * step calls. This tests cross-context class registration. */ -import { Vector } from './serde-models.js'; +import { Vector } from './serde-models'; /** * Step that receives a Vector and scales it. From a6b603a0d940adffe20c332619779a9b38f49870 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Fri, 20 Mar 2026 19:37:16 -0400 Subject: [PATCH 16/34] Removed step lock and added lock index Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 67 +- .../async-deserialization-ordering.test.ts | 1 + .../core/src/hook-sleep-interaction.test.ts | 1 + packages/core/src/lock.test.ts | 40 +- packages/core/src/lock.ts | 19 +- packages/core/src/private.ts | 1 + .../core/src/runtime/step-handler.test.ts | 115 +-- packages/core/src/runtime/step-handler.ts | 234 +----- packages/core/src/step.test.ts | 37 +- packages/core/src/step.ts | 5 - packages/core/src/step/context-storage.ts | 3 - packages/core/src/step/lock.ts | 103 --- packages/core/src/symbols.ts | 1 - packages/core/src/workflow.ts | 1 + packages/core/src/workflow/hook.test.ts | 1 + packages/core/src/workflow/lock.ts | 34 +- packages/core/src/workflow/sleep.test.ts | 1 + packages/world-local/src/limits.test.ts | 12 +- packages/world-local/src/limits.ts | 151 ++-- packages/world-local/src/storage.test.ts | 77 -- .../world-local/src/storage/events-storage.ts | 32 +- packages/world-postgres/README.md | 2 +- packages/world-postgres/src/limits.test.ts | 23 +- packages/world-postgres/src/limits.ts | 171 +--- packages/world-postgres/src/storage.ts | 59 +- packages/world-postgres/test/storage.test.ts | 116 --- packages/world-testing/src/limits-contract.ts | 795 +++++++++--------- packages/world-testing/src/limits-runtime.ts | 129 ++- packages/world-vercel/src/limits.test.ts | 44 +- packages/world/FLOW_LIMITS.md | 136 ++- packages/world/src/events.ts | 17 - packages/world/src/index.ts | 4 + packages/world/src/limits.ts | 41 +- workbench/example/workflows/99_e2e.ts | 257 ++---- 34 files changed, 904 insertions(+), 1826 deletions(-) delete mode 100644 packages/core/src/step/lock.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index d20f73e037..77a5960231 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -249,11 +249,10 @@ describe('e2e', () => { createLimitsRuntimeSuite( `limits runtime (${isPostgresWorld ? 'postgres' : 'local'})`, async () => ({ - async runWorkflowWithWorkflowAndStepLocks(userId) { - const run = await start( - await e2e('workflowWithWorkflowAndStepLocks'), - [userId] - ); + async runWorkflowWithScopedLocks(userId) { + const run = await start(await e2e('workflowWithScopedLocks'), [ + userId, + ]); return await run.returnValue; }, async runWorkflowLockContention(userId, holdMs) { @@ -263,18 +262,17 @@ describe('e2e', () => { const runB = await start(workflow, [userId, holdMs]); return await Promise.all([runA.returnValue, runB.returnValue]); }, - async runStepLockNoRetriesContention(userId, holdMs) { - const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); - const runA = await start(workflow, [userId, holdMs, 'A']); - await sleep(100); - const runB = await start(workflow, [userId, holdMs, 'B']); + async runLockedStepCallContention( + key, + holdMs, + labelA = 'A', + labelB = 'B' + ) { + const workflow = await e2e('lockedStepCallContentionWorkflow'); + const runA = await start(workflow, [key, holdMs, labelA]); await sleep(100); - const runC = await start(workflow, [userId, holdMs, 'C']); - return await Promise.all([ - runA.returnValue, - runB.returnValue, - runC.returnValue, - ]); + const runB = await start(workflow, [key, holdMs, labelB]); + return await Promise.all([runA.returnValue, runB.returnValue]); }, async runWorkflowLockAcrossSuspension(userId, holdMs) { const workflow = await e2e('workflowOnlyLockContentionWorkflow'); @@ -298,9 +296,9 @@ describe('e2e', () => { const waiterResult = await waiterRun.returnValue; return [leakedResult, waiterResult]; }, - async runStepExpiredLeaseRecovery(userId, leaseTtlMs) { - const leakedWorkflow = await e2e('stepLeakedLockWorkflow'); - const waiterWorkflow = await e2e('stepKeyLockContentionWorkflow'); + async runLeakedKeyExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('leakedKeyLockWorkflow'); + const waiterWorkflow = await e2e('lockedStepCallContentionWorkflow'); const leakedRun = await start(leakedWorkflow, [ userId, leaseTtlMs, @@ -358,7 +356,7 @@ describe('e2e', () => { return await Promise.all([runA.returnValue, runB.returnValue]); }, async runIndependentStepKeys(holdMs) { - const workflow = await e2e('stepKeyLockContentionWorkflow'); + const workflow = await e2e('lockedStepCallContentionWorkflow'); const runA = await start(workflow, [ 'step:db:isolation:a', holdMs, @@ -399,31 +397,10 @@ describe('e2e', () => { ]); return { holder, waiter, unrelated }; }, - async runMidStepLockContract(holdMs) { - const holderWorkflow = await e2e('stepKeyLockContentionWorkflow'); - const waiterWorkflow = await e2e('midStepLockContentionWorkflow'); - const traceToken = `mid-step-${Date.now()}-${Math.random() - .toString(36) - .slice(2)}`; - const key = `step:db:mid-step:${traceToken}`; - - const holderRun = await start(holderWorkflow, [ - key, - holdMs, - 'holder', - ]); - await sleep(100); - const waiterRun = await start(waiterWorkflow, [ - key, - traceToken, - 'waiter', - ]); - - const [holder, waiter] = await Promise.all([ - holderRun.returnValue, - waiterRun.returnValue, - ]); - return { holder, waiter }; + async runWorkflowSingleLockAcrossMultipleSteps(holdMs) { + const workflow = await e2e('singleLockAcrossMultipleStepsWorkflow'); + const run = await start(workflow, ['step:db:batch', holdMs]); + return await run.returnValue; }, }) ); diff --git a/packages/core/src/async-deserialization-ordering.test.ts b/packages/core/src/async-deserialization-ordering.test.ts index 0774b7d9d8..463a661ec0 100644 --- a/packages/core/src/async-deserialization-ordering.test.ts +++ b/packages/core/src/async-deserialization-ordering.test.ts @@ -36,6 +36,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/hook-sleep-interaction.test.ts b/packages/core/src/hook-sleep-interaction.test.ts index a706628b81..9ec1bca88d 100644 --- a/packages/core/src/hook-sleep-interaction.test.ts +++ b/packages/core/src/hook-sleep-interaction.test.ts @@ -42,6 +42,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => promiseQueueHolder.current, }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index c9237066e3..9cc1e2dcee 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -1,10 +1,14 @@ import { afterEach, describe, expect, it, vi } from 'vitest'; -import { lock, LIMITS_NOT_IMPLEMENTED_MESSAGE } from './lock.js'; -import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; +import { + lock, + LIMITS_NOT_IMPLEMENTED_MESSAGE, + LOCK_WORKFLOW_ONLY_MESSAGE, +} from './lock.js'; +import { contextStorage } from './step/context-storage.js'; +import { WORKFLOW_LOCK } from './symbols.js'; afterEach(() => { delete (globalThis as any)[WORKFLOW_LOCK]; - delete (globalThis as any)[STEP_LOCK]; }); describe('lock', () => { @@ -20,9 +24,7 @@ describe('lock', () => { it('prefers the workflow runtime lock when both runtimes are present', async () => { const workflowHandle = { leaseId: 'lease_workflow' }; const workflowLock = vi.fn().mockResolvedValue(workflowHandle); - const stepLock = vi.fn().mockResolvedValue({ leaseId: 'lease_step' }); (globalThis as any)[WORKFLOW_LOCK] = workflowLock; - (globalThis as any)[STEP_LOCK] = stepLock; const options = { key: 'workflow:user:test', concurrency: { max: 1 }, @@ -30,19 +32,33 @@ describe('lock', () => { await expect(lock(options)).resolves.toBe(workflowHandle); expect(workflowLock).toHaveBeenCalledWith(options); - expect(stepLock).not.toHaveBeenCalled(); }); - it('falls back to the step runtime lock when no workflow runtime is present', async () => { - const handle = { leaseId: 'lease_step' }; - const stepLock = vi.fn().mockResolvedValue(handle); - (globalThis as any)[STEP_LOCK] = stepLock; + it('throws a workflow-only error when called inside a step context', async () => { const options = { key: 'step:db:cheap', concurrency: { max: 2 }, }; - await expect(lock(options)).resolves.toBe(handle); - expect(stepLock).toHaveBeenCalledWith(options); + await expect( + contextStorage.run( + { + stepMetadata: { + stepId: 'step_test', + stepName: 'testStep', + stepStartedAt: new Date(), + attempt: 1, + }, + workflowMetadata: { + workflowName: 'testWorkflow', + workflowRunId: 'wrun_test', + workflowStartedAt: new Date(), + url: 'http://localhost:3000', + }, + ops: [], + }, + () => lock(options) + ) + ).rejects.toThrow(LOCK_WORKFLOW_ONLY_MESSAGE); }); }); diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index 11829957d0..9791c39e13 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -4,10 +4,14 @@ import { type LimitKey, type LimitLease, } from '@workflow/world'; -import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; +import { contextStorage } from './step/context-storage.js'; +import { WORKFLOW_LOCK } from './symbols.js'; export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; +export const LOCK_WORKFLOW_ONLY_MESSAGE = + '`lock()` is only supported in workflow functions. Wrap the step call with `await using` in workflow code.'; + /** * Reserved first-pass user-facing API for future flow concurrency and rate * limiting inside workflow functions. @@ -21,7 +25,10 @@ export interface LockOptions extends LimitDefinition { * Reserved handle shape for future lock acquisition. */ export interface LockHandle - extends Pick { + extends Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' + > { dispose(): Promise; heartbeat(ttlMs?: number): Promise; [Symbol.asyncDispose](): Promise; @@ -39,12 +46,8 @@ export async function lock(options: LockOptions): Promise { return workflowLock(options); } - const stepLock = (globalThis as any)[STEP_LOCK] as - | ((options: LockOptions) => Promise) - | undefined; - - if (stepLock) { - return stepLock(options); + if (contextStorage.getStore()) { + throw new Error(LOCK_WORKFLOW_ONLY_MESSAGE); } throw createLimitsNotImplementedError(); diff --git a/packages/core/src/private.ts b/packages/core/src/private.ts index 0eabc7b70f..ac827aae05 100644 --- a/packages/core/src/private.ts +++ b/packages/core/src/private.ts @@ -93,6 +93,7 @@ export interface WorkflowOrchestratorContext { encryptionKey: CryptoKey | undefined; globalThis: typeof globalThis; eventsConsumer: EventsConsumer; + nextLockIndex: number; /** * Map of pending invocations keyed by correlationId. * Using Map instead of Array for O(1) lookup/delete operations. diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index 1951e7a162..3c7aae614f 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -1,6 +1,5 @@ import { EntityConflictError, WorkflowWorldError } from '@workflow/errors'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; -import { StepLockBlockedError } from '../step/lock.js'; // Use vi.hoisted so these are available in mock factories const { @@ -287,116 +286,16 @@ describe('step-handler 409 handling', () => { mockStepFn.mockResolvedValue('step-result'); }); - it('returns a timeout when a step lock is blocked before user code can proceed', async () => { - mockEventsCreate.mockResolvedValue({ - step: { - stepId: 'step_abc', - status: 'running', - attempt: 1, - startedAt: new Date(), - input: [], - }, - }); - mockStepFn.mockRejectedValue( - new StepLockBlockedError( - { - key: 'step:db:no-retries', - holderId: 'stplock_wrun_test123:step_abc:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }, - 2_500 - ) - ); - - const result = await capturedHandler( - createMessage(), - createMetadata('myStep') - ); - - expect(result).toEqual({ timeoutSeconds: 3 }); - expect(mockQueueMessage).not.toHaveBeenCalled(); - expect(mockEventsCreate).toHaveBeenCalledTimes(2); - expect(mockEventsCreate).toHaveBeenNthCalledWith( - 1, - 'wrun_test123', - expect.objectContaining({ - eventType: 'step_started', - }), - expect.anything() - ); - expect(mockEventsCreate).toHaveBeenNthCalledWith( - 2, - 'wrun_test123', - expect.objectContaining({ - eventType: 'step_deferred', - correlationId: 'step_abc', - eventData: { - retryAfter: expect.any(Date), - lockRequest: expect.objectContaining({ - key: expect.any(String), - holderId: 'stplock_wrun_test123:step_abc:0', - }), - }, - }), - expect.anything() - ); + afterEach(() => { + vi.restoreAllMocks(); }); - it('rechecks a deferred lock before step_started and re-defers without running user code', async () => { - mockEventsListByCorrelationId.mockResolvedValue({ - data: [ - { - eventId: 'evnt_1', - runId: 'wrun_test123', - eventType: 'step_deferred', - correlationId: 'step_abc', - eventData: { - retryAfter: new Date(Date.now() - 1_000), - lockRequest: { - key: 'step:db:no-retries', - holderId: 'stplock_wrun_test123:step_abc:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }, - }, - createdAt: new Date(), - }, - ], - cursor: null, - hasMore: false, - }); - mockLimitsAcquire.mockResolvedValue({ - status: 'blocked', - reason: 'concurrency', - retryAfterMs: 2_500, - }); - - const result = await capturedHandler( - createMessage(), - createMetadata('myStep') - ); + it('does not call limits for ordinary step execution without lock()', async () => { + await capturedHandler(createMessage(), createMetadata('myStep')); - expect(result).toEqual({ timeoutSeconds: 3 }); - expect(mockStepFn).not.toHaveBeenCalled(); - expect(mockLimitsAcquire).toHaveBeenCalledWith({ - key: 'step:db:no-retries', - holderId: 'stplock_wrun_test123:step_abc:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - expect(mockEventsCreate).toHaveBeenCalledTimes(1); - expect(mockEventsCreate).toHaveBeenCalledWith( - 'wrun_test123', - expect.objectContaining({ - eventType: 'step_deferred', - }), - expect.anything() - ); - }); - - afterEach(() => { - vi.restoreAllMocks(); + expect(mockLimitsAcquire).not.toHaveBeenCalled(); + expect(mockLimitsHeartbeat).not.toHaveBeenCalled(); + expect(mockLimitsRelease).not.toHaveBeenCalled(); }); describe('step_completed 409', () => { diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index c0620e31d3..ec60d06b04 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -11,12 +11,7 @@ import { } from '@workflow/errors'; import { pluralize } from '@workflow/utils'; import { getPort } from '@workflow/utils/get-port'; -import { - LimitAcquireRequestSchema, - SPEC_VERSION_CURRENT, - StepInvokePayloadSchema, - type LimitLease, -} from '@workflow/world'; +import { SPEC_VERSION_CURRENT, StepInvokePayloadSchema } from '@workflow/world'; import { importKey } from '../encryption.js'; import { runtimeLogger, stepLogger } from '../logger.js'; import { getStepFunction } from '../private.js'; @@ -25,8 +20,6 @@ import { hydrateStepArguments, } from '../serialization.js'; import { contextStorage } from '../step/context-storage.js'; -import { createStepLock, StepLockBlockedError } from '../step/lock.js'; -import { STEP_LOCK } from '../symbols.js'; import * as Attribute from '../telemetry/semantic-conventions.js'; import { getSpanKind, @@ -52,68 +45,6 @@ import { getWorld, getWorldHandlers } from './world.js'; const DEFAULT_STEP_MAX_RETRIES = 3; -async function getDeferredStepLock( - world: ReturnType, - workflowRunId: string, - stepId: string -) { - let step: Awaited>; - try { - step = await world.steps.get(workflowRunId, stepId); - } catch (error) { - if ( - WorkflowWorldError.is(error) && - (error.status === 404 || error.message === `Step not found: ${stepId}`) - ) { - return null; - } - throw error; - } - if (step.status !== 'pending') { - return null; - } - - const result = await world.events.listByCorrelationId({ - correlationId: stepId, - pagination: { - limit: 1, - sortOrder: 'desc', - }, - }); - const latestEvent = result.data[0]; - - if ( - !latestEvent || - latestEvent.runId !== workflowRunId || - latestEvent.eventType !== 'step_deferred' || - !latestEvent.eventData.lockRequest - ) { - return null; - } - - return { - step, - lockRequest: LimitAcquireRequestSchema.parse( - latestEvent.eventData.lockRequest - ), - }; -} - -async function releaseUnusedPreAcquiredLocks( - world: ReturnType, - preAcquiredLocks: Record -) { - await Promise.all( - Object.values(preAcquiredLocks).map((lease) => - world.limits.release({ - leaseId: lease.leaseId, - key: lease.key, - holderId: lease.holderId, - }) - ) - ); -} - const stepHandler = getWorldHandlers().createQueueHandler( '__wkf_step_', async (message_, metadata) => { @@ -185,56 +116,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( ...Attribute.StepTracePropagated(!!traceContext), }); - const preAcquiredLocks: Record = {}; - const deferredStepLock = await getDeferredStepLock( - world, - workflowRunId, - stepId - ); - if (deferredStepLock) { - const retryAfter = deferredStepLock.step.retryAfter; - if (retryAfter && retryAfter.getTime() > Date.now()) { - const timeoutSeconds = Math.max( - 1, - Math.ceil((retryAfter.getTime() - Date.now()) / 1000) - ); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - return { timeoutSeconds }; - } - - const lockResult = await world.limits.acquire( - deferredStepLock.lockRequest - ); - if (lockResult.status === 'blocked') { - const retryAfterMs = Math.max(1, lockResult.retryAfterMs ?? 1000); - const timeoutSeconds = Math.max( - 1, - Math.ceil(retryAfterMs / 1000) - ); - await world.events.create( - workflowRunId, - { - eventType: 'step_deferred', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - retryAfter: new Date(Date.now() + retryAfterMs), - lockRequest: deferredStepLock.lockRequest, - }, - }, - { requestId } - ); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - return { timeoutSeconds }; - } - - preAcquiredLocks[lockResult.lease.holderId] = lockResult.lease; - } - // step_started validates state and returns the step entity, so no separate // world.steps.get() call is needed. The server checks: // - Step not in terminal state (returns 409) @@ -260,7 +141,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( step = startResult.step; } catch (err) { if (ThrottleError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); const retryRetryAfter = Math.max( 1, typeof err.retryAfter === 'number' ? err.retryAfter : 1 @@ -274,14 +154,12 @@ const stepHandler = getWorldHandlers().createQueueHandler( return { timeoutSeconds: retryRetryAfter }; } if (RunExpiredError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.info( `Workflow run "${workflowRunId}" has already completed, skipping step "${stepId}": ${err.message}` ); return; } if (EntityConflictError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.debug( 'Step in terminal state, re-enqueuing workflow', { @@ -311,7 +189,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( // Too early: retryAfter timestamp not reached yet // Return timeout to queue so it retries later if (TooEarlyError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); const retryAfter = err.retryAfter ?? new Date(Date.now() + 1000); const timeoutSeconds = Math.max( 1, @@ -334,7 +211,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( }); return { timeoutSeconds }; } - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); // Re-throw other errors throw err; } @@ -497,50 +373,35 @@ const stepHandler = getWorldHandlers().createQueueHandler( const executionStartTime = Date.now(); try { - const previousStepLock = (globalThis as any)[STEP_LOCK]; - (globalThis as any)[STEP_LOCK] = createStepLock(world); - result = await trace('step.execute', {}, async () => { - try { - return await contextStorage.run( - { - stepMetadata: { - stepName, - stepId, - stepStartedAt: new Date(+stepStartedAt), - attempt, - }, - workflowMetadata: { - workflowName, - workflowRunId, - workflowStartedAt: new Date(+workflowStartedAt), - // TODO: there should be a getUrl method on the world interface itself. This - // solution only works for vercel + local worlds. - url: isVercel - ? `https://${process.env.VERCEL_URL}` - : `http://localhost:${port ?? 3000}`, - }, - ops, - closureVars: hydratedInput.closureVars, - encryptionKey, - lockCounter: 0, - preAcquiredLocks, + return await contextStorage.run( + { + stepMetadata: { + stepName, + stepId, + stepStartedAt: new Date(+stepStartedAt), + attempt, }, - () => stepFn.apply(thisVal, args) - ); - } finally { - if (previousStepLock === undefined) { - delete (globalThis as any)[STEP_LOCK]; - } else { - (globalThis as any)[STEP_LOCK] = previousStepLock; - } - } + workflowMetadata: { + workflowName, + workflowRunId, + workflowStartedAt: new Date(+workflowStartedAt), + // TODO: there should be a getUrl method on the world interface itself. This + // solution only works for vercel + local worlds. + url: isVercel + ? `https://${process.env.VERCEL_URL}` + : `http://localhost:${port ?? 3000}`, + }, + ops, + closureVars: hydratedInput.closureVars, + encryptionKey, + }, + () => stepFn.apply(thisVal, args) + ); }); } catch (err) { userCodeError = err; userCodeFailed = true; - } finally { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); } const executionTimeMs = Date.now() - executionStartTime; @@ -552,53 +413,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( if (userCodeFailed) { const err = userCodeError; - if (StepLockBlockedError.is(err)) { - const retryAfterMs = Math.max(1, err.retryAfterMs ?? 1000); - const timeoutSeconds = Math.max( - 1, - Math.ceil(retryAfterMs / 1000) - ); - const retryAfter = new Date(Date.now() + retryAfterMs); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - span?.addEvent?.('step.lock_blocked', { - 'retry.timeout_seconds': timeoutSeconds, - 'step.id': stepId, - 'step.name': stepName, - }); - try { - await world.events.create( - workflowRunId, - { - eventType: 'step_deferred', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - retryAfter, - lockRequest: err.request, - }, - }, - { requestId } - ); - } catch (stepDeferredErr) { - if (EntityConflictError.is(stepDeferredErr)) { - runtimeLogger.info( - 'Tried deferring step, but step has already finished.', - { - workflowRunId, - stepId, - stepName, - message: stepDeferredErr.message, - } - ); - return; - } - throw stepDeferredErr; - } - return { timeoutSeconds }; - } - // Infrastructure errors that somehow surfaced through user code // should propagate to the queue handler for retry, not consume // step attempts. diff --git a/packages/core/src/step.test.ts b/packages/core/src/step.test.ts index 5a0e47af56..506def4361 100644 --- a/packages/core/src/step.test.ts +++ b/packages/core/src/step.test.ts @@ -26,6 +26,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), // All generated ulids use the workflow's started at time generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => @@ -412,42 +413,6 @@ describe('createUseStep', () => { expect(ctx.invocationsQueue.size).toBe(1); }); - it('should consume step_deferred event and continue waiting', async () => { - const ctx = setupWorkflowContext([ - { - eventId: 'evnt_0', - runId: 'wrun_123', - eventType: 'step_deferred', - correlationId: 'step_01K11TFZ62YS0YYFDQ3E8B9YCV', - eventData: { - retryAfter: new Date(), - }, - createdAt: new Date(), - }, - ]); - - let workflowErrorReject: (err: Error) => void; - const workflowErrorPromise = new Promise((_, reject) => { - workflowErrorReject = reject; - }); - ctx.onWorkflowError = (err) => { - workflowErrorReject(err); - }; - - const useStep = createUseStep(ctx); - const add = useStep('add'); - - let error: Error | undefined; - try { - await Promise.race([add(1, 2), workflowErrorPromise]); - } catch (err_) { - error = err_ as Error; - } - - expect(error).toBeInstanceOf(WorkflowSuspension); - expect(ctx.invocationsQueue.size).toBe(1); - }); - it('should remove queue item when step_completed (terminal state)', async () => { const ctx = setupWorkflowContext([ { diff --git a/packages/core/src/step.ts b/packages/core/src/step.ts index 3cc9e59ce4..33e544d19e 100644 --- a/packages/core/src/step.ts +++ b/packages/core/src/step.ts @@ -112,11 +112,6 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Consumed; } - if (event.eventType === 'step_deferred') { - // Admission was blocked before user work could proceed, so keep waiting. - return EventConsumerResult.Consumed; - } - if (event.eventType === 'step_failed') { // Terminal state - we can remove the invocationQueue item ctx.invocationsQueue.delete(event.correlationId); diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index b63329dd20..2a9aa8b7e1 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -1,5 +1,4 @@ import { AsyncLocalStorage } from 'node:async_hooks'; -import type { LimitLease } from '@workflow/world'; import type { CryptoKey } from '../encryption.js'; import type { WorkflowMetadata } from '../workflow/get-workflow-metadata.js'; import type { StepMetadata } from './get-step-metadata.js'; @@ -10,6 +9,4 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ ops: Promise[]; closureVars?: Record; encryptionKey?: CryptoKey; - lockCounter: number; - preAcquiredLocks?: Record; }>(); diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts deleted file mode 100644 index b537cc7503..0000000000 --- a/packages/core/src/step/lock.ts +++ /dev/null @@ -1,103 +0,0 @@ -import type { LimitAcquireRequest, LimitLease, World } from '@workflow/world'; -import type { LockHandle, LockOptions } from '../lock.js'; -import { contextStorage } from './context-storage.js'; - -export class StepLockBlockedError extends Error { - retryAfterMs?: number; - request: LimitAcquireRequest; - - constructor(request: LimitAcquireRequest, retryAfterMs?: number) { - super('Step lock blocked'); - this.name = 'StepLockBlockedError'; - this.retryAfterMs = retryAfterMs; - this.request = request; - } - - static is(value: unknown): value is StepLockBlockedError { - return value instanceof StepLockBlockedError; - } -} - -function createStepLockHandle(lease: LimitLease, world: World): LockHandle { - let currentLease = lease; - let disposed = false; - - const dispose = async () => { - if (disposed) return; - disposed = true; - await world.limits.release({ - leaseId: currentLease.leaseId, - key: currentLease.key, - holderId: currentLease.holderId, - }); - }; - - const heartbeat = async (ttlMs?: number) => { - currentLease = await world.limits.heartbeat({ - leaseId: currentLease.leaseId, - ttlMs, - }); - }; - - return { - get leaseId() { - return currentLease.leaseId; - }, - get key() { - return currentLease.key; - }, - get holderId() { - return currentLease.holderId; - }, - get expiresAt() { - return currentLease.expiresAt; - }, - dispose, - heartbeat, - [Symbol.asyncDispose]: dispose, - }; -} - -export function createStepLock(world: World) { - return async function lockInStep(options: LockOptions): Promise { - const store = contextStorage.getStore(); - if (!store) { - throw new Error( - '`lock()` can only be called inside a workflow or step function' - ); - } - - const lockIndex = store.lockCounter++; - const holderId = `stplock_${store.workflowMetadata.workflowRunId}:${store.stepMetadata.stepId}:${lockIndex}`; - const definition = { - concurrency: options.concurrency, - rate: options.rate, - }; - const request = { - key: options.key, - holderId, - definition, - leaseTtlMs: options.leaseTtlMs, - } satisfies LimitAcquireRequest; - - const preAcquiredLease = store.preAcquiredLocks?.[holderId]; - if (preAcquiredLease) { - if (store.preAcquiredLocks) { - delete store.preAcquiredLocks[holderId]; - } - return createStepLockHandle(preAcquiredLease, world); - } - - const result = await world.limits.acquire(request); - - if (result.status === 'acquired') { - return createStepLockHandle(result.lease, world); - } - - /* - Steps do not sit inside user code polling for a lease. - The runtime catches this and re-queues the step attempt at the boundary. - */ - throw new StepLockBlockedError(request, result.retryAfterMs); - }; -} diff --git a/packages/core/src/symbols.ts b/packages/core/src/symbols.ts index cd9616b17e..790f2fe46f 100644 --- a/packages/core/src/symbols.ts +++ b/packages/core/src/symbols.ts @@ -2,7 +2,6 @@ export const WORKFLOW_USE_STEP = Symbol.for('WORKFLOW_USE_STEP'); export const WORKFLOW_CREATE_HOOK = Symbol.for('WORKFLOW_CREATE_HOOK'); export const WORKFLOW_SLEEP = Symbol.for('WORKFLOW_SLEEP'); export const WORKFLOW_LOCK = Symbol.for('WORKFLOW_LOCK'); -export const STEP_LOCK = Symbol.for('STEP_LOCK'); export const WORKFLOW_CONTEXT = Symbol.for('WORKFLOW_CONTEXT'); export const WORKFLOW_GET_STREAM_ID = Symbol.for('WORKFLOW_GET_STREAM_ID'); export const STABLE_ULID = Symbol.for('WORKFLOW_STABLE_ULID'); diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index ece1823196..01883a0fee 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -139,6 +139,7 @@ export async function runWorkflow( globalThis: vmGlobalThis, onWorkflowError: workflowDiscontinuation.reject, eventsConsumer, + nextLockIndex: 0, generateUlid: () => ulid(+startedAt), generateNanoid, invocationsQueue: new Map(), diff --git a/packages/core/src/workflow/hook.test.ts b/packages/core/src/workflow/hook.test.ts index baa108cb03..ead1169ea3 100644 --- a/packages/core/src/workflow/hook.test.ts +++ b/packages/core/src/workflow/hook.test.ts @@ -28,6 +28,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index f0905e06e9..2010c1fd61 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -1,18 +1,19 @@ import { WorkflowSuspension } from '../global.js'; import type { LockHandle, LockOptions } from '../lock.js'; +import { createLockWakeCorrelationId, type LimitLease } from '@workflow/world'; import { scheduleWhenIdle, type WorkflowOrchestratorContext, } from '../private.js'; import { getWorld } from '../runtime/world.js'; +const DEFAULT_LOCK_LEASE_TTL_MS = 24 * 60 * 60 * 1000; + function createLockHandle( - lease: { - leaseId: string; - key: string; - holderId: string; - expiresAt?: Date; - }, + lease: Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' + >, ctx: WorkflowOrchestratorContext ): LockHandle { let currentLease = lease; @@ -24,7 +25,7 @@ function createLockHandle( await getWorld().limits.release({ leaseId: currentLease.leaseId, key: currentLease.key, - holderId: currentLease.holderId, + lockId: currentLease.lockId, }); }; @@ -42,8 +43,14 @@ function createLockHandle( get key() { return currentLease.key; }, - get holderId() { - return currentLease.holderId; + get lockId() { + return currentLease.lockId; + }, + get runId() { + return currentLease.runId; + }, + get lockIndex() { + return currentLease.lockIndex; }, get expiresAt() { return currentLease.expiresAt; @@ -68,8 +75,8 @@ export function createLock(ctx: WorkflowOrchestratorContext) { wait event. Postgres can wake this correlation id early when the waiter is promoted, and the delayed replay is just a fallback. */ - const correlationId = `wflock_wait_${ctx.generateUlid()}`; - const holderId = `wflock_${ctx.runId}:${correlationId}:${ctx.generateUlid()}`; + const lockIndex = ctx.nextLockIndex++; + const correlationId = createLockWakeCorrelationId(ctx.runId, lockIndex); const definition = { concurrency: options.concurrency, rate: options.rate, @@ -78,9 +85,10 @@ export function createLock(ctx: WorkflowOrchestratorContext) { while (true) { const result = await getWorld().limits.acquire({ key: options.key, - holderId, + runId: ctx.runId, + lockIndex, definition, - leaseTtlMs: options.leaseTtlMs, + leaseTtlMs: options.leaseTtlMs ?? DEFAULT_LOCK_LEASE_TTL_MS, }); if (result.status === 'acquired') { diff --git a/packages/core/src/workflow/sleep.test.ts b/packages/core/src/workflow/sleep.test.ts index 8b77ca2c76..b6853c4405 100644 --- a/packages/core/src/workflow/sleep.test.ts +++ b/packages/core/src/workflow/sleep.test.ts @@ -32,6 +32,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { }, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 6428422dbb..8b301c2d00 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -21,9 +21,9 @@ createLimitsContractSuite('local world limits', async () => { keys?: Record< string, { - leases?: { holderId: string }[]; - waiters?: { holderId: string }[]; - tokens?: { holderId: string }[]; + leases?: { lockId: string }[]; + waiters?: { lockId: string }[]; + tokens?: { lockId: string }[]; } >; }; @@ -43,10 +43,10 @@ createLimitsContractSuite('local world limits', async () => { const keyState = raw.keys?.[key]; return { - leaseHolderIds: keyState?.leases?.map((lease) => lease.holderId) ?? [], + leaseHolderIds: keyState?.leases?.map((lease) => lease.lockId) ?? [], waiterHolderIds: - keyState?.waiters?.map((waiter) => waiter.holderId) ?? [], - tokenHolderIds: keyState?.tokens?.map((token) => token.holderId) ?? [], + keyState?.waiters?.map((waiter) => waiter.lockId) ?? [], + tokenHolderIds: keyState?.tokens?.map((token) => token.lockId) ?? [], }; }, close: async () => { diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 081b95f63e..896b9ad3d6 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,12 +1,9 @@ import path from 'node:path'; import { WorkflowWorldError } from '@workflow/errors'; -import type { - Queue, - Storage, - WorkflowRunWithoutData, - StepWithoutData, -} from '@workflow/world'; +import type { Queue, Storage, WorkflowRunWithoutData } from '@workflow/world'; import { + createLockId, + createLockWakeCorrelationId, LimitAcquireRequestSchema, type LimitAcquireResult, LimitHeartbeatRequestSchema, @@ -14,6 +11,7 @@ import { LimitLeaseSchema, LimitReleaseRequestSchema, type Limits, + parseLockId, } from '@workflow/world'; import { z } from 'zod'; import { readJSON, writeJSON } from './fs.js'; @@ -21,14 +19,16 @@ import { monotonicUlid } from './storage/helpers.js'; const LimitTokenSchema = z.object({ tokenId: z.string(), - holderId: z.string(), + lockId: z.string(), acquiredAt: z.coerce.date(), expiresAt: z.coerce.date(), }); const LimitWaiterSchema = z.object({ waiterId: z.string(), - holderId: z.string(), + lockId: z.string(), + runId: z.string(), + lockIndex: z.number().int().nonnegative(), createdAt: z.coerce.date(), leaseTtlMs: z.number().int().positive().optional(), concurrencyMax: z.number().int().positive().nullable(), @@ -55,15 +55,10 @@ type LimitsState = z.infer; type HolderTarget = | { - kind: 'workflow'; + kind: 'lock'; runId: string; correlationId: string; } - | { - kind: 'step'; - runId: string; - stepId: string; - } | { kind: 'opaque'; }; @@ -71,7 +66,7 @@ type HolderTarget = export interface LocalLimitsOptions { tag?: string; queue?: Pick; - storage?: Pick; + storage?: Pick; } const EMPTY_STATE: LimitsState = { @@ -164,7 +159,8 @@ function getRetryAfterMs( function createLease( key: string, - holderId: string, + runId: string, + lockIndex: number, definition: LimitLease['definition'], acquiredAt: Date, leaseTtlMs?: number @@ -172,7 +168,9 @@ function createLease( return { leaseId: `lmt_${monotonicUlid()}`, key, - holderId, + lockId: createLockId(runId, lockIndex), + runId, + lockIndex, acquiredAt, expiresAt: leaseTtlMs !== undefined @@ -184,31 +182,29 @@ function createLease( function insertToken( keyState: KeyState, - holderId: string, + lockId: string, acquiredAt: Date, periodMs: number ) { keyState.tokens.push({ tokenId: `lmttok_${monotonicUlid()}`, - holderId, + lockId, acquiredAt, expiresAt: new Date(acquiredAt.getTime() + periodMs), }); } -function parseHolderId(holderId: string): HolderTarget { - if (holderId.startsWith('wflock_')) { - const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); - if (runId && correlationId) { - return { kind: 'workflow', runId, correlationId }; - } - } - - if (holderId.startsWith('stplock_')) { - const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); - if (runId && stepId) { - return { kind: 'step', runId, stepId }; - } +function parseHolderId(lockId: string): HolderTarget { + const parsedLockId = parseLockId(lockId); + if (parsedLockId) { + return { + kind: 'lock', + runId: parsedLockId.runId, + correlationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; } return { kind: 'opaque' }; @@ -218,14 +214,6 @@ function isTerminalRun(run: WorkflowRunWithoutData | undefined) { return !run || ['completed', 'failed', 'cancelled'].includes(run.status); } -function isTerminalStep(step: StepWithoutData | undefined) { - return !step || ['completed', 'failed', 'cancelled'].includes(step.status); -} - -function toMillis(value: Date | undefined): number | undefined { - return value ? value.getTime() : undefined; -} - function deleteEmptyKey(state: LimitsState, key: string) { const keyState = state.keys[key]; if (!keyState) return; @@ -277,35 +265,14 @@ export function createLimits( } }; - const getStep = async ( - runId: string, - stepId: string - ): Promise => { - try { - return await options?.storage?.steps.get(runId, stepId, { - resolveData: 'none', - }); - } catch { - return undefined; - } - }; - const isHolderLive = async (holderId: string): Promise => { const target = parseHolderId(holderId); if (target.kind === 'opaque' || !options?.storage) { return true; } - if (target.kind === 'workflow') { - const run = await getRun(target.runId); - return !isTerminalRun(run); - } - - const [run, step] = await Promise.all([ - getRun(target.runId), - getStep(target.runId, target.stepId), - ]); - return !isTerminalRun(run) && !isTerminalStep(step); + const run = await getRun(target.runId); + return !isTerminalRun(run); }; const queueWakeForHolder = async (holderId: string): Promise => { @@ -315,40 +282,17 @@ export function createLimits( } try { - if (target.kind === 'workflow') { - const run = await getRun(target.runId); - if (isTerminalRun(run) || !run) return; - - await options.queue.queue( - `__wkf_workflow_${run.workflowName}`, - { - runId: target.runId, - requestedAt: new Date(), - }, - { - idempotencyKey: target.correlationId, - } - ); - return; - } - - const [run, step] = await Promise.all([ - getRun(target.runId), - getStep(target.runId, target.stepId), - ]); - if (isTerminalRun(run) || isTerminalStep(step) || !run || !step) return; + const run = await getRun(target.runId); + if (isTerminalRun(run) || !run) return; await options.queue.queue( - `__wkf_step_${step.stepName}`, + `__wkf_workflow_${run.workflowName}`, { - workflowName: run.workflowName, - workflowRunId: target.runId, - workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), - stepId: target.stepId, + runId: target.runId, requestedAt: new Date(), }, { - idempotencyKey: target.stepId, + idempotencyKey: target.correlationId, } ); } catch (error) { @@ -369,7 +313,7 @@ export function createLimits( for (let index = 0; index < promotedKeyState.waiters.length; index++) { const waiter = promotedKeyState.waiters[index]; - if (!(await isHolderLive(waiter.holderId))) { + if (!(await isHolderLive(waiter.lockId))) { continue; } @@ -405,7 +349,8 @@ export function createLimits( promotedKeyState.leases.push( createLease( key, - waiter.holderId, + waiter.runId, + waiter.lockIndex, definition, acquiredAt, waiter.leaseTtlMs @@ -416,14 +361,14 @@ export function createLimits( if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { insertToken( promotedKeyState, - waiter.holderId, + waiter.lockId, acquiredAt, waiter.ratePeriodMs ); activeTokens += 1; } - wakeHolders.push(waiter.holderId); + wakeHolders.push(waiter.lockId); } promotedKeyState.waiters = remainingWaiters; @@ -433,6 +378,7 @@ export function createLimits( return { async acquire(request) { const parsed = LimitAcquireRequestSchema.parse(request); + const lockId = createLockId(parsed.runId, parsed.lockIndex); return withStateLock(async (): Promise => { const state = cloneState(await readState()); @@ -451,7 +397,7 @@ export function createLimits( state.keys[parsed.key] = keyState; const existingLease = keyState.leases.find( - (lease) => lease.holderId === parsed.holderId + (lease) => lease.lockId === lockId ); if (existingLease) { await writeState(state); @@ -469,7 +415,7 @@ export function createLimits( parsed.definition.rate !== undefined && keyState.tokens.length >= parsed.definition.rate.count; const existingWaiter = keyState.waiters.find( - (waiter) => waiter.holderId === parsed.holderId + (waiter) => waiter.lockId === lockId ); if ( @@ -481,7 +427,9 @@ export function createLimits( if (!existingWaiter) { keyState.waiters.push({ waiterId: `lmtwait_${monotonicUlid()}`, - holderId: parsed.holderId, + lockId, + runId: parsed.runId, + lockIndex: parsed.lockIndex, createdAt: new Date(), leaseTtlMs: parsed.leaseTtlMs, concurrencyMax: parsed.definition.concurrency?.max ?? null, @@ -508,7 +456,8 @@ export function createLimits( const acquiredAt = new Date(); const lease = createLease( parsed.key, - parsed.holderId, + parsed.runId, + parsed.lockIndex, parsed.definition, acquiredAt, parsed.leaseTtlMs @@ -519,7 +468,7 @@ export function createLimits( if (parsed.definition.rate) { insertToken( keyState, - parsed.holderId, + lockId, acquiredAt, parsed.definition.rate.periodMs ); @@ -549,7 +498,7 @@ export function createLimits( keyState.leases = keyState.leases.filter((lease) => { if (lease.leaseId !== parsed.leaseId) return true; if (parsed.key && lease.key !== parsed.key) return true; - if (parsed.holderId && lease.holderId !== parsed.holderId) { + if (parsed.lockId && lease.lockId !== parsed.lockId) { return true; } return false; diff --git a/packages/world-local/src/storage.test.ts b/packages/world-local/src/storage.test.ts index 6bfef563e6..7f36478179 100644 --- a/packages/world-local/src/storage.test.ts +++ b/packages/world-local/src/storage.test.ts @@ -2449,83 +2449,6 @@ describe('Storage', () => { }); }); - describe('step_deferred event handling', () => { - let testRunId: string; - - beforeEach(async () => { - const run = await createRun(storage, { - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - input: new Uint8Array(), - }); - testRunId = run.runId; - }); - - it('should roll back the first blocked attempt without recording an error', async () => { - await createStep(storage, testRunId, { - stepId: 'step_deferred_1', - stepName: 'test-step', - input: new Uint8Array(), - }); - await updateStep(storage, testRunId, 'step_deferred_1', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await storage.events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_1', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 0, - startedAt: undefined, - retryAfter, - error: undefined, - }); - }); - - it('should preserve the original startedAt after a prior real attempt', async () => { - await createStep(storage, testRunId, { - stepId: 'step_deferred_2', - stepName: 'test-step', - input: new Uint8Array(), - }); - - const started1 = await updateStep( - storage, - testRunId, - 'step_deferred_2', - 'step_started' - ); - await storage.events.create(testRunId, { - eventType: 'step_retrying', - correlationId: 'step_deferred_2', - eventData: { error: 'Temporary failure' }, - }); - await updateStep(storage, testRunId, 'step_deferred_2', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await storage.events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_2', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 1, - retryAfter, - error: undefined, - }); - expect(result.step?.startedAt).toEqual(started1.startedAt); - }); - }); - describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(storage, { diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index 2bd5025696..c4d0497e83 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -108,15 +108,11 @@ export function createEventsStorage( ['completed', 'failed', 'cancelled'].includes(status); // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate + // Skip run validation for step_completed and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves filesystem reads per step event. let currentRun: WorkflowRun | null = null; - const skipRunValidationEvents = [ - 'step_completed', - 'step_deferred', - 'step_retrying', - ]; + const skipRunValidationEvents = ['step_completed', 'step_retrying']; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -134,7 +130,7 @@ export function createEventsStorage( // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -223,7 +219,6 @@ export function createEventsStorage( 'step_started', 'step_completed', 'step_failed', - 'step_deferred', 'step_retrying', ]; if (stepEvents.includes(data.eventType) && data.correlationId) { @@ -605,27 +600,6 @@ export function createEventsStorage( { overwrite: true } ); } - } else if (data.eventType === 'step_deferred' && 'eventData' in data) { - // step_deferred: returns the step to pending without recording a failure - if (validatedStep) { - const stepCompositeKey = `${effectiveRunId}-${data.correlationId}`; - const rolledBackAttempt = Math.max(0, validatedStep.attempt - 1); - step = { - ...validatedStep, - status: 'pending', - attempt: rolledBackAttempt, - startedAt: - rolledBackAttempt === 0 ? undefined : validatedStep.startedAt, - error: undefined, - retryAfter: data.eventData.retryAfter, - updatedAt: now, - }; - await writeJSON( - taggedPath(basedir, 'steps', stepCompositeKey, tag), - step, - { overwrite: true } - ); - } } else if (data.eventType === 'step_retrying' && 'eventData' in data) { // step_retrying: Sets status back to 'pending', records error // Reuse validatedStep from validation (already read above) diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index a96cf3b680..1b48974de8 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -129,7 +129,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling - Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key -- Cancelled workflow and failed/completed step waiters are pruned before promotion +- Cancelled workflow waiters are pruned before promotion - Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index e7c8193788..44ab39f16e 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -14,16 +14,16 @@ if (process.platform === 'win32') { let db: Awaited< ReturnType >; - let queue: ReturnType; beforeAll(async () => { const { createPostgresTestDb } = await import('../test/test-db.js'); db = await createPostgresTestDb(); - queue = createQueue( + const queue = createQueue( { connectionString: db.connectionString, queueConcurrency: 1 }, db.sql ); await queue.start(); + await queue.close(); }, 120_000); beforeEach(async () => { @@ -31,7 +31,6 @@ if (process.platform === 'win32') { }); afterAll(async () => { - await queue?.close(); await db?.close(); }); @@ -48,20 +47,20 @@ if (process.platform === 'win32') { }, inspectKeyState: async (key) => { const [leases, waiters, tokens] = await Promise.all([ - db.sql<{ holderId: string }[]>` - select holder_id as "holderId" + db.sql<{ lockId: string }[]>` + select holder_id as "lockId" from workflow.workflow_limit_leases where limit_key = ${key} order by holder_id asc `, - db.sql<{ holderId: string }[]>` - select holder_id as "holderId" + db.sql<{ lockId: string }[]>` + select holder_id as "lockId" from workflow.workflow_limit_waiters where limit_key = ${key} order by created_at asc, holder_id asc `, - db.sql<{ holderId: string }[]>` - select holder_id as "holderId" + db.sql<{ lockId: string }[]>` + select holder_id as "lockId" from workflow.workflow_limit_tokens where limit_key = ${key} order by acquired_at asc, holder_id asc @@ -69,9 +68,9 @@ if (process.platform === 'win32') { ]); return { - leaseHolderIds: leases.map((row) => row.holderId), - waiterHolderIds: waiters.map((row) => row.holderId), - tokenHolderIds: tokens.map((row) => row.holderId), + leaseHolderIds: leases.map((row) => row.lockId), + waiterHolderIds: waiters.map((row) => row.lockId), + tokenHolderIds: tokens.map((row) => row.lockId), }; }, }; diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index b83680a2f1..22220ff8e3 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -2,6 +2,8 @@ import { JsonTransport } from '@vercel/queue'; import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; import { WorkflowWorldError } from '@workflow/errors'; import { + createLockId, + createLockWakeCorrelationId, LimitAcquireRequestSchema, type LimitAcquireResult, LimitHeartbeatRequestSchema, @@ -9,6 +11,7 @@ import { LimitReleaseRequestSchema, type Limits, MessageId, + parseLockId, } from '@workflow/world'; import { monotonicFactory } from 'ulid'; import type { PostgresWorldConfig } from './config.js'; @@ -23,21 +26,15 @@ type RunRow = Pick< typeof Schema.runs.$inferSelect, 'workflowName' | 'startedAt' | 'status' >; -type StepRow = Pick; type Tx = Parameters[0]>[0]; type Db = Drizzle | Tx; type HolderTarget = | { - kind: 'workflow'; + kind: 'lock'; runId: string; correlationId: string; } - | { - kind: 'step'; - runId: string; - stepId: string; - } | { kind: 'opaque'; }; @@ -49,7 +46,6 @@ function getQueues(config: PostgresWorldConfig) { const prefix = config.jobPrefix || 'workflow_'; return { workflow: `${prefix}flows`, - step: `${prefix}steps`, } as const; } @@ -72,29 +68,30 @@ function toMillis(value: Date | string | null | undefined): number | undefined { Holder ids double as wake-up hints. When a waiter is promoted, we decode the holder id to decide which queue to poke. */ -function parseHolderId(holderId: string): HolderTarget { - if (holderId.startsWith('wflock_')) { - const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); - if (runId && correlationId) { - return { kind: 'workflow', runId, correlationId }; - } - } - - if (holderId.startsWith('stplock_')) { - const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); - if (runId && stepId) { - return { kind: 'step', runId, stepId }; - } +function parseHolderId(lockId: string): HolderTarget { + const parsedLockId = parseLockId(lockId); + if (parsedLockId) { + return { + kind: 'lock', + runId: parsedLockId.runId, + correlationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; } return { kind: 'opaque' }; } function toLease(row: LeaseRow): LimitLease { + const parsedLockId = parseLockId(row.holderId); return { leaseId: row.leaseId, key: row.limitKey, - holderId: row.holderId, + lockId: row.holderId, + runId: parsedLockId?.runId ?? row.holderId, + lockIndex: parsedLockId?.lockIndex ?? 0, acquiredAt: toDate(row.acquiredAt)!, expiresAt: toDate(row.expiresAt), definition: { @@ -183,45 +180,6 @@ async function queueWorkflowWake( `); } -async function queueStepWake( - tx: Db, - config: PostgresWorldConfig, - step: { - stepId: string; - stepName: string; - workflowName: string; - workflowStartedAt: number; - workflowRunId: string; - } -) { - const messageId = MessageId.parse(`msg_${generateId()}`); - const payload = MessageData.encode({ - id: step.stepName, - data: Buffer.from( - transport.serialize({ - workflowName: step.workflowName, - workflowRunId: step.workflowRunId, - workflowStartedAt: step.workflowStartedAt, - stepId: step.stepId, - requestedAt: new Date(), - }) - ), - attempt: 1, - idempotencyKey: step.stepId, - messageId, - }); - - await tx.execute(sql` - select graphile_worker.add_job( - ${getQueues(config).step}::text, - payload := ${JSON.stringify(payload)}::json, - max_attempts := 3, - job_key := ${step.stepId}::text, - job_key_mode := 'replace' - ) - `); -} - async function queueWakeForHolder( tx: Db, config: PostgresWorldConfig, @@ -229,50 +187,13 @@ async function queueWakeForHolder( ) { /* Limit state is durable in Postgres, but wake-ups still need a runtime target. - If the run or step is already terminal, there is nothing left to resume. + If the workflow is already terminal, there is nothing left to resume. */ const target = parseHolderId(holderId); if (target.kind === 'opaque') { return; } - if (target.kind === 'workflow') { - const [run] = (await tx - .select({ - workflowName: Schema.runs.workflowName, - startedAt: Schema.runs.startedAt, - status: Schema.runs.status, - }) - .from(Schema.runs) - .where(eq(Schema.runs.runId, target.runId)) - .limit(1)) as RunRow[]; - - if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { - return; - } - - await queueWorkflowWake( - tx, - config, - target.runId, - run.workflowName, - target.correlationId - ); - return; - } - - const [step] = (await tx - .select({ - stepName: Schema.steps.stepName, - status: Schema.steps.status, - }) - .from(Schema.steps) - .where(eq(Schema.steps.stepId, target.stepId)) - .limit(1)) as StepRow[]; - if (!step || ['completed', 'failed'].includes(step.status)) { - return; - } - const [run] = (await tx .select({ workflowName: Schema.runs.workflowName, @@ -286,13 +207,13 @@ async function queueWakeForHolder( return; } - await queueStepWake(tx, config, { - stepId: target.stepId, - stepName: step.stepName, - workflowName: run.workflowName, - workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), - workflowRunId: target.runId, - }); + await queueWorkflowWake( + tx, + config, + target.runId, + run.workflowName, + target.correlationId + ); } async function pruneExpired(tx: Db, key: string): Promise { @@ -373,29 +294,6 @@ async function isHolderLive(tx: Db, holderId: string): Promise { return true; } - if (target.kind === 'workflow') { - const [run] = (await tx - .select({ - status: Schema.runs.status, - }) - .from(Schema.runs) - .where(eq(Schema.runs.runId, target.runId)) - .limit(1)) as Pick[]; - - return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); - } - - const [step] = (await tx - .select({ - status: Schema.steps.status, - }) - .from(Schema.steps) - .where(eq(Schema.steps.stepId, target.stepId)) - .limit(1)) as Pick[]; - if (!step || ['completed', 'failed'].includes(step.status)) { - return false; - } - const [run] = (await tx .select({ status: Schema.runs.status, @@ -502,8 +400,9 @@ export function createLimits( await promoteWaiters(tx, config, parsed.key); const state = await getActiveState(tx, parsed.key); + const lockId = createLockId(parsed.runId, parsed.lockIndex); const existingLease = state.leases.find( - (lease) => lease.holderId === parsed.holderId + (lease) => lease.holderId === lockId ); if (existingLease) { return { @@ -513,7 +412,7 @@ export function createLimits( } const existingWaiter = state.waiters.find( - (waiter) => waiter.holderId === parsed.holderId + (waiter) => waiter.holderId === lockId ); // If there are already waiters for this key and holder no need to queue a new waiter. if (existingWaiter) { @@ -553,7 +452,7 @@ export function createLimits( .values({ leaseId: `lmt_${generateId()}`, limitKey: parsed.key, - holderId: parsed.holderId, + holderId: lockId, acquiredAt: new Date(), expiresAt, concurrencyMax: parsed.definition.concurrency?.max ?? null, @@ -566,7 +465,7 @@ export function createLimits( await tx.insert(Schema.limitTokens).values({ tokenId: `lmttok_${generateId()}`, limitKey: parsed.key, - holderId: parsed.holderId, + holderId: lockId, acquiredAt: new Date(), expiresAt: new Date(Date.now() + parsed.definition.rate.periodMs), }); @@ -584,7 +483,7 @@ export function createLimits( .values({ waiterId: `lmtwait_${generateId()}`, limitKey: parsed.key, - holderId: parsed.holderId, + holderId: lockId, createdAt: new Date(), leaseTtlMs: parsed.leaseTtlMs ?? null, concurrencyMax: parsed.definition.concurrency?.max ?? null, @@ -630,8 +529,8 @@ export function createLimits( if (parsed.key) { where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; } - if (parsed.holderId) { - where = and(where, eq(Schema.limitLeases.holderId, parsed.holderId))!; + if (parsed.lockId) { + where = and(where, eq(Schema.limitLeases.holderId, parsed.lockId))!; } const [deleted] = await tx diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 5fa5adac83..882f7ec7d8 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -358,16 +358,12 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // ============================================================ // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate + // Skip run validation for step_completed and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves database queries per step event. let currentRun: { status: string; specVersion: number | null } | null = null; - const skipRunValidationEvents = [ - 'step_completed', - 'step_deferred', - 'step_retrying', - ]; + const skipRunValidationEvents = ['step_completed', 'step_retrying']; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -383,7 +379,7 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -478,11 +474,7 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { startedAt: Date | null; retryAfter: Date | null; } | null = null; - const stepEventsNeedingValidation = [ - 'step_started', - 'step_deferred', - 'step_retrying', - ]; + const stepEventsNeedingValidation = ['step_started', 'step_retrying']; if ( stepEventsNeedingValidation.includes(data.eventType) && data.correlationId @@ -913,49 +905,6 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { } } - // Handle step_deferred event: returns the step to pending without recording a failure - if (data.eventType === 'step_deferred') { - const eventData = (data as any).eventData as { - retryAfter?: Date; - }; - - const [stepValue] = await drizzle - .update(Schema.steps) - .set({ - status: 'pending', - attempt: sql`GREATEST(${Schema.steps.attempt} - 1, 0)`, - startedAt: sql`CASE WHEN ${Schema.steps.attempt} <= 1 THEN NULL ELSE ${Schema.steps.startedAt} END`, - error: null, - retryAfter: eventData.retryAfter, - }) - .where( - and( - eq(Schema.steps.runId, effectiveRunId), - eq(Schema.steps.stepId, data.correlationId!), - notInArray(Schema.steps.status, terminalStepStatuses) - ) - ) - .returning(); - if (stepValue) { - step = deserializeStepError(compact(stepValue)); - } else { - const [existing] = await getStepForValidation.execute({ - runId: effectiveRunId, - stepId: data.correlationId!, - }); - if (!existing) { - throw new WorkflowWorldError( - `Step "${data.correlationId}" not found` - ); - } - if (isStepTerminal(existing.status)) { - throw new EntityConflictError( - `Cannot modify step in terminal state "${existing.status}"` - ); - } - } - } - // Handle step_retrying event: sets status back to 'pending', records error // Uses conditional UPDATE to prevent retrying an already-terminal step. if (data.eventType === 'step_retrying') { diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index 8b2328c4c1..3023790d65 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1808,122 +1808,6 @@ describe('Storage (Postgres integration)', () => { }); }); - describe('step_deferred event handling', () => { - let testRunId: string; - - beforeEach(async () => { - const run = await createRun(events, { - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - input: new Uint8Array(), - }); - testRunId = run.runId; - }); - - it('should roll back the first blocked attempt without recording an error', async () => { - await createStep(events, testRunId, { - stepId: 'step_deferred_1', - stepName: 'test-step', - input: new Uint8Array(), - }); - await updateStep(events, testRunId, 'step_deferred_1', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_1', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 0, - startedAt: undefined, - retryAfter, - error: undefined, - }); - }); - - it('should preserve the original startedAt after a prior real attempt', async () => { - await createStep(events, testRunId, { - stepId: 'step_deferred_2', - stepName: 'test-step', - input: new Uint8Array(), - }); - - const started1 = await updateStep( - events, - testRunId, - 'step_deferred_2', - 'step_started' - ); - await events.create(testRunId, { - eventType: 'step_retrying', - correlationId: 'step_deferred_2', - eventData: { error: 'Temporary failure' }, - }); - await updateStep(events, testRunId, 'step_deferred_2', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_2', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 1, - retryAfter, - error: undefined, - }); - expect(result.step?.startedAt).toEqual(started1.startedAt); - }); - - it('throws WorkflowWorldError when step_deferred targets a missing step', async () => { - await expect( - events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_missing_deferred', - eventData: { - retryAfter: new Date(Date.now() + 5_000), - }, - }) - ).rejects.toBeInstanceOf(WorkflowWorldError); - }); - - it('throws EntityConflictError when step_deferred targets a terminal step', async () => { - await createStep(events, testRunId, { - stepId: 'step_deferred_terminal', - stepName: 'test-step', - input: new Uint8Array(), - }); - await updateStep( - events, - testRunId, - 'step_deferred_terminal', - 'step_failed', - { - error: 'already failed', - } - ); - - await expect( - events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_terminal', - eventData: { - retryAfter: new Date(Date.now() + 5_000), - }, - }) - ).rejects.toBeInstanceOf(EntityConflictError); - }); - }); - describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(events, { diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 515bbf7fc4..f36c33c410 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -1,6 +1,8 @@ import { setTimeout as sleep } from 'node:timers/promises'; import { SPEC_VERSION_CURRENT, + type LimitDefinition, + type LimitLease, type Limits, type Storage, } from '@workflow/world'; @@ -17,6 +19,16 @@ export interface LimitsHarness { close?: () => Promise; } +interface LockOwner { + lockId: string; + runId: string; + lockIndex: number; +} + +function createTestLockId(runId: string, lockIndex: number) { + return `${runId}:${lockIndex}`; +} + async function createRun( storage: Pick, workflowName: string @@ -36,24 +48,49 @@ async function createRun( return result.run; } -async function createStep( - storage: Pick, - runId: string, - stepId: string -) { - const result = await storage.events.create(runId, { - eventType: 'step_created', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - stepName: 'test-step', - input: [], - }, - }); - if (!result.step) { - throw new Error('expected step'); +function requireEventsStorage( + storage: LimitsHarness['storage'] +): Pick { + if (!storage) { + throw new Error('storage.events is required for limits tests'); } - return result.step; + return storage; +} + +async function createLockOwner( + storage: LimitsHarness['storage'], + workflowName: string, + lockIndex = 0 +): Promise { + const run = await createRun(requireEventsStorage(storage), workflowName); + return { + lockId: createTestLockId(run.runId, lockIndex), + runId: run.runId, + lockIndex, + }; +} + +function acquireRequest( + owner: LockOwner, + key: string, + definition: LimitDefinition, + leaseTtlMs?: number +) { + return { + key, + runId: owner.runId, + lockIndex: owner.lockIndex, + definition, + ...(leaseTtlMs !== undefined ? { leaseTtlMs } : {}), + }; +} + +function releaseRequest(lease: LimitLease) { + return { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }; } export function createLimitsContractSuite( @@ -80,39 +117,43 @@ export function createLimitsContractSuite( it('enforces per-key concurrency limits', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second).toMatchObject({ status: 'blocked', reason: 'concurrency', }); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const third = await harness.limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(third.status).toBe('acquired'); } finally { await harness.close?.(); @@ -122,19 +163,25 @@ export function createLimitsContractSuite( it('isolates unrelated keys at the raw limits layer', async () => { const harness = await createHarness(); try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); const [first, second] = await Promise.all([ - harness.limits.acquire({ - key: 'workflow:user:a', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }), - harness.limits.acquire({ - key: 'workflow:user:b', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }), + harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:a', + { concurrency: { max: 1 } }, + 1_000 + ) + ), + harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:b', + { concurrency: { max: 1 } }, + 1_000 + ) + ), ]); expect(first.status).toBe('acquired'); @@ -147,14 +194,21 @@ export function createLimitsContractSuite( it('serializes concurrent acquires for the same key', async () => { const harness = await createHarness(); try { - const results = await Promise.all( + const owners = await Promise.all( Array.from({ length: 12 }, (_, index) => - harness.limits.acquire({ - key: 'workflow:user:concurrent', - holderId: `holder-${index}`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }) + createLockOwner(harness.storage, `holder-${index}`) + ) + ); + const results = await Promise.all( + owners.map((owner) => + harness.limits.acquire( + acquireRequest( + owner, + 'workflow:user:concurrent', + { concurrency: { max: 1 } }, + 1_000 + ) + ) ) ); @@ -174,48 +228,55 @@ export function createLimitsContractSuite( const harness = await createHarness(); try { const periodMs = 200; - const first = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-a', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const ownerC = await createLockOwner(harness.storage, 'holder-c'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const second = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-b', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); if (second.status !== 'blocked') throw new Error('expected blocked'); expect(second.reason).toBe('rate'); expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); - let third = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-c', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + let third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); const deadline = Date.now() + periodMs + 1_000; while (third.status === 'blocked' && Date.now() < deadline) { await sleep(Math.max(25, third.retryAfterMs ?? 0) + 50); - third = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-c', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); } expect(third.status).toBe('acquired'); } finally { @@ -227,49 +288,53 @@ export function createLimitsContractSuite( const harness = await createHarness(); try { const periodMs = 300; - const first = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-a', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-b', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); expect(second).toMatchObject({ status: 'blocked', reason: 'concurrency_and_rate', }); if (second.status !== 'blocked') throw new Error('expected blocked'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const third = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-b', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); expect(third).toMatchObject({ status: 'blocked', reason: 'rate', @@ -279,15 +344,17 @@ export function createLimitsContractSuite( const deadline = Date.now() + periodMs + 1_000; while (fourth.status === 'blocked' && Date.now() < deadline) { await sleep(Math.max(25, fourth.retryAfterMs ?? 0) + 50); - fourth = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-b', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + fourth = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); } expect(fourth.status).toBe('acquired'); @@ -299,36 +366,40 @@ export function createLimitsContractSuite( it('restores capacity immediately when a lease is released', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const third = await harness.limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(third.status).toBe('acquired'); } finally { await harness.close?.(); @@ -338,12 +409,16 @@ export function createLimitsContractSuite( it('extends lease expiry when heartbeated', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:heartbeat', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 200, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:heartbeat', + { concurrency: { max: 1 } }, + 200 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); @@ -357,12 +432,14 @@ export function createLimitsContractSuite( first.lease.expiresAt?.getTime() ?? 0 ); - const second = await harness.limits.acquire({ - key: 'workflow:user:heartbeat', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:heartbeat', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); } finally { await harness.close?.(); @@ -372,32 +449,40 @@ export function createLimitsContractSuite( it('reclaims expired leases without manual cleanup', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:expired', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 250, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 250 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:expired', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); await sleep(400); - const third = await harness.limits.acquire({ - key: 'workflow:user:expired', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(third.status).toBe('acquired'); } finally { await harness.close?.(); @@ -407,27 +492,32 @@ export function createLimitsContractSuite( it('reuses an existing lease for the same holder', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:reacquire', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:reacquire', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:reacquire', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:reacquire', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second).toMatchObject({ status: 'acquired', lease: { leaseId: first.lease.leaseId, - holderId: first.lease.holderId, + lockId: first.lease.lockId, }, }); @@ -440,10 +530,10 @@ export function createLimitsContractSuite( 'workflow:user:reacquire' ); expect( - keyState.leaseHolderIds.filter((holderId) => holderId === 'holder-a') + keyState.leaseHolderIds.filter((lockId) => lockId === ownerA.lockId) ).toHaveLength(1); expect( - keyState.waiterHolderIds.filter((holderId) => holderId === 'holder-a') + keyState.waiterHolderIds.filter((lockId) => lockId === ownerA.lockId) ).toHaveLength(0); } finally { await harness.close?.(); @@ -453,68 +543,75 @@ export function createLimitsContractSuite( it('promotes waiters in FIFO order per key', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const ownerC = await createLockOwner(harness.storage, 'holder-c'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const third = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + const third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); expect(third.status).toBe('blocked'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); + await harness.limits.release(releaseRequest(first.lease)); - const promoted = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const stillWaiting = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const promoted = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + const stillWaiting = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(promoted.status).toBe('acquired'); expect(stillWaiting.status).toBe('blocked'); if (promoted.status !== 'acquired') throw new Error('expected waiter-b promotion'); - await harness.limits.release({ - leaseId: promoted.lease.leaseId, - holderId: promoted.lease.holderId, - key: promoted.lease.key, - }); + await harness.limits.release(releaseRequest(promoted.lease)); - const thirdPromoted = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const thirdPromoted = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(thirdPromoted.status).toBe('acquired'); } finally { @@ -544,126 +641,57 @@ export function createLimitsContractSuite( eventType: 'run_started', specVersion: SPEC_VERSION_CURRENT, }); - - const first = await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); + const liveOwner = { + lockId: createTestLockId(liveRun.runId, 0), + runId: liveRun.runId, + lockIndex: 0, + }; + const deadOwner = { + lockId: createTestLockId(deadRun.runId, 0), + runId: deadRun.runId, + lockIndex: 0, + }; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: `wflock_${deadRun.runId}:limitwait_dead`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: `wflock_${liveRun.runId}:limitwait_live`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const promoted = await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: `wflock_${liveRun.runId}:limitwait_live`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - expect(promoted.status).toBe('acquired'); - } finally { - await harness.close?.(); - } - }); - - it('skips failed step waiters before promotion', async () => { - const harness = await createHarness(); - try { - if (!harness.storage) { - throw new Error('storage is required for step waiter liveness'); - } - - const deadRun = await createRun(harness.storage, 'dead-step-workflow'); - await harness.storage.events.create(deadRun.runId, { - eventType: 'run_started', - specVersion: SPEC_VERSION_CURRENT, - }); - const deadStep = await createStep( - harness.storage, - deadRun.runId, - 'step-dead' + await harness.limits.acquire( + acquireRequest( + deadOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) ); - await harness.storage.events.create(deadRun.runId, { - eventType: 'step_started', - specVersion: SPEC_VERSION_CURRENT, - correlationId: deadStep.stepId, - }); - await harness.storage.events.create(deadRun.runId, { - eventType: 'step_failed', - specVersion: SPEC_VERSION_CURRENT, - correlationId: deadStep.stepId, - eventData: { - error: { name: 'Error', message: 'failed waiter' }, - }, - } as any); - - const liveRun = await createRun(harness.storage, 'live-step-workflow'); - await harness.storage.events.create(liveRun.runId, { - eventType: 'run_started', - specVersion: SPEC_VERSION_CURRENT, - }); - const liveStep = await createStep( - harness.storage, - liveRun.runId, - 'step-live' + await harness.limits.acquire( + acquireRequest( + liveOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) ); - const first = await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') - throw new Error('expected acquisition'); - - await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: `stplock_${deadRun.runId}:${deadStep.stepId}:0`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); + await harness.limits.release(releaseRequest(first.lease)); - const promoted = await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); + const promoted = await harness.limits.acquire( + acquireRequest( + liveOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); expect(promoted.status).toBe('acquired'); } finally { @@ -675,30 +703,26 @@ export function createLimitsContractSuite( const harness = await createHarness(); try { const key = 'workflow:user:replay'; - const blockedHolderId = 'wflock_wrun_replay:corr_replay:holder_replay'; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const replayOwner = await createLockOwner( + harness.storage, + 'holder-replay' + ); + const blockedLockId = replayOwner.lockId; - const first = await harness.limits.acquire({ - key, - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const first = await harness.limits.acquire( + acquireRequest(ownerA, key, { concurrency: { max: 1 } }, 1_000) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const blockedA = await harness.limits.acquire({ - key, - holderId: blockedHolderId, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const blockedB = await harness.limits.acquire({ - key, - holderId: blockedHolderId, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const blockedA = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); + const blockedB = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); expect(blockedA.status).toBe('blocked'); expect(blockedB.status).toBe('blocked'); @@ -706,27 +730,20 @@ export function createLimitsContractSuite( const blockedState = await harness.inspectKeyState(key); expect( blockedState.waiterHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(1); expect( blockedState.leaseHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(0); - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); + await harness.limits.release(releaseRequest(first.lease)); - const acquired = await harness.limits.acquire({ - key, - holderId: blockedHolderId, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const acquired = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); expect(acquired.status).toBe('acquired'); if (acquired.status !== 'acquired') throw new Error('expected replayed holder acquisition'); @@ -734,12 +751,12 @@ export function createLimitsContractSuite( const acquiredState = await harness.inspectKeyState(key); expect( acquiredState.waiterHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(0); expect( acquiredState.leaseHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(1); } finally { diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts index 4627023ba9..807033e712 100644 --- a/packages/world-testing/src/limits-runtime.ts +++ b/packages/world-testing/src/limits-runtime.ts @@ -3,11 +3,11 @@ import { describe, expect, it } from 'vitest'; type WorkflowLockContentionResult = { workflowLockAcquiredAt: number; workflowLockReleasedAt: number; - stepLockAcquiredAt: number; - stepLockReleasedAt: number; + stepCallLockAcquiredAt: number; + stepCallLockReleasedAt: number; }; -type StepLockNoRetriesResult = { +type LockedStepCallResult = { label: string; key?: string; attempt: number; @@ -28,34 +28,24 @@ type WorkflowRateLimitResult = { periodMs: number; }; -type WorkflowLeakedLockResult = { +type LeakedLockResult = { label: string; key: string; leaseTtlMs: number; - workflowLockAcquiredAt: number; - workflowCompletedAt: number; -}; - -type StepLeakedLockResult = { - label: string; - key: string; - leaseTtlMs: number; - stepLockAcquiredAt: number; + lockAcquiredAt: number; workflowCompletedAt: number; }; -type MidStepLockResult = { - label: string; +type WorkflowMultiStepScopeResult = { key: string; - attempt: number; - lockAcquiredAt: number; - preLockEffects: number; - postLockEffects: number; - trace: string[]; + workflowLockAcquiredAt: number; + firstStepCompletedAt: number; + secondStepCompletedAt: number; + workflowLockReleasedAt: number; }; export interface LimitsRuntimeHarness { - runWorkflowWithWorkflowAndStepLocks(userId: string): Promise<{ + runWorkflowWithScopedLocks(userId: string): Promise<{ workflowKey: string; dbKey: string; aiKey: string; @@ -65,12 +55,12 @@ export interface LimitsRuntimeHarness { userId: string, holdMs: number ): Promise<[WorkflowLockContentionResult, WorkflowLockContentionResult]>; - runStepLockNoRetriesContention( - userId: string, - holdMs: number - ): Promise< - [StepLockNoRetriesResult, StepLockNoRetriesResult, StepLockNoRetriesResult] - >; + runLockedStepCallContention( + key: string, + holdMs: number, + labelA?: string, + labelB?: string + ): Promise<[LockedStepCallResult, LockedStepCallResult]>; runWorkflowLockAcrossSuspension( userId: string, holdMs: number @@ -78,11 +68,11 @@ export interface LimitsRuntimeHarness { runWorkflowExpiredLeaseRecovery( userId: string, leaseTtlMs: number - ): Promise<[WorkflowLeakedLockResult, WorkflowOnlyLockResult]>; - runStepExpiredLeaseRecovery( + ): Promise<[LeakedLockResult, WorkflowOnlyLockResult]>; + runLeakedKeyExpiredLeaseRecovery( userId: string, leaseTtlMs: number - ): Promise<[StepLeakedLockResult, StepLockNoRetriesResult]>; + ): Promise<[LeakedLockResult, LockedStepCallResult]>; runWorkflowMixedLimitContention( userId: string, holdMs: number, @@ -107,16 +97,15 @@ export interface LimitsRuntimeHarness { ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; runIndependentStepKeys( holdMs: number - ): Promise<[StepLockNoRetriesResult, StepLockNoRetriesResult]>; + ): Promise<[LockedStepCallResult, LockedStepCallResult]>; runBlockedWaiterWithUnrelatedWorkflow(holdMs: number): Promise<{ holder: WorkflowOnlyLockResult; waiter: WorkflowOnlyLockResult; unrelated: WorkflowOnlyLockResult; }>; - runMidStepLockContract(holdMs: number): Promise<{ - holder: StepLockNoRetriesResult; - waiter: MidStepLockResult; - }>; + runWorkflowSingleLockAcrossMultipleSteps( + holdMs: number + ): Promise; } export function createLimitsRuntimeSuite( @@ -124,10 +113,10 @@ export function createLimitsRuntimeSuite( createHarness: () => Promise ) { describe(name, () => { - it('runs workflow and step locks end-to-end', async () => { + it('runs locks around individual step calls end-to-end', async () => { const harness = await createHarness(); const userId = 'shared-user'; - const result = await harness.runWorkflowWithWorkflowAndStepLocks(userId); + const result = await harness.runWorkflowWithScopedLocks(userId); expect(result).toMatchObject({ workflowKey: `workflow:user:${userId}`, @@ -137,7 +126,7 @@ export function createLimitsRuntimeSuite( }); }); - it('serializes workflow and step admission under contention', async () => { + it('serializes workflow locks and locks around step calls under contention', async () => { const harness = await createHarness(); const [resultA, resultB] = await harness.runWorkflowLockContention( 'shared-user', @@ -147,12 +136,12 @@ export function createLimitsRuntimeSuite( expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( resultA.workflowLockReleasedAt ); - expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.stepLockReleasedAt + expect(resultB.stepCallLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepCallLockReleasedAt ); }); - it('wakes promoted workflow and step waiters promptly', async () => { + it('wakes promoted workflow and step-call lock waiters promptly', async () => { const harness = await createHarness(); const [resultA, resultB] = await harness.runWorkflowLockContention( 'shared-user', @@ -163,28 +152,23 @@ export function createLimitsRuntimeSuite( resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt ).toBeLessThan(4_000); expect( - resultB.stepLockAcquiredAt - resultA.stepLockReleasedAt + resultB.stepCallLockAcquiredAt - resultA.stepCallLockReleasedAt ).toBeLessThan(4_000); }); - it('does not consume retries while blocked on a top-of-step lock', async () => { + it('can hold one workflow lock across multiple steps in the same scope', async () => { const harness = await createHarness(); - const [resultA, resultB, resultC] = - await harness.runStepLockNoRetriesContention('shared-user', 750); - const [firstResult, secondResult, thirdResult] = [ - resultA, - resultB, - resultC, - ].sort((left, right) => left.acquiredAt - right.acquiredAt); - - expect(resultA.attempt).toBe(1); - expect(resultB.attempt).toBe(1); - expect(resultC.attempt).toBe(1); - expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( - firstResult.releasedAt + const result = + await harness.runWorkflowSingleLockAcrossMultipleSteps(400); + + expect(result.firstStepCompletedAt).toBeGreaterThanOrEqual( + result.workflowLockAcquiredAt + ); + expect(result.secondStepCompletedAt).toBeGreaterThanOrEqual( + result.firstStepCompletedAt ); - expect(thirdResult.acquiredAt).toBeGreaterThanOrEqual( - secondResult.releasedAt + expect(result.workflowLockReleasedAt).toBeGreaterThanOrEqual( + result.secondStepCompletedAt ); }); @@ -203,7 +187,7 @@ export function createLimitsRuntimeSuite( ).toBeLessThan(4_000); }); - it('reclaims expired leaked workflow leases without manual cleanup', async () => { + it('reclaims expired leaked workflow locks without manual cleanup', async () => { const harness = await createHarness(); const leaseTtlMs = 1_250; const [resultA, resultB] = await harness.runWorkflowExpiredLeaseRecovery( @@ -215,15 +199,15 @@ export function createLimitsRuntimeSuite( resultA.workflowCompletedAt ); expect( - resultB.workflowLockAcquiredAt - resultA.workflowLockAcquiredAt + resultB.workflowLockAcquiredAt - resultA.lockAcquiredAt ).toBeGreaterThanOrEqual(leaseTtlMs - 100); }); - it('reclaims expired leaked step leases without manual cleanup', async () => { + it('reclaims expired leaked locks on arbitrary keys without manual cleanup', async () => { const harness = await createHarness(); const leaseTtlMs = 1_250; - const [resultA, resultB] = await harness.runStepExpiredLeaseRecovery( - 'expired-step-user', + const [resultA, resultB] = await harness.runLeakedKeyExpiredLeaseRecovery( + 'expired-key-user', leaseTtlMs ); @@ -231,7 +215,7 @@ export function createLimitsRuntimeSuite( resultA.workflowCompletedAt ); expect( - resultB.acquiredAt - resultA.stepLockAcquiredAt + resultB.acquiredAt - resultA.lockAcquiredAt ).toBeGreaterThanOrEqual(leaseTtlMs - 100); }); @@ -294,7 +278,7 @@ export function createLimitsRuntimeSuite( ); }); - it('does not block unrelated step keys', async () => { + it('does not block unrelated step-like keys', async () => { const harness = await createHarness(); const [resultA, resultB] = await harness.runIndependentStepKeys(1_000); @@ -316,20 +300,5 @@ export function createLimitsRuntimeSuite( ); } ); - - it('replays a mid-step lock at the acquire boundary without duplicating post-lock effects', async () => { - const harness = await createHarness(); - const { holder, waiter } = await harness.runMidStepLockContract(1_500); - - expect(waiter.lockAcquiredAt).toBeGreaterThanOrEqual(holder.releasedAt); - expect(waiter.preLockEffects).toBe(2); - expect(waiter.postLockEffects).toBe(1); - expect(waiter.trace.map((event) => event.split(':')[0])).toEqual([ - 'pre', - 'pre', - 'lock', - 'post', - ]); - }); }); } diff --git a/packages/world-vercel/src/limits.test.ts b/packages/world-vercel/src/limits.test.ts index 2afdf8af80..ff6bf0151a 100644 --- a/packages/world-vercel/src/limits.test.ts +++ b/packages/world-vercel/src/limits.test.ts @@ -1,19 +1,41 @@ -import { describe, it } from 'vitest'; +import { describe, expect, it } from 'vitest'; +import { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; +import { createVercelWorld } from './index.js'; +import { createLimits } from './limits.js'; describe('vercel world limits', () => { - it.fails('exposes the required limits namespace', () => { - throw new Error('TODO: implement'); - }); + it('exposes the required limits namespace', () => { + const limits = createLimits(); - it.fails('enforces per-key concurrency limits', () => { - throw new Error('TODO: implement'); + expect(limits).toMatchObject({ + acquire: expect.any(Function), + release: expect.any(Function), + heartbeat: expect.any(Function), + }); }); - it.fails('returns a retry path when rate limits block acquisition', () => { - throw new Error('TODO: implement'); - }); + it('keeps limits unimplemented until lock support exists', async () => { + const world = createVercelWorld(); + + await expect( + world.limits.acquire({ + key: 'workflow:user:test', + runId: 'wrun_test', + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + + await expect( + world.limits.release({ + leaseId: 'lease_test', + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); - it.fails('restores capacity when a lease is released or expires', () => { - throw new Error('TODO: implement'); + await expect( + world.limits.heartbeat({ + leaseId: 'lease_test', + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); }); }); diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 07cc69168f..b2d30b6376 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -12,15 +12,16 @@ implementations. - Postgres implements the same limits semantics with PostgreSQL-backed leases, rate tokens, durable waiters, and durable queue wake-up. - Vercel still exposes `limits` as a stub. -- The Next.js Turbopack workbench has shared E2E coverage for workflow and step - locks on implemented worlds. +- The Next.js Turbopack workbench has shared E2E coverage for `lock()` used + with `await using`, including locks that wrap individual step calls or + groups of steps. ## Goals - Support keyed concurrency limits. - Support keyed rate limits. - Allow concurrency and rate to be colocated in one interface. -- Support workflow-scoped limits and step-scoped limits. +- Support locks whose lifetime follows normal `await using` lexical scope. - Make crash recovery possible through leases with TTL/expiry. - Keep worker throughput controls separate from business-level flow limits. @@ -28,8 +29,10 @@ implementations. - `worker concurrency`: backend throughput setting for queue/job processing. - `workflow limit`: admission control for workflow runs that share a key. -- `step limit`: execution control for a specific step/resource key. -- `lease`: durable record that a workflow or step currently occupies capacity for a key. +- `scoped resource key`: any user-defined key acquired from workflow scope to + protect one step call, multiple step calls, or a whole workflow section. +- `lease`: durable record that a workflow currently occupies capacity for a + key. ## Shared Contract vs World-Specific Behavior @@ -42,7 +45,7 @@ semantics across implemented worlds. That shared contract includes: - same-holder lease reuse - serialization of concurrent acquires for a single key - FIFO waiter promotion per key -- pruning cancelled workflow waiters and failed/completed step waiters +- pruning cancelled workflow waiters - blocked acquisitions not consuming execution concurrency - prompt wake-up with delayed fallback replay @@ -87,6 +90,10 @@ Limits are modeled as leases with TTL/expiry so capacity can be recovered after: Normal completion should dispose/release the lease explicitly. Crash recovery comes from lease expiry plus future reclaim logic. +The default workflow lock TTL should be high enough to cover normal suspended +execution without making users tune it eagerly. The current runtime default is +24 hours unless the caller overrides `leaseTtlMs`. + ### 3. Keep worker concurrency separate from flow limits Current world-level concurrency settings are infrastructure controls, not @@ -127,7 +134,7 @@ Important distinction: Releasing a lease should free concurrency capacity immediately, but it should not restore rate capacity until the associated rate usage entry expires. -### 5. Use one `lock()` API in both workflows and steps +### 5. Use one `lock()` API from workflow scope We want one user-facing primitive: @@ -135,37 +142,14 @@ We want one user-facing primitive: await using lease = await lock({ ... }); ``` -But the runtime meaning differs by context. - -#### In workflows - -`lock()` means workflow admission / workflow-scope ownership. +`lock()` means workflow code acquires ownership of a keyed lease. If placed at the top of a workflow, it should hold the lease across the logical workflow scope, even though the workflow may suspend and resume many times. -#### In steps - -`lock()` acts like a step gate. - -The current behavior is: - -- declare the limit at the top of the step when possible -- the runtime treats a blocked acquisition as step-boundary admission failure -- the step does not keep executing user code while waiting for capacity -- the step is re-queued and retried after promotion or timeout -- lease is disposed automatically when the step attempt completes - -If `lock()` is called in the middle of a step, the intended contract is: - -- the current attempt stops at the blocked `lock()` call -- the step is deferred and re-queued rather than polling in-process -- code before the blocked `lock()` may replay on the next attempt -- code after the `lock()` runs only after the lock is actually acquired - -This means zero-attempt semantics are still strongest when `lock()` is used as -a top-of-step admission gate, but mid-step `lock()` is now part of the shared -runtime contract rather than unsupported behavior. +Steps themselves do not acquire locks directly. To limit one step category or a +group of steps, the workflow acquires the lock and then calls those steps while +the lease is held. ### 6. `await using` is the preferred user-facing shape @@ -175,8 +159,8 @@ The preferred API is explicit resource management: await using lease = await lock({ ... }); ``` -This gives automatic cleanup on scope exit and reads well for both workflow -scopes and step scopes. +This gives automatic cleanup on scope exit and reads well for critical sections +that may include one or many step calls. For manual early cleanup, the user-facing `LockHandle` should expose: @@ -185,7 +169,7 @@ For manual early cleanup, the user-facing `LockHandle` should expose: The backend-facing world contract can continue to use `release(...)` internally. -### 7. Workflow-scoped locks are logical-scope locks, not request-lifetime locks +### 7. Locks follow logical scope, not request lifetime For workflows, `await using` must be tied to the logical workflow scope across: @@ -197,21 +181,19 @@ For workflows, `await using` must be tied to the logical workflow scope across: The lease must not be disposed merely because one host process invocation ends. -### 8. Prefer step-boundary admission for deadlock avoidance +### 8. Keep admission decisions in workflow code Current preferred model: -- workflow-level limits may be held by a run -- blocked step-level limits return control to the runtime at the step boundary -- step-level limits are short-lived -- step execution should not wait on workflow-level locks +- workflow code acquires and releases limits +- steps execute inside whatever critical section the workflow establishes +- step code never waits on a separate lock of its own -This keeps the dependency direction one-way: +This keeps the dependency direction simple: -- workflow admission -> step admission -> step execution +- workflow admission / critical section -> step execution -That avoids the classic cycle where one workflow holds a workflow lock and -another holds a step lock and each waits on the other. +That avoids needing separate workflow-lock and step-lock runtime semantics. ### 9. Waiters are FIFO per key @@ -237,8 +219,6 @@ Blocked flow limits and worker concurrency are intentionally separate. For implemented worlds: - blocked workflows are suspended and re-queued, not left running on a worker -- blocked steps exit the current attempt and are re-queued instead of polling in - a live worker slot - worker slots are free to service unrelated work while the blocked execution is waiting to be retried or promoted @@ -256,8 +236,7 @@ Current behavior: - leases, rate tokens, and waiters live in world-owned limit state - promotion decisions are made from that limit state -- when a waiter is promoted, the runtime is woken by enqueuing the appropriate - workflow or step job +- when a waiter is promoted, the runtime is woken by enqueuing the workflow job - workflows also keep a delayed replay fallback so progress is still possible if an immediate wake-up is missed @@ -271,26 +250,24 @@ survival is not guaranteed after process loss. For v1, the intended semantics are: - workflow locks count admitted, in-flight workflows for a key -- step locks count or rate-limit specific step execution categories +- workflow-held keys may be used to serialize or rate-limit specific step categories - worker concurrency remains a separate infrastructure throttle More concretely: -- if a workflow acquires a workflow-scoped lock and then sleeps for 10 minutes, +- if a workflow acquires a lock and then sleeps for 10 minutes, it still counts as active for that workflow key during the sleep -- if a workflow is parked waiting for a step-level limit, it still counts as - active for its workflow-level lock -- a step-level lock should conceptually be an admission gate for the step - attempt, not a second workflow-level lock, even when the `lock()` call - appears in the middle of user code -- step-level rate limits should consume rate capacity when the step starts, and - that rate usage should remain counted until the window expires even if the - step releases its lease quickly +- if a workflow acquires a lock for a step-like key such as `step:db:cheap`, + that key remains occupied until the workflow releases it, even if the + protected work is just one step call or a small group of step calls +- rate-limited step-like keys still consume rate capacity when the workflow + acquires that key, and that usage remains counted until the window expires + even if the workflow releases the lease quickly For the current local implementation specifically: -- workflow and step locks now follow the same live-process waiter/fairness - semantics as Postgres +- workflow locks now follow the same live-process waiter/fairness semantics as + Postgres - the queue remains in-memory, so queued wake-ups are not durable across process loss @@ -318,31 +295,38 @@ With intended usage like: ```ts async function cheapDbStep(userId: string) { 'use step'; - await using _dbLimit = await lock({ - key: 'step:db:cheap', - concurrency: { max: 20 }, - }); return { userId, prompt: `profile:${userId}` }; } async function expensiveAIStep(prompt: string) { 'use step'; - await using _aiLimit = await lock({ - key: 'step:provider:openai', - rate: { count: 10, periodMs: 60_000 }, - }); return `summary:${prompt}`; } -export async function workflowWithWorkflowAndStepLocks(userId: string) { +export async function workflowWithScopedLocks(userId: string) { 'use workflow'; await using userLimit = await lock({ key: `workflow:user:${userId}`, concurrency: { max: 2 }, }); - const row = await cheapDbStep(userId); - const summary = await expensiveAIStep(row.prompt); + let row: Awaited>; + { + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + }); + row = await cheapDbStep(userId); + } + + let summary: Awaited>; + { + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + }); + summary = await expensiveAIStep(row.prompt); + } return { row, summary }; } ``` @@ -372,8 +356,8 @@ Two more practical clarifications: ## Open Questions - Whether workflow-level locks should always be whole-run admission locks or - also support narrower workflow-scoped blocks. + also support narrower lexical scopes within workflow code. - Whether `heartbeat()` should remain user-visible or become mostly internal. -- Whether step limits should only be expressed through `lock()` or also through - step metadata/config sugar. +- Whether `lock()` should eventually grow optional metadata or + config sugar for common per-step resource keys. - Exact event-log representation for acquire/block/dispose transitions. diff --git a/packages/world/src/events.ts b/packages/world/src/events.ts index eac141c1f7..2965906f7b 100644 --- a/packages/world/src/events.ts +++ b/packages/world/src/events.ts @@ -1,5 +1,4 @@ import { z } from 'zod'; -import { LimitAcquireRequestSchema } from './limits.js'; import { SerializedDataSchema } from './serialization.js'; import type { PaginationOptions, ResolveData } from './shared.js'; @@ -65,7 +64,6 @@ export const EventTypeSchema = z.enum([ 'step_created', 'step_completed', 'step_failed', - 'step_deferred', 'step_retrying', 'step_started', // Hook lifecycle events @@ -111,19 +109,6 @@ const StepFailedEventSchema = BaseEventSchema.extend({ }), }); -/** - * Event created when a step is blocked on admission and should be retried - * without counting the blocked attempt against maxRetries. - */ -const StepDeferredEventSchema = BaseEventSchema.extend({ - eventType: z.literal('step_deferred'), - correlationId: z.string(), - eventData: z.object({ - retryAfter: z.coerce.date().optional(), - lockRequest: LimitAcquireRequestSchema.optional(), - }), -}); - /** * Event created when a step fails and will be retried. * Sets the step status back to 'pending' and records the error. @@ -287,7 +272,6 @@ export const CreateEventSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, - StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events @@ -312,7 +296,6 @@ const AllEventsSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, - StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events diff --git a/packages/world/src/index.ts b/packages/world/src/index.ts index fd12d63d94..5e8f73d111 100644 --- a/packages/world/src/index.ts +++ b/packages/world/src/index.ts @@ -12,6 +12,8 @@ export { HookSchema } from './hooks.js'; export type * from './interfaces.js'; export type * from './limits.js'; export { + createLockId, + createLockWakeCorrelationId, createLimitsNotImplementedError, LimitAcquireAcquiredResultSchema, LimitAcquireBlockedResultSchema, @@ -24,9 +26,11 @@ export { LimitHeartbeatRequestSchema, LimitKeySchema, LimitLeaseSchema, + LimitLockIdSchema, LimitRateSchema, LimitReleaseRequestSchema, LIMITS_NOT_IMPLEMENTED_MESSAGE, + parseLockId, } from './limits.js'; export type * from './queue.js'; export { diff --git a/packages/world/src/limits.ts b/packages/world/src/limits.ts index ec155b2d8d..495f29a84f 100644 --- a/packages/world/src/limits.ts +++ b/packages/world/src/limits.ts @@ -34,10 +34,44 @@ export const LimitDefinitionSchema = z ); export type LimitDefinition = z.infer; +export const LimitLockIdSchema = z.string().min(1); +export type LimitLockId = z.infer; + +export function createLockId(runId: string, lockIndex: number): LimitLockId { + return `${runId}:${lockIndex}`; +} + +export function parseLockId( + lockId: string +): { runId: string; lockIndex: number } | null { + const separatorIndex = lockId.lastIndexOf(':'); + if (separatorIndex <= 0 || separatorIndex === lockId.length - 1) { + return null; + } + + const runId = lockId.slice(0, separatorIndex); + const rawLockIndex = lockId.slice(separatorIndex + 1); + const lockIndex = Number.parseInt(rawLockIndex, 10); + if (!Number.isInteger(lockIndex) || lockIndex < 0) { + return null; + } + + return { runId, lockIndex }; +} + +export function createLockWakeCorrelationId( + runId: string, + lockIndex: number +): string { + return `wflock_wait_${runId}:${lockIndex}`; +} + export const LimitLeaseSchema = z.object({ leaseId: z.string().min(1), key: LimitKeySchema, - holderId: z.string().min(1), + lockId: LimitLockIdSchema, + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), acquiredAt: z.coerce.date(), expiresAt: z.coerce.date().optional(), definition: LimitDefinitionSchema, @@ -46,7 +80,8 @@ export type LimitLease = z.infer; export const LimitAcquireRequestSchema = z.object({ key: LimitKeySchema, - holderId: z.string().min(1), + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), definition: LimitDefinitionSchema, leaseTtlMs: z.number().int().positive().optional(), }); @@ -88,7 +123,7 @@ export type LimitAcquireResult = z.infer; export const LimitReleaseRequestSchema = z.object({ leaseId: z.string().min(1), key: LimitKeySchema.optional(), - holderId: z.string().min(1).optional(), + lockId: LimitLockIdSchema.optional(), }); export type LimitReleaseRequest = z.infer; diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index b85e49cf3e..1c9bd2ca0a 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -216,13 +216,6 @@ export async function parallelSleepWorkflow() { async function cheapDbStep(userId: string) { 'use step'; - - await using _dbLimit = await lock({ - key: 'step:db:cheap', - concurrency: { max: 20 }, - leaseTtlMs: 30_000, - }); - return { userId, prompt: `profile:${userId}`, @@ -231,17 +224,10 @@ async function cheapDbStep(userId: string) { async function expensiveAIStep(prompt: string) { 'use step'; - - await using _aiLimit = await lock({ - key: 'step:provider:openai', - rate: { count: 10, periodMs: 60_000 }, - leaseTtlMs: 30_000, - }); - return `summary:${prompt}`; } -export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { +export async function workflowWithScopedLocks(userId = 'user-123') { 'use workflow'; await using userLimit = await lock({ @@ -250,8 +236,25 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { leaseTtlMs: 30_000, }); - const row = await cheapDbStep(userId); - const summary = await expensiveAIStep(row.prompt); + let row: Awaited>; + { + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + leaseTtlMs: 30_000, + }); + row = await cheapDbStep(userId); + } + + let summary: Awaited>; + { + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + leaseTtlMs: 30_000, + }); + summary = await expensiveAIStep(row.prompt); + } return { workflowKey: userLimit.key, @@ -261,61 +264,6 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { }; } -type LimitTraceState = { - events: string[]; -}; - -function sanitizeLimitTraceToken(traceToken: string) { - return traceToken.replace(/[^a-zA-Z0-9_-]/g, '_'); -} - -async function getLimitTracePath(traceToken: string) { - const path = await import('node:path'); - return path.join( - process.cwd(), - '.workflow-e2e', - `limits-${sanitizeLimitTraceToken(traceToken)}.json` - ); -} - -async function readLimitTraceState( - traceToken: string -): Promise { - const { mkdir, readFile } = await import('node:fs/promises'); - const path = await import('node:path'); - const tracePath = await getLimitTracePath(traceToken); - await mkdir(path.dirname(tracePath), { recursive: true }); - - try { - return JSON.parse(await readFile(tracePath, 'utf8')) as LimitTraceState; - } catch (error) { - if ((error as NodeJS.ErrnoException).code === 'ENOENT') { - return { events: [] }; - } - throw error; - } -} - -async function writeLimitTraceState( - traceToken: string, - state: LimitTraceState -) { - const { mkdir, writeFile } = await import('node:fs/promises'); - const path = await import('node:path'); - const tracePath = await getLimitTracePath(traceToken); - await mkdir(path.dirname(tracePath), { recursive: true }); - await writeFile(tracePath, JSON.stringify(state), 'utf8'); -} - -async function appendLimitTraceEvent(traceToken: string, event: string) { - const state = await readLimitTraceState(traceToken); - const nextState = { - events: [...state.events, event], - }; - await writeLimitTraceState(traceToken, nextState); - return nextState.events; -} - async function serializedLimitStep( label: string, holdMs: number, @@ -323,16 +271,9 @@ async function serializedLimitStep( ) { 'use step'; - const stepLock = await lock({ - key, - concurrency: { max: 1 }, - leaseTtlMs: holdMs + 5_000, - }); - const metadata = getStepMetadata(); const acquiredAt = Date.now(); await new Promise((resolve) => setTimeout(resolve, holdMs)); - await stepLock.dispose(); const releasedAt = Date.now(); return { @@ -357,7 +298,15 @@ export async function workflowLockContentionWorkflow( }); const workflowLockAcquiredAt = Date.now(); - const step = await serializedLimitStep(userId, holdMs); + let step: Awaited>; + { + await using _nestedLock = await lock({ + key: 'step:db:serialized', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + step = await serializedLimitStep(userId, holdMs); + } await workflowLock.dispose(); const workflowLockReleasedAt = Date.now(); @@ -365,57 +314,27 @@ export async function workflowLockContentionWorkflow( userId, workflowLockAcquiredAt, workflowLockReleasedAt, - stepLockAcquiredAt: step.acquiredAt, - stepLockReleasedAt: step.releasedAt, - }; -} - -async function stepLockNoRetriesStep( - label: string, - holdMs: number, - key = 'step:db:no-retries' -) { - 'use step'; - - await using _stepLock = await lock({ - key, - concurrency: { max: 1 }, - leaseTtlMs: holdMs + 5_000, - }); - - const metadata = getStepMetadata(); - const acquiredAt = Date.now(); - await new Promise((resolve) => setTimeout(resolve, holdMs)); - const releasedAt = Date.now(); - - return { - label, - key, - attempt: metadata.attempt, - acquiredAt, - releasedAt, + stepCallLockAcquiredAt: step.acquiredAt, + stepCallLockReleasedAt: step.releasedAt, }; } -stepLockNoRetriesStep.maxRetries = 0; -export async function stepLockNoRetriesContentionWorkflow( - userId = 'user-123', - holdMs = 750, - label = userId -) { - 'use workflow'; - - return await stepLockNoRetriesStep(label, holdMs); -} - -export async function stepKeyLockContentionWorkflow( +export async function lockedStepCallContentionWorkflow( key = 'step:db:key-contention', holdMs = 750, label = key ) { 'use workflow'; - return await stepLockNoRetriesStep(label, holdMs, key); + { + await using _lock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + return await serializedLimitStep(label, holdMs, key); + } } ////////////////////////////////////////////////////////// @@ -466,48 +385,34 @@ export async function workflowLeakedLockWorkflow( key: leakedWorkflowLock.key, leaseTtlMs, leakedLeaseId: leakedWorkflowLock.leaseId, - workflowLockAcquiredAt, + lockAcquiredAt: workflowLockAcquiredAt, workflowCompletedAt: Date.now(), }; } -async function leakedStepLockStep( - key: string, - leaseTtlMs: number, - label: string +export async function leakedKeyLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId ) { - 'use step'; + 'use workflow'; - const leakedStepLock = await lock({ - key, + const leakedLock = await lock({ + key: `workflow:key:expired:${userId}`, concurrency: { max: 1 }, leaseTtlMs, }); return { label, - key, + key: leakedLock.key, leaseTtlMs, - leakedLeaseId: leakedStepLock.leaseId, - stepLockAcquiredAt: Date.now(), + leakedLeaseId: leakedLock.leaseId, + lockAcquiredAt: Date.now(), workflowCompletedAt: Date.now(), }; } -export async function stepLeakedLockWorkflow( - userId = 'user-123', - leaseTtlMs = 1_250, - label = userId -) { - 'use workflow'; - - return await leakedStepLockStep( - `step:db:expired:${userId}`, - leaseTtlMs, - label - ); -} - export async function workflowRateLimitContentionWorkflow( userId = 'user-123', holdMs = 250, @@ -563,42 +468,48 @@ export async function workflowMixedLimitContentionWorkflow( }; } -async function midStepLockStep(key: string, traceToken: string, label: string) { +async function scopedMultiStepStep(label: string, holdMs: number) { 'use step'; - const { attempt } = getStepMetadata(); - await appendLimitTraceEvent(traceToken, `pre:${attempt}`); - - await using _midStepLock = await lock({ - key, - concurrency: { max: 1 }, - leaseTtlMs: 5_000, - }); - - const lockAcquiredAt = Date.now(); - await appendLimitTraceEvent(traceToken, `lock:${attempt}`); - const trace = await appendLimitTraceEvent(traceToken, `post:${attempt}`); - + const metadata = getStepMetadata(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); return { label, - key, - attempt, - lockAcquiredAt, - preLockEffects: trace.filter((event) => event.startsWith('pre:')).length, - postLockEffects: trace.filter((event) => event.startsWith('post:')).length, - trace, + attempt: metadata.attempt, + completedAt: Date.now(), }; } -midStepLockStep.maxRetries = 0; -export async function midStepLockContentionWorkflow( - key = 'step:db:mid-step', - traceToken = 'mid-step', - label = key +export async function singleLockAcrossMultipleStepsWorkflow( + key = 'step:db:batch', + holdMs = 400 ) { 'use workflow'; - return await midStepLockStep(key, traceToken, label); + let workflowLockAcquiredAt: number; + let first: Awaited>; + let second: Awaited>; + let workflowLockReleasedAt: number; + { + await using _lock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: holdMs * 2 + 5_000, + }); + + workflowLockAcquiredAt = Date.now(); + first = await scopedMultiStepStep('first', holdMs); + second = await scopedMultiStepStep('second', holdMs); + workflowLockReleasedAt = Date.now(); + } + + return { + key, + workflowLockAcquiredAt, + firstStepCompletedAt: first.completedAt, + secondStepCompletedAt: second.completedAt, + workflowLockReleasedAt, + }; } ////////////////////////////////////////////////////////// From 6f226767d5a85043e407ea13564bf187c68deedc Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Mon, 30 Mar 2026 21:42:31 -0400 Subject: [PATCH 17/34] Added event sourced flow limit architecture and simplify schema Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 14 +- packages/core/src/lock.test.ts | 4 +- packages/core/src/lock.ts | 8 +- packages/core/src/private.ts | 2 + packages/core/src/runtime.ts | 6 +- packages/core/src/step/context-storage.ts | 4 + packages/core/src/symbols.ts | 3 + packages/core/src/workflow.ts | 7 +- packages/core/src/workflow/lock.test.ts | 364 +++++++++++ packages/core/src/workflow/lock.ts | 443 +++++++++++-- packages/errors/src/index.ts | 24 + packages/world-local/src/index.ts | 16 +- packages/world-local/src/limits.test.ts | 188 +++++- packages/world-local/src/limits.ts | 325 ++++++---- packages/world-local/src/queue.test.ts | 52 ++ packages/world-local/src/queue.ts | 12 +- .../world-local/src/storage/events-storage.ts | 294 ++++++++- packages/world-local/src/storage/index.ts | 23 +- .../migrations/0010_add_flow_limits.sql | 32 +- .../migrations/meta/0010_snapshot.json | 208 +++--- .../src/drizzle/migrations/meta/_journal.json | 2 +- packages/world-postgres/src/drizzle/schema.ts | 35 +- packages/world-postgres/src/index.ts | 26 +- packages/world-postgres/src/limits.test.ts | 227 ++++++- packages/world-postgres/src/limits.ts | 591 ++++++++++-------- packages/world-postgres/src/queue.ts | 7 +- packages/world-postgres/src/storage.ts | 336 +++++++++- packages/world-postgres/test/test-db.ts | 3 +- packages/world-testing/src/limits-contract.ts | 31 +- packages/world-testing/src/limits-runtime.ts | 22 + packages/world/FLOW_LIMITS.md | 50 +- packages/world/src/events.ts | 59 ++ packages/world/src/index.ts | 3 + packages/world/src/limits.ts | 22 +- packages/world/src/queue.ts | 1 + workbench/example/workflows/99_e2e.ts | 25 + 36 files changed, 2879 insertions(+), 590 deletions(-) create mode 100644 packages/core/src/workflow/lock.test.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 77a5960231..4ccc1c8747 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -53,15 +53,6 @@ if (!deploymentUrl) { */ type E2EWorkflowMetadata = Awaited>; -async function start( - workflow: E2EWorkflowMetadata, - options?: StartOptions -): Promise>; -async function start( - workflow: E2EWorkflowMetadata, - args: TArgs, - options?: StartOptions -): Promise>; async function start( workflow: E2EWorkflowMetadata, argsOrOptions?: unknown[] | StartOptions, @@ -320,6 +311,11 @@ describe('e2e', () => { const runB = await start(workflow, [userId, holdMs, periodMs, 'B']); return await Promise.all([runA.returnValue, runB.returnValue]); }, + async runReleasedRateLimitReplay(userId, periodMs, sleepMs) { + const workflow = await e2e('releasedRateLimitReplayWorkflow'); + const run = await start(workflow, [userId, periodMs, sleepMs]); + return await run.returnValue; + }, async runWorkflowFifoThreeWaiters(userId, holdMs) { const workflow = await e2e('workflowOnlyLockContentionWorkflow'); const runA = await start(workflow, [userId, holdMs, 'A']); diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index 9cc1e2dcee..0b63b96fc5 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -5,10 +5,12 @@ import { LOCK_WORKFLOW_ONLY_MESSAGE, } from './lock.js'; import { contextStorage } from './step/context-storage.js'; -import { WORKFLOW_LOCK } from './symbols.js'; +import { WORKFLOW_HAS_STEP_CONTEXT, WORKFLOW_LOCK } from './symbols.js'; afterEach(() => { delete (globalThis as any)[WORKFLOW_LOCK]; + (globalThis as any)[WORKFLOW_HAS_STEP_CONTEXT] = () => + contextStorage.getStore() !== undefined; }); describe('lock', () => { diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index 9791c39e13..9419bfb475 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -4,8 +4,7 @@ import { type LimitKey, type LimitLease, } from '@workflow/world'; -import { contextStorage } from './step/context-storage.js'; -import { WORKFLOW_LOCK } from './symbols.js'; +import { WORKFLOW_HAS_STEP_CONTEXT, WORKFLOW_LOCK } from './symbols.js'; export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; @@ -46,7 +45,10 @@ export async function lock(options: LockOptions): Promise { return workflowLock(options); } - if (contextStorage.getStore()) { + const hasStepContext = (globalThis as any)[WORKFLOW_HAS_STEP_CONTEXT] as + | (() => boolean) + | undefined; + if (hasStepContext?.()) { throw new Error(LOCK_WORKFLOW_ONLY_MESSAGE); } diff --git a/packages/core/src/private.ts b/packages/core/src/private.ts index ac827aae05..6b84d2d549 100644 --- a/packages/core/src/private.ts +++ b/packages/core/src/private.ts @@ -90,8 +90,10 @@ export { __private_getClosureVars } from './step/get-closure-vars.js'; export interface WorkflowOrchestratorContext { runId: string; + lockPreApproval?: string; encryptionKey: CryptoKey | undefined; globalThis: typeof globalThis; + advanceTimestamp: (timestamp: number) => void; eventsConsumer: EventsConsumer; nextLockIndex: number; /** diff --git a/packages/core/src/runtime.ts b/packages/core/src/runtime.ts index a7d6d398ef..c2158ce3be 100644 --- a/packages/core/src/runtime.ts +++ b/packages/core/src/runtime.ts @@ -4,7 +4,6 @@ import { RunExpiredError, WorkflowRuntimeError, } from '@workflow/errors'; -import { classifyRunError } from './classify-error.js'; import { parseWorkflowName } from '@workflow/utils/parse-name'; import { type Event, @@ -12,6 +11,7 @@ import { WorkflowInvokePayloadSchema, type WorkflowRun, } from '@workflow/world'; +import { classifyRunError } from './classify-error.js'; import { importKey } from './encryption.js'; import { WorkflowSuspension } from './global.js'; import { runtimeLogger } from './logger.js'; @@ -101,6 +101,7 @@ export function workflowEntrypoint( const { runId, + lockPreApproval, traceCarrier: traceContext, requestedAt, } = WorkflowInvokePayloadSchema.parse(message_); @@ -314,7 +315,8 @@ export function workflowEntrypoint( workflowCode, workflowRun, events, - encryptionKey + encryptionKey, + lockPreApproval ); } ); diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index 2a9aa8b7e1..6b4ae846d3 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -1,5 +1,6 @@ import { AsyncLocalStorage } from 'node:async_hooks'; import type { CryptoKey } from '../encryption.js'; +import { WORKFLOW_HAS_STEP_CONTEXT } from '../symbols.js'; import type { WorkflowMetadata } from '../workflow/get-workflow-metadata.js'; import type { StepMetadata } from './get-step-metadata.js'; @@ -10,3 +11,6 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ closureVars?: Record; encryptionKey?: CryptoKey; }>(); + +(globalThis as any)[WORKFLOW_HAS_STEP_CONTEXT] = () => + contextStorage.getStore() !== undefined; diff --git a/packages/core/src/symbols.ts b/packages/core/src/symbols.ts index 790f2fe46f..c9842d22e4 100644 --- a/packages/core/src/symbols.ts +++ b/packages/core/src/symbols.ts @@ -3,6 +3,9 @@ export const WORKFLOW_CREATE_HOOK = Symbol.for('WORKFLOW_CREATE_HOOK'); export const WORKFLOW_SLEEP = Symbol.for('WORKFLOW_SLEEP'); export const WORKFLOW_LOCK = Symbol.for('WORKFLOW_LOCK'); export const WORKFLOW_CONTEXT = Symbol.for('WORKFLOW_CONTEXT'); +export const WORKFLOW_HAS_STEP_CONTEXT = Symbol.for( + 'WORKFLOW_HAS_STEP_CONTEXT' +); export const WORKFLOW_GET_STREAM_ID = Symbol.for('WORKFLOW_GET_STREAM_ID'); export const STABLE_ULID = Symbol.for('WORKFLOW_STABLE_ULID'); export const STREAM_NAME_SYMBOL = Symbol.for('WORKFLOW_STREAM_NAME'); diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index 01883a0fee..685245c77d 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -78,7 +78,8 @@ export async function runWorkflow( workflowCode: string, workflowRun: WorkflowRun, events: Event[], - encryptionKey: CryptoKey | undefined + encryptionKey: CryptoKey | undefined, + lockPreApproval?: string ): Promise { return trace(`workflow.run ${workflowRun.workflowName}`, async (span) => { span?.setAttributes({ @@ -135,8 +136,12 @@ export async function runWorkflow( const workflowContext: WorkflowOrchestratorContext = { runId: workflowRun.runId, + lockPreApproval, encryptionKey, globalThis: vmGlobalThis, + advanceTimestamp: (timestamp) => { + updateTimestamp(Math.max(timestamp, vmGlobalThis.Date.now())); + }, onWorkflowError: workflowDiscontinuation.reject, eventsConsumer, nextLockIndex: 0, diff --git a/packages/core/src/workflow/lock.test.ts b/packages/core/src/workflow/lock.test.ts new file mode 100644 index 0000000000..675d5b42ba --- /dev/null +++ b/packages/core/src/workflow/lock.test.ts @@ -0,0 +1,364 @@ +import { TooEarlyError, WorkflowRuntimeError } from '@workflow/errors'; +import { withResolvers } from '@workflow/utils'; +import type { Event, EventResult, LimitLease } from '@workflow/world'; +import { + createLockCorrelationId, + createLockWakeCorrelationId, +} from '@workflow/world'; +import * as nanoid from 'nanoid'; +import { monotonicFactory } from 'ulid'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { EventsConsumer } from '../events-consumer.js'; +import { WorkflowSuspension } from '../global.js'; +import type { WorkflowOrchestratorContext } from '../private.js'; +import { setWorld } from '../runtime/world.js'; +import { createContext } from '../vm/index.js'; +import { createLock } from './lock.js'; + +function createLease(): LimitLease { + return { + leaseId: 'lmt_lease', + key: 'workflow:user:test', + lockId: 'wrun_test:0', + runId: 'wrun_test', + lockIndex: 0, + acquiredAt: new Date('2025-01-01T00:00:00.000Z'), + expiresAt: new Date('2027-01-01T00:00:00.000Z'), + definition: { + concurrency: { max: 1 }, + }, + }; +} + +function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { + const context = createContext({ + seed: 'test', + fixedTimestamp: 1753481739458, + }); + const ulid = monotonicFactory(() => context.globalThis.Math.random()); + const workflowStartedAt = context.globalThis.Date.now(); + return { + runId: 'wrun_test', + lockPreApproval: undefined, + encryptionKey: undefined, + globalThis: context.globalThis, + advanceTimestamp: vi.fn(), + eventsConsumer: new EventsConsumer(events, { + onUnconsumedEvent: () => {}, + getPromiseQueue: () => Promise.resolve(), + }), + nextLockIndex: 0, + invocationsQueue: new Map(), + generateUlid: () => ulid(workflowStartedAt), + generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => + new Uint8Array(size).map(() => 256 * context.globalThis.Math.random()) + ), + onWorkflowError: vi.fn(), + promiseQueue: Promise.resolve(), + pendingDeliveries: 0, + }; +} + +function asEventResult(event: Event): EventResult { + return { event }; +} + +afterEach(() => { + setWorld(undefined as any); + vi.restoreAllMocks(); +}); + +describe('createLock', () => { + it('creates and immediately acquires a fresh lock via world events', async () => { + const lease = createLease(); + const createEvent = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId: createLockCorrelationId('wrun_test', 0), + eventData: { lease }, + createdAt: new Date(), + }) + ) + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_release', + runId: 'wrun_test', + eventType: 'lock_release', + correlationId: createLockCorrelationId('wrun_test', 0), + createdAt: new Date(), + }) + ); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const ctx = setupWorkflowContext([]); + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + + expect(createEvent).toHaveBeenNthCalledWith( + 1, + 'wrun_test', + expect.objectContaining({ + eventType: 'lock_created', + correlationId: createLockCorrelationId('wrun_test', 0), + }) + ); + expect(handle.leaseId).toBe(lease.leaseId); + + await handle.dispose(); + + expect(createEvent).toHaveBeenNthCalledWith( + 2, + 'wrun_test', + expect.objectContaining({ + eventType: 'lock_release', + correlationId: createLockCorrelationId('wrun_test', 0), + }) + ); + }); + + it('replays a rate-only lock from lock_acquired without creating new events', async () => { + const lease = { + ...createLease(), + key: 'workflow:rate:test', + definition: { + rate: { count: 1, periodMs: 60_000 }, + }, + }; + const createEvent = vi.fn(); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease }, + createdAt: new Date(), + }, + ]); + + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + rate: { count: 1, periodMs: 60_000 }, + }); + + expect(createEvent).not.toHaveBeenCalled(); + expect(handle.leaseId).toBe(lease.leaseId); + }); + + it('ignores an expired lock_acquired event and reacquires the lease', async () => { + const expiredLease = { + ...createLease(), + expiresAt: new Date('2025-01-01T00:00:00.000Z'), + }; + const freshLease = { + ...createLease(), + leaseId: 'lmt_fresh', + expiresAt: new Date('2027-06-01T00:00:00.000Z'), + }; + const createEvent = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_acquired_fresh', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId: createLockCorrelationId('wrun_test', 0), + eventData: { lease: freshLease }, + createdAt: new Date(), + }) + ); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat: vi.fn() }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_created', + runId: 'wrun_test', + eventType: 'lock_created', + correlationId, + eventData: { + key: expiredLease.key, + definition: expiredLease.definition, + leaseTtlMs: 1_000, + }, + createdAt: new Date(), + }, + { + eventId: 'evnt_lock_acquired_expired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease: expiredLease }, + createdAt: new Date(), + }, + ]); + + const lock = createLock(ctx); + const handle = await lock({ + key: expiredLease.key, + concurrency: { max: 1 }, + }); + + expect(createEvent).toHaveBeenCalledTimes(1); + expect(createEvent).toHaveBeenCalledWith( + 'wrun_test', + expect.objectContaining({ + eventType: 'lock_acquired', + correlationId, + }) + ); + expect(handle.leaseId).toBe(freshLease.leaseId); + }); + + it('replays a released scope as a no-op without double-releasing', async () => { + const lease = createLease(); + const createEvent = vi.fn(); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease }, + createdAt: new Date(), + }, + { + eventId: 'evnt_lock_release', + runId: 'wrun_test', + eventType: 'lock_release', + correlationId, + createdAt: new Date(), + }, + ]); + + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + + await handle.dispose(); + + expect(createEvent).not.toHaveBeenCalled(); + }); + + it('re-suspends when a stale lock wake-up becomes too early again', async () => { + const retryAfter = new Date(Date.now() + 30_000); + const createEvent = vi + .fn<() => Promise>() + .mockRejectedValueOnce( + new TooEarlyError('not ready yet', { retryAfter }) + ); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat: vi.fn() }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_lock_created', + runId: 'wrun_test', + eventType: 'lock_created', + correlationId, + eventData: { + key: 'workflow:rate:test', + definition: { rate: { count: 1, periodMs: 60_000 } }, + acquireAt: new Date(Date.now() - 1_000), + }, + createdAt: new Date(), + }, + ]); + const errorReceived = withResolvers(); + ctx.onWorkflowError = errorReceived.resolve; + + const lock = createLock(ctx); + void lock({ + key: 'workflow:rate:test', + rate: { count: 1, periodMs: 60_000 }, + }); + + const workflowError = await errorReceived.promise; + expect(workflowError).toBeInstanceOf(WorkflowSuspension); + expect(createEvent).toHaveBeenCalledTimes(1); + const waitItem = ctx.invocationsQueue.get( + createLockWakeCorrelationId('wrun_test', 0) + ); + expect(waitItem).toMatchObject({ + type: 'limit_wait', + correlationId: createLockWakeCorrelationId('wrun_test', 0), + resumeAt: retryAfter, + }); + }); + + it('rejects heartbeat in workflow scope to preserve replay determinism', async () => { + const lease = createLease(); + const createEvent = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + asEventResult({ + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId: createLockCorrelationId('wrun_test', 0), + eventData: { lease }, + createdAt: new Date(), + }) + ); + const heartbeat = vi.fn().mockResolvedValue(lease); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat }, + } as any); + + const ctx = setupWorkflowContext([]); + const lock = createLock(ctx); + const handle = await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + + await expect(handle.heartbeat()).rejects.toBeInstanceOf( + WorkflowRuntimeError + ); + await expect(handle.heartbeat()).rejects.toThrow( + 'Lock heartbeat is not supported in workflow functions yet' + ); + expect(heartbeat).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index 2010c1fd61..bf30fb55d6 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -1,6 +1,20 @@ +import { + EntityConflictError, + TooEarlyError, + WorkflowRuntimeError, +} from '@workflow/errors'; +import { withResolvers } from '@workflow/utils'; +import { + type CreateEventRequest, + createLockCorrelationId, + createLockWakeCorrelationId, + type LimitDefinition, + type LimitLease, + SPEC_VERSION_CURRENT, +} from '@workflow/world'; +import { EventConsumerResult } from '../events-consumer.js'; import { WorkflowSuspension } from '../global.js'; import type { LockHandle, LockOptions } from '../lock.js'; -import { createLockWakeCorrelationId, type LimitLease } from '@workflow/world'; import { scheduleWhenIdle, type WorkflowOrchestratorContext, @@ -8,52 +22,128 @@ import { import { getWorld } from '../runtime/world.js'; const DEFAULT_LOCK_LEASE_TTL_MS = 24 * 60 * 60 * 1000; +const LOCK_HEARTBEAT_UNSUPPORTED_MESSAGE = + 'Lock heartbeat is not supported in workflow functions yet because it cannot be replayed deterministically.'; + +type LockLeaseView = Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' +>; + +interface LockState { + correlationId: string; + wakeCorrelationId: string; + key: string; + leaseTtlMs: number; + definition: LimitDefinition; + acquireAt?: Date; + lease?: LockLeaseView; + hasCreatedEvent: boolean; + hasAcquiredEvent: boolean; + hasReleaseEvent: boolean; +} + +function createSuspension(ctx: WorkflowOrchestratorContext) { + scheduleWhenIdle(ctx, () => { + ctx.onWorkflowError( + new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + ); + }); +} + +function isLeaseLive(lease: Pick): boolean { + return ( + lease.expiresAt === undefined || lease.expiresAt.getTime() > Date.now() + ); +} + +function getReleasedLeaseView( + ctx: WorkflowOrchestratorContext, + event: Extract | any +): LockLeaseView | undefined { + const data = event.eventData; + if (!data?.leaseId || !data?.key || !data?.lockId) { + return undefined; + } + + return { + leaseId: data.leaseId, + key: data.key, + lockId: data.lockId, + runId: ctx.runId, + lockIndex: Number.parseInt(data.lockId.split(':').at(-1) ?? '0', 10), + expiresAt: undefined, + }; +} function createLockHandle( - lease: Pick< - LimitLease, - 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' - >, + state: LockState, ctx: WorkflowOrchestratorContext ): LockHandle { - let currentLease = lease; let disposed = false; + const getLease = () => { + if (!state.lease) { + throw new WorkflowRuntimeError( + `Corrupted event log: lock ${state.correlationId} is missing lease metadata` + ); + } + return state.lease; + }; + const dispose = async () => { - if (disposed) return; + if (disposed || state.hasReleaseEvent) { + return; + } + disposed = true; - await getWorld().limits.release({ - leaseId: currentLease.leaseId, - key: currentLease.key, - lockId: currentLease.lockId, - }); + let eventCreatedAt: Date | undefined; + try { + const result = await getWorld().events.create(ctx.runId, { + eventType: 'lock_release', + specVersion: SPEC_VERSION_CURRENT, + correlationId: state.correlationId, + }); + eventCreatedAt = result.event?.createdAt; + } catch (error) { + if (EntityConflictError.is(error)) { + state.hasReleaseEvent = true; + return; + } + throw error; + } + + state.hasReleaseEvent = true; + if (eventCreatedAt) { + ctx.advanceTimestamp(+eventCreatedAt); + } }; const heartbeat = async (ttlMs?: number) => { - currentLease = await getWorld().limits.heartbeat({ - leaseId: currentLease.leaseId, - ttlMs, - }); + if (state.hasReleaseEvent) return; + void ttlMs; + getLease(); + throw new WorkflowRuntimeError(LOCK_HEARTBEAT_UNSUPPORTED_MESSAGE); }; const handle: LockHandle = { get leaseId() { - return currentLease.leaseId; + return getLease().leaseId; }, get key() { - return currentLease.key; + return getLease().key; }, get lockId() { - return currentLease.lockId; + return getLease().lockId; }, get runId() { - return currentLease.runId; + return getLease().runId; }, get lockIndex() { - return currentLease.lockIndex; + return getLease().lockIndex; }, get expiresAt() { - return currentLease.expiresAt; + return getLease().expiresAt; }, dispose, heartbeat, @@ -68,46 +158,303 @@ function createLockHandle( return handle; } +function createLockCreatedEvent( + state: LockState +): Extract { + return { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: state.correlationId, + eventData: { + key: state.key, + definition: state.definition, + leaseTtlMs: state.leaseTtlMs, + }, + }; +} + +function createLockAcquiredEvent( + state: LockState +): Extract { + return { + eventType: 'lock_acquired', + specVersion: SPEC_VERSION_CURRENT, + correlationId: state.correlationId, + }; +} + export function createLock(ctx: WorkflowOrchestratorContext) { return async function lockImpl(options: LockOptions): Promise { - /* - Blocked workflow locks suspend the workflow turn instead of creating a real - wait event. Postgres can wake this correlation id early when the waiter is - promoted, and the delayed replay is just a fallback. - */ const lockIndex = ctx.nextLockIndex++; - const correlationId = createLockWakeCorrelationId(ctx.runId, lockIndex); - const definition = { - concurrency: options.concurrency, - rate: options.rate, + const state: LockState = { + correlationId: createLockCorrelationId(ctx.runId, lockIndex), + wakeCorrelationId: createLockWakeCorrelationId(ctx.runId, lockIndex), + key: options.key, + leaseTtlMs: options.leaseTtlMs ?? DEFAULT_LOCK_LEASE_TTL_MS, + definition: { + concurrency: options.concurrency, + rate: options.rate, + }, + hasCreatedEvent: false, + hasAcquiredEvent: false, + hasReleaseEvent: false, }; - while (true) { - const result = await getWorld().limits.acquire({ - key: options.key, - runId: ctx.runId, - lockIndex, - definition, - leaseTtlMs: options.leaseTtlMs ?? DEFAULT_LOCK_LEASE_TTL_MS, - }); + const { promise, resolve, reject } = withResolvers(); + let resolved = false; + let pendingRuntimeRequest = false; + let suspensionScheduled = false; + + const resolveHandle = () => { + if (resolved) return; + resolved = true; + ctx.invocationsQueue.delete(state.wakeCorrelationId); + resolve(createLockHandle(state, ctx)); + }; - if (result.status === 'acquired') { - return createLockHandle(result.lease, ctx); + const suspendWorkflow = () => { + if (suspensionScheduled || resolved) { + return; } + suspensionScheduled = true; + createSuspension(ctx); + }; - ctx.invocationsQueue.set(correlationId, { + const scheduleRateRetry = (acquireAt: Date) => { + ctx.invocationsQueue.set(state.wakeCorrelationId, { type: 'limit_wait', - correlationId, - resumeAt: new Date(Date.now() + (result.retryAfterMs || 1000)), + correlationId: state.wakeCorrelationId, + resumeAt: acquireAt, + }); + suspendWorkflow(); + }; + + const shouldAttemptAcquire = (acquireAt?: Date) => { + if (ctx.lockPreApproval === state.correlationId) { + return true; + } + if (!acquireAt) { + return false; + } + return acquireAt.getTime() <= Date.now(); + }; + + const requestLockCreated = async () => { + try { + const result = await getWorld().events.create( + ctx.runId, + createLockCreatedEvent(state) + ); + const event = result.event; + if (!event) { + throw new WorkflowRuntimeError( + `World did not return an event for lock ${state.correlationId}` + ); + } + + if (event.eventType === 'lock_acquired') { + if (!event.eventData?.lease) { + throw new WorkflowRuntimeError( + `Corrupted event log: lock ${state.correlationId} acquisition is missing lease metadata` + ); + } + if (!isLeaseLive(event.eventData.lease)) { + state.hasCreatedEvent = true; + state.acquireAt = new Date(0); + suspendWorkflow(); + return; + } + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.lease = event.eventData.lease; + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + return; + } + + if (event.eventType === 'lock_release') { + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.hasReleaseEvent = true; + state.lease ??= getReleasedLeaseView(ctx, event); + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + return; + } + + if (event.eventType !== 'lock_created') { + throw new WorkflowRuntimeError( + `Unexpected event type for lock ${state.correlationId}: ${event.eventType}` + ); + } + + state.hasCreatedEvent = true; + ctx.advanceTimestamp(+event.createdAt); + if (event.eventData.acquireAt) { + state.acquireAt = event.eventData.acquireAt; + scheduleRateRetry(event.eventData.acquireAt); + return; + } + + state.acquireAt = undefined; + suspendWorkflow(); + } catch (error) { + reject(error); + } + }; + + const requestLockAcquired = async () => { + try { + const result = await getWorld().events.create( + ctx.runId, + createLockAcquiredEvent(state) + ); + const event = result.event; + if ( + !event || + (event.eventType !== 'lock_acquired' && + event.eventType !== 'lock_release') + ) { + throw new WorkflowRuntimeError( + `World did not acquire lock ${state.correlationId}` + ); + } + + if (event.eventType === 'lock_release') { + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.hasReleaseEvent = true; + state.lease ??= getReleasedLeaseView(ctx, event); + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + return; + } + + if (!event.eventData?.lease) { + throw new WorkflowRuntimeError( + `World did not acquire lock ${state.correlationId}` + ); + } + if (!isLeaseLive(event.eventData.lease)) { + state.acquireAt = new Date(0); + suspendWorkflow(); + return; + } + state.hasAcquiredEvent = true; + state.lease = event.eventData.lease; + ctx.advanceTimestamp(+event.createdAt); + resolveHandle(); + } catch (error) { + if (TooEarlyError.is(error)) { + if (error.retryAfter) { + state.acquireAt = error.retryAfter; + scheduleRateRetry(error.retryAfter); + } else { + state.acquireAt = undefined; + suspendWorkflow(); + } + return; + } + reject(error); + } + }; + + const ensureRuntimeProgress = (acquireAt?: Date) => { + if (resolved || pendingRuntimeRequest) { + return; + } + + if (!state.hasCreatedEvent) { + pendingRuntimeRequest = true; + void requestLockCreated().finally(() => { + pendingRuntimeRequest = false; + }); + return; + } + + if (state.hasAcquiredEvent) { + resolveHandle(); + return; + } + + if (!shouldAttemptAcquire(acquireAt)) { + if (acquireAt) { + scheduleRateRetry(acquireAt); + } else { + suspendWorkflow(); + } + return; + } + + pendingRuntimeRequest = true; + void requestLockAcquired().finally(() => { + pendingRuntimeRequest = false; }); + }; + + ctx.eventsConsumer.subscribe((event) => { + if (!event) { + ensureRuntimeProgress(state.acquireAt); + return EventConsumerResult.NotConsumed; + } + + if (event.correlationId !== state.correlationId) { + return EventConsumerResult.NotConsumed; + } - scheduleWhenIdle(ctx, () => { + if (event.eventType === 'lock_created') { + state.hasCreatedEvent = true; + state.acquireAt = event.eventData.acquireAt; + return EventConsumerResult.Consumed; + } + + if (event.eventType === 'lock_acquired') { + if (!event.eventData?.lease) { + ctx.promiseQueue = ctx.promiseQueue.then(() => { + ctx.onWorkflowError( + new WorkflowRuntimeError( + `Corrupted event log: lock ${state.correlationId} acquisition is missing lease metadata` + ) + ); + }); + return EventConsumerResult.Finished; + } + if (!isLeaseLive(event.eventData.lease)) { + state.hasCreatedEvent = true; + state.acquireAt = new Date(0); + return EventConsumerResult.Consumed; + } + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.lease = event.eventData.lease; + resolveHandle(); + return EventConsumerResult.Consumed; + } + + if (event.eventType === 'lock_release') { + state.lease ??= getReleasedLeaseView(ctx, event); + state.hasCreatedEvent = true; + state.hasAcquiredEvent = true; + state.hasReleaseEvent = true; + ctx.invocationsQueue.delete(state.wakeCorrelationId); + resolveHandle(); + return EventConsumerResult.Finished; + } + + if (event.eventType === 'lock_waiter_queued') { + return EventConsumerResult.Consumed; + } + + ctx.promiseQueue = ctx.promiseQueue.then(() => { ctx.onWorkflowError( - new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + new WorkflowRuntimeError( + `Unexpected event type for lock ${state.correlationId} "${event.eventType}"` + ) ); }); + return EventConsumerResult.Finished; + }); - await new Promise(() => {}); - } + return promise; }; } diff --git a/packages/errors/src/index.ts b/packages/errors/src/index.ts index 224bc8ea56..bb86fb5478 100644 --- a/packages/errors/src/index.ts +++ b/packages/errors/src/index.ts @@ -250,6 +250,30 @@ export class EntityConflictError extends WorkflowError { } } +export class LimitDefinitionConflictError extends WorkflowError { + key: string; + existingDefinition: unknown; + requestedDefinition: unknown; + + constructor( + key: string, + existingDefinition: unknown, + requestedDefinition: unknown + ) { + super( + `Limit key "${key}" is already configured with a different definition` + ); + this.name = 'LimitDefinitionConflictError'; + this.key = key; + this.existingDefinition = existingDefinition; + this.requestedDefinition = requestedDefinition; + } + + static is(value: unknown): value is LimitDefinitionConflictError { + return isError(value) && value.name === 'LimitDefinitionConflictError'; + } +} + /** * Thrown when a run is no longer available — either because it has been * cleaned up, expired, or already reached a terminal state (completed/failed). diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 142fe26ccf..d8fd515f89 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -61,13 +61,17 @@ export function createLocalWorld(args?: Partial): LocalWorld { const mergedConfig = { ...config.value, ...definedArgs }; const tag = mergedConfig.tag; const queue = createQueue(mergedConfig); - const storage = createStorage(mergedConfig.dataDir, tag); + let limits: World['limits'] | undefined; + const storage = createStorage(mergedConfig.dataDir, tag, { + getLimits: () => limits, + queue, + }); + limits = createLimits(mergedConfig.dataDir, { + tag, + storage, + }); return { - limits: createLimits(mergedConfig.dataDir, { - tag, - queue, - storage, - }), + limits, ...queue, ...storage, ...createStreamer(mergedConfig.dataDir, tag), diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 8b301c2d00..5ff9129908 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,8 +1,12 @@ -import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; -import { createLocalWorld } from './index.js'; import { mkdtemp, readFile, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; +import { LimitDefinitionConflictError } from '@workflow/errors'; +import { describe, expect, it } from 'vitest'; +import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { createLocalWorld } from './index.js'; +import { createLimits } from './limits.js'; createLimitsContractSuite('local world limits', async () => { const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); @@ -55,3 +59,183 @@ createLimitsContractSuite('local world limits', async () => { }, }; }); + +describe('local world limit retry timing', () => { + it('persists nextWaiter metadata and emits lock_waiter_queued on release', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const world = createLocalWorld({ dataDir: dir }); + world.registerHandler('__wkf_workflow_', async () => + Response.json({ ok: true }) + ); + + try { + const runA = ( + await world.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName: 'holder-a', + input: [], + }, + }) + ).run; + const runB = ( + await world.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName: 'holder-b', + input: [], + }, + }) + ).run; + if (!runA || !runB) { + throw new Error('expected runs'); + } + const correlationA = createLockCorrelationId(runA.runId, 0); + const correlationB = createLockCorrelationId(runB.runId, 0); + + const first = await world.events.create(runA.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }, + }); + const second = await world.events.create(runB.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationB, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }, + }); + + expect(first.event?.eventType).toBe('lock_acquired'); + expect(second.event?.eventType).toBe('lock_created'); + + const released = await world.events.create(runA.runId, { + eventType: 'lock_release', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + }); + + expect(released.event?.eventType).toBe('lock_release'); + if (!released.event || released.event.eventType !== 'lock_release') { + throw new Error('expected lock_release event'); + } + expect(released.event.eventData?.nextWaiter).toMatchObject({ + runId: runB.runId, + lockIndex: 0, + lockCorrelationId: correlationB, + }); + + const correlated = await world.events.listByCorrelationId({ + correlationId: correlationB, + }); + expect( + correlated.data.some( + (event) => event.eventType === 'lock_waiter_queued' + ) + ).toBe(true); + } finally { + await world.close?.(); + await rm(dir, { recursive: true, force: true }); + } + }); + + it('throws when the same key is acquired with a conflicting definition', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-a', + lockIndex: 0, + definition: { + concurrency: { max: 1 }, + }, + leaseTtlMs: 10, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs: 5_000 }, + }, + leaseTtlMs: 10, + }) + ).rejects.toBeInstanceOf(LimitDefinitionConflictError); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + it('uses the head waiter retryAfter for backlog-only waiters', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + const key = 'shared-key'; + const periodMs = 5_000; + + const acquired = await limits.acquire({ + key, + runId: 'run-a', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 10, + }); + expect(acquired.status).toBe('acquired'); + + await new Promise((resolve) => setTimeout(resolve, 25)); + + const headWaiter = await limits.acquire({ + key, + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 10, + }); + expect(headWaiter.status).toBe('blocked'); + if (headWaiter.status !== 'blocked') { + throw new Error('expected blocked'); + } + expect(headWaiter.retryAfterMs).toBeGreaterThan(0); + + const backlogOnlyWaiter = await limits.acquire({ + key, + runId: 'run-c', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 10, + }); + expect(backlogOnlyWaiter.status).toBe('blocked'); + if (backlogOnlyWaiter.status !== 'blocked') { + throw new Error('expected blocked'); + } + expect(backlogOnlyWaiter.retryAfterMs).toBeGreaterThan(0); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 896b9ad3d6..2a3be12023 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,14 +1,21 @@ import path from 'node:path'; -import { WorkflowWorldError } from '@workflow/errors'; -import type { Queue, Storage, WorkflowRunWithoutData } from '@workflow/world'; import { + LimitDefinitionConflictError, + WorkflowWorldError, +} from '@workflow/errors'; +import type { Storage, WorkflowRunWithoutData } from '@workflow/world'; +import { + createLockCorrelationId, createLockId, createLockWakeCorrelationId, + type LimitDefinition, LimitAcquireRequestSchema, type LimitAcquireResult, LimitHeartbeatRequestSchema, type LimitLease, LimitLeaseSchema, + type LimitNextWaiter, + type LimitReleaseResult, LimitReleaseRequestSchema, type Limits, parseLockId, @@ -38,13 +45,24 @@ const LimitWaiterSchema = z.object({ const KeyStateSchema = z.object({ key: z.string(), + definition: z + .object({ + concurrency: z.object({ max: z.number().int().positive() }).optional(), + rate: z + .object({ + count: z.number().int().positive(), + periodMs: z.number().int().positive(), + }) + .optional(), + }) + .optional(), leases: z.array(LimitLeaseSchema), tokens: z.array(LimitTokenSchema), waiters: z.array(LimitWaiterSchema), }); const LimitsStateSchema = z.object({ - version: z.literal(2), + version: z.union([z.literal(2), z.literal(3)]), keys: z.record(z.string(), KeyStateSchema), }); @@ -57,7 +75,8 @@ type HolderTarget = | { kind: 'lock'; runId: string; - correlationId: string; + wakeCorrelationId: string; + lockCorrelationId: string; } | { kind: 'opaque'; @@ -65,12 +84,11 @@ type HolderTarget = export interface LocalLimitsOptions { tag?: string; - queue?: Pick; storage?: Pick; } const EMPTY_STATE: LimitsState = { - version: 2, + version: 3, keys: {}, }; @@ -89,6 +107,7 @@ function cloneWaiter(waiter: LimitWaiter): LimitWaiter { function normalizeKeyState(keyState: KeyState): KeyState { return { key: keyState.key, + definition: keyState.definition, leases: keyState.leases.map((lease) => ({ ...lease })), tokens: keyState.tokens.map(cloneToken), waiters: keyState.waiters.map(cloneWaiter), @@ -97,7 +116,7 @@ function normalizeKeyState(keyState: KeyState): KeyState { function cloneState(state: LimitsState): LimitsState { return { - version: 2, + version: 3, keys: Object.fromEntries( Object.entries(state.keys).map(([key, keyState]) => [ key, @@ -110,6 +129,7 @@ function cloneState(state: LimitsState): LimitsState { function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { return { key: keyState.key, + definition: keyState.definition, leases: keyState.leases.filter( (lease) => lease.expiresAt === undefined || lease.expiresAt.getTime() > now @@ -119,6 +139,32 @@ function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { }; } +function areLimitDefinitionsEqual( + left: LimitDefinition | undefined, + right: LimitDefinition +): boolean { + return ( + left?.concurrency?.max === right.concurrency?.max && + left?.rate?.count === right.rate?.count && + left?.rate?.periodMs === right.rate?.periodMs + ); +} + +function assertCanonicalDefinition( + key: string, + keyState: KeyState, + requested: LimitDefinition +) { + if (!keyState.definition) { + keyState.definition = requested; + return; + } + + if (!areLimitDefinitionsEqual(keyState.definition, requested)) { + throw new LimitDefinitionConflictError(key, keyState.definition, requested); + } +} + function getBlockedReason( concurrencyBlocked: boolean, rateBlocked: boolean @@ -157,6 +203,35 @@ function getRetryAfterMs( return Math.min(...candidates); } +function getWaiterRetryAfterMs( + keyState: KeyState, + now: number, + waiter: Pick +): number | undefined { + return getRetryAfterMs( + keyState, + now, + waiter.concurrencyMax !== null && + keyState.leases.length >= waiter.concurrencyMax, + waiter.rateCount !== null && keyState.tokens.length >= waiter.rateCount + ); +} + +function getBlockedRetryAfterMs( + keyState: KeyState, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const headWaiter = keyState.waiters[0]; + return ( + (headWaiter + ? getWaiterRetryAfterMs(keyState, now, headWaiter) + : undefined) ?? + getRetryAfterMs(keyState, now, concurrencyBlocked, rateBlocked) + ); +} + function createLease( key: string, runId: string, @@ -200,7 +275,11 @@ function parseHolderId(lockId: string): HolderTarget { return { kind: 'lock', runId: parsedLockId.runId, - correlationId: createLockWakeCorrelationId( + wakeCorrelationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + lockCorrelationId: createLockCorrelationId( parsedLockId.runId, parsedLockId.lockIndex ), @@ -210,6 +289,26 @@ function parseHolderId(lockId: string): HolderTarget { return { kind: 'opaque' }; } +function toNextWaiter(holderId: string): LimitNextWaiter | undefined { + const parsedLockId = parseLockId(holderId); + if (!parsedLockId) { + return undefined; + } + + return { + runId: parsedLockId.runId, + lockIndex: parsedLockId.lockIndex, + wakeCorrelationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + lockCorrelationId: createLockCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; +} + function isTerminalRun(run: WorkflowRunWithoutData | undefined) { return !run || ['completed', 'failed', 'cancelled'].includes(run.status); } @@ -275,104 +374,67 @@ export function createLimits( return !isTerminalRun(run); }; - const queueWakeForHolder = async (holderId: string): Promise => { - const target = parseHolderId(holderId); - if (target.kind === 'opaque' || !options?.queue || !options?.storage) { - return; - } + const pruneDeadWaiters = async (keyState: KeyState): Promise => { + const prunedKeyState = pruneKeyState(keyState); + const waiters: LimitWaiter[] = []; - try { - const run = await getRun(target.runId); - if (isTerminalRun(run) || !run) return; - - await options.queue.queue( - `__wkf_workflow_${run.workflowName}`, - { - runId: target.runId, - requestedAt: new Date(), - }, - { - idempotencyKey: target.correlationId, - } - ); - } catch (error) { - console.warn('[world-local] Failed to queue lock wake-up', error); + for (const waiter of prunedKeyState.waiters) { + if (await isHolderLive(waiter.lockId)) { + waiters.push(waiter); + } } + + prunedKeyState.waiters = waiters; + return prunedKeyState; }; - const promoteWaiters = async ( + const promoteWaiter = ( key: string, - keyState: KeyState - ): Promise<{ keyState: KeyState; wakeHolders: string[] }> => { - const wakeHolders: string[] = []; - const promotedKeyState = pruneKeyState(keyState); - const remainingWaiters: LimitWaiter[] = []; - let activeLeases = promotedKeyState.leases.length; - let activeTokens = promotedKeyState.tokens.length; - - for (let index = 0; index < promotedKeyState.waiters.length; index++) { - const waiter = promotedKeyState.waiters[index]; - - if (!(await isHolderLive(waiter.lockId))) { - continue; - } - - const concurrencyBlocked = - waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; - const rateBlocked = - waiter.rateCount !== null && activeTokens >= waiter.rateCount; - - if (concurrencyBlocked || rateBlocked) { - remainingWaiters.push( - waiter, - ...promotedKeyState.waiters.slice(index + 1) - ); - promotedKeyState.waiters = remainingWaiters; - return { keyState: promotedKeyState, wakeHolders }; - } + keyState: KeyState, + waiter: LimitWaiter + ): { + keyState: KeyState; + lease: LimitLease; + nextWaiter?: LimitNextWaiter; + } => { + const acquiredAt = new Date(); + const definition = { + concurrency: + waiter.concurrencyMax !== null + ? { max: waiter.concurrencyMax } + : undefined, + rate: + waiter.rateCount !== null && waiter.ratePeriodMs !== null + ? { + count: waiter.rateCount, + periodMs: waiter.ratePeriodMs, + } + : undefined, + } satisfies LimitDefinition; + + const lease = createLease( + key, + waiter.runId, + waiter.lockIndex, + definition, + acquiredAt, + waiter.leaseTtlMs + ); - const acquiredAt = new Date(); - const definition = { - concurrency: - waiter.concurrencyMax !== null - ? { max: waiter.concurrencyMax } - : undefined, - rate: - waiter.rateCount !== null && waiter.ratePeriodMs !== null - ? { - count: waiter.rateCount, - periodMs: waiter.ratePeriodMs, - } - : undefined, - }; - - promotedKeyState.leases.push( - createLease( - key, - waiter.runId, - waiter.lockIndex, - definition, - acquiredAt, - waiter.leaseTtlMs - ) - ); - activeLeases += 1; - - if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { - insertToken( - promotedKeyState, - waiter.lockId, - acquiredAt, - waiter.ratePeriodMs - ); - activeTokens += 1; - } + keyState.waiters = keyState.waiters.filter( + (candidate) => candidate.waiterId !== waiter.waiterId + ); + keyState.leases.push(lease); - wakeHolders.push(waiter.lockId); + if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { + insertToken(keyState, waiter.lockId, acquiredAt, waiter.ratePeriodMs); } - promotedKeyState.waiters = remainingWaiters; - return { keyState: promotedKeyState, wakeHolders }; + return { + keyState, + lease, + nextWaiter: toNextWaiter(waiter.lockId), + }; }; return { @@ -382,18 +444,23 @@ export function createLimits( return withStateLock(async (): Promise => { const state = cloneState(await readState()); - const baseKeyState = pruneKeyState( + const keyState = await pruneDeadWaiters( state.keys[parsed.key] ?? { key: parsed.key, + definition: undefined, leases: [], tokens: [], waiters: [], } ); - const { keyState, wakeHolders } = await promoteWaiters( - parsed.key, - baseKeyState - ); + if ( + keyState.leases.length === 0 && + keyState.tokens.length === 0 && + keyState.waiters.length === 0 + ) { + keyState.definition = undefined; + } + assertCanonicalDefinition(parsed.key, keyState, parsed.definition); state.keys[parsed.key] = keyState; const existingLease = keyState.leases.find( @@ -401,7 +468,6 @@ export function createLimits( ); if (existingLease) { await writeState(state); - await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'acquired', lease: existingLease, @@ -418,6 +484,25 @@ export function createLimits( (waiter) => waiter.lockId === lockId ); + if ( + existingWaiter && + keyState.waiters[0]?.waiterId === existingWaiter.waiterId + ) { + if (!concurrencyBlocked && !rateBlocked) { + const promoted = promoteWaiter( + parsed.key, + keyState, + existingWaiter + ); + state.keys[parsed.key] = promoted.keyState; + await writeState(state); + return { + status: 'acquired', + lease: promoted.lease, + }; + } + } + if ( existingWaiter || concurrencyBlocked || @@ -440,11 +525,10 @@ export function createLimits( state.keys[parsed.key] = keyState; await writeState(state); - await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'blocked', reason: getBlockedReason(concurrencyBlocked, rateBlocked), - retryAfterMs: getRetryAfterMs( + retryAfterMs: getBlockedRetryAfterMs( keyState, Date.now(), concurrencyBlocked, @@ -476,7 +560,6 @@ export function createLimits( state.keys[parsed.key] = keyState; await writeState(state); - await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'acquired', @@ -488,12 +571,12 @@ export function createLimits( async release(request) { const parsed = LimitReleaseRequestSchema.parse(request); - await withStateLock(async () => { + return withStateLock(async (): Promise => { const state = cloneState(await readState()); - const wakeHolders: string[] = []; + let nextWaiter: LimitNextWaiter | undefined; for (const [key, keyStateValue] of Object.entries(state.keys)) { - const keyState = pruneKeyState(keyStateValue); + const keyState = await pruneDeadWaiters(keyStateValue); const beforeLeases = keyState.leases.length; keyState.leases = keyState.leases.filter((lease) => { if (lease.leaseId !== parsed.leaseId) return true; @@ -505,9 +588,25 @@ export function createLimits( }); if (keyState.leases.length !== beforeLeases) { - const promoted = await promoteWaiters(key, keyState); - state.keys[key] = promoted.keyState; - wakeHolders.push(...promoted.wakeHolders); + const headWaiter = keyState.waiters[0]; + if (headWaiter) { + const concurrencyBlocked = + headWaiter.concurrencyMax !== null && + keyState.leases.length >= headWaiter.concurrencyMax; + const rateBlocked = + headWaiter.rateCount !== null && + keyState.tokens.length >= headWaiter.rateCount; + + if (!concurrencyBlocked && !rateBlocked) { + const promoted = promoteWaiter(key, keyState, headWaiter); + nextWaiter = promoted.nextWaiter; + state.keys[key] = promoted.keyState; + } else { + state.keys[key] = keyState; + } + } else { + state.keys[key] = keyState; + } } else { state.keys[key] = keyState; } @@ -516,7 +615,7 @@ export function createLimits( } await writeState(state); - await Promise.all(wakeHolders.map(queueWakeForHolder)); + return { nextWaiter }; }); }, diff --git a/packages/world-local/src/queue.test.ts b/packages/world-local/src/queue.test.ts index f07677fe49..e96ed16695 100644 --- a/packages/world-local/src/queue.test.ts +++ b/packages/world-local/src/queue.test.ts @@ -10,6 +10,7 @@ const stepPayload: StepInvokePayload = { }; describe('queue timeout re-enqueue', () => { + const maxSetTimeoutDelayMs = 2_147_483_647; let localQueue: ReturnType; beforeEach(() => { @@ -133,4 +134,55 @@ describe('queue timeout re-enqueue', () => { expect(seenStepIds).toEqual(['step_replacement']); }); + + it('does not fire long delayed messages before the setTimeout max delay elapses', async () => { + let callCount = 0; + const delaySeconds = Math.ceil((maxSetTimeoutDelayMs + 5_000) / 1000); + const remainingDelayMs = delaySeconds * 1000 - maxSetTimeoutDelayMs; + const handler = localQueue.createQueueHandler('__wkf_step_', async () => { + callCount++; + return undefined; + }); + + localQueue.registerHandler('__wkf_step_', handler); + + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + delaySeconds, + }); + + await vi.advanceTimersByTimeAsync(maxSetTimeoutDelayMs); + expect(callCount).toBe(0); + + await vi.advanceTimersByTimeAsync(remainingDelayMs); + expect(callCount).toBe(1); + }); + + it('replaces chunked long-delay deliveries with an immediate idempotent wake-up', async () => { + const seenStepIds: string[] = []; + const handler = localQueue.createQueueHandler( + '__wkf_step_', + async (body) => { + seenStepIds.push((body as StepInvokePayload).stepId); + return undefined; + } + ); + + localQueue.registerHandler('__wkf_step_', handler); + + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + idempotencyKey: 'step_very_delayed', + delaySeconds: Math.ceil((maxSetTimeoutDelayMs + 5_000) / 1000), + }); + await localQueue.queue( + '__wkf_step_test' as any, + { ...stepPayload, stepId: 'step_immediate_replacement' }, + { + idempotencyKey: 'step_very_delayed', + } + ); + + await vi.runAllTimersAsync(); + + expect(seenStepIds).toEqual(['step_immediate_replacement']); + }); }); diff --git a/packages/world-local/src/queue.ts b/packages/world-local/src/queue.ts index c356730daf..c45e80e986 100644 --- a/packages/world-local/src/queue.ts +++ b/packages/world-local/src/queue.ts @@ -16,6 +16,7 @@ const DEFAULT_CONCURRENCY_LIMIT = 1000; const WORKFLOW_LOCAL_QUEUE_CONCURRENCY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_CONCURRENCY ?? '0', 10) || DEFAULT_CONCURRENCY_LIMIT; +const MAX_SET_TIMEOUT_DELAY_MS = 2_147_483_647; export type DirectHandler = (req: Request) => Promise; @@ -101,13 +102,18 @@ export function createQueue(config: Partial): LocalQueue { return; } + const timeoutMs = Math.min(delayMs, MAX_SET_TIMEOUT_DELAY_MS); message.timer = globalThis.setTimeout(() => { if (message.version !== version || closed) { return; } message.timer = undefined; + if (delayMs > MAX_SET_TIMEOUT_DELAY_MS) { + scheduleExecution(message, delayMs - MAX_SET_TIMEOUT_DELAY_MS); + return; + } enqueueRun(); - }, delayMs); + }, timeoutMs); }; const deliverMessage = async ( @@ -270,6 +276,10 @@ export function createQueue(config: Partial): LocalQueue { if (opts?.idempotencyKey) { const existing = scheduledMessages.get(opts.idempotencyKey); if (existing) { + if (existing.running) { + return { messageId: existing.messageId }; + } + existing.queueName = queueName; existing.body = body; existing.headers = opts.headers; diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index c4d0497e83..4389208c09 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -12,6 +12,8 @@ import type { Event, EventResult, Hook, + Limits, + Queue, SerializedData, Step, Storage, @@ -65,14 +67,38 @@ async function deleteAllWaitsForRun( } } +async function listEventsByCorrelationId( + basedir: string, + correlationId: string +): Promise { + const result = await paginatedFileSystemQuery({ + directory: path.join(basedir, 'events'), + schema: EventSchema, + filter: (event) => event.correlationId === correlationId, + sortOrder: 'asc', + getCreatedAt: getObjectCreatedAt('evnt'), + getId: (event) => event.eventId, + }); + + return result.data; +} + /** * Creates the events storage implementation using the filesystem. * Implements the Storage['events'] interface with create, list, and listByCorrelationId operations. */ export function createEventsStorage( basedir: string, - tag?: string + tag?: string, + options?: { + getLimits?: () => Limits | undefined; + queue?: Pick; + runs?: Pick; + } ): Storage['events'] { + const isLeaseLive = (lease: { expiresAt?: Date }) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > Date.now(); + return { async create(runId, data, params): Promise { const eventId = `evnt_${monotonicUlid()}`; @@ -204,7 +230,11 @@ export function createEventsStorage( if ( data.eventType === 'step_created' || data.eventType === 'hook_created' || - data.eventType === 'wait_created' + data.eventType === 'wait_created' || + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' || + data.eventType === 'lock_waiter_queued' ) { throw new EntityConflictError( `Cannot create new entities on run in terminal state "${currentRun.status}"` @@ -273,7 +303,7 @@ export function createEventsStorage( throw new HookNotFoundError(data.correlationId); } } - const event: Event = { + let event: Event = { ...data, runId: effectiveRunId, eventId, @@ -287,6 +317,264 @@ export function createEventsStorage( let hook: Hook | undefined; let wait: Wait | undefined; + if ( + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' + ) { + const limits = options?.getLimits?.(); + if (!limits) { + throw new WorkflowWorldError( + `Flow limits are not configured for event type "${data.eventType}"` + ); + } + + const existingEvents = await listEventsByCorrelationId( + basedir, + data.correlationId + ); + const existingCreatedEvent = existingEvents.find( + (event) => event.eventType === 'lock_created' + ); + const existingAcquiredEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_acquired'); + const existingReleaseEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_release'); + + if (data.eventType === 'lock_created') { + const existingEvent = + existingReleaseEvent ?? + (existingAcquiredEvent?.eventData?.lease && + isLeaseLive(existingAcquiredEvent.eventData.lease) + ? existingAcquiredEvent + : undefined) ?? + existingCreatedEvent; + if (existingEvent) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingEvent, resolveData), + run, + step, + hook, + wait, + }; + } + + const result = await limits.acquire({ + key: data.eventData.key, + runId: effectiveRunId, + lockIndex: Number.parseInt( + data.correlationId.split(':').at(-1) ?? '0', + 10 + ), + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + }); + const eventCreatedAt = new Date(); + + event = + result.status === 'acquired' + ? EventSchema.parse({ + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }) + : EventSchema.parse({ + eventType: 'lock_created', + correlationId: data.correlationId, + eventData: { + key: data.eventData.key, + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + acquireAt: + result.retryAfterMs !== undefined + ? new Date( + eventCreatedAt.getTime() + result.retryAfterMs + ) + : undefined, + }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }); + } else if (data.eventType === 'lock_acquired') { + if (existingReleaseEvent) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingReleaseEvent, resolveData), + run, + step, + hook, + wait, + }; + } + if ( + existingAcquiredEvent?.eventData?.lease && + isLeaseLive(existingAcquiredEvent.eventData.lease) + ) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingAcquiredEvent, resolveData), + run, + step, + hook, + wait, + }; + } + + const createdEvent = existingCreatedEvent; + if (!createdEvent || !createdEvent.eventData) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be acquired before lock_created` + ); + } + + const result = await limits.acquire({ + key: createdEvent.eventData.key, + runId: effectiveRunId, + lockIndex: Number.parseInt( + data.correlationId.split(':').at(-1) ?? '0', + 10 + ), + definition: createdEvent.eventData.definition, + leaseTtlMs: createdEvent.eventData.leaseTtlMs, + }); + if (result.status !== 'acquired') { + const retryAfter = + result.retryAfterMs !== undefined + ? new Date(Date.now() + result.retryAfterMs) + : undefined; + throw new TooEarlyError( + `Lock "${data.correlationId}" is not ready to acquire`, + { retryAfter } + ); + } + const eventCreatedAt = new Date(); + + event = EventSchema.parse({ + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }); + } else { + if (existingReleaseEvent) { + const resolveData = + params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(existingReleaseEvent, resolveData), + run, + step, + hook, + wait, + }; + } + + const acquiredEvent = existingAcquiredEvent; + const lease = acquiredEvent?.eventData?.lease; + if (!lease) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be released before lock_acquired` + ); + } + + const releaseResult = await limits.release({ + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }); + const eventCreatedAt = new Date(); + + event = EventSchema.parse({ + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + nextWaiter: releaseResult.nextWaiter, + }, + runId: effectiveRunId, + eventId, + createdAt: eventCreatedAt, + specVersion: effectiveSpecVersion, + }); + } + + const compositeKey = `${effectiveRunId}-${eventId}`; + await writeJSON( + taggedPath(basedir, 'events', compositeKey, tag), + event + ); + + if ( + event.eventType === 'lock_release' && + event.eventData?.nextWaiter && + options?.queue && + options?.runs + ) { + const nextRun = await options.runs.get( + event.eventData.nextWaiter.runId, + { + resolveData: 'none', + } + ); + if (!['completed', 'failed', 'cancelled'].includes(nextRun.status)) { + await options.queue.queue( + `__wkf_workflow_${nextRun.workflowName}`, + { + runId: event.eventData.nextWaiter.runId, + lockPreApproval: event.eventData.nextWaiter.lockCorrelationId, + requestedAt: new Date(), + }, + { + idempotencyKey: event.eventData.nextWaiter.wakeCorrelationId, + } + ); + + const waiterQueuedEvent = EventSchema.parse({ + eventType: 'lock_waiter_queued', + correlationId: event.eventData.nextWaiter.lockCorrelationId, + runId: event.eventData.nextWaiter.runId, + eventId: `evnt_${monotonicUlid()}`, + createdAt: new Date(), + specVersion: effectiveSpecVersion, + }); + await writeJSON( + taggedPath( + basedir, + 'events', + `${waiterQueuedEvent.runId}-${waiterQueuedEvent.eventId}`, + tag + ), + waiterQueuedEvent + ); + } + } + + const resolveData = params?.resolveData ?? DEFAULT_RESOLVE_DATA_OPTION; + return { + event: stripEventDataRefs(event, resolveData), + run, + step, + hook, + wait, + }; + } + // Create/update entity based on event type (event-sourced architecture) // Run lifecycle events if (data.eventType === 'run_created' && 'eventData' in data) { diff --git a/packages/world-local/src/storage/index.ts b/packages/world-local/src/storage/index.ts index e5304e0104..ac21a408ce 100644 --- a/packages/world-local/src/storage/index.ts +++ b/packages/world-local/src/storage/index.ts @@ -1,10 +1,16 @@ -import type { Storage } from '@workflow/world'; +import type { Limits, Queue, Storage } from '@workflow/world'; import { instrumentObject } from '../instrumentObject.js'; import { createEventsStorage } from './events-storage.js'; import { createHooksStorage } from './hooks-storage.js'; import { createRunsStorage } from './runs-storage.js'; import { createStepsStorage } from './steps-storage.js'; +export interface LocalStorageOptions { + getLimits?: () => Limits | undefined; + queue?: Pick; + runs?: Pick; +} + /** * Creates a complete storage implementation using the filesystem. * This is the main entry point that composes all storage implementations. @@ -14,12 +20,19 @@ import { createStepsStorage } from './steps-storage.js'; * @param basedir - The base directory for storing workflow data * @returns A complete Storage implementation with tracing */ -export function createStorage(basedir: string, tag?: string): Storage { - // Create raw storage implementations +export function createStorage( + basedir: string, + tag?: string, + options?: LocalStorageOptions +): Storage { + const runs = createRunsStorage(basedir, tag); const storage: Storage = { - runs: createRunsStorage(basedir, tag), + runs, steps: createStepsStorage(basedir, tag), - events: createEventsStorage(basedir, tag), + events: createEventsStorage(basedir, tag, { + ...options, + runs, + }), hooks: createHooksStorage(basedir, tag), }; diff --git a/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql index 01892d0bfe..dcb6198c4c 100644 --- a/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql +++ b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql @@ -1,20 +1,16 @@ -CREATE TABLE "workflow"."workflow_limit_leases" ( - "lease_id" varchar PRIMARY KEY NOT NULL, - "limit_key" varchar NOT NULL, - "holder_id" varchar NOT NULL, - "acquired_at" timestamp DEFAULT now() NOT NULL, - "expires_at" timestamp, +CREATE TABLE "workflow"."workflow_limit_keys" ( + "limit_key" varchar PRIMARY KEY NOT NULL, "concurrency_max" integer, "rate_count" integer, "rate_period_ms" integer ); --> statement-breakpoint -CREATE TABLE "workflow"."workflow_limit_tokens" ( - "token_id" varchar PRIMARY KEY NOT NULL, +CREATE TABLE "workflow"."workflow_limit_leases" ( + "lease_id" varchar PRIMARY KEY NOT NULL, "limit_key" varchar NOT NULL, "holder_id" varchar NOT NULL, "acquired_at" timestamp DEFAULT now() NOT NULL, - "expires_at" timestamp NOT NULL + "expires_at" timestamp ); --> statement-breakpoint CREATE TABLE "workflow"."workflow_limit_waiters" ( @@ -22,14 +18,22 @@ CREATE TABLE "workflow"."workflow_limit_waiters" ( "limit_key" varchar NOT NULL, "holder_id" varchar NOT NULL, "created_at" timestamp DEFAULT now() NOT NULL, - "lease_ttl_ms" integer, - "concurrency_max" integer, - "rate_count" integer, - "rate_period_ms" integer + "lease_ttl_ms" integer +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_rate_limit_tokens" ( + "token_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp NOT NULL ); --> statement-breakpoint +ALTER TABLE "workflow"."workflow_limit_leases" ADD CONSTRAINT "workflow_limit_leases_limit_key_workflow_limit_keys_limit_key_fk" FOREIGN KEY ("limit_key") REFERENCES "workflow"."workflow_limit_keys"("limit_key") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "workflow"."workflow_limit_waiters" ADD CONSTRAINT "workflow_limit_waiters_limit_key_workflow_limit_keys_limit_key_fk" FOREIGN KEY ("limit_key") REFERENCES "workflow"."workflow_limit_keys"("limit_key") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint +ALTER TABLE "workflow"."workflow_rate_limit_tokens" ADD CONSTRAINT "workflow_rate_limit_tokens_limit_key_workflow_limit_keys_limit_key_fk" FOREIGN KEY ("limit_key") REFERENCES "workflow"."workflow_limit_keys"("limit_key") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint CREATE UNIQUE INDEX "workflow_limit_leases_limit_key_holder_id_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","holder_id");--> statement-breakpoint CREATE INDEX "workflow_limit_leases_limit_key_expires_at_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","expires_at");--> statement-breakpoint -CREATE INDEX "workflow_limit_tokens_limit_key_expires_at_index" ON "workflow"."workflow_limit_tokens" USING btree ("limit_key","expires_at");--> statement-breakpoint CREATE UNIQUE INDEX "workflow_limit_waiters_limit_key_holder_id_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","holder_id");--> statement-breakpoint CREATE INDEX "workflow_limit_waiters_limit_key_created_at_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","created_at");--> statement-breakpoint +CREATE INDEX "workflow_rate_limit_tokens_limit_key_expires_at_index" ON "workflow"."workflow_rate_limit_tokens" USING btree ("limit_key","expires_at");--> statement-breakpoint diff --git a/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json index 97ddba3774..e6be10d9f3 100644 --- a/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json +++ b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json @@ -1,5 +1,5 @@ { - "id": "c4af56df-d588-4810-a8b4-f4eb68b270b2", + "id": "c3c21664-f021-4db5-be29-7c2991e325eb", "prevId": "7adbbd35-ca90-4353-bb34-3d1b2435a027", "version": "7", "dialect": "postgresql", @@ -209,6 +209,43 @@ "checkConstraints": {}, "isRLSEnabled": false }, + "workflow.workflow_limit_keys": { + "name": "workflow_limit_keys", + "schema": "workflow", + "columns": { + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "concurrency_max": { + "name": "concurrency_max", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_count": { + "name": "rate_count", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_period_ms": { + "name": "rate_period_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, "workflow.workflow_limit_leases": { "name": "workflow_limit_leases", "schema": "workflow", @@ -243,24 +280,6 @@ "type": "timestamp", "primaryKey": false, "notNull": false - }, - "concurrency_max": { - "name": "concurrency_max", - "type": "integer", - "primaryKey": false, - "notNull": false - }, - "rate_count": { - "name": "rate_count", - "type": "integer", - "primaryKey": false, - "notNull": false - }, - "rate_period_ms": { - "name": "rate_period_ms", - "type": "integer", - "primaryKey": false, - "notNull": false } }, "indexes": { @@ -307,19 +326,30 @@ "with": {} } }, - "foreignKeys": {}, + "foreignKeys": { + "workflow_limit_leases_limit_key_workflow_limit_keys_limit_key_fk": { + "name": "workflow_limit_leases_limit_key_workflow_limit_keys_limit_key_fk", + "tableFrom": "workflow_limit_leases", + "tableTo": "workflow_limit_keys", + "schemaTo": "workflow", + "columnsFrom": ["limit_key"], + "columnsTo": ["limit_key"], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, "compositePrimaryKeys": {}, "uniqueConstraints": {}, "policies": {}, "checkConstraints": {}, "isRLSEnabled": false }, - "workflow.workflow_limit_tokens": { - "name": "workflow_limit_tokens", + "workflow.workflow_limit_waiters": { + "name": "workflow_limit_waiters", "schema": "workflow", "columns": { - "token_id": { - "name": "token_id", + "waiter_id": { + "name": "waiter_id", "type": "varchar", "primaryKey": true, "notNull": true @@ -336,23 +366,23 @@ "primaryKey": false, "notNull": true }, - "acquired_at": { - "name": "acquired_at", + "created_at": { + "name": "created_at", "type": "timestamp", "primaryKey": false, "notNull": true, "default": "now()" }, - "expires_at": { - "name": "expires_at", - "type": "timestamp", + "lease_ttl_ms": { + "name": "lease_ttl_ms", + "type": "integer", "primaryKey": false, - "notNull": true + "notNull": false } }, "indexes": { - "workflow_limit_tokens_limit_key_expires_at_index": { - "name": "workflow_limit_tokens_limit_key_expires_at_index", + "workflow_limit_waiters_limit_key_holder_id_index": { + "name": "workflow_limit_waiters_limit_key_holder_id_index", "columns": [ { "expression": "limit_key", @@ -361,7 +391,28 @@ "nulls": "last" }, { - "expression": "expires_at", + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_waiters_limit_key_created_at_index": { + "name": "workflow_limit_waiters_limit_key_created_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "created_at", "isExpression": false, "asc": true, "nulls": "last" @@ -373,19 +424,30 @@ "with": {} } }, - "foreignKeys": {}, + "foreignKeys": { + "workflow_limit_waiters_limit_key_workflow_limit_keys_limit_key_fk": { + "name": "workflow_limit_waiters_limit_key_workflow_limit_keys_limit_key_fk", + "tableFrom": "workflow_limit_waiters", + "tableTo": "workflow_limit_keys", + "schemaTo": "workflow", + "columnsFrom": ["limit_key"], + "columnsTo": ["limit_key"], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, "compositePrimaryKeys": {}, "uniqueConstraints": {}, "policies": {}, "checkConstraints": {}, "isRLSEnabled": false }, - "workflow.workflow_limit_waiters": { - "name": "workflow_limit_waiters", + "workflow.workflow_rate_limit_tokens": { + "name": "workflow_rate_limit_tokens", "schema": "workflow", "columns": { - "waiter_id": { - "name": "waiter_id", + "token_id": { + "name": "token_id", "type": "varchar", "primaryKey": true, "notNull": true @@ -402,41 +464,23 @@ "primaryKey": false, "notNull": true }, - "created_at": { - "name": "created_at", + "acquired_at": { + "name": "acquired_at", "type": "timestamp", "primaryKey": false, "notNull": true, "default": "now()" }, - "lease_ttl_ms": { - "name": "lease_ttl_ms", - "type": "integer", - "primaryKey": false, - "notNull": false - }, - "concurrency_max": { - "name": "concurrency_max", - "type": "integer", - "primaryKey": false, - "notNull": false - }, - "rate_count": { - "name": "rate_count", - "type": "integer", - "primaryKey": false, - "notNull": false - }, - "rate_period_ms": { - "name": "rate_period_ms", - "type": "integer", + "expires_at": { + "name": "expires_at", + "type": "timestamp", "primaryKey": false, - "notNull": false + "notNull": true } }, "indexes": { - "workflow_limit_waiters_limit_key_holder_id_index": { - "name": "workflow_limit_waiters_limit_key_holder_id_index", + "workflow_rate_limit_tokens_limit_key_expires_at_index": { + "name": "workflow_rate_limit_tokens_limit_key_expires_at_index", "columns": [ { "expression": "limit_key", @@ -445,28 +489,7 @@ "nulls": "last" }, { - "expression": "holder_id", - "isExpression": false, - "asc": true, - "nulls": "last" - } - ], - "isUnique": true, - "concurrently": false, - "method": "btree", - "with": {} - }, - "workflow_limit_waiters_limit_key_created_at_index": { - "name": "workflow_limit_waiters_limit_key_created_at_index", - "columns": [ - { - "expression": "limit_key", - "isExpression": false, - "asc": true, - "nulls": "last" - }, - { - "expression": "created_at", + "expression": "expires_at", "isExpression": false, "asc": true, "nulls": "last" @@ -478,7 +501,18 @@ "with": {} } }, - "foreignKeys": {}, + "foreignKeys": { + "workflow_rate_limit_tokens_limit_key_workflow_limit_keys_limit_key_fk": { + "name": "workflow_rate_limit_tokens_limit_key_workflow_limit_keys_limit_key_fk", + "tableFrom": "workflow_rate_limit_tokens", + "tableTo": "workflow_limit_keys", + "schemaTo": "workflow", + "columnsFrom": ["limit_key"], + "columnsTo": ["limit_key"], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, "compositePrimaryKeys": {}, "uniqueConstraints": {}, "policies": {}, diff --git a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json index e98c400c01..43f0daa548 100644 --- a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json +++ b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json @@ -75,7 +75,7 @@ { "idx": 10, "version": "7", - "when": 1773863098757, + "when": 1774917672940, "tag": "0010_add_flow_limits", "breakpoints": true } diff --git a/packages/world-postgres/src/drizzle/schema.ts b/packages/world-postgres/src/drizzle/schema.ts index b6e8205237..8ff82f7d9b 100644 --- a/packages/world-postgres/src/drizzle/schema.ts +++ b/packages/world-postgres/src/drizzle/schema.ts @@ -193,17 +193,25 @@ export const waits = schema.table( (tb) => [index().on(tb.runId)] ); +export const limitKeys = schema.table('workflow_limit_keys', { + limitKey: varchar('limit_key').primaryKey(), + concurrencyMax: integer('concurrency_max'), + rateCount: integer('rate_count'), + ratePeriodMs: integer('rate_period_ms'), +}); + export const limitLeases = schema.table( 'workflow_limit_leases', { leaseId: varchar('lease_id').primaryKey(), - limitKey: varchar('limit_key').notNull(), + limitKey: varchar('limit_key') + .references(() => limitKeys.limitKey, { + onDelete: 'cascade', + }) + .notNull(), holderId: varchar('holder_id').notNull(), acquiredAt: timestamp('acquired_at').defaultNow().notNull(), expiresAt: timestamp('expires_at'), - concurrencyMax: integer('concurrency_max'), - rateCount: integer('rate_count'), - ratePeriodMs: integer('rate_period_ms'), }, (tb) => [ uniqueIndex().on(tb.limitKey, tb.holderId), @@ -211,11 +219,15 @@ export const limitLeases = schema.table( ] ); -export const limitTokens = schema.table( - 'workflow_limit_tokens', +export const rateLimitTokens = schema.table( + 'workflow_rate_limit_tokens', { tokenId: varchar('token_id').primaryKey(), - limitKey: varchar('limit_key').notNull(), + limitKey: varchar('limit_key') + .references(() => limitKeys.limitKey, { + onDelete: 'cascade', + }) + .notNull(), holderId: varchar('holder_id').notNull(), acquiredAt: timestamp('acquired_at').defaultNow().notNull(), expiresAt: timestamp('expires_at').notNull(), @@ -227,13 +239,14 @@ export const limitWaiters = schema.table( 'workflow_limit_waiters', { waiterId: varchar('waiter_id').primaryKey(), - limitKey: varchar('limit_key').notNull(), + limitKey: varchar('limit_key') + .references(() => limitKeys.limitKey, { + onDelete: 'cascade', + }) + .notNull(), holderId: varchar('holder_id').notNull(), createdAt: timestamp('created_at').defaultNow().notNull(), leaseTtlMs: integer('lease_ttl_ms'), - concurrencyMax: integer('concurrency_max'), - rateCount: integer('rate_count'), - ratePeriodMs: integer('rate_period_ms'), }, (tb) => [ uniqueIndex().on(tb.limitKey, tb.holderId), diff --git a/packages/world-postgres/src/index.ts b/packages/world-postgres/src/index.ts index ad1a4c0028..cf59d254e3 100644 --- a/packages/world-postgres/src/index.ts +++ b/packages/world-postgres/src/index.ts @@ -1,5 +1,5 @@ import type { Socket } from 'node:net'; -import type { Storage, World } from '@workflow/world'; +import type { Limits, Queue, Storage, World } from '@workflow/world'; import createPostgres from 'postgres'; import type { PostgresWorldConfig } from './config.js'; import { createClient, type Drizzle } from './drizzle/index.js'; @@ -13,10 +13,20 @@ import { } from './storage.js'; import { createStreamer } from './streamer.js'; -function createStorage(drizzle: Drizzle): Storage { +function createStorage( + drizzle: Drizzle, + options?: { + getLimits?: () => Limits | undefined; + queue?: Pick; + } +): Storage { + const runs = createRunsStorage(drizzle); return { - runs: createRunsStorage(drizzle), - events: createEventsStorage(drizzle), + runs, + events: createEventsStorage(drizzle, { + ...options, + runs, + }), hooks: createHooksStorage(drizzle), steps: createStepsStorage(drizzle), }; @@ -36,9 +46,13 @@ export function createWorld( const postgres = createPostgres(config.connectionString); const drizzle = createClient(postgres); const queue = createQueue(config, postgres); - const storage = createStorage(drizzle); const streamer = createStreamer(postgres, drizzle); - const limits = createLimits(config, drizzle); + let limits: Limits | undefined; + const storage = createStorage(drizzle, { + getLimits: () => limits, + queue, + }); + limits = createLimits(config, drizzle); return { limits, diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 44ab39f16e..bfa21e47e0 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,4 +1,6 @@ -import { afterAll, beforeAll, beforeEach, test } from 'vitest'; +import { afterAll, beforeAll, beforeEach, expect, test, vi } from 'vitest'; +import { LimitDefinitionConflictError } from '@workflow/errors'; +import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; import { createLimits } from './limits.js'; import { @@ -30,6 +32,26 @@ if (process.platform === 'win32') { await db.truncateLimits(); }); + async function createLockOwner(workflowName: string, lockIndex = 0) { + const events = createEventsStorage(db.drizzle); + const result = await events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName, + input: [], + }, + }); + if (!result.run) { + throw new Error('expected run'); + } + return { + runId: result.run.runId, + lockIndex, + }; + } + afterAll(async () => { await db?.close(); }); @@ -61,7 +83,7 @@ if (process.platform === 'win32') { `, db.sql<{ lockId: string }[]>` select holder_id as "lockId" - from workflow.workflow_limit_tokens + from workflow.workflow_rate_limit_tokens where limit_key = ${key} order by acquired_at asc, holder_id asc `, @@ -75,4 +97,205 @@ if (process.platform === 'win32') { }, }; }); + + test('uses the head waiter retryAfter for waiters queued behind a long rate window', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + const key = 'workflow:fifo:head-waiter-rate'; + const periodMs = 60_000; + const ownerA = await createLockOwner('holder-a'); + const ownerB = await createLockOwner('holder-b'); + const ownerC = await createLockOwner('holder-c'); + + const first = await limits.acquire({ + key, + runId: ownerA.runId, + lockIndex: ownerA.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + lockId: first.lease.lockId, + }); + + const headWaiter = await limits.acquire({ + key, + runId: ownerB.runId, + lockIndex: ownerB.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(headWaiter.status).toBe('blocked'); + if (headWaiter.status !== 'blocked') throw new Error('expected blocked'); + + const behindHead = await limits.acquire({ + key, + runId: ownerC.runId, + lockIndex: ownerC.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(behindHead.status).toBe('blocked'); + if (behindHead.status !== 'blocked') throw new Error('expected blocked'); + expect(behindHead.retryAfterMs).toBeGreaterThan(5_000); + + const existingWaiterRetry = await limits.acquire({ + key, + runId: ownerC.runId, + lockIndex: ownerC.lockIndex, + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + expect(existingWaiterRetry.status).toBe('blocked'); + if (existingWaiterRetry.status !== 'blocked') { + throw new Error('expected blocked'); + } + expect(existingWaiterRetry.retryAfterMs).toBeGreaterThan(5_000); + }); + + test('persists nextWaiter metadata and emits lock_waiter_queued on release', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + const runs = createRunsStorage(db.drizzle); + const queue = { queue: vi.fn().mockResolvedValue(undefined) }; + const events = createEventsStorage(db.drizzle, { + getLimits: () => limits, + queue, + runs, + }); + const ownerA = await createLockOwner('holder-a'); + const ownerB = await createLockOwner('holder-b'); + const correlationA = createLockCorrelationId( + ownerA.runId, + ownerA.lockIndex + ); + const correlationB = createLockCorrelationId( + ownerB.runId, + ownerB.lockIndex + ); + + const first = await events.create(ownerA.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }, + }); + const second = await events.create(ownerB.runId, { + eventType: 'lock_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationB, + eventData: { + key: 'workflow:user:test', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }, + }); + + expect(first.event?.eventType).toBe('lock_acquired'); + expect(second.event?.eventType).toBe('lock_created'); + + const released = await events.create(ownerA.runId, { + eventType: 'lock_release', + specVersion: SPEC_VERSION_CURRENT, + correlationId: correlationA, + }); + + if (!released.event || released.event.eventType !== 'lock_release') { + throw new Error('expected lock_release event'); + } + expect(released.event?.eventData?.nextWaiter).toMatchObject({ + runId: ownerB.runId, + lockIndex: ownerB.lockIndex, + lockCorrelationId: correlationB, + }); + expect(queue.queue).toHaveBeenCalledWith( + '__wkf_workflow_holder-b', + expect.objectContaining({ + runId: ownerB.runId, + lockPreApproval: correlationB, + }), + expect.objectContaining({ + idempotencyKey: expect.any(String), + }) + ); + + const correlated = await events.listByCorrelationId({ + correlationId: correlationB, + }); + expect( + correlated.data.some((event) => event.eventType === 'lock_waiter_queued') + ).toBe(true); + }); + + test('throws when the same key is acquired with a conflicting definition', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-a', + lockIndex: 0, + definition: { + concurrency: { max: 1 }, + }, + leaseTtlMs: 1_000, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs: 5_000 }, + }, + leaseTtlMs: 1_000, + }) + ).rejects.toBeInstanceOf(LimitDefinitionConflictError); + }); + + test('does not resurrect an expired lease when heartbeating after the key lock', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + const first = await limits.acquire({ + key: 'workflow:user:heartbeat-expired', + runId: 'run-a', + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 50, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await new Promise((resolve) => setTimeout(resolve, 75)); + + await expect( + limits.heartbeat({ + leaseId: first.lease.leaseId, + }) + ).rejects.toMatchObject({ + name: 'WorkflowWorldError', + message: expect.stringContaining('not found'), + }); + }); } diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 22220ff8e3..76b0fd47b0 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -1,54 +1,34 @@ -import { JsonTransport } from '@vercel/queue'; import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; -import { WorkflowWorldError } from '@workflow/errors'; +import { + LimitDefinitionConflictError, + WorkflowWorldError, +} from '@workflow/errors'; import { createLockId, - createLockWakeCorrelationId, + type LimitDefinition, LimitAcquireRequestSchema, type LimitAcquireResult, LimitHeartbeatRequestSchema, type LimitLease, + type LimitNextWaiter, + type LimitReleaseResult, LimitReleaseRequestSchema, type Limits, - MessageId, parseLockId, } from '@workflow/world'; import { monotonicFactory } from 'ulid'; import type { PostgresWorldConfig } from './config.js'; import type { Drizzle } from './drizzle/index.js'; import * as Schema from './drizzle/schema.js'; -import { MessageData } from './message.js'; type LeaseRow = typeof Schema.limitLeases.$inferSelect; -type TokenRow = typeof Schema.limitTokens.$inferSelect; +type LimitKeyRow = typeof Schema.limitKeys.$inferSelect; +type TokenRow = typeof Schema.rateLimitTokens.$inferSelect; type WaiterRow = typeof Schema.limitWaiters.$inferSelect; -type RunRow = Pick< - typeof Schema.runs.$inferSelect, - 'workflowName' | 'startedAt' | 'status' ->; type Tx = Parameters[0]>[0]; type Db = Drizzle | Tx; - -type HolderTarget = - | { - kind: 'lock'; - runId: string; - correlationId: string; - } - | { - kind: 'opaque'; - }; - -const transport = new JsonTransport(); const generateId = monotonicFactory(); -function getQueues(config: PostgresWorldConfig) { - const prefix = config.jobPrefix || 'workflow_'; - return { - workflow: `${prefix}flows`, - } as const; -} - function nowPlus(ms?: number): Date | undefined { if (ms === undefined) return undefined; return new Date(Date.now() + ms); @@ -64,27 +44,7 @@ function toMillis(value: Date | string | null | undefined): number | undefined { return date ? date.getTime() : undefined; } -/* -Holder ids double as wake-up hints. -When a waiter is promoted, we decode the holder id to decide which queue to poke. -*/ -function parseHolderId(lockId: string): HolderTarget { - const parsedLockId = parseLockId(lockId); - if (parsedLockId) { - return { - kind: 'lock', - runId: parsedLockId.runId, - correlationId: createLockWakeCorrelationId( - parsedLockId.runId, - parsedLockId.lockIndex - ), - }; - } - - return { kind: 'opaque' }; -} - -function toLease(row: LeaseRow): LimitLease { +function toLease(row: LeaseRow, definition: LimitDefinition): LimitLease { const parsedLockId = parseLockId(row.holderId); return { leaseId: row.leaseId, @@ -94,17 +54,45 @@ function toLease(row: LeaseRow): LimitLease { lockIndex: parsedLockId?.lockIndex ?? 0, acquiredAt: toDate(row.acquiredAt)!, expiresAt: toDate(row.expiresAt), - definition: { - concurrency: - row.concurrencyMax !== null ? { max: row.concurrencyMax } : undefined, - rate: - row.rateCount !== null && row.ratePeriodMs !== null - ? { - count: row.rateCount, - periodMs: row.ratePeriodMs, - } - : undefined, - }, + definition, + }; +} + +function definitionFromRow( + row: Pick +): LimitDefinition { + return { + concurrency: + row.concurrencyMax !== null ? { max: row.concurrencyMax } : undefined, + rate: + row.rateCount !== null && row.ratePeriodMs !== null + ? { count: row.rateCount, periodMs: row.ratePeriodMs } + : undefined, + }; +} + +function areLimitDefinitionsEqual( + left: LimitDefinition | undefined, + right: LimitDefinition +): boolean { + return ( + left?.concurrency?.max === right.concurrency?.max && + left?.rate?.count === right.rate?.count && + left?.rate?.periodMs === right.rate?.periodMs + ); +} + +function toNextWaiter(holderId: string): LimitNextWaiter | undefined { + const parsedLockId = parseLockId(holderId); + if (!parsedLockId) { + return undefined; + } + + return { + runId: parsedLockId.runId, + lockIndex: parsedLockId.lockIndex, + wakeCorrelationId: `wflock_wait_${parsedLockId.runId}:${parsedLockId.lockIndex}`, + lockCorrelationId: `wflock_${parsedLockId.runId}:${parsedLockId.lockIndex}`, }; } @@ -148,71 +136,47 @@ function getRetryAfterMs( return Math.min(...candidates); } -async function queueWorkflowWake( - tx: Db, - config: PostgresWorldConfig, - runId: string, - workflowName: string, - idempotencyKey: string -) { - const messageId = MessageId.parse(`msg_${generateId()}`); - const payload = MessageData.encode({ - id: workflowName, - data: Buffer.from( - transport.serialize({ - runId, - requestedAt: new Date(), - }) - ), - attempt: 1, - idempotencyKey, - messageId, - }); - - await tx.execute(sql` - select graphile_worker.add_job( - ${getQueues(config).workflow}::text, - payload := ${JSON.stringify(payload)}::json, - max_attempts := 3, - job_key := ${idempotencyKey}::text, - job_key_mode := 'replace' - ) - `); +function getWaiterRetryAfterMs( + leases: LeaseRow[], + tokens: TokenRow[], + now: number, + definition: LimitDefinition +): number | undefined { + return getRetryAfterMs( + leases, + tokens, + now, + definition.concurrency !== undefined && + leases.length >= definition.concurrency.max, + definition.rate !== undefined && tokens.length >= definition.rate.count + ); } -async function queueWakeForHolder( - tx: Db, - config: PostgresWorldConfig, - holderId: string -) { - /* - Limit state is durable in Postgres, but wake-ups still need a runtime target. - If the workflow is already terminal, there is nothing left to resume. - */ - const target = parseHolderId(holderId); - if (target.kind === 'opaque') { - return; - } - - const [run] = (await tx - .select({ - workflowName: Schema.runs.workflowName, - startedAt: Schema.runs.startedAt, - status: Schema.runs.status, - }) - .from(Schema.runs) - .where(eq(Schema.runs.runId, target.runId)) - .limit(1)) as RunRow[]; - if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { - return; - } - - await queueWorkflowWake( - tx, - config, - target.runId, - run.workflowName, - target.correlationId +function getBlockedRetryAfterMs( + state: { + keyRow?: LimitKeyRow; + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; + }, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number { + const headWaiter = state.waiters[0]; + const definition = state.keyRow ? definitionFromRow(state.keyRow) : undefined; + return ( + (headWaiter && definition + ? getWaiterRetryAfterMs(state.leases, state.tokens, now, definition) + : undefined) ?? + getRetryAfterMs( + state.leases, + state.tokens, + now, + concurrencyBlocked, + rateBlocked + ) ?? + 1000 ); } @@ -224,11 +188,11 @@ async function pruneExpired(tx: Db, key: string): Promise { const now = new Date(); await tx - .delete(Schema.limitTokens) + .delete(Schema.rateLimitTokens) .where( and( - eq(Schema.limitTokens.limitKey, key), - lte(Schema.limitTokens.expiresAt, now) + eq(Schema.rateLimitTokens.limitKey, key), + lte(Schema.rateLimitTokens.expiresAt, now) ) ); @@ -247,11 +211,15 @@ async function getActiveState( tx: Db, key: string ): Promise<{ + keyRow?: LimitKeyRow; leases: LeaseRow[]; tokens: TokenRow[]; waiters: WaiterRow[]; }> { - const [leases, tokens, waiters] = await Promise.all([ + const [keyRow, leases, tokens, waiters] = await Promise.all([ + tx.query.limitKeys.findFirst({ + where: eq(Schema.limitKeys.limitKey, key), + }), tx .select() .from(Schema.limitLeases) @@ -262,9 +230,9 @@ async function getActiveState( ), tx .select() - .from(Schema.limitTokens) - .where(eq(Schema.limitTokens.limitKey, key)) - .orderBy(asc(Schema.limitTokens.expiresAt)), + .from(Schema.rateLimitTokens) + .where(eq(Schema.rateLimitTokens.limitKey, key)) + .orderBy(asc(Schema.rateLimitTokens.expiresAt)), tx .select() .from(Schema.limitWaiters) @@ -275,7 +243,7 @@ async function getActiveState( ), ]); - return { leases, tokens, waiters }; + return { keyRow, leases, tokens, waiters }; } /* @@ -289,8 +257,8 @@ async function lockLimitKey(tx: Db, key: string): Promise { } async function isHolderLive(tx: Db, holderId: string): Promise { - const target = parseHolderId(holderId); - if (target.kind === 'opaque') { + const parsedLockId = parseLockId(holderId); + if (!parsedLockId) { return true; } @@ -299,94 +267,134 @@ async function isHolderLive(tx: Db, holderId: string): Promise { status: Schema.runs.status, }) .from(Schema.runs) - .where(eq(Schema.runs.runId, target.runId)) + .where(eq(Schema.runs.runId, parsedLockId.runId)) .limit(1)) as Pick[]; return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); } -async function promoteWaiters( - tx: Db, - config: PostgresWorldConfig, - key: string -): Promise { - /* - We walk waiters in FIFO order and stop at the first waiter that is still blocked. - Later waiters cannot jump ahead of an earlier waiter for the same key. (getActiveState returns waiters in FIFO order) - */ - const state = await getActiveState(tx, key); - let activeLeases = state.leases.length; - let activeTokens = state.tokens.length; +async function pruneDeadWaiters(tx: Db, key: string): Promise { + const waiters = await tx + .select({ + waiterId: Schema.limitWaiters.waiterId, + holderId: Schema.limitWaiters.holderId, + }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, key)); - for (const waiter of state.waiters) { + for (const waiter of waiters) { if (!(await isHolderLive(tx, waiter.holderId))) { await tx .delete(Schema.limitWaiters) .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); - continue; } + } +} - const concurrencyBlocked = - waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; - const rateBlocked = - waiter.rateCount !== null && activeTokens >= waiter.rateCount; +async function ensureCanonicalDefinition( + tx: Db, + key: string, + requested: LimitDefinition, + state: { + keyRow?: LimitKeyRow; + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; + } +) { + const existing = state.keyRow; + + if ( + existing && + state.leases.length === 0 && + state.tokens.length === 0 && + state.waiters.length === 0 + ) { + await tx.delete(Schema.limitKeys).where(eq(Schema.limitKeys.limitKey, key)); + } - if (concurrencyBlocked || rateBlocked) { - break; - } + const current = + existing && + state.leases.length === 0 && + state.tokens.length === 0 && + state.waiters.length === 0 + ? undefined + : (existing ?? + (await tx.query.limitKeys.findFirst({ + where: eq(Schema.limitKeys.limitKey, key), + }))); + + if (!current) { + await tx.insert(Schema.limitKeys).values({ + limitKey: key, + concurrencyMax: requested.concurrency?.max ?? null, + rateCount: requested.rate?.count ?? null, + ratePeriodMs: requested.rate?.periodMs ?? null, + }); + return; + } - const leaseId = `lmt_${generateId()}`; - const expiresAt = nowPlus(waiter.leaseTtlMs ?? undefined); - const [lease] = await tx - .insert(Schema.limitLeases) - .values({ - leaseId, - limitKey: key, - holderId: waiter.holderId, - acquiredAt: new Date(), - expiresAt, - concurrencyMax: waiter.concurrencyMax, - rateCount: waiter.rateCount, - ratePeriodMs: waiter.ratePeriodMs, - }) - .onConflictDoNothing() - .returning(); - - const acquiredLease = - lease ?? - (await tx.query.limitLeases.findFirst({ - where: and( - eq(Schema.limitLeases.limitKey, key), - eq(Schema.limitLeases.holderId, waiter.holderId) - ), - })); - - if (!acquiredLease) { - continue; - } + const currentDefinition = definitionFromRow(current); + if (!areLimitDefinitionsEqual(currentDefinition, requested)) { + throw new LimitDefinitionConflictError(key, currentDefinition, requested); + } +} - if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { - await tx.insert(Schema.limitTokens).values({ - tokenId: `lmttok_${generateId()}`, - limitKey: key, - holderId: waiter.holderId, - acquiredAt: new Date(), - expiresAt: new Date(Date.now() + waiter.ratePeriodMs), - }); - activeTokens += 1; - } +async function promoteWaiter( + tx: Db, + key: string, + waiter: WaiterRow, + definition: LimitDefinition +): Promise<{ lease: LimitLease; nextWaiter?: LimitNextWaiter }> { + const leaseId = `lmt_${generateId()}`; + const expiresAt = nowPlus(waiter.leaseTtlMs ?? undefined); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt, + }) + .onConflictDoNothing() + .returning(); + + const acquiredLease = + lease ?? + (await tx.query.limitLeases.findFirst({ + where: and( + eq(Schema.limitLeases.limitKey, key), + eq(Schema.limitLeases.holderId, waiter.holderId) + ), + })); - await tx - .delete(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + if (!acquiredLease) { + throw new WorkflowWorldError(`Failed to promote waiter for key "${key}"`); + } - activeLeases += 1; - await queueWakeForHolder(tx, config, acquiredLease.holderId); + if (definition.rate) { + await tx.insert(Schema.rateLimitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + definition.rate.periodMs), + }); } + + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + + return { + lease: toLease(acquiredLease, definition), + nextWaiter: toNextWaiter(waiter.holderId), + }; } export function createLimits( - config: PostgresWorldConfig, + _config: PostgresWorldConfig, drizzle: Drizzle ): Limits { return { @@ -395,57 +403,93 @@ export function createLimits( return drizzle.transaction(async (tx) => { await lockLimitKey(tx, parsed.key); - // Prune expired leases and tokens, promote pre-existing waiters before attempting to acquire a new lease or token. await pruneExpired(tx, parsed.key); - await promoteWaiters(tx, config, parsed.key); + await pruneDeadWaiters(tx, parsed.key); const state = await getActiveState(tx, parsed.key); + await ensureCanonicalDefinition( + tx, + parsed.key, + parsed.definition, + state + ); + const currentState = await getActiveState(tx, parsed.key); + const definition = + currentState.keyRow && definitionFromRow(currentState.keyRow); const lockId = createLockId(parsed.runId, parsed.lockIndex); - const existingLease = state.leases.find( + const existingLease = currentState.leases.find( (lease) => lease.holderId === lockId ); if (existingLease) { + if (!definition) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${parsed.key}"` + ); + } return { status: 'acquired', - lease: toLease(existingLease), + lease: toLease(existingLease, definition), } satisfies LimitAcquireResult; } - const existingWaiter = state.waiters.find( + const existingWaiter = currentState.waiters.find( (waiter) => waiter.holderId === lockId ); - // If there are already waiters for this key and holder no need to queue a new waiter. if (existingWaiter) { - const now = Date.now(); const concurrencyBlocked = parsed.definition.concurrency !== undefined && - state.leases.length >= parsed.definition.concurrency.max; + currentState.leases.length >= parsed.definition.concurrency.max; const rateBlocked = parsed.definition.rate !== undefined && - state.tokens.length >= parsed.definition.rate.count; + currentState.tokens.length >= parsed.definition.rate.count; + + if ( + currentState.waiters[0]?.waiterId === existingWaiter.waiterId && + !concurrencyBlocked && + !rateBlocked + ) { + if (!definition) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${parsed.key}"` + ); + } + const promoted = await promoteWaiter( + tx, + parsed.key, + existingWaiter, + definition + ); + return { + status: 'acquired', + lease: promoted.lease, + } satisfies LimitAcquireResult; + } + + const now = Date.now(); return { status: 'blocked', reason: getBlockedReason(concurrencyBlocked, rateBlocked), - retryAfterMs: - getRetryAfterMs( - state.leases, - state.tokens, - now, - concurrencyBlocked, - rateBlocked - ) ?? 1000, + retryAfterMs: getBlockedRetryAfterMs( + currentState, + now, + concurrencyBlocked, + rateBlocked + ), } satisfies LimitAcquireResult; } const concurrencyBlocked = parsed.definition.concurrency !== undefined && - state.leases.length >= parsed.definition.concurrency.max; + currentState.leases.length >= parsed.definition.concurrency.max; const rateBlocked = parsed.definition.rate !== undefined && - state.tokens.length >= parsed.definition.rate.count; + currentState.tokens.length >= parsed.definition.rate.count; - // If we are not blocked, and there are no waiters for this key and holder, we can acquire a new lease or token. - if (!concurrencyBlocked && !rateBlocked && state.waiters.length === 0) { + if ( + !concurrencyBlocked && + !rateBlocked && + currentState.waiters.length === 0 + ) { const expiresAt = nowPlus(parsed.leaseTtlMs); const [lease] = await tx .insert(Schema.limitLeases) @@ -455,14 +499,11 @@ export function createLimits( holderId: lockId, acquiredAt: new Date(), expiresAt, - concurrencyMax: parsed.definition.concurrency?.max ?? null, - rateCount: parsed.definition.rate?.count ?? null, - ratePeriodMs: parsed.definition.rate?.periodMs ?? null, }) .returning(); if (parsed.definition.rate) { - await tx.insert(Schema.limitTokens).values({ + await tx.insert(Schema.rateLimitTokens).values({ tokenId: `lmttok_${generateId()}`, limitKey: parsed.key, holderId: lockId, @@ -473,11 +514,10 @@ export function createLimits( return { status: 'acquired', - lease: toLease(lease), + lease: toLease(lease, definition ?? parsed.definition), } satisfies LimitAcquireResult; } - // If we are blocked, we need to queue a waiter. await tx .insert(Schema.limitWaiters) .values({ @@ -486,9 +526,6 @@ export function createLimits( holderId: lockId, createdAt: new Date(), leaseTtlMs: parsed.leaseTtlMs ?? null, - concurrencyMax: parsed.definition.concurrency?.max ?? null, - rateCount: parsed.definition.rate?.count ?? null, - ratePeriodMs: parsed.definition.rate?.periodMs ?? null, }) .onConflictDoNothing(); @@ -496,14 +533,12 @@ export function createLimits( return { status: 'blocked', reason: getBlockedReason(concurrencyBlocked, rateBlocked), - retryAfterMs: - getRetryAfterMs( - state.leases, - state.tokens, - now, - parsed.definition.concurrency !== undefined, - parsed.definition.rate !== undefined - ) ?? 1000, + retryAfterMs: getBlockedRetryAfterMs( + currentState, + now, + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ), } satisfies LimitAcquireResult; }); }, @@ -511,7 +546,7 @@ export function createLimits( async release(request) { const parsed = LimitReleaseRequestSchema.parse(request); - await drizzle.transaction(async (tx) => { + return drizzle.transaction(async (tx): Promise => { const key = parsed.key ?? ( @@ -536,12 +571,54 @@ export function createLimits( const [deleted] = await tx .delete(Schema.limitLeases) .where(where) - .returning({ limitKey: Schema.limitLeases.limitKey }); + .returning({ + limitKey: Schema.limitLeases.limitKey, + holderId: Schema.limitLeases.holderId, + }); if (deleted?.limitKey) { await pruneExpired(tx, deleted.limitKey); - await promoteWaiters(tx, config, deleted.limitKey); + await pruneDeadWaiters(tx, deleted.limitKey); + const state = await getActiveState(tx, deleted.limitKey); + const headWaiter = state.waiters[0]; + + if (headWaiter) { + const definition = state.keyRow && definitionFromRow(state.keyRow); + if (!definition) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${deleted.limitKey}"` + ); + } + const concurrencyBlocked = + definition.concurrency !== undefined && + state.leases.length >= definition.concurrency.max; + const rateBlocked = + definition.rate !== undefined && + state.tokens.length >= definition.rate.count; + + if (!concurrencyBlocked && !rateBlocked) { + const promoted = await promoteWaiter( + tx, + deleted.limitKey, + headWaiter, + definition + ); + return { nextWaiter: promoted.nextWaiter }; + } + } + + if ( + state.leases.length === 0 && + state.tokens.length === 0 && + state.waiters.length === 0 + ) { + await tx + .delete(Schema.limitKeys) + .where(eq(Schema.limitKeys.limitKey, deleted.limitKey)); + } } + + return {}; }); }, @@ -559,9 +636,21 @@ export function createLimits( } await lockLimitKey(tx, existing.limitKey); + await pruneExpired(tx, existing.limitKey); + + const current = await tx.query.limitLeases.findFirst({ + where: and( + eq(Schema.limitLeases.leaseId, parsed.leaseId), + eq(Schema.limitLeases.limitKey, existing.limitKey) + ), + }); + + if (!current) { + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); + } const now = Date.now(); - const currentExpiry = toMillis(existing.expiresAt); + const currentExpiry = toMillis(current.expiresAt); const ttlMs = parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); const expiresAt = new Date(now + Math.max(1, ttlMs)); @@ -572,7 +661,21 @@ export function createLimits( .where(eq(Schema.limitLeases.leaseId, parsed.leaseId)) .returning(); - return toLease(updated); + if (!updated) { + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); + } + + const keyRow = await tx.query.limitKeys.findFirst({ + where: eq(Schema.limitKeys.limitKey, current.limitKey), + }); + + if (!keyRow) { + throw new WorkflowWorldError( + `Missing canonical definition for key "${current.limitKey}"` + ); + } + + return toLease(updated, definitionFromRow(keyRow)); }); }, }; diff --git a/packages/world-postgres/src/queue.ts b/packages/world-postgres/src/queue.ts index f4ac31fb00..39a37fdc16 100644 --- a/packages/world-postgres/src/queue.ts +++ b/packages/world-postgres/src/queue.ts @@ -65,9 +65,10 @@ type HttpExecutionResult = * - `step` for step jobs * * When a message is queued, it is sent to graphile-worker with the appropriate job type. - * When a job is processed, it is deserialized and then re-queued into the _local world_, showing that - * we can reuse the local world, mix and match worlds to build - * hybrid architectures, and even migrate between worlds. + * When a job is processed, the worker POSTs the payload directly to the + * workflow HTTP endpoints. We reuse `world-local` only for its + * `createQueueHandler()` HTTP adapter so the request/response contract stays + * consistent across worlds; execution is not re-enqueued into the local queue. */ export type PostgresQueue = Queue & { start(): Promise; diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 882f7ec7d8..eb6988a849 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -4,17 +4,20 @@ import { RunExpiredError, RunNotSupportedError, TooEarlyError, - WorkflowWorldError, WorkflowRunNotFoundError, + WorkflowWorldError, } from '@workflow/errors'; import type { Event, EventResult, GetEventParams, Hook, + LimitLease, + Limits, ListEventsParams, ListHooksParams, PaginatedResponse, + Queue, ResolveData, Step, StepWithoutData, @@ -41,6 +44,26 @@ import { type Drizzle, Schema } from './drizzle/index.js'; import type { SerializedContent } from './drizzle/schema.js'; import { compact } from './util.js'; +function getAcquiredLease( + event: + | { + eventType: string; + eventData?: unknown; + } + | undefined +): LimitLease | undefined { + if (!event || event.eventType !== 'lock_acquired') { + return undefined; + } + + const data = event.eventData; + if (!data || typeof data !== 'object' || !('lease' in data)) { + return undefined; + } + + return (data as { lease?: LimitLease }).lease; +} + /** * Parse legacy errorJson (text column with JSON-stringified StructuredError). * Used for backwards compatibility when reading from deprecated error column. @@ -260,9 +283,18 @@ async function handleLegacyEventPostgres( } } -export function createEventsStorage(drizzle: Drizzle): Storage['events'] { +export function createEventsStorage( + drizzle: Drizzle, + options?: { + getLimits?: () => Limits | undefined; + queue?: Pick; + runs?: Pick; + } +): Storage['events'] { const ulid = monotonicFactory(); const { events } = Schema; + const isLeaseLive = (lease: { expiresAt?: Date }) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > Date.now(); // Prepared statements for validation queries (performance optimization) const getRunForValidation = drizzle @@ -458,7 +490,11 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { if ( data.eventType === 'step_created' || data.eventType === 'hook_created' || - data.eventType === 'wait_created' + data.eventType === 'wait_created' || + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' || + data.eventType === 'lock_waiter_queued' ) { throw new EntityConflictError( `Cannot create new entities on run in terminal state "${currentRun.status}"` @@ -528,6 +564,300 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { } } + if ( + data.eventType === 'lock_created' || + data.eventType === 'lock_acquired' || + data.eventType === 'lock_release' + ) { + const limits = options?.getLimits?.(); + if (!limits) { + throw new WorkflowWorldError( + `Flow limits are not configured for event type "${data.eventType}"` + ); + } + + const lockIndex = Number.parseInt( + data.correlationId.split(':').at(-1) ?? '0', + 10 + ); + const existingEvents = await drizzle + .select({ + eventType: Schema.events.eventType, + eventData: Schema.events.eventData, + createdAt: Schema.events.createdAt, + eventId: Schema.events.eventId, + }) + .from(Schema.events) + .where( + and( + eq(Schema.events.runId, effectiveRunId), + eq(Schema.events.correlationId, data.correlationId) + ) + ) + .orderBy(asc(Schema.events.createdAt), asc(Schema.events.eventId)); + const existingCreatedEvent = existingEvents.find( + (event) => event.eventType === 'lock_created' + ); + const existingAcquiredEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_acquired'); + const existingReleaseEvent = [...existingEvents] + .reverse() + .find((event) => event.eventType === 'lock_release'); + let eventToStore: + | { + eventType: 'lock_created' | 'lock_acquired' | 'lock_release'; + correlationId: string; + eventData?: unknown; + } + | undefined; + let eventToReturn: + | { + eventType: 'lock_created' | 'lock_acquired' | 'lock_release'; + correlationId: string; + eventData?: unknown; + createdAt: Date; + eventId: string; + } + | undefined; + + if (data.eventType === 'lock_created') { + const existingLeaseData = getAcquiredLease(existingAcquiredEvent); + const existingEvent = + existingReleaseEvent ?? + (existingLeaseData && isLeaseLive(existingLeaseData) + ? existingAcquiredEvent + : undefined) ?? + existingCreatedEvent; + if (existingEvent) { + eventToReturn = { + eventType: existingEvent.eventType as + | 'lock_created' + | 'lock_acquired' + | 'lock_release', + correlationId: data.correlationId, + eventData: existingEvent.eventData ?? undefined, + createdAt: existingEvent.createdAt, + eventId: existingEvent.eventId, + }; + } else { + const result = await limits.acquire({ + key: data.eventData.key, + runId: effectiveRunId, + lockIndex, + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + }); + const eventCreatedAt = new Date(); + + eventToStore = + result.status === 'acquired' + ? { + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + } + : { + eventType: 'lock_created', + correlationId: data.correlationId, + eventData: { + key: data.eventData.key, + definition: data.eventData.definition, + leaseTtlMs: data.eventData.leaseTtlMs, + acquireAt: + result.retryAfterMs !== undefined + ? new Date( + eventCreatedAt.getTime() + result.retryAfterMs + ) + : undefined, + }, + }; + } + } else if (data.eventType === 'lock_acquired') { + const existingLeaseData = getAcquiredLease(existingAcquiredEvent); + if (existingReleaseEvent) { + eventToReturn = { + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: existingReleaseEvent.eventData ?? undefined, + createdAt: existingReleaseEvent.createdAt, + eventId: existingReleaseEvent.eventId, + }; + } else if ( + existingAcquiredEvent && + existingLeaseData && + isLeaseLive(existingLeaseData) + ) { + eventToReturn = { + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: existingAcquiredEvent.eventData ?? undefined, + createdAt: existingAcquiredEvent.createdAt, + eventId: existingAcquiredEvent.eventId, + }; + } else { + const createdEvent = existingCreatedEvent; + const createdData = createdEvent?.eventData as + | { + key: string; + definition: any; + leaseTtlMs?: number; + } + | undefined; + if (!createdData) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be acquired before lock_created` + ); + } + + const result = await limits.acquire({ + key: createdData.key, + runId: effectiveRunId, + lockIndex, + definition: createdData.definition, + leaseTtlMs: createdData.leaseTtlMs, + }); + if (result.status !== 'acquired') { + const retryAfter = + result.retryAfterMs !== undefined + ? new Date(Date.now() + result.retryAfterMs) + : undefined; + throw new TooEarlyError( + `Lock "${data.correlationId}" is not ready to acquire`, + { retryAfter } + ); + } + + eventToStore = { + eventType: 'lock_acquired', + correlationId: data.correlationId, + eventData: { lease: result.lease }, + }; + } + } else { + if (existingReleaseEvent) { + eventToReturn = { + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: existingReleaseEvent.eventData ?? undefined, + createdAt: existingReleaseEvent.createdAt, + eventId: existingReleaseEvent.eventId, + }; + } else { + const acquiredEvent = existingAcquiredEvent; + const lease = getAcquiredLease(acquiredEvent); + if (!lease) { + throw new WorkflowWorldError( + `Lock "${data.correlationId}" cannot be released before lock_acquired` + ); + } + + const releaseResult = await limits.release({ + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }); + + eventToStore = { + eventType: 'lock_release', + correlationId: data.correlationId, + eventData: { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + nextWaiter: releaseResult.nextWaiter, + }, + }; + } + } + + if (eventToReturn) { + const parsed = EventSchema.parse({ + eventType: eventToReturn.eventType, + correlationId: eventToReturn.correlationId, + eventData: eventToReturn.eventData, + createdAt: eventToReturn.createdAt, + runId: effectiveRunId, + eventId: eventToReturn.eventId, + specVersion: effectiveSpecVersion, + }); + const resolveData = params?.resolveData ?? 'all'; + return { + event: stripEventDataRefs(parsed, resolveData), + run, + step, + hook, + wait, + }; + } + if (!eventToStore) { + throw new WorkflowWorldError( + `Lock event "${data.eventType}" did not resolve for "${data.correlationId}"` + ); + } + + const [value] = await drizzle + .insert(Schema.events) + .values({ + runId: effectiveRunId, + eventId, + correlationId: eventToStore.correlationId, + eventType: eventToStore.eventType, + eventData: eventToStore.eventData as SerializedContent | undefined, + specVersion: effectiveSpecVersion, + }) + .returning({ createdAt: Schema.events.createdAt }); + + const parsed = EventSchema.parse({ + ...eventToStore, + ...value, + runId: effectiveRunId, + eventId, + }); + const resolveData = params?.resolveData ?? 'all'; + if ( + parsed.eventType === 'lock_release' && + parsed.eventData?.nextWaiter && + options?.queue && + options?.runs + ) { + const nextRun = await options.runs.get( + parsed.eventData.nextWaiter.runId, + { + resolveData: 'none', + } + ); + if (!['completed', 'failed', 'cancelled'].includes(nextRun.status)) { + await options.queue.queue( + `__wkf_workflow_${nextRun.workflowName}`, + { + runId: parsed.eventData.nextWaiter.runId, + lockPreApproval: parsed.eventData.nextWaiter.lockCorrelationId, + requestedAt: new Date(), + }, + { + idempotencyKey: parsed.eventData.nextWaiter.wakeCorrelationId, + } + ); + + await drizzle.insert(Schema.events).values({ + runId: parsed.eventData.nextWaiter.runId, + eventId: `wevt_${ulid()}`, + correlationId: parsed.eventData.nextWaiter.lockCorrelationId, + eventType: 'lock_waiter_queued', + specVersion: effectiveSpecVersion, + }); + } + } + return { + event: stripEventDataRefs(parsed, resolveData), + run, + step, + hook, + wait, + }; + } + // ============================================================ // Entity creation/updates based on event type // ============================================================ diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts index ef27f70052..333909dd52 100644 --- a/packages/world-postgres/test/test-db.ts +++ b/packages/world-postgres/test/test-db.ts @@ -42,8 +42,9 @@ export async function createPostgresTestDb(): Promise { async truncateLimits() { await sql` truncate table + workflow.workflow_limit_keys, workflow.workflow_limit_waiters, - workflow.workflow_limit_tokens, + workflow.workflow_rate_limit_tokens, workflow.workflow_limit_leases, workflow.workflow_steps, workflow.workflow_events, diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index f36c33c410..cf593399b5 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -258,6 +258,33 @@ export function createLimitsContractSuite( expect(second.reason).toBe('rate'); expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + let secondRetry = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); + const deadline = Date.now() + periodMs + 1_000; + while (secondRetry.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, secondRetry.retryAfterMs ?? 0) + 50); + secondRetry = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); + } + + expect(secondRetry.status).toBe('acquired'); + if (secondRetry.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.release(releaseRequest(secondRetry.lease)); + let third = await harness.limits.acquire( acquireRequest( ownerC, @@ -266,8 +293,8 @@ export function createLimitsContractSuite( 1_000 ) ); - const deadline = Date.now() + periodMs + 1_000; - while (third.status === 'blocked' && Date.now() < deadline) { + const thirdDeadline = Date.now() + periodMs + 1_000; + while (third.status === 'blocked' && Date.now() < thirdDeadline) { await sleep(Math.max(25, third.retryAfterMs ?? 0) + 50); third = await harness.limits.acquire( acquireRequest( diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts index 807033e712..c6943bd675 100644 --- a/packages/world-testing/src/limits-runtime.ts +++ b/packages/world-testing/src/limits-runtime.ts @@ -28,6 +28,12 @@ type WorkflowRateLimitResult = { periodMs: number; }; +type ReleasedRateLimitReplayResult = { + elapsedMs: number; + periodMs: number; + sleepMs: number; +}; + type LeakedLockResult = { label: string; key: string; @@ -78,6 +84,11 @@ export interface LimitsRuntimeHarness { holdMs: number, periodMs: number ): Promise<[WorkflowRateLimitResult, WorkflowRateLimitResult]>; + runReleasedRateLimitReplay( + userId: string, + periodMs: number, + sleepMs: number + ): Promise; runWorkflowFifoThreeWaiters( userId: string, holdMs: number @@ -241,6 +252,17 @@ export function createLimitsRuntimeSuite( ).toBeGreaterThanOrEqual(Math.max(0, remainingWindowAfterRelease - 100)); }); + it('does not reacquire a released rate-only lock on later replay', async () => { + const harness = await createHarness(); + const result = await harness.runReleasedRateLimitReplay( + 'replay-user', + 6_000, + 100 + ); + + expect(result.elapsedMs).toBeLessThan(4_000); + }); + it('promotes 3 workflow waiters in FIFO order', async () => { const harness = await createHarness(); const [resultA, resultB, resultC] = diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index b2d30b6376..b2ef665fad 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -109,30 +109,27 @@ independent from flow limits like: - `step:db:cheap` - `step:provider:openai` -### 4. Use a sliding-window model for rate limits in v1 +### 4. Rate-limited waits are scheduled with `acquireAt` -The current rate-limit model is a sliding-window log model, not a token bucket. - -For a limit like: +For a rate limit like: - `rate: { count: 10, periodMs: 60_000 }` -the intended semantics are: - -- allow at most 10 successful acquires in the last 60 seconds -- each successful acquire records a timestamped rate usage entry -- rate capacity returns only when that entry ages out of the window - -This is simpler than a token bucket and matches the current local-world -implementation direction well. +the observable contract is: -Important distinction: +- blocked acquires receive an `acquireAt` time through `lock_created` +- a workflow retries `lock_acquired` only once that `acquireAt` has arrived, or + sooner if it is explicitly re-queued with lock pre-approval +- a historical `lock_acquired` is only valid while its lease is still live +- once the lease has expired, replay must ignore that old acquisition and + acquire again -- `lease`: active occupancy / ownership for a holder -- `token`: internal rate-usage record that remains until the rate window expires +The important distinction in the event log is: -Releasing a lease should free concurrency capacity immediately, but it should -not restore rate capacity until the associated rate usage entry expires. +- `lock_created`: reservation / retry scheduling information +- `lock_acquired`: proof that a live lease was actually granted +- `lock_release`: disposal of the granted lease, optionally with a nominated + next waiter to wake ### 5. Use one `lock()` API from workflow scope @@ -212,6 +209,16 @@ Important details: This gives deterministic and inspectable fairness for a key without requiring a global scheduler. +### 9.5. First writer wins for key configuration + +Each limit key has one canonical definition while it is live. + +- the first acquire for a key seeds that definition +- later acquires for the same key must match it exactly +- a mismatched definition is a hard error +- once a key fully drains, the canonical definition is forgotten and the next + acquire may seed a new one + ### 10. Blocked limits do not consume worker concurrency Blocked flow limits and worker concurrency are intentionally separate. @@ -236,7 +243,9 @@ Current behavior: - leases, rate tokens, and waiters live in world-owned limit state - promotion decisions are made from that limit state -- when a waiter is promoted, the runtime is woken by enqueuing the workflow job +- `lock_release` may nominate the next waiter to wake +- event storage is responsible for enqueuing that waiter with lock pre-approval + and then appending `lock_waiter_queued` for the waiter correlation - workflows also keep a delayed replay fallback so progress is still possible if an immediate wake-up is missed @@ -350,8 +359,8 @@ Two more practical clarifications: - a blocked workflow lock should not monopolize `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` or `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` just because it is waiting -- a released concurrency lease frees concurrency immediately, but associated - rate usage still remains counted until its token ages out of the rate window +- a released lease may nominate one waiter for prompt wake-up, but delayed retry + remains in place as the fallback path ## Open Questions @@ -360,4 +369,3 @@ Two more practical clarifications: - Whether `heartbeat()` should remain user-visible or become mostly internal. - Whether `lock()` should eventually grow optional metadata or config sugar for common per-step resource keys. -- Exact event-log representation for acquire/block/dispose transitions. diff --git a/packages/world/src/events.ts b/packages/world/src/events.ts index 2965906f7b..9fc6675e5a 100644 --- a/packages/world/src/events.ts +++ b/packages/world/src/events.ts @@ -1,4 +1,9 @@ import { z } from 'zod'; +import { + LimitDefinitionSchema, + LimitLeaseSchema, + LimitNextWaiterSchema, +} from './limits.js'; import { SerializedDataSchema } from './serialization.js'; import type { PaginationOptions, ResolveData } from './shared.js'; @@ -74,6 +79,11 @@ export const EventTypeSchema = z.enum([ // Wait lifecycle events 'wait_created', 'wait_completed', + // Lock lifecycle events + 'lock_created', + 'lock_acquired', + 'lock_release', + 'lock_waiter_queued', ]); // Base event schema with common properties @@ -202,6 +212,45 @@ const WaitCompletedEventSchema = BaseEventSchema.extend({ correlationId: z.string(), }); +const LockCreatedEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_created'), + correlationId: z.string(), + eventData: z.object({ + key: z.string(), + definition: LimitDefinitionSchema, + leaseTtlMs: z.number().int().positive().optional(), + acquireAt: z.coerce.date().optional(), + }), +}); + +const LockAcquiredEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_acquired'), + correlationId: z.string(), + eventData: z + .object({ + lease: LimitLeaseSchema, + }) + .optional(), +}); + +const LockReleaseEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_release'), + correlationId: z.string(), + eventData: z + .object({ + leaseId: z.string().min(1), + key: z.string(), + lockId: z.string(), + nextWaiter: LimitNextWaiterSchema.optional(), + }) + .optional(), +}); + +const LockWaiterQueuedEventSchema = BaseEventSchema.extend({ + eventType: z.literal('lock_waiter_queued'), + correlationId: z.string(), +}); + // ============================================================================= // Run lifecycle events // ============================================================================= @@ -281,6 +330,11 @@ export const CreateEventSchema = z.discriminatedUnion('eventType', [ // Wait lifecycle events WaitCreatedEventSchema, WaitCompletedEventSchema, + // Lock lifecycle events + LockCreatedEventSchema, + LockAcquiredEventSchema, + LockReleaseEventSchema, + LockWaiterQueuedEventSchema, ]); // Discriminated union for ALL events (includes World-only events like hook_conflict) @@ -306,6 +360,11 @@ const AllEventsSchema = z.discriminatedUnion('eventType', [ // Wait lifecycle events WaitCreatedEventSchema, WaitCompletedEventSchema, + // Lock lifecycle events + LockCreatedEventSchema, + LockAcquiredEventSchema, + LockReleaseEventSchema, + LockWaiterQueuedEventSchema, ]); // Server response includes runId, eventId, and createdAt diff --git a/packages/world/src/index.ts b/packages/world/src/index.ts index 5e8f73d111..7a56a65b0a 100644 --- a/packages/world/src/index.ts +++ b/packages/world/src/index.ts @@ -13,6 +13,7 @@ export type * from './interfaces.js'; export type * from './limits.js'; export { createLockId, + createLockCorrelationId, createLockWakeCorrelationId, createLimitsNotImplementedError, LimitAcquireAcquiredResultSchema, @@ -27,7 +28,9 @@ export { LimitKeySchema, LimitLeaseSchema, LimitLockIdSchema, + LimitNextWaiterSchema, LimitRateSchema, + LimitReleaseResultSchema, LimitReleaseRequestSchema, LIMITS_NOT_IMPLEMENTED_MESSAGE, parseLockId, diff --git a/packages/world/src/limits.ts b/packages/world/src/limits.ts index 495f29a84f..ad8d2b7af4 100644 --- a/packages/world/src/limits.ts +++ b/packages/world/src/limits.ts @@ -66,6 +66,13 @@ export function createLockWakeCorrelationId( return `wflock_wait_${runId}:${lockIndex}`; } +export function createLockCorrelationId( + runId: string, + lockIndex: number +): string { + return `wflock_${runId}:${lockIndex}`; +} + export const LimitLeaseSchema = z.object({ leaseId: z.string().min(1), key: LimitKeySchema, @@ -127,6 +134,19 @@ export const LimitReleaseRequestSchema = z.object({ }); export type LimitReleaseRequest = z.infer; +export const LimitNextWaiterSchema = z.object({ + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), + wakeCorrelationId: z.string().min(1), + lockCorrelationId: z.string().min(1), +}); +export type LimitNextWaiter = z.infer; + +export const LimitReleaseResultSchema = z.object({ + nextWaiter: LimitNextWaiterSchema.optional(), +}); +export type LimitReleaseResult = z.infer; + export const LimitHeartbeatRequestSchema = z.object({ leaseId: z.string().min(1), ttlMs: z.number().int().positive().optional(), @@ -135,6 +155,6 @@ export type LimitHeartbeatRequest = z.infer; export interface Limits { acquire(request: LimitAcquireRequest): Promise; - release(request: LimitReleaseRequest): Promise; + release(request: LimitReleaseRequest): Promise; heartbeat(request: LimitHeartbeatRequest): Promise; } diff --git a/packages/world/src/queue.ts b/packages/world/src/queue.ts index 5093b62dd3..78eb23bbe8 100644 --- a/packages/world/src/queue.ts +++ b/packages/world/src/queue.ts @@ -23,6 +23,7 @@ export type TraceCarrier = z.infer; export const WorkflowInvokePayloadSchema = z.object({ runId: z.string(), + lockPreApproval: z.string().optional(), traceCarrier: TraceCarrierSchema.optional(), requestedAt: z.coerce.date().optional(), /** Number of times this message has been re-enqueued due to server errors (5xx) */ diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 1c9bd2ca0a..2c76998e2f 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -440,6 +440,31 @@ export async function workflowRateLimitContentionWorkflow( }; } +export async function releasedRateLimitReplayWorkflow( + userId = 'user-123', + periodMs = 6_000, + sleepMs = 100 +) { + 'use workflow'; + + const startedAt = Date.now(); + { + await using _releasedRateLimit = await lock({ + key: `workflow:replay-rate:${userId}`, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + } + + await sleep(sleepMs); + + return { + elapsedMs: Date.now() - startedAt, + periodMs, + sleepMs, + }; +} + export async function workflowMixedLimitContentionWorkflow( userId = 'user-123', holdMs = 250, From a0bdb8e0941d900c749fbe4395d6177083226af2 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Mon, 30 Mar 2026 22:16:17 -0400 Subject: [PATCH 18/34] Fix concurrent rustup target installs in CI Signed-off-by: nathancolosimo --- packages/swc-playground-wasm/build.js | 110 +++++++++++++++++++----- packages/swc-plugin-workflow/build.js | 116 +++++++++++++++++++++----- 2 files changed, 184 insertions(+), 42 deletions(-) diff --git a/packages/swc-playground-wasm/build.js b/packages/swc-playground-wasm/build.js index 2ac8b99a25..d9a41b1ee6 100644 --- a/packages/swc-playground-wasm/build.js +++ b/packages/swc-playground-wasm/build.js @@ -1,10 +1,16 @@ import { execSync } from 'node:child_process'; -import { existsSync } from 'node:fs'; +import { existsSync, mkdirSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; import { fileURLToPath } from 'node:url'; -function runCommand(command) { +function execCommand(command, options = {}) { + return execSync(command, { stdio: 'inherit', shell: true, ...options }); +} + +function runCommand(command, options = {}) { try { - execSync(command, { stdio: 'inherit', shell: true }); + execCommand(command, options); } catch (error) { console.error(`Command failed: ${command}: ${error}`); process.exit(1); @@ -52,31 +58,95 @@ function ensureRustup() { } } -console.log('Building swc-playground-wasm...'); - -ensureRustup(); +function sleepMs(ms) { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms); +} -// Check if wasm32-unknown-unknown target exists and install if needed -console.log('Checking wasm32-unknown-unknown target...'); -try { +function isRustTargetInstalled(target) { const installedTargets = execSync('rustup target list --installed', { stdio: 'pipe', shell: true, }).toString(); - if (!installedTargets.includes('wasm32-unknown-unknown')) { - console.log('wasm32-unknown-unknown target not found, installing...'); - runCommand('rustup target add wasm32-unknown-unknown'); - } else { - console.log('wasm32-unknown-unknown target already installed'); - } -} catch (error) { - console.error( - 'Failed to check/install wasm32-unknown-unknown target:', - error.message + return installedTargets.includes(target); +} + +function withTargetInstallLock(target, callback) { + const lockDir = path.join( + tmpdir(), + `workflow-rustup-target-${target.replaceAll(/[^a-z0-9_-]/gi, '-')}.lock` ); - process.exit(1); + const timeoutMs = 2 * 60 * 1000; + const startedAt = Date.now(); + + while (true) { + try { + mkdirSync(lockDir); + break; + } catch (error) { + if (error?.code !== 'EEXIST') { + throw error; + } + + if (Date.now() - startedAt > timeoutMs) { + throw new Error( + `Timed out waiting for rustup target install lock for ${target}` + ); + } + + console.log( + `Another process is installing ${target}; waiting for the lock...` + ); + sleepMs(1000); + } + } + + try { + return callback(); + } finally { + rmSync(lockDir, { recursive: true, force: true }); + } } +function ensureRustTarget(target) { + console.log(`Checking ${target} target...`); + + try { + if (isRustTargetInstalled(target)) { + console.log(`${target} target already installed`); + return; + } + + withTargetInstallLock(target, () => { + if (isRustTargetInstalled(target)) { + console.log(`${target} target was installed by another process`); + return; + } + + console.log(`${target} target not found, installing...`); + try { + execCommand(`rustup target add ${target}`); + } catch (error) { + if (isRustTargetInstalled(target)) { + console.warn( + `${target} target appears installed after a rustup error; continuing` + ); + return; + } + throw error; + } + }); + } catch (error) { + console.error(`Failed to check/install ${target} target:`, error.message); + process.exit(1); + } +} + +console.log('Building swc-playground-wasm...'); + +ensureRustup(); + +ensureRustTarget('wasm32-unknown-unknown'); + // Check if wasm-pack is installed if (!commandExists('wasm-pack')) { console.log('Installing wasm-pack...'); diff --git a/packages/swc-plugin-workflow/build.js b/packages/swc-plugin-workflow/build.js index 95c5d430ac..17b691ec67 100644 --- a/packages/swc-plugin-workflow/build.js +++ b/packages/swc-plugin-workflow/build.js @@ -3,14 +3,22 @@ import { execSync } from 'node:child_process'; import { copyFileSync, existsSync, + mkdirSync, readdirSync, readFileSync, + rmSync, writeFileSync, } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; -function runCommand(command) { +function execCommand(command, options = {}) { + return execSync(command, { stdio: 'inherit', shell: true, ...options }); +} + +function runCommand(command, options = {}) { try { - execSync(command, { stdio: 'inherit', shell: true }); + execCommand(command, options); } catch (error) { console.error(`Command failed: ${command}: ${error}`); process.exit(1); @@ -26,6 +34,89 @@ function commandExists(command) { } } +function sleepMs(ms) { + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, ms); +} + +function isRustTargetInstalled(target) { + const installedTargets = execSync('rustup target list --installed', { + stdio: 'pipe', + shell: true, + }).toString(); + return installedTargets.includes(target); +} + +function withTargetInstallLock(target, callback) { + const lockDir = path.join( + tmpdir(), + `workflow-rustup-target-${target.replaceAll(/[^a-z0-9_-]/gi, '-')}.lock` + ); + const timeoutMs = 2 * 60 * 1000; + const startedAt = Date.now(); + + while (true) { + try { + mkdirSync(lockDir); + break; + } catch (error) { + if (error?.code !== 'EEXIST') { + throw error; + } + + if (Date.now() - startedAt > timeoutMs) { + throw new Error( + `Timed out waiting for rustup target install lock for ${target}` + ); + } + + console.log( + `Another process is installing ${target}; waiting for the lock...` + ); + sleepMs(1000); + } + } + + try { + return callback(); + } finally { + rmSync(lockDir, { recursive: true, force: true }); + } +} + +function ensureRustTarget(target) { + console.log(`Checking ${target} target...`); + + try { + if (isRustTargetInstalled(target)) { + console.log(`${target} target already installed`); + return; + } + + withTargetInstallLock(target, () => { + if (isRustTargetInstalled(target)) { + console.log(`${target} target was installed by another process`); + return; + } + + console.log(`${target} target not found, installing...`); + try { + execCommand(`rustup target add ${target}`); + } catch (error) { + if (isRustTargetInstalled(target)) { + console.warn( + `${target} target appears installed after a rustup error; continuing` + ); + return; + } + throw error; + } + }); + } catch (error) { + console.error(`Failed to check/install ${target} target:`, error.message); + process.exit(1); + } +} + console.log('Building swc-plugin-workflow WASM...'); // Check if cargo is installed @@ -57,26 +148,7 @@ if (!commandExists('cargo')) { } } -// Check if wasm32-unknown-unknown target exists and install if needed -console.log('Checking wasm32-unknown-unknown target...'); -try { - const installedTargets = execSync('rustup target list --installed', { - stdio: 'pipe', - shell: true, - }).toString(); - if (!installedTargets.includes('wasm32-unknown-unknown')) { - console.log('wasm32-unknown-unknown target not found, installing...'); - runCommand('rustup target add wasm32-unknown-unknown'); - } else { - console.log('wasm32-unknown-unknown target already installed'); - } -} catch (error) { - console.error( - 'Failed to check/install wasm32-unknown-unknown target:', - error.message - ); - process.exit(1); -} +ensureRustTarget('wasm32-unknown-unknown'); // Build the WASM plugin console.log('Running cargo build...'); From 25ebab3bd825f2ad46daff7a13a17fbc7b677a34 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Mon, 30 Mar 2026 22:22:20 -0400 Subject: [PATCH 19/34] Fix local TooEarlyError retryAfter type Signed-off-by: nathancolosimo --- packages/world-local/src/storage/events-storage.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index 6f32e53e56..e798064c4f 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -452,7 +452,7 @@ export function createEventsStorage( if (result.status !== 'acquired') { const retryAfter = result.retryAfterMs !== undefined - ? new Date(Date.now() + result.retryAfterMs) + ? Math.ceil(result.retryAfterMs / 1000) : undefined; throw new TooEarlyError( `Lock "${data.correlationId}" is not ready to acquire`, From 5973860423805933b423ed6a885250026eb5f935 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Mon, 30 Mar 2026 23:57:25 -0400 Subject: [PATCH 20/34] Fix lock retryAfter unit test contract Signed-off-by: nathancolosimo --- packages/core/src/workflow/lock.test.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/core/src/workflow/lock.test.ts b/packages/core/src/workflow/lock.test.ts index 675d5b42ba..b903d216f1 100644 --- a/packages/core/src/workflow/lock.test.ts +++ b/packages/core/src/workflow/lock.test.ts @@ -276,11 +276,14 @@ describe('createLock', () => { }); it('re-suspends when a stale lock wake-up becomes too early again', async () => { - const retryAfter = new Date(Date.now() + 30_000); + const now = Date.parse('2026-03-31T03:50:29.624Z'); + const retryAfterSeconds = 30; + const retryAfter = new Date(now + retryAfterSeconds * 1000); + vi.spyOn(Date, 'now').mockReturnValue(now); const createEvent = vi .fn<() => Promise>() .mockRejectedValueOnce( - new TooEarlyError('not ready yet', { retryAfter }) + new TooEarlyError('not ready yet', { retryAfter: retryAfterSeconds }) ); setWorld({ From 0b7be6939485bdef149936fcfe4b1800d73171da Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 00:19:47 -0400 Subject: [PATCH 21/34] Harden limits timing tests across CI Signed-off-by: nathancolosimo --- packages/world-local/src/limits.test.ts | 43 +++++++++++++++++-- packages/world-testing/src/limits-contract.ts | 32 +++++++------- 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 5ff9129908..31b48957a7 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -104,7 +104,7 @@ describe('local world limit retry timing', () => { eventData: { key: 'workflow:user:test', definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, + leaseTtlMs: 10_000, }, }); const second = await world.events.create(runB.runId, { @@ -114,7 +114,7 @@ describe('local world limit retry timing', () => { eventData: { key: 'workflow:user:test', definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, + leaseTtlMs: 10_000, }, }); @@ -164,7 +164,7 @@ describe('local world limit retry timing', () => { definition: { concurrency: { max: 1 }, }, - leaseTtlMs: 10, + leaseTtlMs: 60_000, }) ).resolves.toMatchObject({ status: 'acquired' }); @@ -176,7 +176,7 @@ describe('local world limit retry timing', () => { definition: { rate: { count: 1, periodMs: 5_000 }, }, - leaseTtlMs: 10, + leaseTtlMs: 60_000, }) ).rejects.toBeInstanceOf(LimitDefinitionConflictError); } finally { @@ -184,6 +184,41 @@ describe('local world limit retry timing', () => { } }); + it('allows a key definition to be reseeded after the key fully drains', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-a', + lockIndex: 0, + definition: { + concurrency: { max: 1 }, + }, + leaseTtlMs: 200, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + + await new Promise((resolve) => setTimeout(resolve, 400)); + + await expect( + limits.acquire({ + key: 'shared-key', + runId: 'run-b', + lockIndex: 0, + definition: { + rate: { count: 1, periodMs: 5_000 }, + }, + leaseTtlMs: 200, + }) + ).resolves.toMatchObject({ status: 'acquired' }); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + it('uses the head waiter retryAfter for backlog-only waiters', async () => { const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); const limits = createLimits(dir); diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index cf593399b5..c1e346c9f6 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -227,7 +227,7 @@ export function createLimitsContractSuite( it('keeps rate capacity consumed until the window expires', async () => { const harness = await createHarness(); try { - const periodMs = 200; + const periodMs = 1_000; const ownerA = await createLockOwner(harness.storage, 'holder-a'); const ownerB = await createLockOwner(harness.storage, 'holder-b'); const ownerC = await createLockOwner(harness.storage, 'holder-c'); @@ -236,7 +236,7 @@ export function createLimitsContractSuite( ownerA, 'step:provider:openai', { rate: { count: 1, periodMs } }, - 1_000 + 5_000 ) ); expect(first.status).toBe('acquired'); @@ -250,7 +250,7 @@ export function createLimitsContractSuite( ownerB, 'step:provider:openai', { rate: { count: 1, periodMs } }, - 1_000 + 5_000 ) ); expect(second.status).toBe('blocked'); @@ -263,7 +263,7 @@ export function createLimitsContractSuite( ownerB, 'step:provider:openai', { rate: { count: 1, periodMs } }, - 1_000 + 5_000 ) ); const deadline = Date.now() + periodMs + 1_000; @@ -290,7 +290,7 @@ export function createLimitsContractSuite( ownerC, 'step:provider:openai', { rate: { count: 1, periodMs } }, - 1_000 + 5_000 ) ); const thirdDeadline = Date.now() + periodMs + 1_000; @@ -314,7 +314,7 @@ export function createLimitsContractSuite( it('returns a combined blocked reason when both limits are saturated', async () => { const harness = await createHarness(); try { - const periodMs = 300; + const periodMs = 1_500; const ownerA = await createLockOwner(harness.storage, 'holder-a'); const ownerB = await createLockOwner(harness.storage, 'holder-b'); const first = await harness.limits.acquire( @@ -325,7 +325,7 @@ export function createLimitsContractSuite( concurrency: { max: 1 }, rate: { count: 1, periodMs }, }, - 1_000 + 5_000 ) ); expect(first.status).toBe('acquired'); @@ -340,7 +340,7 @@ export function createLimitsContractSuite( concurrency: { max: 1 }, rate: { count: 1, periodMs }, }, - 1_000 + 5_000 ) ); expect(second).toMatchObject({ @@ -379,7 +379,7 @@ export function createLimitsContractSuite( concurrency: { max: 1 }, rate: { count: 1, periodMs }, }, - 1_000 + 5_000 ) ); } @@ -443,7 +443,7 @@ export function createLimitsContractSuite( ownerA, 'workflow:user:heartbeat', { concurrency: { max: 1 } }, - 200 + 1_000 ) ); expect(first.status).toBe('acquired'); @@ -452,7 +452,7 @@ export function createLimitsContractSuite( const heartbeat = await harness.limits.heartbeat({ leaseId: first.lease.leaseId, - ttlMs: 600, + ttlMs: 5_000, }); expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( @@ -464,7 +464,7 @@ export function createLimitsContractSuite( ownerB, 'workflow:user:heartbeat', { concurrency: { max: 1 } }, - 1_000 + 5_000 ) ); expect(second.status).toBe('blocked'); @@ -483,7 +483,7 @@ export function createLimitsContractSuite( ownerA, 'workflow:user:expired', { concurrency: { max: 1 } }, - 250 + 1_000 ) ); expect(first.status).toBe('acquired'); @@ -495,19 +495,19 @@ export function createLimitsContractSuite( ownerB, 'workflow:user:expired', { concurrency: { max: 1 } }, - 1_000 + 5_000 ) ); expect(second.status).toBe('blocked'); - await sleep(400); + await sleep(1_500); const third = await harness.limits.acquire( acquireRequest( ownerB, 'workflow:user:expired', { concurrency: { max: 1 } }, - 1_000 + 5_000 ) ); expect(third.status).toBe('acquired'); From 7c7175c80da6d7087b9def7f7ae8a7a900659408 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 09:17:28 -0400 Subject: [PATCH 22/34] Fix postgres world tests to use pg Signed-off-by: nathancolosimo --- packages/world-postgres/src/limits.test.ts | 53 +++++++++++++--------- packages/world-postgres/test/spec.test.ts | 10 +++- packages/world-postgres/test/test-db.ts | 18 ++++---- pnpm-lock.yaml | 10 +--- 4 files changed, 49 insertions(+), 42 deletions(-) diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index bfa21e47e0..ff7f2cf729 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -22,7 +22,7 @@ if (process.platform === 'win32') { db = await createPostgresTestDb(); const queue = createQueue( { connectionString: db.connectionString, queueConcurrency: 1 }, - db.sql + db.pool ); await queue.start(); await queue.close(); @@ -69,30 +69,39 @@ if (process.platform === 'win32') { }, inspectKeyState: async (key) => { const [leases, waiters, tokens] = await Promise.all([ - db.sql<{ lockId: string }[]>` - select holder_id as "lockId" - from workflow.workflow_limit_leases - where limit_key = ${key} - order by holder_id asc - `, - db.sql<{ lockId: string }[]>` - select holder_id as "lockId" - from workflow.workflow_limit_waiters - where limit_key = ${key} - order by created_at asc, holder_id asc - `, - db.sql<{ lockId: string }[]>` - select holder_id as "lockId" - from workflow.workflow_rate_limit_tokens - where limit_key = ${key} - order by acquired_at asc, holder_id asc - `, + db.pool.query<{ lockId: string }>( + ` + select holder_id as "lockId" + from workflow.workflow_limit_leases + where limit_key = $1 + order by holder_id asc + `, + [key] + ), + db.pool.query<{ lockId: string }>( + ` + select holder_id as "lockId" + from workflow.workflow_limit_waiters + where limit_key = $1 + order by created_at asc, holder_id asc + `, + [key] + ), + db.pool.query<{ lockId: string }>( + ` + select holder_id as "lockId" + from workflow.workflow_rate_limit_tokens + where limit_key = $1 + order by acquired_at asc, holder_id asc + `, + [key] + ), ]); return { - leaseHolderIds: leases.map((row) => row.lockId), - waiterHolderIds: waiters.map((row) => row.lockId), - tokenHolderIds: tokens.map((row) => row.lockId), + leaseHolderIds: leases.rows.map((row) => row.lockId), + waiterHolderIds: waiters.rows.map((row) => row.lockId), + tokenHolderIds: tokens.rows.map((row) => row.lockId), }; }, }; diff --git a/packages/world-postgres/test/spec.test.ts b/packages/world-postgres/test/spec.test.ts index 1be4cb2636..22f69e63ad 100644 --- a/packages/world-postgres/test/spec.test.ts +++ b/packages/world-postgres/test/spec.test.ts @@ -1,6 +1,6 @@ import { execSync } from 'node:child_process'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; -import { createTestSuite } from '@workflow/world-testing'; +import { createTestSuite } from '../../world-testing/dist/src/index.mjs'; import { afterAll, beforeAll, test } from 'vitest'; // Skip these tests on Windows since it relies on a docker container @@ -15,7 +15,13 @@ if (process.platform === 'win32') { process.env.WORKFLOW_POSTGRES_URL = dbUrl; process.env.DATABASE_URL = dbUrl; - execSync('pnpm db:push', { + execSync('pnpm build', { + stdio: 'inherit', + cwd: process.cwd(), + env: process.env, + }); + + execSync('pnpm exec tsx src/cli.ts', { stdio: 'inherit', cwd: process.cwd(), env: process.env, diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts index 333909dd52..02d347fe97 100644 --- a/packages/world-postgres/test/test-db.ts +++ b/packages/world-postgres/test/test-db.ts @@ -2,7 +2,7 @@ import { execSync } from 'node:child_process'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; -import postgres from 'postgres'; +import { Pool } from 'pg'; import { createClient } from '../src/drizzle/index.js'; const packageDir = path.resolve( @@ -12,7 +12,7 @@ const packageDir = path.resolve( export interface PostgresTestDb { container: Awaited>; - sql: ReturnType; + pool: Pool; drizzle: ReturnType; connectionString: string; truncateLimits(): Promise; @@ -25,22 +25,22 @@ export async function createPostgresTestDb(): Promise { process.env.DATABASE_URL = connectionString; process.env.WORKFLOW_POSTGRES_URL = connectionString; - execSync('pnpm db:push', { + execSync('pnpm exec tsx src/cli.ts', { stdio: 'inherit', cwd: packageDir, env: process.env, }); - const sql = postgres(connectionString, { max: 10 }); - const drizzle = createClient(sql); + const pool = new Pool({ connectionString, max: 10 }); + const drizzle = createClient(pool); return { container, - sql, + pool, drizzle, connectionString, async truncateLimits() { - await sql` + await pool.query(` truncate table workflow.workflow_limit_keys, workflow.workflow_limit_waiters, @@ -50,10 +50,10 @@ export async function createPostgresTestDb(): Promise { workflow.workflow_events, workflow.workflow_runs restart identity cascade - `; + `); }, async close() { - await sql.end(); + await pool.end(); await container.stop(); }, }; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a4c8750339..cc6346e3ce 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -22712,14 +22712,6 @@ snapshots: optionalDependencies: vite: 7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3) - '@vitest/mocker@4.0.18(vite@7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3))': - dependencies: - '@vitest/spy': 4.0.18 - estree-walker: 3.0.3 - magic-string: 0.30.21 - optionalDependencies: - vite: 7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3) - '@vitest/mocker@4.0.18(vite@7.1.12(@types/node@24.6.2)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3))': dependencies: '@vitest/spy': 4.0.18 @@ -31858,7 +31850,7 @@ snapshots: vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jiti@2.6.1)(jsdom@26.1.0)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3): dependencies: '@vitest/expect': 4.0.18 - '@vitest/mocker': 4.0.18(vite@7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3)) + '@vitest/mocker': 4.0.18(vite@7.1.12(@types/node@24.6.2)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.3)) '@vitest/pretty-format': 4.0.18 '@vitest/runner': 4.0.18 '@vitest/snapshot': 4.0.18 From 1ff210ee72cb99e03a83c71d06649ffe8176ff15 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 09:29:57 -0400 Subject: [PATCH 23/34] Use workspace tsx for postgres test setup Signed-off-by: nathancolosimo --- packages/world-postgres/test/spec.test.ts | 31 ++++++++++++++++++----- packages/world-postgres/test/test-db.ts | 22 +++++++++++----- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/packages/world-postgres/test/spec.test.ts b/packages/world-postgres/test/spec.test.ts index 22f69e63ad..dae128d998 100644 --- a/packages/world-postgres/test/spec.test.ts +++ b/packages/world-postgres/test/spec.test.ts @@ -1,8 +1,16 @@ -import { execSync } from 'node:child_process'; +import { execFileSync, execSync } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; import { createTestSuite } from '../../world-testing/dist/src/index.mjs'; import { afterAll, beforeAll, test } from 'vitest'; +const packageDir = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + '..' +); +const workspaceDir = path.resolve(packageDir, '..', '..'); + // Skip these tests on Windows since it relies on a docker container if (process.platform === 'win32') { test.skip('skipped on Windows since it relies on a docker container', () => {}); @@ -17,15 +25,24 @@ if (process.platform === 'win32') { execSync('pnpm build', { stdio: 'inherit', - cwd: process.cwd(), + cwd: packageDir, env: process.env, }); - execSync('pnpm exec tsx src/cli.ts', { - stdio: 'inherit', - cwd: process.cwd(), - env: process.env, - }); + execFileSync( + 'pnpm', + [ + '--dir', + workspaceDir, + 'exec', + 'tsx', + 'packages/world-postgres/src/cli.ts', + ], + { + stdio: 'inherit', + env: process.env, + } + ); }, 120_000); afterAll(async () => { diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts index 02d347fe97..e882335f89 100644 --- a/packages/world-postgres/test/test-db.ts +++ b/packages/world-postgres/test/test-db.ts @@ -1,4 +1,4 @@ -import { execSync } from 'node:child_process'; +import { execFileSync } from 'node:child_process'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; @@ -9,6 +9,7 @@ const packageDir = path.resolve( path.dirname(fileURLToPath(import.meta.url)), '..' ); +const workspaceDir = path.resolve(packageDir, '..', '..'); export interface PostgresTestDb { container: Awaited>; @@ -25,11 +26,20 @@ export async function createPostgresTestDb(): Promise { process.env.DATABASE_URL = connectionString; process.env.WORKFLOW_POSTGRES_URL = connectionString; - execSync('pnpm exec tsx src/cli.ts', { - stdio: 'inherit', - cwd: packageDir, - env: process.env, - }); + execFileSync( + 'pnpm', + [ + '--dir', + workspaceDir, + 'exec', + 'tsx', + 'packages/world-postgres/src/cli.ts', + ], + { + stdio: 'inherit', + env: process.env, + } + ); const pool = new Pool({ connectionString, max: 10 }); const drizzle = createClient(pool); From 7800784a01b531cc8ad25102714ce7b672e6b260 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 09:55:05 -0400 Subject: [PATCH 24/34] Fix pg tests and stabilize lock contention e2e Signed-off-by: nathancolosimo --- packages/world-postgres/src/limits.test.ts | 2 +- packages/world-postgres/test/spec.test.ts | 20 ++++----------- packages/world-postgres/test/test-db.ts | 26 +++++++++----------- packages/world-testing/src/limits-runtime.ts | 18 +++++++++----- 4 files changed, 29 insertions(+), 37 deletions(-) diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index ff7f2cf729..0282c7eac9 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,7 +1,7 @@ import { afterAll, beforeAll, beforeEach, expect, test, vi } from 'vitest'; import { LimitDefinitionConflictError } from '@workflow/errors'; import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; -import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.ts'; import { createLimits } from './limits.js'; import { createEventsStorage, diff --git a/packages/world-postgres/test/spec.test.ts b/packages/world-postgres/test/spec.test.ts index dae128d998..b22e22b35e 100644 --- a/packages/world-postgres/test/spec.test.ts +++ b/packages/world-postgres/test/spec.test.ts @@ -9,7 +9,6 @@ const packageDir = path.resolve( path.dirname(fileURLToPath(import.meta.url)), '..' ); -const workspaceDir = path.resolve(packageDir, '..', '..'); // Skip these tests on Windows since it relies on a docker container if (process.platform === 'win32') { @@ -29,20 +28,11 @@ if (process.platform === 'win32') { env: process.env, }); - execFileSync( - 'pnpm', - [ - '--dir', - workspaceDir, - 'exec', - 'tsx', - 'packages/world-postgres/src/cli.ts', - ], - { - stdio: 'inherit', - env: process.env, - } - ); + execFileSync('node', ['dist/cli.js'], { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); }, 120_000); afterAll(async () => { diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts index e882335f89..400337db74 100644 --- a/packages/world-postgres/test/test-db.ts +++ b/packages/world-postgres/test/test-db.ts @@ -9,7 +9,6 @@ const packageDir = path.resolve( path.dirname(fileURLToPath(import.meta.url)), '..' ); -const workspaceDir = path.resolve(packageDir, '..', '..'); export interface PostgresTestDb { container: Awaited>; @@ -26,20 +25,17 @@ export async function createPostgresTestDb(): Promise { process.env.DATABASE_URL = connectionString; process.env.WORKFLOW_POSTGRES_URL = connectionString; - execFileSync( - 'pnpm', - [ - '--dir', - workspaceDir, - 'exec', - 'tsx', - 'packages/world-postgres/src/cli.ts', - ], - { - stdio: 'inherit', - env: process.env, - } - ); + execFileSync('pnpm', ['build'], { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); + + execFileSync('node', ['dist/cli.js'], { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); const pool = new Pool({ connectionString, max: 10 }); const drizzle = createClient(pool); diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts index c6943bd675..100794c6f4 100644 --- a/packages/world-testing/src/limits-runtime.ts +++ b/packages/world-testing/src/limits-runtime.ts @@ -50,6 +50,14 @@ type WorkflowMultiStepScopeResult = { workflowLockReleasedAt: number; }; +function sortContentionResults( + results: [T, T] +): [T, T] { + return [...results].sort( + (a, b) => a.workflowLockAcquiredAt - b.workflowLockAcquiredAt + ) as [T, T]; +} + export interface LimitsRuntimeHarness { runWorkflowWithScopedLocks(userId: string): Promise<{ workflowKey: string; @@ -139,9 +147,8 @@ export function createLimitsRuntimeSuite( it('serializes workflow locks and locks around step calls under contention', async () => { const harness = await createHarness(); - const [resultA, resultB] = await harness.runWorkflowLockContention( - 'shared-user', - 750 + const [resultA, resultB] = sortContentionResults( + await harness.runWorkflowLockContention('shared-user', 750) ); expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( @@ -154,9 +161,8 @@ export function createLimitsRuntimeSuite( it('wakes promoted workflow and step-call lock waiters promptly', async () => { const harness = await createHarness(); - const [resultA, resultB] = await harness.runWorkflowLockContention( - 'shared-user', - 1_500 + const [resultA, resultB] = sortContentionResults( + await harness.runWorkflowLockContention('shared-user', 1_500) ); expect( From f1cba2074b99aad333dfc899dc124a8ebe2b6078 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 10:10:03 -0400 Subject: [PATCH 25/34] Fix shared limits test module format Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 2 +- packages/world-local/src/limits.test.ts | 2 +- packages/world-postgres/src/limits.test.ts | 2 +- packages/world-testing/src/index.mts | 4 ++-- .../src/{limits-contract.ts => limits-contract.mts} | 2 +- .../src/{limits-runtime.ts => limits-runtime.mts} | 0 6 files changed, 6 insertions(+), 6 deletions(-) rename packages/world-testing/src/{limits-contract.ts => limits-contract.mts} (99%) rename packages/world-testing/src/{limits-runtime.ts => limits-runtime.mts} (100%) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index cea6ed40eb..72e3039a06 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -14,7 +14,7 @@ import { expect, test, } from 'vitest'; -import { createLimitsRuntimeSuite } from '../../world-testing/src/limits-runtime.js'; +import { createLimitsRuntimeSuite } from '../../world-testing/src/limits-runtime.mts'; import type { Run, StartOptions } from '../src/runtime.js'; import { cancelRun, diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 31b48957a7..6243de2290 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -4,7 +4,7 @@ import path from 'node:path'; import { LimitDefinitionConflictError } from '@workflow/errors'; import { describe, expect, it } from 'vitest'; import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; -import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.mts'; import { createLocalWorld } from './index.js'; import { createLimits } from './limits.js'; diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 0282c7eac9..7cf3b5bdb2 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,7 +1,7 @@ import { afterAll, beforeAll, beforeEach, expect, test, vi } from 'vitest'; import { LimitDefinitionConflictError } from '@workflow/errors'; import { SPEC_VERSION_CURRENT, createLockCorrelationId } from '@workflow/world'; -import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.ts'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.mts'; import { createLimits } from './limits.js'; import { createEventsStorage, diff --git a/packages/world-testing/src/index.mts b/packages/world-testing/src/index.mts index db42585942..b65248f5c7 100644 --- a/packages/world-testing/src/index.mts +++ b/packages/world-testing/src/index.mts @@ -2,8 +2,8 @@ import { addition } from './addition.mjs'; import { errors } from './errors.mjs'; import { hooks } from './hooks.mjs'; import { idempotency } from './idempotency.mjs'; -export { createLimitsContractSuite } from './limits-contract.js'; -export { createLimitsRuntimeSuite } from './limits-runtime.js'; +export { createLimitsContractSuite } from './limits-contract.mjs'; +export { createLimitsRuntimeSuite } from './limits-runtime.mjs'; import { nullByte } from './null-byte.mjs'; export function createTestSuite(pkgName: string) { diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.mts similarity index 99% rename from packages/world-testing/src/limits-contract.ts rename to packages/world-testing/src/limits-contract.mts index c1e346c9f6..be25c0da0f 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.mts @@ -227,7 +227,7 @@ export function createLimitsContractSuite( it('keeps rate capacity consumed until the window expires', async () => { const harness = await createHarness(); try { - const periodMs = 1_000; + const periodMs = 3_000; const ownerA = await createLockOwner(harness.storage, 'holder-a'); const ownerB = await createLockOwner(harness.storage, 'holder-b'); const ownerC = await createLockOwner(harness.storage, 'holder-c'); diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.mts similarity index 100% rename from packages/world-testing/src/limits-runtime.ts rename to packages/world-testing/src/limits-runtime.mts From ad5768b5fefad6435ce3a49aba007cf8b404216f Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 10:19:29 -0400 Subject: [PATCH 26/34] Harden concurrent limits contract timing Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-contract.mts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/world-testing/src/limits-contract.mts b/packages/world-testing/src/limits-contract.mts index be25c0da0f..4c7f8bb293 100644 --- a/packages/world-testing/src/limits-contract.mts +++ b/packages/world-testing/src/limits-contract.mts @@ -206,7 +206,7 @@ export function createLimitsContractSuite( owner, 'workflow:user:concurrent', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ) ) From 4c1a767b3b198e54d0287af45d89011015d1f068 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 11:20:52 -0400 Subject: [PATCH 27/34] Relax cancelled waiter e2e timing Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-runtime.mts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/world-testing/src/limits-runtime.mts b/packages/world-testing/src/limits-runtime.mts index 100794c6f4..c2f79cc4e1 100644 --- a/packages/world-testing/src/limits-runtime.mts +++ b/packages/world-testing/src/limits-runtime.mts @@ -293,7 +293,7 @@ export function createLimitsRuntimeSuite( ); expect( resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt - ).toBeLessThan(4_000); + ).toBeLessThan(6_000); }); it('does not block unrelated workflow keys', async () => { From ded5a4a370e22d60bcf3c418d187cee5a06448e9 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 11:38:51 -0400 Subject: [PATCH 28/34] Stabilize unrelated-key limits e2e timing Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-runtime.mts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/world-testing/src/limits-runtime.mts b/packages/world-testing/src/limits-runtime.mts index c2f79cc4e1..2e2de236c9 100644 --- a/packages/world-testing/src/limits-runtime.mts +++ b/packages/world-testing/src/limits-runtime.mts @@ -299,7 +299,7 @@ export function createLimitsRuntimeSuite( it('does not block unrelated workflow keys', async () => { const harness = await createHarness(); const [resultA, resultB] = - await harness.runIndependentWorkflowKeys(1_000); + await harness.runIndependentWorkflowKeys(3_000); expect(resultB.workflowLockAcquiredAt).toBeLessThan( resultA.workflowLockReleasedAt @@ -308,7 +308,7 @@ export function createLimitsRuntimeSuite( it('does not block unrelated step-like keys', async () => { const harness = await createHarness(); - const [resultA, resultB] = await harness.runIndependentStepKeys(1_000); + const [resultA, resultB] = await harness.runIndependentStepKeys(3_000); expect(resultB.acquiredAt).toBeLessThan(resultA.releasedAt); }); From 17208ac142b962b922609f3b9e5acb98fa62ec0f Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 13:52:26 -0400 Subject: [PATCH 29/34] Add changset Signed-off-by: nathancolosimo --- .changeset/eight-colts-agree.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .changeset/eight-colts-agree.md diff --git a/.changeset/eight-colts-agree.md b/.changeset/eight-colts-agree.md new file mode 100644 index 0000000000..0484c8d6da --- /dev/null +++ b/.changeset/eight-colts-agree.md @@ -0,0 +1,28 @@ +--- +"@workflow/swc-playground-wasm": patch +"@workflow/swc-plugin": patch +"@workflow/world-postgres": patch +"@workflow/world-testing": patch +"@workflow/world-vercel": patch +"@workflow/world-local": patch +"@workflow/web-shared": patch +"@workflow/sveltekit": patch +"@workflow/builders": patch +"workflow": patch +"@workflow/errors": patch +"@workflow/rollup": patch +"@workflow/vitest": patch +"@workflow/astro": patch +"@workflow/nitro": patch +"@workflow/world": patch +"@workflow/core": patch +"@workflow/nest": patch +"@workflow/next": patch +"@workflow/nuxt": patch +"@workflow/vite": patch +"@workflow/cli": patch +"@workflow/web": patch +"@workflow/ai": patch +--- + +Add experimental rate limiting and flow concurrency control From 8a25c91573483df871aea2cf077a8e261260968c Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 16:49:37 -0400 Subject: [PATCH 30/34] Stabilize limits test timing Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 18 ++++++++++++++++-- packages/world-testing/src/limits-contract.mts | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 72e3039a06..a535bfec02 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -96,6 +96,20 @@ function writeE2EMetadata() { const e2e = (fn: string) => getWorkflowMetadata(deploymentUrl, 'workflows/99_e2e.ts', fn); +async function waitForRunLockAttempt(runId: string, timeoutMs = 10_000) { + const deadline = Date.now() + timeoutMs; + + while (Date.now() < deadline) { + const { data: events } = await getWorld().events.list({ runId }); + if (events.some((event) => event.eventType === 'lock_created')) { + return; + } + await sleep(50); + } + + throw new Error(`Timed out waiting for lock attempt on run ${runId}`); +} + /** * Triggers a workflow via HTTP POST. Used only for Pages Router tests * that specifically need to validate the HTTP trigger endpoint. @@ -321,7 +335,7 @@ describe('e2e', () => { const runA = await start(workflow, [userId, holdMs, 'A']); await sleep(100); const runB = await start(workflow, [userId, holdMs, 'B']); - await sleep(100); + await waitForRunLockAttempt(runB.runId); const runC = await start(workflow, [userId, holdMs, 'C']); return await Promise.all([ runA.returnValue, @@ -334,7 +348,7 @@ describe('e2e', () => { const runA = await start(workflow, [userId, holdMs, 'A']); await sleep(100); const runB = await start(workflow, [userId, holdMs, 'B']); - await sleep(100); + await waitForRunLockAttempt(runB.runId); await cancelRun(getWorld(), runB.runId); const cancelledError = await runB.returnValue.catch((error) => error); const runC = await start(workflow, [userId, holdMs, 'C']); diff --git a/packages/world-testing/src/limits-contract.mts b/packages/world-testing/src/limits-contract.mts index 4c7f8bb293..b5f5ca8c1d 100644 --- a/packages/world-testing/src/limits-contract.mts +++ b/packages/world-testing/src/limits-contract.mts @@ -314,7 +314,7 @@ export function createLimitsContractSuite( it('returns a combined blocked reason when both limits are saturated', async () => { const harness = await createHarness(); try { - const periodMs = 1_500; + const periodMs = 3_000; const ownerA = await createLockOwner(harness.storage, 'holder-a'); const ownerB = await createLockOwner(harness.storage, 'holder-b'); const first = await harness.limits.acquire( From 6cc468de1cadbf5426fff7d273ea06c13dc3e7ae Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 17:03:14 -0400 Subject: [PATCH 31/34] Harden FIFO limits contract timing Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-contract.mts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/world-testing/src/limits-contract.mts b/packages/world-testing/src/limits-contract.mts index b5f5ca8c1d..f4820b43c3 100644 --- a/packages/world-testing/src/limits-contract.mts +++ b/packages/world-testing/src/limits-contract.mts @@ -578,7 +578,7 @@ export function createLimitsContractSuite( ownerA, 'workflow:user:ordered', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ); expect(first.status).toBe('acquired'); @@ -590,7 +590,7 @@ export function createLimitsContractSuite( ownerB, 'workflow:user:ordered', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ); const third = await harness.limits.acquire( @@ -598,7 +598,7 @@ export function createLimitsContractSuite( ownerC, 'workflow:user:ordered', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ); @@ -612,7 +612,7 @@ export function createLimitsContractSuite( ownerB, 'workflow:user:ordered', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ); const stillWaiting = await harness.limits.acquire( @@ -620,7 +620,7 @@ export function createLimitsContractSuite( ownerC, 'workflow:user:ordered', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ); @@ -636,7 +636,7 @@ export function createLimitsContractSuite( ownerC, 'workflow:user:ordered', { concurrency: { max: 1 } }, - 1_000 + 10_000 ) ); From a22fd76f6b2fd9d4554acdeb24074fb0259e4926 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 31 Mar 2026 17:33:59 -0400 Subject: [PATCH 32/34] Fix cancelled waiter e2e timing Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index a535bfec02..b30b257221 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -350,9 +350,9 @@ describe('e2e', () => { const runB = await start(workflow, [userId, holdMs, 'B']); await waitForRunLockAttempt(runB.runId); await cancelRun(getWorld(), runB.runId); - const cancelledError = await runB.returnValue.catch((error) => error); const runC = await start(workflow, [userId, holdMs, 'C']); - const [resultA, resultC] = await Promise.all([ + const [cancelledError, resultA, resultC] = await Promise.all([ + runB.returnValue.catch((error) => error), runA.returnValue, runC.returnValue, ]); From 216c9ee539102c470c1ff5779044ebd1e55e7994 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 1 Apr 2026 00:33:49 -0400 Subject: [PATCH 33/34] Prune dead lock holders on terminal state runs and disable lock() on vercel backend Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 15 ++ packages/core/src/workflow.test.ts | 46 +++++- packages/core/src/workflow.ts | 12 +- packages/world-local/src/limits.ts | 25 ++- packages/world-postgres/src/limits.ts | 55 ++++--- .../world-testing/src/limits-contract.mts | 142 ++++++++++++++++++ packages/world-testing/src/limits-runtime.mts | 27 +++- packages/world/FLOW_LIMITS.md | 7 +- workbench/example/workflows/99_e2e.ts | 3 +- 9 files changed, 298 insertions(+), 34 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index b30b257221..fd7fc12479 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -301,6 +301,21 @@ describe('e2e', () => { const waiterResult = await waiterRun.returnValue; return [leakedResult, waiterResult]; }, + async runWorkflowTerminalHolderRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('workflowLeakedLockWorkflow'); + const waiterWorkflow = await e2e( + 'workflowOnlyLockContentionWorkflow' + ); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [userId, 0, 'B']); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, async runLeakedKeyExpiredLeaseRecovery(userId, leaseTtlMs) { const leakedWorkflow = await e2e('leakedKeyLockWorkflow'); const waiterWorkflow = await e2e('lockedStepCallContentionWorkflow'); diff --git a/packages/core/src/workflow.test.ts b/packages/core/src/workflow.test.ts index 2d9c9a65d0..0253d95b04 100644 --- a/packages/core/src/workflow.test.ts +++ b/packages/core/src/workflow.test.ts @@ -1,6 +1,10 @@ import { types } from 'node:util'; import { HookConflictError, WorkflowRuntimeError } from '@workflow/errors'; -import type { Event, WorkflowRun } from '@workflow/world'; +import { + LIMITS_NOT_IMPLEMENTED_MESSAGE, + type Event, + type WorkflowRun, +} from '@workflow/world'; import { assert, describe, expect, it, vi } from 'vitest'; import type { WorkflowSuspension } from './global.js'; import { @@ -147,6 +151,46 @@ describe('runWorkflow', () => { }); }); + it('keeps lock() unsupported in the workflow vm on Vercel', async () => { + vi.stubEnv('VERCEL_URL', 'workflow.vercel.app'); + + try { + const ops: Promise[] = []; + const workflowCode = ` + const lock = globalThis[Symbol.for("WORKFLOW_LOCK")]; + async function workflow() { + await lock({ + key: 'workflow:user:test', + concurrency: { max: 1 }, + }); + } + ${getWorkflowTransformCode('workflow')} + `; + + const workflowRun: WorkflowRun = { + runId: 'wrun_123', + workflowName: 'workflow', + status: 'running', + input: await dehydrateWorkflowArguments( + [], + 'wrun_123', + noEncryptionKey, + ops + ), + createdAt: new Date('2024-01-01T00:00:00.000Z'), + updatedAt: new Date('2024-01-01T00:00:00.000Z'), + startedAt: new Date('2024-01-01T00:00:00.000Z'), + deploymentId: 'test-deployment', + }; + + await expect( + runWorkflow(workflowCode, workflowRun, [], noEncryptionKey) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + } finally { + vi.unstubAllEnvs(); + } + }); + it('should resolve a step that has a `step_completed` event', async () => { const ops: Promise[] = []; const workflowRunId = 'wrun_123'; diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index df3536faa8..725c27f668 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -7,7 +7,11 @@ import { import { withResolvers } from '@workflow/utils'; import { getPort } from '@workflow/utils/get-port'; import { parseWorkflowName } from '@workflow/utils/parse-name'; -import type { Event, WorkflowRun } from '@workflow/world'; +import { + createLimitsNotImplementedError, + type Event, + type WorkflowRun, +} from '@workflow/world'; import * as nanoid from 'nanoid'; import { monotonicFactory } from 'ulid'; import type { CryptoKey } from './encryption.js'; @@ -196,7 +200,11 @@ export async function runWorkflow( const useStep = createUseStep(workflowContext); const createHook = createCreateHook(workflowContext); - const lock = createLock(workflowContext); + const lock = isVercel + ? async () => { + throw createLimitsNotImplementedError(); + } + : createLock(workflowContext); const sleep = createSleep(workflowContext); // @ts-expect-error - `@types/node` says symbol is not valid, but it does work diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 2a3be12023..be765ca3c0 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -310,7 +310,7 @@ function toNextWaiter(holderId: string): LimitNextWaiter | undefined { } function isTerminalRun(run: WorkflowRunWithoutData | undefined) { - return !run || ['completed', 'failed', 'cancelled'].includes(run.status); + return !!run && ['completed', 'failed', 'cancelled'].includes(run.status); } function deleteEmptyKey(state: LimitsState, key: string) { @@ -374,16 +374,26 @@ export function createLimits( return !isTerminalRun(run); }; - const pruneDeadWaiters = async (keyState: KeyState): Promise => { + const pruneDeadHoldersAndWaiters = async ( + keyState: KeyState + ): Promise => { const prunedKeyState = pruneKeyState(keyState); + const leases: LimitLease[] = []; const waiters: LimitWaiter[] = []; + for (const lease of prunedKeyState.leases) { + if (await isHolderLive(lease.lockId)) { + leases.push(lease); + } + } + for (const waiter of prunedKeyState.waiters) { if (await isHolderLive(waiter.lockId)) { waiters.push(waiter); } } + prunedKeyState.leases = leases; prunedKeyState.waiters = waiters; return prunedKeyState; }; @@ -444,7 +454,7 @@ export function createLimits( return withStateLock(async (): Promise => { const state = cloneState(await readState()); - const keyState = await pruneDeadWaiters( + const keyState = await pruneDeadHoldersAndWaiters( state.keys[parsed.key] ?? { key: parsed.key, definition: undefined, @@ -576,8 +586,10 @@ export function createLimits( let nextWaiter: LimitNextWaiter | undefined; for (const [key, keyStateValue] of Object.entries(state.keys)) { - const keyState = await pruneDeadWaiters(keyStateValue); - const beforeLeases = keyState.leases.length; + const beforeLeases = keyStateValue.leases.length; + const keyState = await pruneDeadHoldersAndWaiters(keyStateValue); + let capacityFreed = keyState.leases.length !== beforeLeases; + const beforeExplicitRelease = keyState.leases.length; keyState.leases = keyState.leases.filter((lease) => { if (lease.leaseId !== parsed.leaseId) return true; if (parsed.key && lease.key !== parsed.key) return true; @@ -586,8 +598,9 @@ export function createLimits( } return false; }); + capacityFreed ||= keyState.leases.length !== beforeExplicitRelease; - if (keyState.leases.length !== beforeLeases) { + if (capacityFreed) { const headWaiter = keyState.waiters[0]; if (headWaiter) { const concurrencyBlocked = diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 76b0fd47b0..4afd869a89 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -270,7 +270,7 @@ async function isHolderLive(tx: Db, holderId: string): Promise { .where(eq(Schema.runs.runId, parsedLockId.runId)) .limit(1)) as Pick[]; - return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); + return !run || !['completed', 'failed', 'cancelled'].includes(run.status); } async function pruneDeadWaiters(tx: Db, key: string): Promise { @@ -291,6 +291,24 @@ async function pruneDeadWaiters(tx: Db, key: string): Promise { } } +async function pruneDeadHolders(tx: Db, key: string): Promise { + const leases = await tx + .select({ + leaseId: Schema.limitLeases.leaseId, + holderId: Schema.limitLeases.holderId, + }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, key)); + + for (const lease of leases) { + if (!(await isHolderLive(tx, lease.holderId))) { + await tx + .delete(Schema.limitLeases) + .where(eq(Schema.limitLeases.leaseId, lease.leaseId)); + } + } +} + async function ensureCanonicalDefinition( tx: Db, key: string, @@ -404,6 +422,7 @@ export function createLimits( return drizzle.transaction(async (tx) => { await lockLimitKey(tx, parsed.key); await pruneExpired(tx, parsed.key); + await pruneDeadHolders(tx, parsed.key); await pruneDeadWaiters(tx, parsed.key); const state = await getActiveState(tx, parsed.key); @@ -558,8 +577,11 @@ export function createLimits( if (key) { await lockLimitKey(tx, key); + await pruneExpired(tx, key); } + const beforeState = key ? await getActiveState(tx, key) : undefined; + let where = eq(Schema.limitLeases.leaseId, parsed.leaseId); if (parsed.key) { where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; @@ -568,25 +590,24 @@ export function createLimits( where = and(where, eq(Schema.limitLeases.holderId, parsed.lockId))!; } - const [deleted] = await tx - .delete(Schema.limitLeases) - .where(where) - .returning({ - limitKey: Schema.limitLeases.limitKey, - holderId: Schema.limitLeases.holderId, - }); - - if (deleted?.limitKey) { - await pruneExpired(tx, deleted.limitKey); - await pruneDeadWaiters(tx, deleted.limitKey); - const state = await getActiveState(tx, deleted.limitKey); + await tx.delete(Schema.limitLeases).where(where).returning({ + limitKey: Schema.limitLeases.limitKey, + holderId: Schema.limitLeases.holderId, + }); + + if (key) { + await pruneDeadHolders(tx, key); + await pruneDeadWaiters(tx, key); + const state = await getActiveState(tx, key); const headWaiter = state.waiters[0]; + const capacityFreed = + (beforeState?.leases.length ?? 0) > state.leases.length; - if (headWaiter) { + if (headWaiter && capacityFreed) { const definition = state.keyRow && definitionFromRow(state.keyRow); if (!definition) { throw new WorkflowWorldError( - `Missing canonical definition for key "${deleted.limitKey}"` + `Missing canonical definition for key "${key}"` ); } const concurrencyBlocked = @@ -599,7 +620,7 @@ export function createLimits( if (!concurrencyBlocked && !rateBlocked) { const promoted = await promoteWaiter( tx, - deleted.limitKey, + key, headWaiter, definition ); @@ -614,7 +635,7 @@ export function createLimits( ) { await tx .delete(Schema.limitKeys) - .where(eq(Schema.limitKeys.limitKey, deleted.limitKey)); + .where(eq(Schema.limitKeys.limitKey, key)); } } diff --git a/packages/world-testing/src/limits-contract.mts b/packages/world-testing/src/limits-contract.mts index f4820b43c3..cef3d7a8d6 100644 --- a/packages/world-testing/src/limits-contract.mts +++ b/packages/world-testing/src/limits-contract.mts @@ -726,6 +726,148 @@ export function createLimitsContractSuite( } }); + it('reclaims a terminal workflow holder lease before its ttl expires', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow holder liveness'); + } + + const terminalRun = await createRun(harness.storage, 'terminal-holder'); + await harness.storage.events.create(terminalRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const waiterRun = await createRun(harness.storage, 'waiter-holder'); + await harness.storage.events.create(waiterRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + + const acquired = await harness.limits.acquire({ + key: 'workflow:user:terminal-holder', + runId: terminalRun.runId, + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 60_000, + }); + expect(acquired.status).toBe('acquired'); + if (acquired.status !== 'acquired') { + throw new Error('expected acquisition'); + } + + await harness.storage.events.create(terminalRun.runId, { + eventType: 'run_completed', + specVersion: SPEC_VERSION_CURRENT, + eventData: { output: null }, + }); + + const promoted = await harness.limits.acquire({ + key: 'workflow:user:terminal-holder', + runId: waiterRun.runId, + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('prunes terminal holders during release before promoting the next waiter', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow holder liveness'); + } + + const liveRun = await createRun(harness.storage, 'live-holder'); + const deadRunA = await createRun(harness.storage, 'dead-holder-a'); + const deadRunB = await createRun(harness.storage, 'dead-holder-b'); + const waiterRun = await createRun(harness.storage, 'waiter-holder'); + + for (const run of [liveRun, deadRunA, deadRunB, waiterRun]) { + await harness.storage.events.create(run.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + } + + const key = 'workflow:user:terminal-holder-release'; + const definition = { concurrency: { max: 3 } } as const; + const acquiredLive = await harness.limits.acquire({ + key, + runId: liveRun.runId, + lockIndex: 0, + definition, + leaseTtlMs: 60_000, + }); + const acquiredDeadA = await harness.limits.acquire({ + key, + runId: deadRunA.runId, + lockIndex: 0, + definition, + leaseTtlMs: 60_000, + }); + const acquiredDeadB = await harness.limits.acquire({ + key, + runId: deadRunB.runId, + lockIndex: 0, + definition, + leaseTtlMs: 60_000, + }); + + expect(acquiredLive.status).toBe('acquired'); + expect(acquiredDeadA.status).toBe('acquired'); + expect(acquiredDeadB.status).toBe('acquired'); + if ( + acquiredLive.status !== 'acquired' || + acquiredDeadA.status !== 'acquired' || + acquiredDeadB.status !== 'acquired' + ) { + throw new Error('expected acquisition'); + } + + const blockedWaiter = await harness.limits.acquire({ + key, + runId: waiterRun.runId, + lockIndex: 0, + definition, + leaseTtlMs: 5_000, + }); + expect(blockedWaiter.status).toBe('blocked'); + + for (const run of [deadRunA, deadRunB]) { + await harness.storage.events.create(run.runId, { + eventType: 'run_completed', + specVersion: SPEC_VERSION_CURRENT, + eventData: { output: null }, + }); + } + + const released = await harness.limits.release( + releaseRequest(acquiredLive.lease) + ); + expect(released.nextWaiter).toMatchObject({ + runId: waiterRun.runId, + lockIndex: 0, + }); + + const promoted = await harness.limits.acquire({ + key, + runId: waiterRun.runId, + lockIndex: 0, + definition, + leaseTtlMs: 5_000, + }); + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + it('does not duplicate a replayed blocked holder waiter or lease', async () => { const harness = await createHarness(); try { diff --git a/packages/world-testing/src/limits-runtime.mts b/packages/world-testing/src/limits-runtime.mts index 2e2de236c9..d9a75c95b6 100644 --- a/packages/world-testing/src/limits-runtime.mts +++ b/packages/world-testing/src/limits-runtime.mts @@ -83,6 +83,10 @@ export interface LimitsRuntimeHarness { userId: string, leaseTtlMs: number ): Promise<[LeakedLockResult, WorkflowOnlyLockResult]>; + runWorkflowTerminalHolderRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[LeakedLockResult, WorkflowOnlyLockResult]>; runLeakedKeyExpiredLeaseRecovery( userId: string, leaseTtlMs: number @@ -204,7 +208,7 @@ export function createLimitsRuntimeSuite( ).toBeLessThan(4_000); }); - it('reclaims expired leaked workflow locks without manual cleanup', async () => { + it('reclaims terminal workflow-held locks on workflow keys', async () => { const harness = await createHarness(); const leaseTtlMs = 1_250; const [resultA, resultB] = await harness.runWorkflowExpiredLeaseRecovery( @@ -212,15 +216,29 @@ export function createLimitsRuntimeSuite( leaseTtlMs ); + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + }); + + it('reclaims terminal workflow holder leases promptly before ttl expiry', async () => { + const harness = await createHarness(); + const leaseTtlMs = 30_000; + const [resultA, resultB] = + await harness.runWorkflowTerminalHolderRecovery( + 'terminal-holder-user', + leaseTtlMs + ); + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( resultA.workflowCompletedAt ); expect( resultB.workflowLockAcquiredAt - resultA.lockAcquiredAt - ).toBeGreaterThanOrEqual(leaseTtlMs - 100); + ).toBeLessThan(leaseTtlMs - 5_000); }); - it('reclaims expired leaked locks on arbitrary keys without manual cleanup', async () => { + it('reclaims terminal workflow-held locks on arbitrary keys', async () => { const harness = await createHarness(); const leaseTtlMs = 1_250; const [resultA, resultB] = await harness.runLeakedKeyExpiredLeaseRecovery( @@ -231,9 +249,6 @@ export function createLimitsRuntimeSuite( expect(resultB.acquiredAt).toBeGreaterThanOrEqual( resultA.workflowCompletedAt ); - expect( - resultB.acquiredAt - resultA.lockAcquiredAt - ).toBeGreaterThanOrEqual(leaseTtlMs - 100); }); it('keeps mixed concurrency and rate waiters blocked until the rate window expires', async () => { diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index b2ef665fad..20aa5e227b 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -45,7 +45,7 @@ semantics across implemented worlds. That shared contract includes: - same-holder lease reuse - serialization of concurrent acquires for a single key - FIFO waiter promotion per key -- pruning cancelled workflow waiters +- pruning terminal workflow holders and waiters - blocked acquisitions not consuming execution concurrency - prompt wake-up with delayed fallback replay @@ -201,11 +201,16 @@ Important details: - FIFO is per key, not global across all limit keys - promotion order is based on waiter creation order +- terminal holders are pruned before capacity decisions - dead or terminal waiters are pruned before promotion - a live waiter may still be skipped if it is no longer eligible when promotion runs - releasing a lease or reclaiming an expired lease can both trigger promotion - rate-window expiry can also make the head waiter eligible again +Implemented worlds currently reclaim terminal holders opportunistically when a +key is touched, so completed, failed, or cancelled workflows do not hold +concurrency capacity until lease TTL expiry. + This gives deterministic and inspectable fairness for a key without requiring a global scheduler. diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index fd03705fc5..bf95d34fe8 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -307,6 +307,7 @@ export async function workflowLockContentionWorkflow( }); step = await serializedLimitStep(userId, holdMs); } + const stepCallLockReleasedAt = Date.now(); await workflowLock.dispose(); const workflowLockReleasedAt = Date.now(); @@ -315,7 +316,7 @@ export async function workflowLockContentionWorkflow( workflowLockAcquiredAt, workflowLockReleasedAt, stepCallLockAcquiredAt: step.acquiredAt, - stepCallLockReleasedAt: step.releasedAt, + stepCallLockReleasedAt, }; } From c1be9379cf2c36de947635384ab9fd9647ecdb69 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 1 Apr 2026 01:09:07 -0400 Subject: [PATCH 34/34] fix race condition in lock event replay Signed-off-by: nathancolosimo --- packages/core/src/workflow/lock.test.ts | 84 +++++++++++++++++++++++-- packages/core/src/workflow/lock.ts | 4 +- 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/packages/core/src/workflow/lock.test.ts b/packages/core/src/workflow/lock.test.ts index b903d216f1..0142944ac6 100644 --- a/packages/core/src/workflow/lock.test.ts +++ b/packages/core/src/workflow/lock.test.ts @@ -14,6 +14,7 @@ import type { WorkflowOrchestratorContext } from '../private.js'; import { setWorld } from '../runtime/world.js'; import { createContext } from '../vm/index.js'; import { createLock } from './lock.js'; +import { createSleep } from './sleep.js'; function createLease(): LimitLease { return { @@ -30,22 +31,26 @@ function createLease(): LimitLease { }; } -function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { +function setupWorkflowContext( + events: Event[], + options?: { onUnconsumedEvent?: (event: Event) => void } +): WorkflowOrchestratorContext { const context = createContext({ seed: 'test', fixedTimestamp: 1753481739458, }); const ulid = monotonicFactory(() => context.globalThis.Math.random()); const workflowStartedAt = context.globalThis.Date.now(); - return { + const promiseQueueHolder = { current: Promise.resolve() }; + const workflowContext: WorkflowOrchestratorContext = { runId: 'wrun_test', lockPreApproval: undefined, encryptionKey: undefined, globalThis: context.globalThis, advanceTimestamp: vi.fn(), eventsConsumer: new EventsConsumer(events, { - onUnconsumedEvent: () => {}, - getPromiseQueue: () => Promise.resolve(), + onUnconsumedEvent: options?.onUnconsumedEvent ?? (() => {}), + getPromiseQueue: () => promiseQueueHolder.current, }), nextLockIndex: 0, invocationsQueue: new Map(), @@ -54,9 +59,20 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { new Uint8Array(size).map(() => 256 * context.globalThis.Math.random()) ), onWorkflowError: vi.fn(), - promiseQueue: Promise.resolve(), pendingDeliveries: 0, }; + Object.defineProperty(workflowContext, 'promiseQueue', { + get() { + return promiseQueueHolder.current; + }, + set(value: Promise) { + promiseQueueHolder.current = value; + }, + enumerable: true, + configurable: true, + }); + workflowContext.promiseQueue = Promise.resolve(); + return workflowContext; } function asEventResult(event: Event): EventResult { @@ -328,6 +344,64 @@ describe('createLock', () => { }); }); + it('does not orphan wait_created when a replayed lock is immediately followed by sleep', async () => { + const lease = createLease(); + const createEvent = vi.fn(); + const tempCtx = setupWorkflowContext([]); + const waitCorrelationId = `wait_${tempCtx.generateUlid()}`; + const onUnconsumedEvent = vi.fn(); + + setWorld({ + events: { create: createEvent }, + limits: { heartbeat: vi.fn() }, + } as any); + + const correlationId = createLockCorrelationId('wrun_test', 0); + const ctx = setupWorkflowContext( + [ + { + eventId: 'evnt_lock_acquired', + runId: 'wrun_test', + eventType: 'lock_acquired', + correlationId, + eventData: { lease }, + createdAt: new Date('2025-01-01T00:00:00.000Z'), + }, + { + eventId: 'evnt_wait_created', + runId: 'wrun_test', + eventType: 'wait_created', + correlationId: waitCorrelationId, + eventData: { + resumeAt: new Date('2025-01-01T00:00:01.000Z'), + }, + createdAt: new Date('2025-01-01T00:00:00.010Z'), + }, + { + eventId: 'evnt_wait_completed', + runId: 'wrun_test', + eventType: 'wait_completed', + correlationId: waitCorrelationId, + createdAt: new Date('2025-01-01T00:00:01.000Z'), + }, + ], + { onUnconsumedEvent } + ); + const lock = createLock(ctx); + const sleep = createSleep(ctx); + + await lock({ + key: lease.key, + concurrency: { max: 1 }, + }); + await sleep(1_000); + await new Promise((resolve) => setTimeout(resolve, 150)); + + expect(createEvent).not.toHaveBeenCalled(); + expect(onUnconsumedEvent).not.toHaveBeenCalled(); + expect(ctx.onWorkflowError).not.toHaveBeenCalled(); + }); + it('rejects heartbeat in workflow scope to preserve replay determinism', async () => { const lease = createLease(); const createEvent = vi diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index 473d7ac44c..f18ead94ad 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -209,7 +209,9 @@ export function createLock(ctx: WorkflowOrchestratorContext) { if (resolved) return; resolved = true; ctx.invocationsQueue.delete(state.wakeCorrelationId); - resolve(createLockHandle(state, ctx)); + ctx.promiseQueue = ctx.promiseQueue.then(() => { + resolve(createLockHandle(state, ctx)); + }); }; const suspendWorkflow = () => {