From ce1ed48a77435c41209a524f61826ffd5ffb5507 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Wed, 4 Feb 2026 14:16:06 -0500
Subject: [PATCH 01/16] feat: generate endpoint with SSE streaming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Add batch video generation endpoint with SSE streaming

## Summary

Adds `/api/v1/generate` endpoint for batch video generation with server-side chunking and SSE progress streaming. Supports text-to-video, video-to-video, VACE conditioning, and comprehensive per-chunk parameter scheduling.

This is important for the ComfyUI node wrapper for Scope. It also could conceivably replace the test.py/test_vace.py, or at least their boiler plate code.

## Changes

- **`schema.py`**: Add `GenerateRequest`/`GenerateResponse` models with `EncodedArray` for binary data
- **`generate.py`**: New module handling chunked generation with SSE progress events
- **`app.py`**: Wire up the endpoint
- **`test_generate_endpoint.py`**: Integration tests for v2v, depth, inpainting, LoRA ramps
- **ComfyUI nodes**: Update `ScopeSampler` to use new schema

## Features

### Generation modes
- **Text-to-video**: Generate from prompt alone
- **Video-to-video**: Transform input video with configurable noise scale

### VACE conditioning
- **Reference images**: Style/identity conditioning via image paths
- **Depth/structure guidance**: Pass conditioning frames for structural control
- **Inpainting**: Binary masks specify regions to regenerate vs preserve

### Per-chunk parameter scheduling

All scheduling parameters accept either a single value (applied to all chunks) or a list (applied per-chunk, last value repeats if list is shorter than chunk count).

| Parameter | Type | Description |
|-----------|------|-------------|
| `seed` | `int \| list[int]` | Random seed per chunk |
| `noise_scale` | `float \| list[float]` | V2V noise injection strength |
| `vace_context_scale` | `float \| list[float]` | VACE conditioning influence |
| `lora_scales` | `dict[str, float \| list[float]]` | Per-LoRA strength scheduling |

### Sparse keyframe updates

These parameters use a chunk-indexed specification, only sending updates when values change (sticky behavior).

| Parameter | Type | Description |
|-----------|------|-------------|
| `chunk_prompts` | `list[{chunk, text}]` | Prompt changes at specific chunks |
| `first_frames` | `list[{chunk, image}]` | First frame anchors for extension mode |
| `last_frames` | `list[{chunk, image}]` | Last frame anchors for extension mode |
| `vace_ref_images` | `list[{chunk, images}]` | Reference images at specific chunks |

## Design decisions

Some features were left out of this PR for simplicity (eg, prompt spatial/temporal blending). They can be added or included in a follow up.
### SSE streaming

Clients, like test files or ComfyUI nodes, need performance and progress updates. SSE provides per-chunk progress updates without requiring WebSocket infrastructure:

```
event: progress
data: {"chunk": 1, "total_chunks": 8, "fps": 4.2, "latency": 2.85}

event: progress
data: {"chunk": 2, "total_chunks": 8, "fps": 4.5, "latency": 2.67}

event: complete
data: {"video_base64": "...", "video_shape": [96, 320, 576, 3], ...}
```

### Server-side chunking

The server determines chunk size from the pipeline, handles frame padding, and manages KV cache initialization. Callers specify total frames and per-chunk parameters—the server handles the rest.

## Example usage

### LoRA strength ramp (dissolve effect)

```python
request = GenerateRequest(
    pipeline_id="longlive",
    prompt="a woman dissolving into particles",
    num_frames=96,  # 8 chunks × 12 frames
    lora_scales={
        "path/to/dissolve.safetensors": [0.0, 0.15, 0.3, 0.5, 0.7, 0.85, 1.0, 1.0]
    },
)
```

### Video-to-video with prompt changes

```python
request = GenerateRequest(
    pipeline_id="longlive",
    prompt="a cat sitting calmly",
    chunk_prompts=[
        {"chunk": 3, "text": "a cat jumping"},
        {"chunk": 6, "text": "a cat landing gracefully"},
    ],
    input_video=EncodedArray(base64="...", shape=[96, 512, 512, 3]),
    noise_scale=0.6,
)
```

### Depth-guided generation

```python
request = GenerateRequest(
    pipeline_id="longlive",
    prompt="a robot walking through a forest",
    vace_frames=EncodedArray(base64="...", shape=[1, 3, 48, 320, 576]),
    vace_context_scale=1.5,
)
```

## Test plan

- [x] `uv run daydream-scope` starts without errors
- [x] V2V generation produces correct output
- [x] VACE depth conditioning works
- [x] VACE inpainting with masks works
- [x] LoRA scale ramping works across chunks
- [x] Per-chunk noise scale scheduling works
- [x] Prompt keyframing updates at correct chunks
- [x] ComfyUI ScopeSampler node works (WIP)
- [x] Test with Longlive
- [x] Same test with StreamDiffusionv2

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/app.py         |  26 +++
 src/scope/server/generate.py    | 324 ++++++++++++++++++++++++++++++++
 src/scope/server/schema.py      | 123 ++++++++++++
 tests/test_generate_endpoint.py | 301 +++++++++++++++++++++++++++++
 4 files changed, 774 insertions(+)
 create mode 100644 src/scope/server/generate.py
 create mode 100644 tests/test_generate_endpoint.py

diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index ac8080f58..c5a79d485 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -50,6 +50,7 @@
     is_kafka_enabled,
     set_kafka_publisher,
 )
+from .generate import generate_video_stream
 from .logs_config import (
     cleanup_old_logs,
     ensure_logs_dir,
@@ -78,6 +79,7 @@
     AssetsResponse,
     CloudConnectRequest,
     CloudStatusResponse,
+    GenerateRequest,
     HardwareInfoResponse,
     HealthResponse,
     IceCandidateRequest,
@@ -1126,6 +1128,30 @@ def download_in_background():
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
+@app.post("/api/v1/generate")
+async def generate_video(
+    request: "GenerateRequest",
+    pipeline_manager: "PipelineManager" = Depends(get_pipeline_manager),
+):
+    """Generate video frames in batch mode with SSE progress streaming."""
+    status_info = await pipeline_manager.get_status_info_async()
+    if status_info["status"] != "loaded":
+        raise HTTPException(
+            status_code=400,
+            detail="Pipeline not loaded. Please load pipeline first.",
+        )
+
+    return StreamingResponse(
+        generate_video_stream(request, pipeline_manager, status_info, logger),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+
 def is_spout_available() -> bool:
     """Check if Spout is available (native Windows only, not WSL)."""
     return sys.platform == "win32"
diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
new file mode 100644
index 000000000..9ce59d808
--- /dev/null
+++ b/src/scope/server/generate.py
@@ -0,0 +1,324 @@
+"""Video generation service for batch mode with chunked processing."""
+
+import base64
+import gc
+import json
+import time
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+
+# Defaults
+DEFAULT_HEIGHT = 320
+DEFAULT_WIDTH = 576
+DEFAULT_CHUNK_SIZE = 12
+DEFAULT_SEED = 42
+DEFAULT_NOISE_SCALE = 0.7
+PROMPT_WEIGHT = 100
+
+if TYPE_CHECKING:
+    from logging import Logger
+
+    from .pipeline_manager import PipelineManager
+    from .schema import EncodedArray, GenerateRequest
+
+
+def decode_array(encoded: "EncodedArray", dtype: np.dtype) -> np.ndarray:
+    """Decode EncodedArray to numpy array."""
+    data = base64.b64decode(encoded.base64)
+    return np.frombuffer(data, dtype=dtype).reshape(encoded.shape)
+
+
+def loop_to_length(arr: np.ndarray, target: int, axis: int) -> np.ndarray:
+    """Tile array along axis to reach target length."""
+    current = arr.shape[axis]
+    if current >= target:
+        return arr
+    repeats = (target + current - 1) // current
+    tiled = np.concatenate([arr] * repeats, axis=axis)
+    slices = [slice(None)] * arr.ndim
+    slices[axis] = slice(0, target)
+    return tiled[tuple(slices)]
+
+
+def pad_chunk(arr: np.ndarray, target_size: int, axis: int) -> np.ndarray:
+    """Pad array with last frame along axis to reach target size."""
+    current = arr.shape[axis]
+    if current >= target_size:
+        return arr
+    slices = [slice(None)] * arr.ndim
+    slices[axis] = slice(-1, None)
+    last_frame = arr[tuple(slices)]
+    padding = np.repeat(last_frame, target_size - current, axis=axis)
+    return np.concatenate([arr, padding], axis=axis)
+
+
+def build_lookup(specs: list | None, value_attr: str = "image") -> dict:
+    """Build chunk -> value lookup from list of specs."""
+    if not specs:
+        return {}
+    return {spec.chunk: getattr(spec, value_attr) for spec in specs}
+
+
+def get_chunk_value(value, chunk_idx: int, default=None):
+    """Get per-chunk value from scalar or list."""
+    if value is None:
+        return default
+    if isinstance(value, list):
+        return value[chunk_idx] if chunk_idx < len(value) else value[-1]
+    return value
+
+
+def sse_event(event_type: str, data: dict) -> str:
+    """Format a server-sent event."""
+    return f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
+
+
+@dataclass
+class DecodedInputs:
+    """Decoded and preprocessed inputs for generation."""
+
+    input_video: np.ndarray | None = None
+    vace_frames: np.ndarray | None = None
+    vace_masks: np.ndarray | None = None
+    first_frames: dict[int, str] = field(default_factory=dict)
+    last_frames: dict[int, str] = field(default_factory=dict)
+    ref_images: dict[int, list[str]] = field(default_factory=dict)
+    prompts: dict[int, str] = field(default_factory=dict)
+
+
+def decode_inputs(request: "GenerateRequest", num_frames: int) -> DecodedInputs:
+    """Decode all base64 inputs from request."""
+    inputs = DecodedInputs()
+
+    if request.input_video:
+        inputs.input_video = decode_array(request.input_video, np.uint8)
+        inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0)
+
+    if request.vace_frames:
+        inputs.vace_frames = decode_array(request.vace_frames, np.float32)
+        inputs.vace_frames = loop_to_length(inputs.vace_frames, num_frames, axis=2)
+
+    if request.vace_masks:
+        inputs.vace_masks = decode_array(request.vace_masks, np.float32)
+        inputs.vace_masks = loop_to_length(inputs.vace_masks, num_frames, axis=2)
+
+    inputs.first_frames = build_lookup(request.first_frames, "image")
+    inputs.last_frames = build_lookup(request.last_frames, "image")
+    inputs.ref_images = build_lookup(request.vace_ref_images, "images")
+    inputs.prompts = {0: request.prompt}
+    inputs.prompts.update(build_lookup(request.chunk_prompts, "text"))
+
+    return inputs
+
+
+def build_chunk_kwargs(
+    request: "GenerateRequest",
+    inputs: DecodedInputs,
+    chunk_idx: int,
+    chunk_size: int,
+    start_frame: int,
+    end_frame: int,
+    status_info: dict,
+    device: torch.device,
+    dtype: torch.dtype,
+    logger: "Logger",
+) -> dict:
+    """Build pipeline kwargs for a single chunk."""
+    kwargs = {
+        "height": request.height
+        or status_info.get("load_params", {}).get("height", DEFAULT_HEIGHT),
+        "width": request.width
+        or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH),
+        "base_seed": get_chunk_value(request.seed, chunk_idx, DEFAULT_SEED),
+        "init_cache": chunk_idx == 0,
+        "manage_cache": request.manage_cache,
+    }
+
+    # Prompt (sticky behavior - only send when it changes)
+    if chunk_idx in inputs.prompts:
+        kwargs["prompts"] = [
+            {"text": inputs.prompts[chunk_idx], "weight": PROMPT_WEIGHT}
+        ]
+
+    if request.denoising_steps:
+        kwargs["denoising_step_list"] = request.denoising_steps
+
+    # Video-to-video
+    if inputs.input_video is not None:
+        chunk_frames = inputs.input_video[start_frame:end_frame]
+        chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0)
+        kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames]
+        kwargs["noise_scale"] = get_chunk_value(
+            request.noise_scale, chunk_idx, DEFAULT_NOISE_SCALE
+        )
+    else:
+        kwargs["num_frames"] = chunk_size
+
+    # VACE context scale
+    kwargs["vace_context_scale"] = get_chunk_value(
+        request.vace_context_scale, chunk_idx, 1.0
+    )
+
+    # LoRA scales
+    if request.lora_scales:
+        lora_scale_updates = []
+        for path, scale_value in request.lora_scales.items():
+            scale = get_chunk_value(scale_value, chunk_idx, 1.0)
+            lora_scale_updates.append({"path": path, "scale": scale})
+            logger.info(
+                f"Chunk {chunk_idx}: LoRA scale={scale:.3f} for {Path(path).name}"
+            )
+        if lora_scale_updates:
+            kwargs["lora_scales"] = lora_scale_updates
+
+    # Keyframes
+    if chunk_idx in inputs.first_frames:
+        kwargs["first_frame_image"] = inputs.first_frames[chunk_idx]
+        kwargs["extension_mode"] = (
+            "firstlastframe" if chunk_idx in inputs.last_frames else "firstframe"
+        )
+
+    if chunk_idx in inputs.last_frames:
+        kwargs["last_frame_image"] = inputs.last_frames[chunk_idx]
+        if chunk_idx not in inputs.first_frames:
+            kwargs["extension_mode"] = "lastframe"
+
+    if chunk_idx in inputs.ref_images:
+        kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx]
+
+    # VACE conditioning frames [1, C, T, H, W]
+    if inputs.vace_frames is not None:
+        chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :]
+        chunk = pad_chunk(chunk, chunk_size, axis=2)
+        kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype)
+
+    # VACE masks [1, 1, T, H, W]
+    if inputs.vace_masks is not None:
+        chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :]
+        chunk = pad_chunk(chunk, chunk_size, axis=2)
+        kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype)
+
+    return kwargs
+
+
+def generate_video_stream(
+    request: "GenerateRequest",
+    pipeline_manager: "PipelineManager",
+    status_info: dict,
+    logger: "Logger",
+) -> Iterator[str]:
+    """Generate video frames, yielding SSE events."""
+    try:
+        pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id)
+
+        # Determine chunk size from pipeline
+        has_video = request.input_video is not None
+        requirements = pipeline.prepare(video=[] if has_video else None)
+        chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE
+        num_chunks = (request.num_frames + chunk_size - 1) // chunk_size
+
+        # Decode inputs
+        inputs = decode_inputs(request, request.num_frames)
+
+        # Setup
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        dtype = torch.bfloat16
+        output_chunks = []
+        latency_measures = []
+        fps_measures = []
+
+        for chunk_idx in range(num_chunks):
+            start_frame = chunk_idx * chunk_size
+            end_frame = min(start_frame + chunk_size, request.num_frames)
+            actual_frames = end_frame - start_frame
+
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            kwargs = build_chunk_kwargs(
+                request,
+                inputs,
+                chunk_idx,
+                chunk_size,
+                start_frame,
+                end_frame,
+                status_info,
+                device,
+                dtype,
+                logger,
+            )
+
+            # Run pipeline
+            chunk_start = time.time()
+            with torch.amp.autocast("cuda", dtype=dtype):
+                result = pipeline(**kwargs)
+            chunk_latency = time.time() - chunk_start
+
+            chunk_output = result["video"]
+            num_output_frames = chunk_output.shape[0]
+            chunk_fps = num_output_frames / chunk_latency
+
+            latency_measures.append(chunk_latency)
+            fps_measures.append(chunk_fps)
+
+            logger.info(
+                f"Chunk {chunk_idx + 1}/{num_chunks}: "
+                f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
+            )
+
+            # Trim padding from output
+            if chunk_output.shape[0] > actual_frames:
+                chunk_output = chunk_output[:actual_frames]
+
+            output_chunks.append(chunk_output.detach().cpu())
+
+            yield sse_event(
+                "progress",
+                {
+                    "chunk": chunk_idx + 1,
+                    "total_chunks": num_chunks,
+                    "frames": num_output_frames,
+                    "latency": round(chunk_latency, 3),
+                    "fps": round(chunk_fps, 2),
+                },
+            )
+
+        # Concatenate and encode output
+        output_video = torch.cat(output_chunks, dim=0)
+        output_np = output_video.numpy()
+
+        # Log performance summary
+        if latency_measures:
+            avg_latency = sum(latency_measures) / len(latency_measures)
+            avg_fps = sum(fps_measures) / len(fps_measures)
+            logger.info(
+                f"=== Performance Summary ({num_chunks} chunks) ===\n"
+                f"  Latency - Avg: {avg_latency:.2f}s, "
+                f"Max: {max(latency_measures):.2f}s, Min: {min(latency_measures):.2f}s\n"
+                f"  FPS - Avg: {avg_fps:.2f}, "
+                f"Max: {max(fps_measures):.2f}, Min: {min(fps_measures):.2f}"
+            )
+
+        video_bytes = output_np.astype(np.float32).tobytes()
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+
+        yield sse_event(
+            "complete",
+            {
+                "video_base64": video_base64,
+                "video_shape": list(output_np.shape),
+                "num_frames": output_np.shape[0],
+                "num_chunks": num_chunks,
+                "chunk_size": chunk_size,
+            },
+        )
+
+    except Exception as e:
+        logger.exception("Error generating video")
+        yield sse_event("error", {"error": str(e)})
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index c50127aab..51c201345 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -816,3 +816,126 @@ class ApiKeySetResponse(BaseModel):
 class ApiKeyDeleteResponse(BaseModel):
     success: bool
     message: str
+class ChunkFrameSpec(BaseModel):
+    """Specification for a frame image at a specific chunk."""
+
+    chunk: int = Field(..., ge=0, description="Chunk index")
+    image: str = Field(..., description="Path to image file")
+
+
+class ChunkPromptSpec(BaseModel):
+    """Specification for a prompt at a specific chunk."""
+
+    chunk: int = Field(..., ge=0, description="Chunk index")
+    text: str = Field(..., description="Prompt text for this chunk")
+
+
+class ChunkRefImagesSpec(BaseModel):
+    """Specification for reference images at a specific chunk."""
+
+    chunk: int = Field(default=0, ge=0, description="Chunk index (default: 0)")
+    images: list[str] = Field(..., description="List of reference image paths")
+
+
+class EncodedArray(BaseModel):
+    """Base64-encoded numpy array with shape metadata."""
+
+    base64: str = Field(..., description="Base64-encoded numpy array bytes")
+    shape: list[int] = Field(..., description="Array shape for decoding")
+
+
+class GenerateRequest(BaseModel):
+    """Request for batch video generation."""
+
+    pipeline_id: str = Field(..., description="Pipeline ID to use for generation")
+    prompt: str = Field(..., description="Text prompt for generation (sent on chunk 0)")
+    chunk_prompts: list[ChunkPromptSpec] | None = Field(
+        default=None,
+        description="Prompt changes at later chunks (sticky behavior). The prompt persists until the next specified chunk.",
+    )
+    num_frames: int = Field(
+        default=64,
+        ge=1,
+        le=1024,
+        description="Total number of frames to generate",
+    )
+    height: int | None = Field(
+        default=None,
+        ge=64,
+        le=2048,
+        description="Output height (defaults to pipeline's native resolution)",
+    )
+    width: int | None = Field(
+        default=None,
+        ge=64,
+        le=2048,
+        description="Output width (defaults to pipeline's native resolution)",
+    )
+    seed: int | list[int] = Field(
+        default=42,
+        description="Random seed. Single int applies to all chunks; list applies per-chunk.",
+    )
+    # Video-to-video input (optional)
+    input_video: EncodedArray | None = Field(
+        default=None,
+        description="Input video frames (THWC, uint8). If provided, enables video-to-video mode.",
+    )
+    noise_scale: float | list[float] = Field(
+        default=0.7,
+        description="Noise scale for video-to-video mode. Single float applies to all chunks; list applies per-chunk.",
+    )
+    denoising_steps: list[int] | None = Field(
+        default=None,
+        description="Denoising timesteps (e.g., [1000, 750, 500, 250])",
+    )
+    manage_cache: bool = Field(
+        default=True,
+        description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).",
+    )
+    # Per-chunk parameters
+    lora_scales: dict[str, float | list[float]] | None = Field(
+        default=None,
+        description="LoRA scales by path. Single float applies to all chunks; list applies per-chunk. Example: {'path/to/lora.pt': 0.8} or {'path/to/lora.pt': [0.5, 0.7, 0.9]}",
+    )
+    vace_context_scale: float | list[float] = Field(
+        default=1.0,
+        description="VACE context scale. Single float applies to all chunks; list applies per-chunk.",
+    )
+    # Keyframe specifications (chunk, image) pairs
+    first_frames: list[ChunkFrameSpec] | None = Field(
+        default=None,
+        description="First frame anchors. Each specifies a chunk index and image path to use as that chunk's first frame.",
+    )
+    last_frames: list[ChunkFrameSpec] | None = Field(
+        default=None,
+        description="Last frame anchors. Each specifies a chunk index and image path to use as that chunk's last frame.",
+    )
+    vace_ref_images: list[ChunkRefImagesSpec] | None = Field(
+        default=None,
+        description="Reference images for VACE conditioning. Each specifies a chunk index and list of image paths.",
+    )
+    # VACE conditioning frames/masks (for depth guidance, inpainting, etc.)
+    vace_frames: EncodedArray | None = Field(
+        default=None,
+        description="VACE conditioning frames ([1, C, T, H, W] float32 [-1, 1]). Used for depth guidance, structural control, etc.",
+    )
+    vace_masks: EncodedArray | None = Field(
+        default=None,
+        description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).",
+    )
+
+
+class GenerateResponse(BaseModel):
+    """Response from batch video generation."""
+
+    video_base64: str = Field(
+        ...,
+        description="Base64-encoded output video frames as numpy array bytes (THWC, float32, [0,1] range)",
+    )
+    video_shape: list[int] = Field(
+        ...,
+        description="Shape of output video [T, H, W, C]",
+    )
+    num_frames: int = Field(..., description="Number of frames generated")
+    num_chunks: int = Field(..., description="Number of chunks processed")
+    chunk_size: int = Field(..., description="Frames per chunk")
diff --git a/tests/test_generate_endpoint.py b/tests/test_generate_endpoint.py
new file mode 100644
index 000000000..327db105b
--- /dev/null
+++ b/tests/test_generate_endpoint.py
@@ -0,0 +1,301 @@
+"""Test script for the /api/v1/generate endpoint.
+
+Usage:
+    python test_generate_endpoint.py <test_name>
+    python test_generate_endpoint.py --list
+"""
+
+import base64
+import json
+import sys
+import time
+
+import numpy as np
+import requests
+from diffusers.utils import export_to_video
+
+from scope.core.pipelines.video import load_video
+from scope.server.schema import (
+    GenerateRequest,
+    LongLiveLoadParams,
+    LoRAConfig,
+    LoRAMergeMode,
+    PipelineLoadRequest,
+    PipelineStatusResponse,
+)
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+SERVER_URL = "http://localhost:8000"
+DEFAULT_PIPELINE = "longlive"
+
+# Asset paths (tests skip gracefully if missing)
+LORA = "path/to/a/lora.safetensors"
+TEST_VIDEO = "path/to/test_video.mp4"
+VACE_CONDITIONING_VIDEO = "path/to/depth_video.mp4"
+MASK_VIDEO = "path/to/mask_video.mp4"
+
+# =============================================================================
+# Test Definitions
+# =============================================================================
+
+TESTS = {
+    "lora": {
+        "description": "LoRA strength ramping over chunks",
+        "pipeline": "longlive",
+        "resolution": (576, 320),
+        "num_frames": 96,
+        "prompt": "a woman dissolving into particles, ethereal, magical transformation",
+        "lora": LORA,
+        "lora_ramp": [0.0, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9, 1.0],
+        "manage_cache": False,
+    },
+    "v2v": {
+        "description": "Video-to-video transformation",
+        "resolution": (512, 512),
+        "num_frames": 48,
+        "prompt": "A 3D animated scene. A **panda** sitting in the grass, looking around.",
+        "input_video": TEST_VIDEO,
+        "noise_scale": 0.6,
+    },
+    "v2v_lora": {
+        "description": "Video-to-video with LoRA ramp (0 -> 1.5 -> 0)",
+        "resolution": (512, 512),
+        "num_frames": 120,
+        "prompt": "a woman made of ral-dissolve, dissolving into particles",
+        "input_video": TEST_VIDEO,
+        "noise_scale": 0.7,
+        "lora": LORA,
+        "lora_ramp": [0.0, 0.3, 0.6, 1.0, 1.5, 1.5, 1.0, 0.6, 0.3, 0.0],
+    },
+    "vace_conditioning": {
+        "description": "VACE structural conditioning (depth, pose, etc.)",
+        "resolution": (576, 320),
+        "num_frames": 48,
+        "prompt": "a cat walking towards the camera",
+        "vace_frames": VACE_CONDITIONING_VIDEO,
+        "vace_context_scale": 1.5,
+    },
+    "inpainting": {
+        "description": "VACE inpainting with mask",
+        "resolution": (512, 512),
+        "num_frames": 48,
+        "prompt": "fireball doom flames",
+        "vace_frames": TEST_VIDEO,
+        "vace_masks": MASK_VIDEO,
+    },
+}
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+
+def encode_array(arr: np.ndarray) -> dict:
+    """Encode numpy array as EncodedArray dict."""
+    return {
+        "base64": base64.b64encode(arr.tobytes()).decode("utf-8"),
+        "shape": list(arr.shape),
+    }
+
+
+def load_video_for_v2v(path: str, height: int, width: int) -> dict:
+    """Load video as [T, H, W, C] uint8 for video-to-video mode."""
+    tensor = load_video(path, resize_hw=(height, width), normalize=False)
+    arr = tensor.permute(1, 2, 3, 0).numpy().astype(np.uint8)
+    return encode_array(arr)
+
+
+def load_video_for_vace(path: str, height: int, width: int) -> dict:
+    """Load video as [1, C, T, H, W] float32 for VACE conditioning."""
+    tensor = load_video(path, resize_hw=(height, width))
+    arr = tensor.unsqueeze(0).numpy().astype(np.float32)
+    return encode_array(arr)
+
+
+def load_mask_for_vace(path: str, height: int, width: int) -> dict:
+    """Load video as [1, 1, T, H, W] binary mask for VACE inpainting."""
+    tensor = load_video(path, resize_hw=(height, width))
+    arr = (tensor[0:1].unsqueeze(0).numpy() > 0.0).astype(np.float32)
+    return encode_array(arr)
+
+
+def parse_sse_events(response):
+    """Parse SSE events using iter_content (handles large payloads)."""
+    buffer = ""
+    event_type = None
+    data_lines = []
+
+    for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
+        buffer += chunk
+        while "\n" in buffer:
+            line, buffer = buffer.split("\n", 1)
+            line = line.rstrip("\r")
+
+            if line.startswith("event:"):
+                event_type = line[6:].strip()
+            elif line.startswith("data:"):
+                data_lines.append(line[5:].strip())
+            elif line == "":
+                if data_lines:
+                    yield (event_type or "message", json.loads("\n".join(data_lines)))
+                event_type = None
+                data_lines = []
+
+
+def wait_for_pipeline(timeout: int = 300):
+    """Wait for pipeline to finish loading."""
+    start = time.time()
+    while time.time() - start < timeout:
+        resp = requests.get(f"{SERVER_URL}/api/v1/pipeline/status")
+        status = PipelineStatusResponse.model_validate(resp.json())
+        if status.status.value == "loaded":
+            return time.time() - start
+        if status.status.value == "error":
+            raise RuntimeError(f"Pipeline failed: {status.error}")
+        time.sleep(1)
+    raise TimeoutError(f"Pipeline did not load within {timeout}s")
+
+
+# =============================================================================
+# Test Runner
+# =============================================================================
+
+
+def run_test(name: str):
+    """Run a single test by name."""
+    if name not in TESTS:
+        print(f"Unknown test: {name}")
+        print(f"Available: {', '.join(TESTS.keys())}")
+        return
+
+    cfg = TESTS[name]
+    width, height = cfg.get("resolution", (576, 320))
+    pipeline_id = cfg.get("pipeline", DEFAULT_PIPELINE)
+
+    print(f"\n{'=' * 60}")
+    print(f"Test: {name}")
+    print(f"Description: {cfg['description']}")
+    print(f"{'=' * 60}")
+
+    # Build LoRA config if specified
+    loras = None
+    lora_scales = None
+    if "lora" in cfg:
+        loras = [
+            LoRAConfig(
+                path=cfg["lora"], scale=0.0, merge_mode=LoRAMergeMode.RUNTIME_PEFT
+            )
+        ]
+        if "lora_ramp" in cfg:
+            lora_scales = {cfg["lora"]: cfg["lora_ramp"]}
+            print(f"LoRA ramp: {cfg['lora_ramp']}")
+
+    # Load pipeline
+    print(f"Loading pipeline '{pipeline_id}' at {width}x{height}...")
+    request = PipelineLoadRequest(
+        pipeline_ids=[pipeline_id],
+        load_params=LongLiveLoadParams(
+            height=height,
+            width=width,
+            loras=loras,
+            lora_merge_mode=LoRAMergeMode.RUNTIME_PEFT
+            if loras
+            else LoRAMergeMode.PERMANENT_MERGE,
+        ),
+    )
+    requests.post(
+        f"{SERVER_URL}/api/v1/pipeline/load", json=request.model_dump(mode="json")
+    ).raise_for_status()
+    load_time = wait_for_pipeline()
+    print(f"Pipeline loaded in {load_time:.1f}s")
+
+    # Load input video if specified
+    input_video = None
+    if "input_video" in cfg:
+        input_video = load_video_for_v2v(cfg["input_video"], height, width)
+        print(f"Input video: {input_video['shape']}")
+
+    # Load VACE frames if specified
+    vace_frames = None
+    if "vace_frames" in cfg:
+        vace_frames = load_video_for_vace(cfg["vace_frames"], height, width)
+        print(f"VACE frames: {vace_frames['shape']}")
+
+    # Load VACE masks if specified
+    vace_masks = None
+    if "vace_masks" in cfg:
+        vace_masks = load_mask_for_vace(cfg["vace_masks"], height, width)
+        print(f"VACE masks: {vace_masks['shape']}")
+
+    # Build and send request
+    gen_request = GenerateRequest(
+        pipeline_id=pipeline_id,
+        prompt=cfg["prompt"],
+        num_frames=cfg["num_frames"],
+        input_video=input_video,
+        noise_scale=cfg.get("noise_scale", 0.7),
+        vace_frames=vace_frames,
+        vace_masks=vace_masks,
+        vace_context_scale=cfg.get("vace_context_scale", 1.0),
+        lora_scales=lora_scales,
+        manage_cache=cfg.get("manage_cache", True),
+    )
+
+    print(f"Generating {cfg['num_frames']} frames...")
+    start = time.time()
+
+    with requests.post(
+        f"{SERVER_URL}/api/v1/generate",
+        json=gen_request.model_dump(exclude_none=True),
+        stream=True,
+        headers={"Accept": "text/event-stream"},
+    ) as resp:
+        resp.raise_for_status()
+        result = None
+        for event_type, data in parse_sse_events(resp):
+            if event_type == "progress":
+                print(
+                    f"  Chunk {data['chunk']}/{data['total_chunks']}: {data['fps']:.1f} fps"
+                )
+            elif event_type == "complete":
+                result = data
+                break
+            elif event_type == "error":
+                raise RuntimeError(f"Generation failed: {data['error']}")
+
+    if result is None:
+        raise RuntimeError("No complete event received")
+
+    # Decode and save
+    video = np.frombuffer(
+        base64.b64decode(result["video_base64"]), dtype=np.float32
+    ).reshape(result["video_shape"])
+
+    output_path = f"test_{name}.mp4"
+    export_to_video(video, output_path, fps=16)
+
+    print(f"\nComplete in {time.time() - start:.1f}s")
+    print(f"Output: {output_path} ({result['video_shape']})")
+
+
+def main():
+    if len(sys.argv) < 2 or sys.argv[1] == "--list":
+        print("Available tests:")
+        for name, cfg in TESTS.items():
+            print(f"  {name:20} - {cfg['description']}")
+        print("\nUsage: python test_generate_endpoint.py <test_name>")
+        print("       python test_generate_endpoint.py all")
+        return
+
+    if sys.argv[1] == "all":
+        for name in TESTS:
+            run_test(name)
+    else:
+        run_test(sys.argv[1])
+
+
+if __name__ == "__main__":
+    main()

From f626b5b440e9501f1d7b496fe8f5e1697401f701 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Wed, 4 Feb 2026 14:46:46 -0500
Subject: [PATCH 02/16] remove edge case padding

enables rife

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 9ce59d808..0e270ee01 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -235,7 +235,6 @@ def generate_video_stream(
         for chunk_idx in range(num_chunks):
             start_frame = chunk_idx * chunk_size
             end_frame = min(start_frame + chunk_size, request.num_frames)
-            actual_frames = end_frame - start_frame
 
             gc.collect()
             if torch.cuda.is_available():
@@ -272,10 +271,6 @@ def generate_video_stream(
                 f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
             )
 
-            # Trim padding from output
-            if chunk_output.shape[0] > actual_frames:
-                chunk_output = chunk_output[:actual_frames]
-
             output_chunks.append(chunk_output.detach().cpu())
 
             yield sse_event(

From 4e38e70722b8afaf640b15990fd3c4f62daeeb18 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Wed, 4 Feb 2026 15:21:57 -0500
Subject: [PATCH 03/16] rm longliveloadparams

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 tests/test_generate_endpoint.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/tests/test_generate_endpoint.py b/tests/test_generate_endpoint.py
index 327db105b..c184e5da8 100644
--- a/tests/test_generate_endpoint.py
+++ b/tests/test_generate_endpoint.py
@@ -17,7 +17,6 @@
 from scope.core.pipelines.video import load_video
 from scope.server.schema import (
     GenerateRequest,
-    LongLiveLoadParams,
     LoRAConfig,
     LoRAMergeMode,
     PipelineLoadRequest,
@@ -195,17 +194,11 @@ def run_test(name: str):
 
     # Load pipeline
     print(f"Loading pipeline '{pipeline_id}' at {width}x{height}...")
-    request = PipelineLoadRequest(
-        pipeline_ids=[pipeline_id],
-        load_params=LongLiveLoadParams(
-            height=height,
-            width=width,
-            loras=loras,
-            lora_merge_mode=LoRAMergeMode.RUNTIME_PEFT
-            if loras
-            else LoRAMergeMode.PERMANENT_MERGE,
-        ),
-    )
+    load_params = {"height": height, "width": width}
+    if loras:
+        load_params["loras"] = [lora.model_dump() for lora in loras]
+        load_params["lora_merge_mode"] = "runtime_peft"
+    request = PipelineLoadRequest(pipeline_ids=[pipeline_id], load_params=load_params)
     requests.post(
         f"{SERVER_URL}/api/v1/pipeline/load", json=request.model_dump(mode="json")
     ).raise_for_status()

From 7f5c9fd390dd60424b2ce79b1bd363617e2e8524 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Wed, 4 Feb 2026 15:22:13 -0500
Subject: [PATCH 04/16] move scripts

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 {tests => scripts}/test_generate_endpoint.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename {tests => scripts}/test_generate_endpoint.py (96%)

diff --git a/tests/test_generate_endpoint.py b/scripts/test_generate_endpoint.py
similarity index 96%
rename from tests/test_generate_endpoint.py
rename to scripts/test_generate_endpoint.py
index c184e5da8..986133100 100644
--- a/tests/test_generate_endpoint.py
+++ b/scripts/test_generate_endpoint.py
@@ -31,10 +31,10 @@
 DEFAULT_PIPELINE = "longlive"
 
 # Asset paths (tests skip gracefully if missing)
-LORA = "path/to/a/lora.safetensors"
-TEST_VIDEO = "path/to/test_video.mp4"
-VACE_CONDITIONING_VIDEO = "path/to/depth_video.mp4"
-MASK_VIDEO = "path/to/mask_video.mp4"
+LORA = r"C:\Users\ryanf\.daydream-scope\models\lora\lora\output\model_245889_dissolve_imgvid\dissolve-000064.safetensors"
+TEST_VIDEO = r"frontend\public\assets\test.mp4"
+VACE_CONDITIONING_VIDEO = r"controlnet_test\control_frames_depth.mp4"
+MASK_VIDEO = r"src\scope\core\pipelines\longlive\vace_tests\static_mask_half_white_half_black.mp4"
 
 # =============================================================================
 # Test Definitions

From 3da2f2f40dd47e1a93c51c4442946cff339aa7ff Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Thu, 5 Feb 2026 15:55:41 -0500
Subject: [PATCH 05/16] Add file-based transfer for generate endpoint

- Reuse RecordingManager temp file pattern for large video I/O                                                                                                                                    - Add POST /generate/upload and GET /generate/download endpoints
  - Write output chunks incrementally to disk (constant memory)
  - Add generate_input/generate_output prefixes to TEMP_FILE_PREFIXES

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/app.py       | 133 +++++++++++++++++++++++
 src/scope/server/generate.py  | 196 ++++++++++++++++++++++------------
 src/scope/server/recording.py |   2 +
 src/scope/server/schema.py    |  37 +++++--
 4 files changed, 295 insertions(+), 73 deletions(-)

diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index c5a79d485..916ce9c0c 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -1152,6 +1152,139 @@ async def generate_video(
     )
 
 
+@app.post("/api/v1/generate/upload")
+async def upload_video_for_generate(request: Request):
+    """Upload a video for batch generation (file-based transfer for large videos).
+
+    Accepts raw binary video data with metadata headers:
+    - X-Video-Frames: number of frames (T)
+    - X-Video-Height: frame height (H)
+    - X-Video-Width: frame width (W)
+    - X-Video-Channels: number of channels (C), typically 3 for RGB
+
+    Video data should be raw uint8 bytes in THWC order.
+
+    Returns input_path to use in the generate request.
+    """
+    from .recording import TEMP_FILE_PREFIXES, RecordingManager
+    from .schema import VideoUploadResponse
+
+    try:
+        # Get video dimensions from headers
+        num_frames = int(request.headers.get("X-Video-Frames", 0))
+        height = int(request.headers.get("X-Video-Height", 0))
+        width = int(request.headers.get("X-Video-Width", 0))
+        channels = int(request.headers.get("X-Video-Channels", 3))
+
+        if not all([num_frames, height, width]):
+            raise HTTPException(
+                status_code=400,
+                detail="Missing required headers: X-Video-Frames, X-Video-Height, X-Video-Width",
+            )
+
+        expected_size = num_frames * height * width * channels
+        shape = (num_frames, height, width, channels)
+
+        # Create temp file (reuse recording pattern)
+        file_path = RecordingManager._create_temp_file(
+            ".bin", TEMP_FILE_PREFIXES["generate_input"]
+        )
+
+        # Stream body to file
+        with open(file_path, "wb") as f:
+            # Write header: ndim (4 bytes) + shape (ndim * 4 bytes)
+            f.write(len(shape).to_bytes(4, "little"))
+            for dim in shape:
+                f.write(dim.to_bytes(4, "little"))
+
+            # Stream video data
+            bytes_written = 0
+            async for chunk in request.stream():
+                f.write(chunk)
+                bytes_written += len(chunk)
+
+        if bytes_written != expected_size:
+            Path(file_path).unlink(missing_ok=True)
+            raise HTTPException(
+                status_code=400,
+                detail=f"Video data size mismatch: expected {expected_size}, got {bytes_written}",
+            )
+
+        logger.info(f"Uploaded video: {file_path} (shape: {shape})")
+
+        return VideoUploadResponse(
+            input_path=file_path,
+            num_frames=num_frames,
+            shape=list(shape),
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error uploading video: {e}")
+        raise HTTPException(status_code=500, detail=str(e)) from e
+
+
+@app.get("/api/v1/generate/download")
+async def download_generated_video(
+    path: str = Query(..., description="Path to output video file"),
+    background_tasks: BackgroundTasks = None,
+):
+    """Download a generated video by path.
+
+    Returns raw binary video data with metadata headers:
+    - X-Video-Frames: number of frames (T)
+    - X-Video-Height: frame height (H)
+    - X-Video-Width: frame width (W)
+    - X-Video-Channels: number of channels (C)
+
+    Video data is raw uint8 bytes in THWC order.
+    """
+    import tempfile
+
+    from .recording import TEMP_FILE_PREFIXES, cleanup_temp_file
+
+    try:
+        file_path = Path(path)
+
+        # Security: only allow files in temp dir with our prefix
+        temp_dir = Path(tempfile.gettempdir())
+        if not file_path.is_relative_to(temp_dir):
+            raise HTTPException(status_code=403, detail="Invalid file path")
+        if not file_path.name.startswith(TEMP_FILE_PREFIXES["generate_output"]):
+            raise HTTPException(status_code=403, detail="Invalid file path")
+
+        if not file_path.exists():
+            raise HTTPException(status_code=404, detail="Output video not found")
+
+        # Read header to get shape
+        with open(file_path, "rb") as f:
+            ndim = int.from_bytes(f.read(4), "little")
+            shape = tuple(int.from_bytes(f.read(4), "little") for _ in range(ndim))
+
+        # Schedule cleanup after download
+        if background_tasks:
+            background_tasks.add_task(cleanup_temp_file, str(file_path))
+
+        # Return file with metadata headers
+        return FileResponse(
+            file_path,
+            media_type="application/octet-stream",
+            headers={
+                "X-Video-Frames": str(shape[0]),
+                "X-Video-Height": str(shape[1]),
+                "X-Video-Width": str(shape[2]),
+                "X-Video-Channels": str(shape[3]) if len(shape) > 3 else "3",
+            },
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error downloading generated video: {e}")
+        raise HTTPException(status_code=500, detail=str(e)) from e
+
+
 def is_spout_available() -> bool:
     """Check if Spout is available (native Windows only, not WSL)."""
     return sys.platform == "win32"
diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 0e270ee01..64c71dc9c 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -91,11 +91,34 @@ class DecodedInputs:
     prompts: dict[int, str] = field(default_factory=dict)
 
 
-def decode_inputs(request: "GenerateRequest", num_frames: int) -> DecodedInputs:
-    """Decode all base64 inputs from request."""
+def load_video_from_file(file_path: str) -> np.ndarray:
+    """Load video from temp file.
+
+    Args:
+        file_path: Path to video file with header
+
+    Returns:
+        Video array [T, H, W, C] uint8
+    """
+    with open(file_path, "rb") as f:
+        ndim = int.from_bytes(f.read(4), "little")
+        shape = tuple(int.from_bytes(f.read(4), "little") for _ in range(ndim))
+        data = np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)
+    return data
+
+
+def decode_inputs(
+    request: "GenerateRequest", num_frames: int, logger: "Logger"
+) -> DecodedInputs:
+    """Decode all inputs from request (base64 or file-based)."""
     inputs = DecodedInputs()
 
-    if request.input_video:
+    # Handle input video - either from file path or base64
+    if request.input_path:
+        logger.info(f"Loading input video from file: {request.input_path}")
+        inputs.input_video = load_video_from_file(request.input_path)
+        inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0)
+    elif request.input_video:
         inputs.input_video = decode_array(request.input_video, np.uint8)
         inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0)
 
@@ -212,81 +235,121 @@ def generate_video_stream(
     status_info: dict,
     logger: "Logger",
 ) -> Iterator[str]:
-    """Generate video frames, yielding SSE events."""
+    """Generate video frames, yielding SSE events.
+
+    Writes output to temp file incrementally, returns output_path for download.
+    """
     try:
         pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id)
 
         # Determine chunk size from pipeline
-        has_video = request.input_video is not None
+        has_video = request.input_video is not None or request.input_path is not None
         requirements = pipeline.prepare(video=[] if has_video else None)
         chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE
         num_chunks = (request.num_frames + chunk_size - 1) // chunk_size
 
-        # Decode inputs
-        inputs = decode_inputs(request, request.num_frames)
+        # Decode inputs (supports both file-based and base64)
+        inputs = decode_inputs(request, request.num_frames, logger)
 
         # Setup
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         dtype = torch.bfloat16
-        output_chunks = []
         latency_measures = []
         fps_measures = []
 
-        for chunk_idx in range(num_chunks):
-            start_frame = chunk_idx * chunk_size
-            end_frame = min(start_frame + chunk_size, request.num_frames)
-
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-
-            kwargs = build_chunk_kwargs(
-                request,
-                inputs,
-                chunk_idx,
-                chunk_size,
-                start_frame,
-                end_frame,
-                status_info,
-                device,
-                dtype,
-                logger,
-            )
-
-            # Run pipeline
-            chunk_start = time.time()
-            with torch.amp.autocast("cuda", dtype=dtype):
-                result = pipeline(**kwargs)
-            chunk_latency = time.time() - chunk_start
-
-            chunk_output = result["video"]
-            num_output_frames = chunk_output.shape[0]
-            chunk_fps = num_output_frames / chunk_latency
-
-            latency_measures.append(chunk_latency)
-            fps_measures.append(chunk_fps)
+        # Create output file for incremental writing (reuse recording pattern)
+        from .recording import TEMP_FILE_PREFIXES, RecordingManager
 
-            logger.info(
-                f"Chunk {chunk_idx + 1}/{num_chunks}: "
-                f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
-            )
-
-            output_chunks.append(chunk_output.detach().cpu())
-
-            yield sse_event(
-                "progress",
-                {
-                    "chunk": chunk_idx + 1,
-                    "total_chunks": num_chunks,
-                    "frames": num_output_frames,
-                    "latency": round(chunk_latency, 3),
-                    "fps": round(chunk_fps, 2),
-                },
-            )
-
-        # Concatenate and encode output
-        output_video = torch.cat(output_chunks, dim=0)
-        output_np = output_video.numpy()
+        output_file_path = RecordingManager._create_temp_file(
+            ".bin", TEMP_FILE_PREFIXES["generate_output"]
+        )
+        output_file = open(output_file_path, "wb")
+
+        # We'll write a placeholder header, then update it at the end
+        # Header format: ndim (4 bytes) + shape (4 * ndim bytes)
+        # For video [T, H, W, C], that's 4 + 16 = 20 bytes
+        header_size = 4 + 4 * 4  # ndim + 4 dimensions
+        output_file.write(b"\x00" * header_size)  # Placeholder
+
+        total_frames = 0
+        video_height = None
+        video_width = None
+        video_channels = None
+
+        try:
+            for chunk_idx in range(num_chunks):
+                start_frame = chunk_idx * chunk_size
+                end_frame = min(start_frame + chunk_size, request.num_frames)
+
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+
+                kwargs = build_chunk_kwargs(
+                    request,
+                    inputs,
+                    chunk_idx,
+                    chunk_size,
+                    start_frame,
+                    end_frame,
+                    status_info,
+                    device,
+                    dtype,
+                    logger,
+                )
+
+                # Run pipeline
+                chunk_start = time.time()
+                with torch.amp.autocast("cuda", dtype=dtype):
+                    result = pipeline(**kwargs)
+                chunk_latency = time.time() - chunk_start
+
+                chunk_output = result["video"]
+                num_output_frames = chunk_output.shape[0]
+                chunk_fps = num_output_frames / chunk_latency
+
+                latency_measures.append(chunk_latency)
+                fps_measures.append(chunk_fps)
+
+                logger.info(
+                    f"Chunk {chunk_idx + 1}/{num_chunks}: "
+                    f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
+                )
+
+                # Write chunk to file immediately (convert to uint8)
+                chunk_np = chunk_output.detach().cpu().numpy()
+                chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8)
+                output_file.write(chunk_uint8.tobytes())
+
+                # Track dimensions
+                total_frames += num_output_frames
+                if video_height is None:
+                    video_height = chunk_np.shape[1]
+                    video_width = chunk_np.shape[2]
+                    video_channels = chunk_np.shape[3]
+
+                yield sse_event(
+                    "progress",
+                    {
+                        "chunk": chunk_idx + 1,
+                        "total_chunks": num_chunks,
+                        "frames": num_output_frames,
+                        "latency": round(chunk_latency, 3),
+                        "fps": round(chunk_fps, 2),
+                    },
+                )
+
+            # Update header with actual shape
+            output_file.seek(0)
+            shape = (total_frames, video_height, video_width, video_channels)
+            output_file.write(len(shape).to_bytes(4, "little"))
+            for dim in shape:
+                output_file.write(dim.to_bytes(4, "little"))
+
+        finally:
+            output_file.close()
+
+        logger.info(f"Output video saved: {output_file_path}")
 
         # Log performance summary
         if latency_measures:
@@ -300,15 +363,14 @@ def generate_video_stream(
                 f"Max: {max(fps_measures):.2f}, Min: {min(fps_measures):.2f}"
             )
 
-        video_bytes = output_np.astype(np.float32).tobytes()
-        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        output_shape = [total_frames, video_height, video_width, video_channels]
 
         yield sse_event(
             "complete",
             {
-                "video_base64": video_base64,
-                "video_shape": list(output_np.shape),
-                "num_frames": output_np.shape[0],
+                "output_path": output_file_path,
+                "video_shape": output_shape,
+                "num_frames": total_frames,
                 "num_chunks": num_chunks,
                 "chunk_size": chunk_size,
             },
diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py
index 5ac39fc2b..839109a05 100644
--- a/src/scope/server/recording.py
+++ b/src/scope/server/recording.py
@@ -17,6 +17,8 @@
 TEMP_FILE_PREFIXES = {
     "recording": "scope_recording_",
     "download": "scope_download_",
+    "generate_input": "scope_gen_input_",
+    "generate_output": "scope_gen_output_",
 }
 
 # Environment variables
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index 51c201345..3e785f685 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -844,6 +844,16 @@ class EncodedArray(BaseModel):
     shape: list[int] = Field(..., description="Array shape for decoding")
 
 
+class VideoUploadResponse(BaseModel):
+    """Response after uploading a video for generation."""
+
+    input_path: str = Field(
+        ..., description="Path to uploaded video file for generate request"
+    )
+    num_frames: int = Field(..., description="Number of frames in uploaded video")
+    shape: list[int] = Field(..., description="Video shape [T, H, W, C]")
+
+
 class GenerateRequest(BaseModel):
     """Request for batch video generation."""
 
@@ -875,10 +885,14 @@ class GenerateRequest(BaseModel):
         default=42,
         description="Random seed. Single int applies to all chunks; list applies per-chunk.",
     )
-    # Video-to-video input (optional)
+    # Video-to-video input (optional) - two mutually exclusive options
     input_video: EncodedArray | None = Field(
         default=None,
-        description="Input video frames (THWC, uint8). If provided, enables video-to-video mode.",
+        description="Input video frames (THWC, uint8). If provided, enables video-to-video mode. For large videos, use input_path instead.",
+    )
+    input_path: str | None = Field(
+        default=None,
+        description="Path to uploaded video file (from /generate/upload). Alternative to input_video for large files.",
     )
     noise_scale: float | list[float] = Field(
         default=0.7,
@@ -926,11 +940,22 @@ class GenerateRequest(BaseModel):
 
 
 class GenerateResponse(BaseModel):
-    """Response from batch video generation."""
+    """Response from batch video generation.
 
-    video_base64: str = Field(
-        ...,
-        description="Base64-encoded output video frames as numpy array bytes (THWC, float32, [0,1] range)",
+    Supports two modes:
+    - Legacy: video_base64 contains the full video (for small videos)
+    - File-based: output_path references a downloadable file (for large videos)
+    """
+
+    # File-based output (preferred for large videos)
+    output_path: str | None = Field(
+        default=None,
+        description="Path to output video file for download via /generate/download. Preferred for large videos.",
+    )
+    # Legacy base64 output (kept for backwards compatibility)
+    video_base64: str | None = Field(
+        default=None,
+        description="Base64-encoded output video frames (THWC, uint8). Deprecated for large videos, use output_path.",
     )
     video_shape: list[int] = Field(
         ...,

From 4791f8efe1ed4c055916d0456c12ae422aa4ae5d Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Fri, 13 Feb 2026 14:55:40 -0500
Subject: [PATCH 06/16] add noise controller, bias, use vace input,
 interpolation method

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py | 16 ++++++++++++++++
 src/scope/server/schema.py   | 18 ++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 64c71dc9c..b7f980592 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -187,6 +187,22 @@ def build_chunk_kwargs(
         request.vace_context_scale, chunk_idx, 1.0
     )
 
+    # Noise controller
+    if request.noise_controller is not None:
+        kwargs["noise_controller"] = request.noise_controller
+
+    # KV cache attention bias
+    kv_bias = get_chunk_value(request.kv_cache_attention_bias, chunk_idx)
+    if kv_bias is not None:
+        kwargs["kv_cache_attention_bias"] = kv_bias
+
+    # Prompt interpolation method
+    kwargs["prompt_interpolation_method"] = request.prompt_interpolation_method
+
+    # VACE use input video
+    if request.vace_use_input_video is not None:
+        kwargs["vace_use_input_video"] = request.vace_use_input_video
+
     # LoRA scales
     if request.lora_scales:
         lora_scale_updates = []
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index 3e785f685..4b590a53a 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -816,6 +816,8 @@ class ApiKeySetResponse(BaseModel):
 class ApiKeyDeleteResponse(BaseModel):
     success: bool
     message: str
+
+
 class ChunkFrameSpec(BaseModel):
     """Specification for a frame image at a specific chunk."""
 
@@ -906,6 +908,22 @@ class GenerateRequest(BaseModel):
         default=True,
         description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).",
     )
+    noise_controller: bool | None = Field(
+        default=None,
+        description="Enable automatic noise scale adjustment based on motion detection.",
+    )
+    kv_cache_attention_bias: float | list[float] | None = Field(
+        default=None,
+        description="Controls reliance on past frames in cache. Lower values mitigate error accumulation. Single float applies to all chunks; list applies per-chunk. Typical values: 0.3-0.7 moderate, 0.1-0.2 strong.",
+    )
+    prompt_interpolation_method: Literal["linear", "slerp"] = Field(
+        default="linear",
+        description="Spatial interpolation method for blending multiple prompts: linear (weighted average) or slerp (spherical).",
+    )
+    vace_use_input_video: bool | None = Field(
+        default=None,
+        description="When enabled in video-to-video mode, input video is used for VACE conditioning instead of latent initialization.",
+    )
     # Per-chunk parameters
     lora_scales: dict[str, float | list[float]] | None = Field(
         default=None,

From c23446dd99e7f18bf0ca6127b039fa44ff3caea7 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Fri, 13 Feb 2026 15:48:44 -0500
Subject: [PATCH 07/16] cancellation

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/app.py      | 11 ++++++++++-
 src/scope/server/generate.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index 916ce9c0c..61e79d1d8 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -45,12 +45,12 @@
     VIDEO_EXTENSIONS,
     iter_files,
 )
+from .generate import generate_video_stream
 from .kafka_publisher import (
     KafkaPublisher,
     is_kafka_enabled,
     set_kafka_publisher,
 )
-from .generate import generate_video_stream
 from .logs_config import (
     cleanup_old_logs,
     ensure_logs_dir,
@@ -1152,6 +1152,15 @@ async def generate_video(
     )
 
 
+@app.post("/api/v1/generate/cancel")
+async def cancel_generate():
+    """Cancel the current video generation after the current chunk completes."""
+    from .generate import cancel_generation
+
+    cancel_generation()
+    return {"status": "cancelling"}
+
+
 @app.post("/api/v1/generate/upload")
 async def upload_video_for_generate(request: Request):
     """Upload a video for batch generation (file-based transfer for large videos).
diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index b7f980592..55d6d4a4c 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -3,6 +3,7 @@
 import base64
 import gc
 import json
+import threading
 import time
 from collections.abc import Iterator
 from dataclasses import dataclass, field
@@ -12,6 +13,20 @@
 import numpy as np
 import torch
 
+# Cancellation support (single-client, so one event suffices)
+_cancel_event = threading.Event()
+
+
+def cancel_generation():
+    """Signal the current generation to stop after the current chunk."""
+    _cancel_event.set()
+
+
+def is_generation_cancelled() -> bool:
+    """Check if cancellation has been requested."""
+    return _cancel_event.is_set()
+
+
 # Defaults
 DEFAULT_HEIGHT = 320
 DEFAULT_WIDTH = 576
@@ -255,6 +270,8 @@ def generate_video_stream(
 
     Writes output to temp file incrementally, returns output_path for download.
     """
+    _cancel_event.clear()
+
     try:
         pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id)
 
@@ -294,6 +311,18 @@ def generate_video_stream(
 
         try:
             for chunk_idx in range(num_chunks):
+                if _cancel_event.is_set():
+                    logger.info("Generation cancelled by user")
+                    yield sse_event(
+                        "cancelled",
+                        {
+                            "chunk": chunk_idx,
+                            "total_chunks": num_chunks,
+                            "frames_completed": total_frames,
+                        },
+                    )
+                    return
+
                 start_frame = chunk_idx * chunk_size
                 end_frame = min(start_frame + chunk_size, request.num_frames)
 

From 5e6a9653fcdb9681e049c6da2861d72a190d2c90 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Fri, 13 Feb 2026 15:54:56 -0500
Subject: [PATCH 08/16] temp file cleanup

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py  | 9 +++++++++
 src/scope/server/recording.py | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 55d6d4a4c..28dee1ca2 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -424,3 +424,12 @@ def generate_video_stream(
     except Exception as e:
         logger.exception("Error generating video")
         yield sse_event("error", {"error": str(e)})
+
+    finally:
+        # Clean up uploaded input file
+        if request.input_path:
+            try:
+                Path(request.input_path).unlink(missing_ok=True)
+                logger.info(f"Cleaned up input file: {request.input_path}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up input file: {e}")
diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py
index 839109a05..bd06a3bca 100644
--- a/src/scope/server/recording.py
+++ b/src/scope/server/recording.py
@@ -439,6 +439,8 @@ def cleanup_recording_files():
     patterns = [
         f"{TEMP_FILE_PREFIXES['recording']}*.mp4",
         f"{TEMP_FILE_PREFIXES['download']}*.mp4",
+        f"{TEMP_FILE_PREFIXES['generate_input']}*.bin",
+        f"{TEMP_FILE_PREFIXES['generate_output']}*.bin",
     ]
 
     deleted_count = 0

From 090a0ea07c86a110ee0d29d2e75256497285e9ff Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Mon, 16 Feb 2026 10:21:21 -0500
Subject: [PATCH 09/16] prompt blending

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py | 49 ++++++++++++++++++++++++++++++------
 src/scope/server/schema.py   | 47 +++++++++++++++++++++++++++++++---
 2 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 28dee1ca2..d97494f5f 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -103,7 +103,8 @@ class DecodedInputs:
     first_frames: dict[int, str] = field(default_factory=dict)
     last_frames: dict[int, str] = field(default_factory=dict)
     ref_images: dict[int, list[str]] = field(default_factory=dict)
-    prompts: dict[int, str] = field(default_factory=dict)
+    prompts: dict[int, list[dict]] = field(default_factory=dict)
+    transitions: dict[int, dict] = field(default_factory=dict)
 
 
 def load_video_from_file(file_path: str) -> np.ndarray:
@@ -148,8 +149,36 @@ def decode_inputs(
     inputs.first_frames = build_lookup(request.first_frames, "image")
     inputs.last_frames = build_lookup(request.last_frames, "image")
     inputs.ref_images = build_lookup(request.vace_ref_images, "images")
-    inputs.prompts = {0: request.prompt}
-    inputs.prompts.update(build_lookup(request.chunk_prompts, "text"))
+    # Normalize prompt to weighted list format
+    if isinstance(request.prompt, str):
+        inputs.prompts = {0: [{"text": request.prompt, "weight": PROMPT_WEIGHT}]}
+    else:
+        inputs.prompts = {
+            0: [{"text": p.text, "weight": p.weight} for p in request.prompt]
+        }
+
+    # Chunk prompts: support both text and weighted prompt lists
+    if request.chunk_prompts:
+        for spec in request.chunk_prompts:
+            if spec.prompts:
+                inputs.prompts[spec.chunk] = [
+                    {"text": p.text, "weight": p.weight} for p in spec.prompts
+                ]
+            elif spec.text:
+                inputs.prompts[spec.chunk] = [
+                    {"text": spec.text, "weight": PROMPT_WEIGHT}
+                ]
+
+    # Build transitions lookup
+    if request.transitions:
+        for t in request.transitions:
+            inputs.transitions[t.chunk] = {
+                "target_prompts": [
+                    {"text": p.text, "weight": p.weight} for p in t.target_prompts
+                ],
+                "num_steps": t.num_steps,
+                "temporal_interpolation_method": t.temporal_interpolation_method,
+            }
 
     return inputs
 
@@ -173,15 +202,21 @@ def build_chunk_kwargs(
         "width": request.width
         or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH),
         "base_seed": get_chunk_value(request.seed, chunk_idx, DEFAULT_SEED),
-        "init_cache": chunk_idx == 0,
+        "init_cache": chunk_idx == 0
+        or (
+            request.cache_reset_chunks is not None
+            and chunk_idx in request.cache_reset_chunks
+        ),
         "manage_cache": request.manage_cache,
     }
 
     # Prompt (sticky behavior - only send when it changes)
     if chunk_idx in inputs.prompts:
-        kwargs["prompts"] = [
-            {"text": inputs.prompts[chunk_idx], "weight": PROMPT_WEIGHT}
-        ]
+        kwargs["prompts"] = inputs.prompts[chunk_idx]
+
+    # Temporal transition
+    if chunk_idx in inputs.transitions:
+        kwargs["transition"] = inputs.transitions[chunk_idx]
 
     if request.denoising_steps:
         kwargs["denoising_step_list"] = request.denoising_steps
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index 4b590a53a..07aa9c223 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -826,10 +826,38 @@ class ChunkFrameSpec(BaseModel):
 
 
 class ChunkPromptSpec(BaseModel):
-    """Specification for a prompt at a specific chunk."""
+    """Specification for a prompt at a specific chunk.
+
+    Supports both simple text and weighted prompt lists for spatial blending.
+    """
 
     chunk: int = Field(..., ge=0, description="Chunk index")
-    text: str = Field(..., description="Prompt text for this chunk")
+    text: str | None = Field(
+        default=None,
+        description="Simple prompt text for this chunk (mutually exclusive with prompts)",
+    )
+    prompts: list[PromptItem] | None = Field(
+        default=None,
+        description="Weighted prompt list for spatial blending at this chunk (mutually exclusive with text)",
+    )
+
+
+class ChunkTransitionSpec(BaseModel):
+    """Specification for a temporal transition starting at a specific chunk."""
+
+    chunk: int = Field(..., ge=0, description="Chunk index where transition starts")
+    target_prompts: list[PromptItem] = Field(
+        ..., description="Target prompt blend to interpolate to"
+    )
+    num_steps: int = Field(
+        default=4,
+        ge=0,
+        description="Number of generation calls to transition over (0 = instant)",
+    )
+    temporal_interpolation_method: Literal["linear", "slerp"] = Field(
+        default="linear",
+        description="Method for temporal interpolation between blends across frames",
+    )
 
 
 class ChunkRefImagesSpec(BaseModel):
@@ -860,10 +888,17 @@ class GenerateRequest(BaseModel):
     """Request for batch video generation."""
 
     pipeline_id: str = Field(..., description="Pipeline ID to use for generation")
-    prompt: str = Field(..., description="Text prompt for generation (sent on chunk 0)")
+    prompt: str | list[PromptItem] = Field(
+        ...,
+        description="Text prompt for generation (sent on chunk 0). Can be a simple string or a list of weighted prompts for spatial blending.",
+    )
     chunk_prompts: list[ChunkPromptSpec] | None = Field(
         default=None,
-        description="Prompt changes at later chunks (sticky behavior). The prompt persists until the next specified chunk.",
+        description="Prompt changes at later chunks (sticky behavior). Each entry supports simple text or weighted prompt lists.",
+    )
+    transitions: list[ChunkTransitionSpec] | None = Field(
+        default=None,
+        description="Temporal transitions at specific chunks. Each specifies a target prompt blend and number of interpolation steps.",
     )
     num_frames: int = Field(
         default=64,
@@ -908,6 +943,10 @@ class GenerateRequest(BaseModel):
         default=True,
         description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).",
     )
+    cache_reset_chunks: list[int] | None = Field(
+        default=None,
+        description="List of chunk indices where the KV cache should be forcibly reset (init_cache=True). Chunk 0 always resets.",
+    )
     noise_controller: bool | None = Field(
         default=None,
         description="Enable automatic noise scale adjustment based on motion detection.",

From 7741692ef7a3cc773125330fb7134918ff6046f7 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Mon, 16 Feb 2026 10:27:49 -0500
Subject: [PATCH 10/16] additional per chunk logging and tmp file cleanup

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py | 106 +++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index d97494f5f..4b3a268a2 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -306,6 +306,8 @@ def generate_video_stream(
     Writes output to temp file incrementally, returns output_path for download.
     """
     _cancel_event.clear()
+    output_file_path = None
+    completed = False
 
     try:
         pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id)
@@ -378,6 +380,101 @@ def generate_video_stream(
                     logger,
                 )
 
+                # Log chunk operations
+                logger.info(
+                    f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}"
+                )
+
+                # Cache management
+                if kwargs.get("init_cache"):
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)"
+                    )
+
+                # Prompt updates
+                if "prompts" in kwargs:
+                    prompt_texts = [p["text"] for p in kwargs["prompts"]]
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}"
+                    )
+
+                # Temporal transitions
+                if "transition" in kwargs:
+                    target_texts = [
+                        p["text"] for p in kwargs["transition"]["target_prompts"]
+                    ]
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} "
+                        f"over {kwargs['transition']['num_steps']} steps "
+                        f"(method: {kwargs['transition']['temporal_interpolation_method']})"
+                    )
+
+                # Keyframes
+                if "first_frame_image" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe"
+                    )
+                if "last_frame_image" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe"
+                    )
+                if "extension_mode" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}"
+                    )
+
+                # VACE
+                if "vace_ref_images" in kwargs:
+                    num_refs = len(kwargs["vace_ref_images"])
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Using {num_refs} VACE reference images"
+                    )
+                if "vace_input_frames" in kwargs:
+                    vace_shape = kwargs["vace_input_frames"].shape
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {vace_shape}"
+                    )
+                if "vace_input_masks" in kwargs:
+                    mask_shape = kwargs["vace_input_masks"].shape
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {mask_shape}"
+                    )
+                if (
+                    "vace_context_scale" in kwargs
+                    and kwargs["vace_context_scale"] != 1.0
+                ):
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}"
+                    )
+                if "vace_use_input_video" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}"
+                    )
+
+                # Video-to-video
+                if "video" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}"
+                    )
+                elif "num_frames" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames"
+                    )
+
+                # Other parameters
+                if "denoising_step_list" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}"
+                    )
+                if "noise_controller" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}"
+                    )
+                if "kv_cache_attention_bias" in kwargs:
+                    logger.info(
+                        f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}"
+                    )
+
                 # Run pipeline
                 chunk_start = time.time()
                 with torch.amp.autocast("cuda", dtype=dtype):
@@ -455,6 +552,7 @@ def generate_video_stream(
                 "chunk_size": chunk_size,
             },
         )
+        completed = True
 
     except Exception as e:
         logger.exception("Error generating video")
@@ -468,3 +566,11 @@ def generate_video_stream(
                 logger.info(f"Cleaned up input file: {request.input_path}")
             except Exception as e:
                 logger.warning(f"Failed to clean up input file: {e}")
+
+        # Clean up output file if generation didn't complete successfully
+        if not completed and output_file_path:
+            try:
+                Path(output_file_path).unlink(missing_ok=True)
+                logger.info(f"Cleaned up orphaned output file: {output_file_path}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up output file: {e}")

From 67b0a95d3315375d547c578b7a9855f3523e0ab3 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Mon, 16 Feb 2026 17:47:20 -0500
Subject: [PATCH 11/16] pre and post processors

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py           | 522 +++++++++++++++++--------
 src/scope/server/pipeline_processor.py |  80 +++-
 src/scope/server/schema.py             |   8 +
 3 files changed, 449 insertions(+), 161 deletions(-)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 4b3a268a2..ec6ecf545 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -3,6 +3,7 @@
 import base64
 import gc
 import json
+import queue
 import threading
 import time
 from collections.abc import Iterator
@@ -295,6 +296,332 @@ def build_chunk_kwargs(
     return kwargs
 
 
+def _log_chunk_info(kwargs: dict, chunk_idx: int, num_chunks: int, logger: "Logger"):
+    """Log detailed chunk information."""
+    logger.info(f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}")
+    if kwargs.get("init_cache"):
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)"
+        )
+    if "prompts" in kwargs:
+        prompt_texts = [p["text"] for p in kwargs["prompts"]]
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}"
+        )
+    if "transition" in kwargs:
+        target_texts = [p["text"] for p in kwargs["transition"]["target_prompts"]]
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} "
+            f"over {kwargs['transition']['num_steps']} steps "
+            f"(method: {kwargs['transition']['temporal_interpolation_method']})"
+        )
+    if "first_frame_image" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe"
+        )
+    if "last_frame_image" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe"
+        )
+    if "extension_mode" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}"
+        )
+    if "vace_ref_images" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Using {len(kwargs['vace_ref_images'])} VACE reference images"
+        )
+    if "vace_input_frames" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {kwargs['vace_input_frames'].shape}"
+        )
+    if "vace_input_masks" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {kwargs['vace_input_masks'].shape}"
+        )
+    if "vace_context_scale" in kwargs and kwargs["vace_context_scale"] != 1.0:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}"
+        )
+    if "vace_use_input_video" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}"
+        )
+    if "video" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}"
+        )
+    elif "num_frames" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames"
+        )
+    if "denoising_step_list" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}"
+        )
+    if "noise_controller" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}"
+        )
+    if "kv_cache_attention_bias" in kwargs:
+        logger.info(
+            f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}"
+        )
+
+
+def _write_chunk_output(
+    result: dict,
+    chunk_idx: int,
+    num_chunks: int,
+    chunk_latency: float,
+    output_file,
+    latency_measures: list,
+    fps_measures: list,
+    logger: "Logger",
+    total_frames_ref: list,
+    dimensions_ref: list,
+) -> str:
+    """Write chunk output to file and return SSE progress event."""
+    chunk_output = result["video"]
+    num_output_frames = chunk_output.shape[0]
+    chunk_fps = num_output_frames / chunk_latency
+
+    latency_measures.append(chunk_latency)
+    fps_measures.append(chunk_fps)
+
+    logger.info(
+        f"Chunk {chunk_idx + 1}/{num_chunks}: "
+        f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
+    )
+
+    chunk_np = chunk_output.detach().cpu().numpy()
+    chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8)
+    output_file.write(chunk_uint8.tobytes())
+
+    total_frames_ref[0] += num_output_frames
+    if dimensions_ref[0] is None:
+        dimensions_ref[0] = chunk_np.shape[1]
+        dimensions_ref[1] = chunk_np.shape[2]
+        dimensions_ref[2] = chunk_np.shape[3]
+
+    return sse_event(
+        "progress",
+        {
+            "chunk": chunk_idx + 1,
+            "total_chunks": num_chunks,
+            "frames": num_output_frames,
+            "latency": round(chunk_latency, 3),
+            "fps": round(chunk_fps, 2),
+        },
+    )
+
+
+def _generate_sequential(
+    request: "GenerateRequest",
+    pipeline,
+    inputs: DecodedInputs,
+    num_chunks: int,
+    chunk_size: int,
+    status_info: dict,
+    device: torch.device,
+    dtype: torch.dtype,
+    output_file,
+    latency_measures: list,
+    fps_measures: list,
+    logger: "Logger",
+    total_frames_ref: list,
+    dimensions_ref: list,
+) -> Iterator[str]:
+    """Sequential chunk processing (original code path, no processors)."""
+    for chunk_idx in range(num_chunks):
+        if _cancel_event.is_set():
+            logger.info("Generation cancelled by user")
+            yield sse_event(
+                "cancelled",
+                {
+                    "chunk": chunk_idx,
+                    "total_chunks": num_chunks,
+                    "frames_completed": total_frames_ref[0],
+                },
+            )
+            return
+
+        start_frame = chunk_idx * chunk_size
+        end_frame = min(start_frame + chunk_size, request.num_frames)
+
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        kwargs = build_chunk_kwargs(
+            request,
+            inputs,
+            chunk_idx,
+            chunk_size,
+            start_frame,
+            end_frame,
+            status_info,
+            device,
+            dtype,
+            logger,
+        )
+        _log_chunk_info(kwargs, chunk_idx, num_chunks, logger)
+
+        chunk_start = time.time()
+        with torch.amp.autocast("cuda", dtype=dtype):
+            result = pipeline(**kwargs)
+        chunk_latency = time.time() - chunk_start
+
+        yield _write_chunk_output(
+            result,
+            chunk_idx,
+            num_chunks,
+            chunk_latency,
+            output_file,
+            latency_measures,
+            fps_measures,
+            logger,
+            total_frames_ref,
+            dimensions_ref,
+        )
+
+
+def _generate_with_processors(
+    request: "GenerateRequest",
+    pipeline,
+    pipeline_manager: "PipelineManager",
+    inputs: DecodedInputs,
+    num_chunks: int,
+    chunk_size: int,
+    status_info: dict,
+    device: torch.device,
+    dtype: torch.dtype,
+    output_file,
+    latency_measures: list,
+    fps_measures: list,
+    logger: "Logger",
+    total_frames_ref: list,
+    dimensions_ref: list,
+) -> Iterator[str]:
+    """Chunk processing with pre/post processor pipeline chaining."""
+    from .pipeline_processor import _SENTINEL, PipelineProcessor
+
+    # Build the processor chain
+    processors: list[PipelineProcessor] = []
+
+    if request.pre_processor_id:
+        pre_pipeline = pipeline_manager.get_pipeline_by_id(request.pre_processor_id)
+        pre_proc = PipelineProcessor(
+            pipeline=pre_pipeline,
+            pipeline_id=request.pre_processor_id,
+            batch_mode=True,
+        )
+        processors.append(pre_proc)
+        logger.info(f"Pre-processor: {request.pre_processor_id}")
+
+    main_proc = PipelineProcessor(
+        pipeline=pipeline,
+        pipeline_id=request.pipeline_id,
+        batch_mode=True,
+    )
+    processors.append(main_proc)
+
+    if request.post_processor_id:
+        post_pipeline = pipeline_manager.get_pipeline_by_id(request.post_processor_id)
+        post_proc = PipelineProcessor(
+            pipeline=post_pipeline,
+            pipeline_id=request.post_processor_id,
+            batch_mode=True,
+        )
+        processors.append(post_proc)
+        logger.info(f"Post-processor: {request.post_processor_id}")
+
+    # Chain processors
+    for i in range(len(processors) - 1):
+        processors[i].set_next_processor(processors[i + 1])
+
+    # Start all processors
+    for proc in processors:
+        proc.start()
+
+    first_proc = processors[0]
+    last_proc = processors[-1]
+
+    try:
+        # Feed chunks into the first processor's input queue
+        for chunk_idx in range(num_chunks):
+            if _cancel_event.is_set():
+                logger.info("Generation cancelled by user")
+                yield sse_event(
+                    "cancelled",
+                    {
+                        "chunk": chunk_idx,
+                        "total_chunks": num_chunks,
+                        "frames_completed": total_frames_ref[0],
+                    },
+                )
+                return
+
+            start_frame = chunk_idx * chunk_size
+            end_frame = min(start_frame + chunk_size, request.num_frames)
+
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            kwargs = build_chunk_kwargs(
+                request,
+                inputs,
+                chunk_idx,
+                chunk_size,
+                start_frame,
+                end_frame,
+                status_info,
+                device,
+                dtype,
+                logger,
+            )
+            _log_chunk_info(kwargs, chunk_idx, num_chunks, logger)
+
+            chunk_start = time.time()
+
+            # Feed kwargs into chain (blocking put)
+            first_proc.input_queue.put(kwargs)
+
+            # Collect result from last processor (blocking get)
+            while True:
+                try:
+                    result = last_proc.output_queue.get(timeout=1.0)
+                    break
+                except queue.Empty:
+                    if _cancel_event.is_set():
+                        return
+                    continue
+
+            chunk_latency = time.time() - chunk_start
+
+            yield _write_chunk_output(
+                result,
+                chunk_idx,
+                num_chunks,
+                chunk_latency,
+                output_file,
+                latency_measures,
+                fps_measures,
+                logger,
+                total_frames_ref,
+                dimensions_ref,
+            )
+
+        # Signal end of input
+        first_proc.input_queue.put(_SENTINEL)
+
+    finally:
+        # Stop all processors
+        for proc in processors:
+            proc.stop()
+
+
 def generate_video_stream(
     request: "GenerateRequest",
     pipeline_manager: "PipelineManager",
@@ -346,175 +673,52 @@ def generate_video_stream(
         video_width = None
         video_channels = None
 
+        # Determine if we need processor chaining
+        use_processors = (
+            request.pre_processor_id is not None
+            or request.post_processor_id is not None
+        )
+
         try:
-            for chunk_idx in range(num_chunks):
-                if _cancel_event.is_set():
-                    logger.info("Generation cancelled by user")
-                    yield sse_event(
-                        "cancelled",
-                        {
-                            "chunk": chunk_idx,
-                            "total_chunks": num_chunks,
-                            "frames_completed": total_frames,
-                        },
-                    )
-                    return
-
-                start_frame = chunk_idx * chunk_size
-                end_frame = min(start_frame + chunk_size, request.num_frames)
-
-                gc.collect()
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-
-                kwargs = build_chunk_kwargs(
+            if use_processors:
+                yield from _generate_with_processors(
                     request,
+                    pipeline,
+                    pipeline_manager,
                     inputs,
-                    chunk_idx,
+                    num_chunks,
                     chunk_size,
-                    start_frame,
-                    end_frame,
                     status_info,
                     device,
                     dtype,
+                    output_file,
+                    latency_measures,
+                    fps_measures,
                     logger,
+                    _total_frames_ref := [0],
+                    _dimensions_ref := [None, None, None],
                 )
-
-                # Log chunk operations
-                logger.info(
-                    f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}"
-                )
-
-                # Cache management
-                if kwargs.get("init_cache"):
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)"
-                    )
-
-                # Prompt updates
-                if "prompts" in kwargs:
-                    prompt_texts = [p["text"] for p in kwargs["prompts"]]
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}"
-                    )
-
-                # Temporal transitions
-                if "transition" in kwargs:
-                    target_texts = [
-                        p["text"] for p in kwargs["transition"]["target_prompts"]
-                    ]
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} "
-                        f"over {kwargs['transition']['num_steps']} steps "
-                        f"(method: {kwargs['transition']['temporal_interpolation_method']})"
-                    )
-
-                # Keyframes
-                if "first_frame_image" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe"
-                    )
-                if "last_frame_image" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe"
-                    )
-                if "extension_mode" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}"
-                    )
-
-                # VACE
-                if "vace_ref_images" in kwargs:
-                    num_refs = len(kwargs["vace_ref_images"])
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Using {num_refs} VACE reference images"
-                    )
-                if "vace_input_frames" in kwargs:
-                    vace_shape = kwargs["vace_input_frames"].shape
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {vace_shape}"
-                    )
-                if "vace_input_masks" in kwargs:
-                    mask_shape = kwargs["vace_input_masks"].shape
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {mask_shape}"
-                    )
-                if (
-                    "vace_context_scale" in kwargs
-                    and kwargs["vace_context_scale"] != 1.0
-                ):
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}"
-                    )
-                if "vace_use_input_video" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}"
-                    )
-
-                # Video-to-video
-                if "video" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}"
-                    )
-                elif "num_frames" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames"
-                    )
-
-                # Other parameters
-                if "denoising_step_list" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}"
-                    )
-                if "noise_controller" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}"
-                    )
-                if "kv_cache_attention_bias" in kwargs:
-                    logger.info(
-                        f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}"
-                    )
-
-                # Run pipeline
-                chunk_start = time.time()
-                with torch.amp.autocast("cuda", dtype=dtype):
-                    result = pipeline(**kwargs)
-                chunk_latency = time.time() - chunk_start
-
-                chunk_output = result["video"]
-                num_output_frames = chunk_output.shape[0]
-                chunk_fps = num_output_frames / chunk_latency
-
-                latency_measures.append(chunk_latency)
-                fps_measures.append(chunk_fps)
-
-                logger.info(
-                    f"Chunk {chunk_idx + 1}/{num_chunks}: "
-                    f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
-                )
-
-                # Write chunk to file immediately (convert to uint8)
-                chunk_np = chunk_output.detach().cpu().numpy()
-                chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8)
-                output_file.write(chunk_uint8.tobytes())
-
-                # Track dimensions
-                total_frames += num_output_frames
-                if video_height is None:
-                    video_height = chunk_np.shape[1]
-                    video_width = chunk_np.shape[2]
-                    video_channels = chunk_np.shape[3]
-
-                yield sse_event(
-                    "progress",
-                    {
-                        "chunk": chunk_idx + 1,
-                        "total_chunks": num_chunks,
-                        "frames": num_output_frames,
-                        "latency": round(chunk_latency, 3),
-                        "fps": round(chunk_fps, 2),
-                    },
+                total_frames = _total_frames_ref[0]
+                video_height, video_width, video_channels = _dimensions_ref
+            else:
+                yield from _generate_sequential(
+                    request,
+                    pipeline,
+                    inputs,
+                    num_chunks,
+                    chunk_size,
+                    status_info,
+                    device,
+                    dtype,
+                    output_file,
+                    latency_measures,
+                    fps_measures,
+                    logger,
+                    _total_frames_ref := [0],
+                    _dimensions_ref := [None, None, None],
                 )
+                total_frames = _total_frames_ref[0]
+                video_height, video_width, video_channels = _dimensions_ref
 
             # Update header with actual shape
             output_file.seek(0)
diff --git a/src/scope/server/pipeline_processor.py b/src/scope/server/pipeline_processor.py
index b11638996..1ac243a91 100644
--- a/src/scope/server/pipeline_processor.py
+++ b/src/scope/server/pipeline_processor.py
@@ -23,6 +23,9 @@
 
 SLEEP_TIME = 0.01
 
+# Sentinel value to signal end of batch input
+_SENTINEL = object()
+
 # FPS calculation constants
 MIN_FPS = 1.0  # Minimum FPS to prevent division by zero
 MAX_FPS = 60.0  # Maximum FPS cap
@@ -42,6 +45,7 @@ def __init__(
         user_id: str | None = None,
         connection_id: str | None = None,
         connection_info: dict | None = None,
+        batch_mode: bool = False,
     ):
         """Initialize a pipeline processor.
 
@@ -60,10 +64,15 @@ def __init__(
         self.user_id = user_id
         self.connection_id = connection_id
         self.connection_info = connection_info
+        self.batch_mode = batch_mode
 
         # Each processor creates its own queues
-        self.input_queue = queue.Queue(maxsize=30)
-        self.output_queue = queue.Queue(maxsize=8)
+        if batch_mode:
+            self.input_queue = queue.Queue(maxsize=2)
+            self.output_queue = queue.Queue(maxsize=2)
+        else:
+            self.input_queue = queue.Queue(maxsize=30)
+            self.output_queue = queue.Queue(maxsize=8)
         # Lock to protect input_queue assignment for thread-safe reference swapping
         self.input_queue_lock = threading.Lock()
 
@@ -226,6 +235,10 @@ def worker_loop(self):
         """Main worker loop that processes frames."""
         logger.info(f"Worker thread started for pipeline: {self.pipeline_id}")
 
+        if self.batch_mode:
+            self._worker_loop_batch()
+            return
+
         while self.running and not self.shutdown_event.is_set():
             try:
                 self.process_chunk()
@@ -267,6 +280,69 @@ def worker_loop(self):
 
         logger.info(f"Worker thread stopped for pipeline: {self.pipeline_id}")
 
+    def _worker_loop_batch(self):
+        """Batch-mode worker loop: processes chunk kwargs dicts from queue."""
+        while self.running and not self.shutdown_event.is_set():
+            try:
+                item = self.input_queue.get(timeout=1.0)
+            except queue.Empty:
+                continue
+            if item is _SENTINEL:
+                if self.next_processor:
+                    self.next_processor.input_queue.put(_SENTINEL)
+                break
+            try:
+                self.process_chunk_batch(item)
+            except Exception as e:
+                logger.error(
+                    f"Error in batch processing for {self.pipeline_id}: {e}",
+                    exc_info=True,
+                )
+                if not self._is_recoverable(e):
+                    break
+        logger.info(f"Batch worker thread stopped for pipeline: {self.pipeline_id}")
+
+    def process_chunk_batch(self, chunk_kwargs: dict):
+        """Process a single chunk in batch mode.
+
+        Args:
+            chunk_kwargs: Pre-built kwargs dict for the pipeline call.
+        """
+        dtype = torch.bfloat16
+        with torch.amp.autocast("cuda", dtype=dtype):
+            result = self.pipeline(**chunk_kwargs)
+
+        # Forward extra params to downstream processor
+        extra_params = {k: v for k, v in result.items() if k != "video"}
+        if extra_params and self.next_processor is not None:
+            self.next_processor.update_parameters(extra_params)
+
+        if self.next_processor is not None:
+            # Convert video output to list-of-frames format for next pipeline.
+            # Pipeline __call__ expects video as list of [1, H, W, C] uint8 tensors
+            # (same format as real-time path: process_chunk converts to uint8
+            # before putting on output queue, and preprocess_chunk expects [0, 255]).
+            video = result.get("video")
+            if video is not None:
+                video_uint8 = (
+                    (video * 255.0)
+                    .clamp(0, 255)
+                    .to(dtype=torch.uint8)
+                    .contiguous()
+                    .detach()
+                )
+                next_kwargs = dict(chunk_kwargs)
+                next_kwargs["video"] = [f.unsqueeze(0) for f in video_uint8]
+                # Remove keys that are only valid for the original pipeline
+                for key in ("init_cache", "num_frames"):
+                    next_kwargs.pop(key, None)
+                self.output_queue.put(next_kwargs)
+            else:
+                self.output_queue.put(chunk_kwargs)
+        else:
+            # Last processor: put raw result for collection
+            self.output_queue.put(result)
+
     def prepare_chunk(
         self, input_queue_ref: queue.Queue, chunk_size: int
     ) -> list[torch.Tensor]:
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index 07aa9c223..6115564be 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -994,6 +994,14 @@ class GenerateRequest(BaseModel):
         default=None,
         description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).",
     )
+    pre_processor_id: str | None = Field(
+        default=None,
+        description="Pipeline ID for pre-processing each chunk before the main pipeline.",
+    )
+    post_processor_id: str | None = Field(
+        default=None,
+        description="Pipeline ID for post-processing each chunk after the main pipeline.",
+    )
 
 
 class GenerateResponse(BaseModel):

From c61a9a19a59d839e5b8957abc1d9100ac7698b32 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Wed, 18 Feb 2026 06:47:18 -0500
Subject: [PATCH 12/16] per chunk vace spec

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py | 76 ++++++++++++++++++++++++++++++------
 src/scope/server/schema.py   | 28 ++++++++++++-
 2 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index ec6ecf545..54cd10633 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -106,6 +106,7 @@ class DecodedInputs:
     ref_images: dict[int, list[str]] = field(default_factory=dict)
     prompts: dict[int, list[dict]] = field(default_factory=dict)
     transitions: dict[int, dict] = field(default_factory=dict)
+    vace_chunk_specs: dict[int, dict] = field(default_factory=dict)
 
 
 def load_video_from_file(file_path: str) -> np.ndarray:
@@ -170,6 +171,37 @@ def decode_inputs(
                     {"text": spec.text, "weight": PROMPT_WEIGHT}
                 ]
 
+    # Per-chunk VACE specs
+    if request.vace_chunk_specs:
+        logger.info(
+            f"decode_inputs: Found {len(request.vace_chunk_specs)} vace_chunk_specs"
+        )
+        for spec in request.vace_chunk_specs:
+            logger.info(
+                f"decode_inputs: vace_chunk_spec chunk={spec.chunk}, has_frames={spec.frames is not None}, has_masks={spec.masks is not None}, context_scale={spec.context_scale}, temporally_locked={spec.vace_temporally_locked}"
+            )
+            decoded_spec: dict = {
+                "vace_temporally_locked": spec.vace_temporally_locked,
+            }
+            if spec.frames is not None:
+                decoded_spec["frames"] = decode_array(spec.frames, np.float32)
+                logger.info(
+                    f"decode_inputs: chunk {spec.chunk} decoded frames shape={decoded_spec['frames'].shape}"
+                )
+            if spec.masks is not None:
+                decoded_spec["masks"] = decode_array(spec.masks, np.float32)
+                logger.info(
+                    f"decode_inputs: chunk {spec.chunk} decoded masks shape={decoded_spec['masks'].shape}"
+                )
+            if spec.context_scale is not None:
+                decoded_spec["context_scale"] = spec.context_scale
+            inputs.vace_chunk_specs[spec.chunk] = decoded_spec
+        logger.info(
+            f"decode_inputs: vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}"
+        )
+    else:
+        logger.info("decode_inputs: No vace_chunk_specs in request")
+
     # Build transitions lookup
     if request.transitions:
         for t in request.transitions:
@@ -281,17 +313,39 @@ def build_chunk_kwargs(
     if chunk_idx in inputs.ref_images:
         kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx]
 
-    # VACE conditioning frames [1, C, T, H, W]
-    if inputs.vace_frames is not None:
-        chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :]
-        chunk = pad_chunk(chunk, chunk_size, axis=2)
-        kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype)
-
-    # VACE masks [1, 1, T, H, W]
-    if inputs.vace_masks is not None:
-        chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :]
-        chunk = pad_chunk(chunk, chunk_size, axis=2)
-        kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype)
+    # VACE conditioning: per-chunk spec takes priority over global
+    logger.info(
+        f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}, has_global_frames={inputs.vace_frames is not None}, has_global_masks={inputs.vace_masks is not None}"
+    )
+    if chunk_idx in inputs.vace_chunk_specs:
+        logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING PER-CHUNK VACE SPEC")
+        spec = inputs.vace_chunk_specs[chunk_idx]
+
+        if "frames" in spec:
+            frames = spec["frames"]
+            frames = pad_chunk(frames, chunk_size, axis=2)
+            kwargs["vace_input_frames"] = torch.from_numpy(frames).to(device, dtype)
+
+        if "masks" in spec:
+            masks = spec["masks"]
+            masks = pad_chunk(masks, chunk_size, axis=2)
+            kwargs["vace_input_masks"] = torch.from_numpy(masks).to(device, dtype)
+
+        if "context_scale" in spec:
+            kwargs["vace_context_scale"] = spec["context_scale"]
+    else:
+        logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING GLOBAL VACE FALLBACK")
+        # Global VACE conditioning frames [1, C, T, H, W]
+        if inputs.vace_frames is not None:
+            chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :]
+            chunk = pad_chunk(chunk, chunk_size, axis=2)
+            kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype)
+
+        # Global VACE masks [1, 1, T, H, W]
+        if inputs.vace_masks is not None:
+            chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :]
+            chunk = pad_chunk(chunk, chunk_size, axis=2)
+            kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype)
 
     return kwargs
 
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index 6115564be..d450ba44e 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -867,6 +867,28 @@ class ChunkRefImagesSpec(BaseModel):
     images: list[str] = Field(..., description="List of reference image paths")
 
 
+class ChunkVACESpec(BaseModel):
+    """Per-chunk VACE conditioning specification."""
+
+    chunk: int = Field(..., ge=0, description="Chunk index")
+    frames: "EncodedArray | None" = Field(
+        default=None,
+        description="VACE conditioning frames for this chunk ([1, C, T, H, W] float32 [-1, 1])",
+    )
+    masks: "EncodedArray | None" = Field(
+        default=None,
+        description="VACE masks for this chunk ([1, 1, T, H, W] float32 {0, 1})",
+    )
+    context_scale: float | None = Field(
+        default=None,
+        description="VACE context scale override for this chunk. If None, uses global vace_context_scale.",
+    )
+    vace_temporally_locked: bool = Field(
+        default=True,
+        description="When True, frames/masks are sliced temporally to match chunk position. When False, used as-is and padded.",
+    )
+
+
 class EncodedArray(BaseModel):
     """Base64-encoded numpy array with shape metadata."""
 
@@ -903,7 +925,7 @@ class GenerateRequest(BaseModel):
     num_frames: int = Field(
         default=64,
         ge=1,
-        le=1024,
+        le=10000,
         description="Total number of frames to generate",
     )
     height: int | None = Field(
@@ -994,6 +1016,10 @@ class GenerateRequest(BaseModel):
         default=None,
         description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).",
     )
+    vace_chunk_specs: list[ChunkVACESpec] | None = Field(
+        default=None,
+        description="Per-chunk VACE conditioning. Each specifies frames/masks for a specific chunk. Overrides global vace_frames/vace_masks for that chunk.",
+    )
     pre_processor_id: str | None = Field(
         default=None,
         description="Pipeline ID for pre-processing each chunk before the main pipeline.",

From 8f61a2c7c8c6336d60f12da662f13f677947d28b Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Thu, 19 Feb 2026 13:46:59 -0500
Subject: [PATCH 13/16] wip

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 scripts/test_generate_endpoint.py | 231 ++++++++++++++++------
 src/scope/server/app.py           |  44 +++++
 src/scope/server/generate.py      | 310 +++++++++++++++++-------------
 src/scope/server/recording.py     |   2 +
 src/scope/server/schema.py        | 229 +++++++++++-----------
 5 files changed, 508 insertions(+), 308 deletions(-)

diff --git a/scripts/test_generate_endpoint.py b/scripts/test_generate_endpoint.py
index 986133100..122dcb4c6 100644
--- a/scripts/test_generate_endpoint.py
+++ b/scripts/test_generate_endpoint.py
@@ -5,7 +5,6 @@
     python test_generate_endpoint.py --list
 """
 
-import base64
 import json
 import sys
 import time
@@ -92,33 +91,87 @@
 # =============================================================================
 
 
-def encode_array(arr: np.ndarray) -> dict:
-    """Encode numpy array as EncodedArray dict."""
-    return {
-        "base64": base64.b64encode(arr.tobytes()).decode("utf-8"),
-        "shape": list(arr.shape),
-    }
-
-
-def load_video_for_v2v(path: str, height: int, width: int) -> dict:
-    """Load video as [T, H, W, C] uint8 for video-to-video mode."""
+def upload_video_for_v2v(path: str, height: int, width: int) -> str:
+    """Load and upload video for video-to-video mode. Returns input_path."""
     tensor = load_video(path, resize_hw=(height, width), normalize=False)
     arr = tensor.permute(1, 2, 3, 0).numpy().astype(np.uint8)
-    return encode_array(arr)
-
-
-def load_video_for_vace(path: str, height: int, width: int) -> dict:
-    """Load video as [1, C, T, H, W] float32 for VACE conditioning."""
-    tensor = load_video(path, resize_hw=(height, width))
-    arr = tensor.unsqueeze(0).numpy().astype(np.float32)
-    return encode_array(arr)
-
+    num_frames, h, w, c = arr.shape
+
+    response = requests.post(
+        f"{SERVER_URL}/api/v1/generate/upload",
+        data=arr.tobytes(),
+        headers={
+            "Content-Type": "application/octet-stream",
+            "X-Video-Frames": str(num_frames),
+            "X-Video-Height": str(h),
+            "X-Video-Width": str(w),
+            "X-Video-Channels": str(c),
+        },
+        timeout=300,
+    )
+    response.raise_for_status()
+    return response.json()["input_path"]
+
+
+def upload_vace_data(
+    vace_frames_path: str | None,
+    vace_masks_path: str | None,
+    height: int,
+    width: int,
+    num_frames: int,
+    chunk_size: int,
+    vace_context_scale: float = 1.0,
+) -> tuple[str, list[dict]]:
+    """Load VACE frames/masks, pack into blob, upload, return (data_blob_path, chunk_specs)."""
+    blob = bytearray()
+    num_chunks = (num_frames + chunk_size - 1) // chunk_size
+    chunk_specs = []
+
+    # Load tensors
+    vace_frames_tensor = None
+    vace_masks_tensor = None
+    if vace_frames_path:
+        vace_frames_tensor = load_video(vace_frames_path, resize_hw=(height, width))
+        vace_frames_tensor = vace_frames_tensor.unsqueeze(0).numpy().astype(np.float32)
+    if vace_masks_path:
+        masks_tensor = load_video(vace_masks_path, resize_hw=(height, width))
+        vace_masks_tensor = (masks_tensor[0:1].unsqueeze(0).numpy() > 0.0).astype(
+            np.float32
+        )
+
+    for chunk_idx in range(num_chunks):
+        spec = {"chunk": chunk_idx, "vace_temporally_locked": True}
+        start = chunk_idx * chunk_size
+        end = start + chunk_size
+
+        if vace_frames_tensor is not None:
+            sliced = vace_frames_tensor[:, :, start:end, :, :]
+            spec["vace_frames_offset"] = len(blob)
+            spec["vace_frames_shape"] = list(sliced.shape)
+            blob.extend(sliced.tobytes())
+
+        if vace_masks_tensor is not None:
+            sliced_masks = vace_masks_tensor[:, :, start:end, :, :]
+            spec["vace_masks_offset"] = len(blob)
+            spec["vace_masks_shape"] = list(sliced_masks.shape)
+            blob.extend(sliced_masks.tobytes())
+
+        if vace_context_scale != 1.0:
+            spec["vace_context_scale"] = vace_context_scale
+
+        chunk_specs.append(spec)
+
+    # Upload blob
+    response = requests.post(
+        f"{SERVER_URL}/api/v1/generate/upload-data",
+        data=bytes(blob),
+        headers={"Content-Type": "application/octet-stream"},
+        timeout=300,
+    )
+    response.raise_for_status()
+    data_blob_path = response.json()["data_blob_path"]
 
-def load_mask_for_vace(path: str, height: int, width: int) -> dict:
-    """Load video as [1, 1, T, H, W] binary mask for VACE inpainting."""
-    tensor = load_video(path, resize_hw=(height, width))
-    arr = (tensor[0:1].unsqueeze(0).numpy() > 0.0).astype(np.float32)
-    return encode_array(arr)
+    return data_blob_path, chunk_specs
 
 
 def parse_sse_events(response):
@@ -158,6 +211,30 @@ def wait_for_pipeline(timeout: int = 300):
     raise TimeoutError(f"Pipeline did not load within {timeout}s")
 
 
+def download_video(output_path: str) -> np.ndarray:
+    """Download generated video from server."""
+    response = requests.get(
+        f"{SERVER_URL}/api/v1/generate/download",
+        params={"path": output_path},
+        timeout=300,
+    )
+    response.raise_for_status()
+
+    num_frames = int(response.headers.get("X-Video-Frames", 0))
+    height = int(response.headers.get("X-Video-Height", 0))
+    width = int(response.headers.get("X-Video-Width", 0))
+    channels = int(response.headers.get("X-Video-Channels", 3))
+
+    # Skip header (ndim + shape)
+    content = response.content
+    header_size = 4 + 4 * 4
+    video_bytes = content[header_size:]
+
+    return np.frombuffer(video_bytes, dtype=np.uint8).reshape(
+        (num_frames, height, width, channels)
+    )
+
+
 # =============================================================================
 # Test Runner
 # =============================================================================
@@ -189,8 +266,8 @@ def run_test(name: str):
             )
         ]
         if "lora_ramp" in cfg:
-            lora_scales = {cfg["lora"]: cfg["lora_ramp"]}
-            print(f"LoRA ramp: {cfg['lora_ramp']}")
+            lora_scales = cfg["lora_ramp"]
+            print(f"LoRA ramp: {lora_scales}")
 
     # Load pipeline
     print(f"Loading pipeline '{pipeline_id}' at {width}x{height}...")
@@ -205,37 +282,65 @@ def run_test(name: str):
     load_time = wait_for_pipeline()
     print(f"Pipeline loaded in {load_time:.1f}s")
 
-    # Load input video if specified
-    input_video = None
+    # Build request kwargs
+    request_kwargs = {
+        "pipeline_id": pipeline_id,
+        "prompt": cfg["prompt"],
+        "num_frames": cfg["num_frames"],
+        "noise_scale": cfg.get("noise_scale", 0.7),
+        "vace_context_scale": cfg.get("vace_context_scale", 1.0),
+        "manage_cache": cfg.get("manage_cache", True),
+    }
+
+    # Upload input video if specified
     if "input_video" in cfg:
-        input_video = load_video_for_v2v(cfg["input_video"], height, width)
-        print(f"Input video: {input_video['shape']}")
-
-    # Load VACE frames if specified
-    vace_frames = None
-    if "vace_frames" in cfg:
-        vace_frames = load_video_for_vace(cfg["vace_frames"], height, width)
-        print(f"VACE frames: {vace_frames['shape']}")
-
-    # Load VACE masks if specified
-    vace_masks = None
-    if "vace_masks" in cfg:
-        vace_masks = load_mask_for_vace(cfg["vace_masks"], height, width)
-        print(f"VACE masks: {vace_masks['shape']}")
-
-    # Build and send request
-    gen_request = GenerateRequest(
-        pipeline_id=pipeline_id,
-        prompt=cfg["prompt"],
-        num_frames=cfg["num_frames"],
-        input_video=input_video,
-        noise_scale=cfg.get("noise_scale", 0.7),
-        vace_frames=vace_frames,
-        vace_masks=vace_masks,
-        vace_context_scale=cfg.get("vace_context_scale", 1.0),
-        lora_scales=lora_scales,
-        manage_cache=cfg.get("manage_cache", True),
-    )
+        input_path = upload_video_for_v2v(cfg["input_video"], height, width)
+        request_kwargs["input_path"] = input_path
+        print(f"Input video uploaded: {input_path}")
+
+    # Build chunk_specs for LoRA ramp
+    chunk_specs = []
+    if lora_scales and "lora" in cfg:
+        for i, scale in enumerate(lora_scales):
+            chunk_specs.append(
+                {
+                    "chunk": i,
+                    "lora_scales": {cfg["lora"]: scale},
+                }
+            )
+
+    # Handle VACE data
+    if "vace_frames" in cfg or "vace_masks" in cfg:
+        # Assume chunk_size=12 (default for longlive)
+        chunk_size = 12
+        data_blob_path, vace_specs = upload_vace_data(
+            vace_frames_path=cfg.get("vace_frames"),
+            vace_masks_path=cfg.get("vace_masks"),
+            height=height,
+            width=width,
+            num_frames=cfg["num_frames"],
+            chunk_size=chunk_size,
+            vace_context_scale=cfg.get("vace_context_scale", 1.0),
+        )
+        request_kwargs["data_blob_path"] = data_blob_path
+        # Merge VACE specs into chunk_specs
+        existing_chunks = {s["chunk"] for s in chunk_specs}
+        for vs in vace_specs:
+            if vs["chunk"] in existing_chunks:
+                # Merge into existing spec
+                for cs in chunk_specs:
+                    if cs["chunk"] == vs["chunk"]:
+                        cs.update(vs)
+                        break
+            else:
+                chunk_specs.append(vs)
+        print(f"VACE data uploaded: {data_blob_path}")
+
+    if chunk_specs:
+        chunk_specs.sort(key=lambda s: s["chunk"])
+        request_kwargs["chunk_specs"] = chunk_specs
+
+    gen_request = GenerateRequest(**request_kwargs)
 
     print(f"Generating {cfg['num_frames']} frames...")
     start = time.time()
@@ -262,13 +367,15 @@ def run_test(name: str):
     if result is None:
         raise RuntimeError("No complete event received")
 
-    # Decode and save
-    video = np.frombuffer(
-        base64.b64decode(result["video_base64"]), dtype=np.float32
-    ).reshape(result["video_shape"])
+    # Download and save
+    if "output_path" in result:
+        video = download_video(result["output_path"])
+        video_float = video.astype(np.float32) / 255.0
+    else:
+        raise RuntimeError("No output_path in result")
 
     output_path = f"test_{name}.mp4"
-    export_to_video(video, output_path, fps=16)
+    export_to_video(video_float, output_path, fps=16)
 
     print(f"\nComplete in {time.time() - start:.1f}s")
     print(f"Output: {output_path} ({result['video_shape']})")
diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index 61e79d1d8..75cb79e88 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -1234,6 +1234,50 @@ async def upload_video_for_generate(request: Request):
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
+@app.post("/api/v1/generate/upload-data")
+async def upload_data_blob(request: Request):
+    """Upload binary data blob for batch generation.
+
+    Accepts raw binary data containing VACE frames/masks, input video, or other
+    array data referenced by ChunkSpec offsets in the generate request.
+
+    Returns data_blob_path to use in the generate request.
+    """
+
+    from .recording import TEMP_FILE_PREFIXES, RecordingManager
+    from .schema import DataUploadResponse
+
+    try:
+        # Create temp file
+        file_path = RecordingManager._create_temp_file(
+            ".bin", TEMP_FILE_PREFIXES["generate_data"]
+        )
+
+        # Stream body to file
+        bytes_written = 0
+        with open(file_path, "wb") as f:
+            async for chunk in request.stream():
+                f.write(chunk)
+                bytes_written += len(chunk)
+
+        if bytes_written == 0:
+            Path(file_path).unlink(missing_ok=True)
+            raise HTTPException(status_code=400, detail="Empty request body")
+
+        logger.info(f"Uploaded data blob: {file_path} ({bytes_written} bytes)")
+
+        return DataUploadResponse(
+            data_blob_path=file_path,
+            size_bytes=bytes_written,
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error uploading data blob: {e}")
+        raise HTTPException(status_code=500, detail=str(e)) from e
+
+
 @app.get("/api/v1/generate/download")
 async def download_generated_video(
     path: str = Query(..., description="Path to output video file"),
diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 54cd10633..3251738ab 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -1,6 +1,5 @@
 """Video generation service for batch mode with chunked processing."""
 
-import base64
 import gc
 import json
 import queue
@@ -40,13 +39,7 @@ def is_generation_cancelled() -> bool:
     from logging import Logger
 
     from .pipeline_manager import PipelineManager
-    from .schema import EncodedArray, GenerateRequest
-
-
-def decode_array(encoded: "EncodedArray", dtype: np.dtype) -> np.ndarray:
-    """Decode EncodedArray to numpy array."""
-    data = base64.b64decode(encoded.base64)
-    return np.frombuffer(data, dtype=dtype).reshape(encoded.shape)
+    from .schema import ChunkSpec, GenerateRequest
 
 
 def loop_to_length(arr: np.ndarray, target: int, axis: int) -> np.ndarray:
@@ -73,22 +66,6 @@ def pad_chunk(arr: np.ndarray, target_size: int, axis: int) -> np.ndarray:
     return np.concatenate([arr, padding], axis=axis)
 
 
-def build_lookup(specs: list | None, value_attr: str = "image") -> dict:
-    """Build chunk -> value lookup from list of specs."""
-    if not specs:
-        return {}
-    return {spec.chunk: getattr(spec, value_attr) for spec in specs}
-
-
-def get_chunk_value(value, chunk_idx: int, default=None):
-    """Get per-chunk value from scalar or list."""
-    if value is None:
-        return default
-    if isinstance(value, list):
-        return value[chunk_idx] if chunk_idx < len(value) else value[-1]
-    return value
-
-
 def sse_event(event_type: str, data: dict) -> str:
     """Format a server-sent event."""
     return f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
@@ -99,14 +76,14 @@ class DecodedInputs:
     """Decoded and preprocessed inputs for generation."""
 
     input_video: np.ndarray | None = None
-    vace_frames: np.ndarray | None = None
-    vace_masks: np.ndarray | None = None
     first_frames: dict[int, str] = field(default_factory=dict)
     last_frames: dict[int, str] = field(default_factory=dict)
     ref_images: dict[int, list[str]] = field(default_factory=dict)
     prompts: dict[int, list[dict]] = field(default_factory=dict)
     transitions: dict[int, dict] = field(default_factory=dict)
     vace_chunk_specs: dict[int, dict] = field(default_factory=dict)
+    input_video_chunks: dict[int, np.ndarray] = field(default_factory=dict)
+    chunk_specs_map: "dict[int, ChunkSpec]" = field(default_factory=dict)
 
 
 def load_video_from_file(file_path: str) -> np.ndarray:
@@ -128,30 +105,16 @@ def load_video_from_file(file_path: str) -> np.ndarray:
 def decode_inputs(
     request: "GenerateRequest", num_frames: int, logger: "Logger"
 ) -> DecodedInputs:
-    """Decode all inputs from request (base64 or file-based)."""
+    """Decode all inputs from request using unified ChunkSpec."""
     inputs = DecodedInputs()
 
-    # Handle input video - either from file path or base64
+    # Input video from file path
     if request.input_path:
         logger.info(f"Loading input video from file: {request.input_path}")
         inputs.input_video = load_video_from_file(request.input_path)
         inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0)
-    elif request.input_video:
-        inputs.input_video = decode_array(request.input_video, np.uint8)
-        inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0)
-
-    if request.vace_frames:
-        inputs.vace_frames = decode_array(request.vace_frames, np.float32)
-        inputs.vace_frames = loop_to_length(inputs.vace_frames, num_frames, axis=2)
 
-    if request.vace_masks:
-        inputs.vace_masks = decode_array(request.vace_masks, np.float32)
-        inputs.vace_masks = loop_to_length(inputs.vace_masks, num_frames, axis=2)
-
-    inputs.first_frames = build_lookup(request.first_frames, "image")
-    inputs.last_frames = build_lookup(request.last_frames, "image")
-    inputs.ref_images = build_lookup(request.vace_ref_images, "images")
-    # Normalize prompt to weighted list format
+    # Default prompt
     if isinstance(request.prompt, str):
         inputs.prompts = {0: [{"text": request.prompt, "weight": PROMPT_WEIGHT}]}
     else:
@@ -159,59 +122,110 @@ def decode_inputs(
             0: [{"text": p.text, "weight": p.weight} for p in request.prompt]
         }
 
-    # Chunk prompts: support both text and weighted prompt lists
-    if request.chunk_prompts:
-        for spec in request.chunk_prompts:
-            if spec.prompts:
-                inputs.prompts[spec.chunk] = [
-                    {"text": p.text, "weight": p.weight} for p in spec.prompts
-                ]
-            elif spec.text:
-                inputs.prompts[spec.chunk] = [
-                    {"text": spec.text, "weight": PROMPT_WEIGHT}
-                ]
-
-    # Per-chunk VACE specs
-    if request.vace_chunk_specs:
+    # Load binary blob if provided
+    blob: bytes | None = None
+    if request.data_blob_path:
+        import tempfile
+
+        from .recording import TEMP_FILE_PREFIXES
+
+        # Security: validate path prefix and temp dir
+        blob_path = Path(request.data_blob_path)
+        temp_dir = Path(tempfile.gettempdir())
+        if not blob_path.is_relative_to(temp_dir) or not blob_path.name.startswith(
+            TEMP_FILE_PREFIXES["generate_data"]
+        ):
+            raise ValueError(
+                f"Invalid data_blob_path: must be a temp file with prefix {TEMP_FILE_PREFIXES['generate_data']}"
+            )
+        with open(blob_path, "rb") as f:
+            blob = f.read()
         logger.info(
-            f"decode_inputs: Found {len(request.vace_chunk_specs)} vace_chunk_specs"
+            f"decode_inputs: Loaded data blob from {request.data_blob_path} ({len(blob)} bytes)"
         )
-        for spec in request.vace_chunk_specs:
-            logger.info(
-                f"decode_inputs: vace_chunk_spec chunk={spec.chunk}, has_frames={spec.frames is not None}, has_masks={spec.masks is not None}, context_scale={spec.context_scale}, temporally_locked={spec.vace_temporally_locked}"
-            )
-            decoded_spec: dict = {
-                "vace_temporally_locked": spec.vace_temporally_locked,
+
+    # Process chunk specs — single loop, single source of truth
+    for spec in request.chunk_specs or []:
+        # Store spec for build_chunk_kwargs
+        inputs.chunk_specs_map[spec.chunk] = spec
+
+        # Prompts
+        if spec.prompts:
+            inputs.prompts[spec.chunk] = [
+                {"text": p.text, "weight": p.weight} for p in spec.prompts
+            ]
+        elif spec.text:
+            inputs.prompts[spec.chunk] = [{"text": spec.text, "weight": PROMPT_WEIGHT}]
+
+        # Transitions
+        if spec.transition_target_prompts:
+            inputs.transitions[spec.chunk] = {
+                "target_prompts": [
+                    {"text": p.text, "weight": p.weight}
+                    for p in spec.transition_target_prompts
+                ],
+                "num_steps": spec.transition_num_steps or 4,
+                "temporal_interpolation_method": spec.transition_method or "linear",
             }
-            if spec.frames is not None:
-                decoded_spec["frames"] = decode_array(spec.frames, np.float32)
+
+        # Keyframes
+        if spec.first_frame_image:
+            inputs.first_frames[spec.chunk] = spec.first_frame_image
+        if spec.last_frame_image:
+            inputs.last_frames[spec.chunk] = spec.last_frame_image
+        if spec.vace_ref_images:
+            inputs.ref_images[spec.chunk] = spec.vace_ref_images
+
+        # VACE from blob
+        if blob is not None and spec.vace_frames_offset is not None:
+            decoded: dict = {"vace_temporally_locked": spec.vace_temporally_locked}
+            if spec.vace_frames_shape and spec.vace_frames_offset is not None:
+                count = 1
+                for d in spec.vace_frames_shape:
+                    count *= d
+                arr = np.frombuffer(
+                    blob, dtype=np.float32, count=count, offset=spec.vace_frames_offset
+                ).reshape(spec.vace_frames_shape)
+                decoded["frames"] = arr
                 logger.info(
-                    f"decode_inputs: chunk {spec.chunk} decoded frames shape={decoded_spec['frames'].shape}"
+                    f"decode_inputs: chunk {spec.chunk} VACE frames shape={arr.shape}"
                 )
-            if spec.masks is not None:
-                decoded_spec["masks"] = decode_array(spec.masks, np.float32)
+            if spec.vace_masks_shape and spec.vace_masks_offset is not None:
+                count = 1
+                for d in spec.vace_masks_shape:
+                    count *= d
+                arr = np.frombuffer(
+                    blob, dtype=np.float32, count=count, offset=spec.vace_masks_offset
+                ).reshape(spec.vace_masks_shape)
+                decoded["masks"] = arr
                 logger.info(
-                    f"decode_inputs: chunk {spec.chunk} decoded masks shape={decoded_spec['masks'].shape}"
+                    f"decode_inputs: chunk {spec.chunk} VACE masks shape={arr.shape}"
                 )
-            if spec.context_scale is not None:
-                decoded_spec["context_scale"] = spec.context_scale
-            inputs.vace_chunk_specs[spec.chunk] = decoded_spec
-        logger.info(
-            f"decode_inputs: vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}"
-        )
-    else:
-        logger.info("decode_inputs: No vace_chunk_specs in request")
+            if spec.vace_context_scale is not None:
+                decoded["context_scale"] = spec.vace_context_scale
+            inputs.vace_chunk_specs[spec.chunk] = decoded
+
+        # Input video from blob (per-chunk video-to-video)
+        if (
+            blob is not None
+            and spec.input_video_offset is not None
+            and spec.input_video_shape is not None
+        ):
+            count = 1
+            for d in spec.input_video_shape:
+                count *= d
+            inputs.input_video_chunks[spec.chunk] = np.frombuffer(
+                blob, dtype=np.uint8, count=count, offset=spec.input_video_offset
+            ).reshape(spec.input_video_shape)
 
-    # Build transitions lookup
-    if request.transitions:
-        for t in request.transitions:
-            inputs.transitions[t.chunk] = {
-                "target_prompts": [
-                    {"text": p.text, "weight": p.weight} for p in t.target_prompts
-                ],
-                "num_steps": t.num_steps,
-                "temporal_interpolation_method": t.temporal_interpolation_method,
-            }
+    logger.info(
+        f"decode_inputs: prompts={list(inputs.prompts.keys())}, "
+        f"transitions={list(inputs.transitions.keys())}, "
+        f"vace_specs={list(inputs.vace_chunk_specs.keys())}, "
+        f"input_video_chunks={list(inputs.input_video_chunks.keys())}, "
+        f"first_frames={list(inputs.first_frames.keys())}, "
+        f"last_frames={list(inputs.last_frames.keys())}"
+    )
 
     return inputs
 
@@ -228,19 +242,25 @@ def build_chunk_kwargs(
     dtype: torch.dtype,
     logger: "Logger",
 ) -> dict:
-    """Build pipeline kwargs for a single chunk."""
+    """Build pipeline kwargs for a single chunk.
+
+    Per-chunk ChunkSpec values override request-level globals.
+    """
+    # Get per-chunk spec (if any)
+    spec = inputs.chunk_specs_map.get(chunk_idx)
+
     kwargs = {
         "height": request.height
         or status_info.get("load_params", {}).get("height", DEFAULT_HEIGHT),
         "width": request.width
         or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH),
-        "base_seed": get_chunk_value(request.seed, chunk_idx, DEFAULT_SEED),
-        "init_cache": chunk_idx == 0
-        or (
-            request.cache_reset_chunks is not None
-            and chunk_idx in request.cache_reset_chunks
+        "base_seed": spec.seed if spec and spec.seed is not None else request.seed,
+        "init_cache": chunk_idx == 0 or (spec is not None and spec.reset_cache),
+        "manage_cache": (
+            spec.manage_cache
+            if spec and spec.manage_cache is not None
+            else request.manage_cache
         ),
-        "manage_cache": request.manage_cache,
     }
 
     # Prompt (sticky behavior - only send when it changes)
@@ -254,43 +274,73 @@ def build_chunk_kwargs(
     if request.denoising_steps:
         kwargs["denoising_step_list"] = request.denoising_steps
 
-    # Video-to-video
-    if inputs.input_video is not None:
+    # Video-to-video: per-chunk input video takes priority over global input video
+    if chunk_idx in inputs.input_video_chunks:
+        # Per-chunk input video from blob (enables v2v/t2v switching per chunk)
+        chunk_frames = inputs.input_video_chunks[chunk_idx]
+        chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0)
+        kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames]
+        kwargs["noise_scale"] = (
+            spec.noise_scale
+            if spec and spec.noise_scale is not None
+            else request.noise_scale
+        )
+        logger.info(
+            f"Chunk {chunk_idx}: Using per-chunk input video ({chunk_frames.shape[0]} frames)"
+        )
+    elif inputs.input_video is not None:
         chunk_frames = inputs.input_video[start_frame:end_frame]
         chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0)
         kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames]
-        kwargs["noise_scale"] = get_chunk_value(
-            request.noise_scale, chunk_idx, DEFAULT_NOISE_SCALE
+        kwargs["noise_scale"] = (
+            spec.noise_scale
+            if spec and spec.noise_scale is not None
+            else request.noise_scale
         )
     else:
         kwargs["num_frames"] = chunk_size
 
     # VACE context scale
-    kwargs["vace_context_scale"] = get_chunk_value(
-        request.vace_context_scale, chunk_idx, 1.0
+    kwargs["vace_context_scale"] = (
+        spec.vace_context_scale
+        if spec and spec.vace_context_scale is not None
+        else request.vace_context_scale
     )
 
     # Noise controller
-    if request.noise_controller is not None:
-        kwargs["noise_controller"] = request.noise_controller
+    noise_ctrl = (
+        spec.noise_controller
+        if spec and spec.noise_controller is not None
+        else request.noise_controller
+    )
+    if noise_ctrl is not None:
+        kwargs["noise_controller"] = noise_ctrl
 
     # KV cache attention bias
-    kv_bias = get_chunk_value(request.kv_cache_attention_bias, chunk_idx)
+    kv_bias = (
+        spec.kv_cache_attention_bias
+        if spec and spec.kv_cache_attention_bias is not None
+        else request.kv_cache_attention_bias
+    )
     if kv_bias is not None:
         kwargs["kv_cache_attention_bias"] = kv_bias
 
     # Prompt interpolation method
-    kwargs["prompt_interpolation_method"] = request.prompt_interpolation_method
+    kwargs["prompt_interpolation_method"] = (
+        spec.prompt_interpolation_method
+        if spec and spec.prompt_interpolation_method is not None
+        else request.prompt_interpolation_method
+    )
 
     # VACE use input video
     if request.vace_use_input_video is not None:
         kwargs["vace_use_input_video"] = request.vace_use_input_video
 
-    # LoRA scales
-    if request.lora_scales:
+    # LoRA scales: per-chunk spec overrides global
+    lora_scales = spec.lora_scales if spec and spec.lora_scales else request.lora_scales
+    if lora_scales:
         lora_scale_updates = []
-        for path, scale_value in request.lora_scales.items():
-            scale = get_chunk_value(scale_value, chunk_idx, 1.0)
+        for path, scale in lora_scales.items():
             lora_scale_updates.append({"path": path, "scale": scale})
             logger.info(
                 f"Chunk {chunk_idx}: LoRA scale={scale:.3f} for {Path(path).name}"
@@ -313,40 +363,26 @@ def build_chunk_kwargs(
     if chunk_idx in inputs.ref_images:
         kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx]
 
-    # VACE conditioning: per-chunk spec takes priority over global
+    # VACE conditioning from blob
     logger.info(
-        f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}, has_global_frames={inputs.vace_frames is not None}, has_global_masks={inputs.vace_masks is not None}"
+        f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}"
     )
     if chunk_idx in inputs.vace_chunk_specs:
         logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING PER-CHUNK VACE SPEC")
-        spec = inputs.vace_chunk_specs[chunk_idx]
+        vace_spec = inputs.vace_chunk_specs[chunk_idx]
 
-        if "frames" in spec:
-            frames = spec["frames"]
+        if "frames" in vace_spec:
+            frames = vace_spec["frames"]
             frames = pad_chunk(frames, chunk_size, axis=2)
             kwargs["vace_input_frames"] = torch.from_numpy(frames).to(device, dtype)
 
-        if "masks" in spec:
-            masks = spec["masks"]
+        if "masks" in vace_spec:
+            masks = vace_spec["masks"]
             masks = pad_chunk(masks, chunk_size, axis=2)
             kwargs["vace_input_masks"] = torch.from_numpy(masks).to(device, dtype)
 
-        if "context_scale" in spec:
-            kwargs["vace_context_scale"] = spec["context_scale"]
-    else:
-        logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING GLOBAL VACE FALLBACK")
-        # Global VACE conditioning frames [1, C, T, H, W]
-        if inputs.vace_frames is not None:
-            chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :]
-            chunk = pad_chunk(chunk, chunk_size, axis=2)
-            kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype)
-
-        # Global VACE masks [1, 1, T, H, W]
-        if inputs.vace_masks is not None:
-            chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :]
-            chunk = pad_chunk(chunk, chunk_size, axis=2)
-            kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype)
-
+        if "context_scale" in vace_spec:
+            kwargs["vace_context_scale"] = vace_spec["context_scale"]
     return kwargs
 
 
@@ -694,7 +730,7 @@ def generate_video_stream(
         pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id)
 
         # Determine chunk size from pipeline
-        has_video = request.input_video is not None or request.input_path is not None
+        has_video = request.input_path is not None
         requirements = pipeline.prepare(video=[] if has_video else None)
         chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE
         num_chunks = (request.num_frames + chunk_size - 1) // chunk_size
@@ -817,6 +853,14 @@ def generate_video_stream(
         yield sse_event("error", {"error": str(e)})
 
     finally:
+        # Clean up uploaded data blob file
+        if request.data_blob_path:
+            try:
+                Path(request.data_blob_path).unlink(missing_ok=True)
+                logger.info(f"Cleaned up data blob file: {request.data_blob_path}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up data blob file: {e}")
+
         # Clean up uploaded input file
         if request.input_path:
             try:
diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py
index bd06a3bca..280ba9fe1 100644
--- a/src/scope/server/recording.py
+++ b/src/scope/server/recording.py
@@ -19,6 +19,7 @@
     "download": "scope_download_",
     "generate_input": "scope_gen_input_",
     "generate_output": "scope_gen_output_",
+    "generate_data": "scope_gen_data_",
 }
 
 # Environment variables
@@ -441,6 +442,7 @@ def cleanup_recording_files():
         f"{TEMP_FILE_PREFIXES['download']}*.mp4",
         f"{TEMP_FILE_PREFIXES['generate_input']}*.bin",
         f"{TEMP_FILE_PREFIXES['generate_output']}*.bin",
+        f"{TEMP_FILE_PREFIXES['generate_data']}*.bin",
     ]
 
     deleted_count = 0
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index d450ba44e..ad61c6a32 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -818,82 +818,100 @@ class ApiKeyDeleteResponse(BaseModel):
     message: str
 
 
-class ChunkFrameSpec(BaseModel):
-    """Specification for a frame image at a specific chunk."""
+class ChunkSpec(BaseModel):
+    """Unified per-chunk specification. All fields optional — only set what changes."""
 
-    chunk: int = Field(..., ge=0, description="Chunk index")
-    image: str = Field(..., description="Path to image file")
+    chunk: int = Field(..., ge=0, description="Chunk index (required)")
 
-
-class ChunkPromptSpec(BaseModel):
-    """Specification for a prompt at a specific chunk.
-
-    Supports both simple text and weighted prompt lists for spatial blending.
-    """
-
-    chunk: int = Field(..., ge=0, description="Chunk index")
+    # Prompt
     text: str | None = Field(
         default=None,
-        description="Simple prompt text for this chunk (mutually exclusive with prompts)",
+        description="Simple prompt text (mutually exclusive with prompts)",
     )
     prompts: list[PromptItem] | None = Field(
         default=None,
-        description="Weighted prompt list for spatial blending at this chunk (mutually exclusive with text)",
+        description="Weighted prompt list for spatial blending (mutually exclusive with text)",
+    )
+    prompt_interpolation_method: Literal["linear", "slerp"] | None = Field(
+        default=None,
+        description="Spatial interpolation method override for this chunk",
     )
 
-
-class ChunkTransitionSpec(BaseModel):
-    """Specification for a temporal transition starting at a specific chunk."""
-
-    chunk: int = Field(..., ge=0, description="Chunk index where transition starts")
-    target_prompts: list[PromptItem] = Field(
-        ..., description="Target prompt blend to interpolate to"
+    # Temporal transition
+    transition_target_prompts: list[PromptItem] | None = Field(
+        default=None,
+        description="Target prompt blend to interpolate to",
     )
-    num_steps: int = Field(
-        default=4,
+    transition_num_steps: int | None = Field(
+        default=None,
         ge=0,
         description="Number of generation calls to transition over (0 = instant)",
     )
-    temporal_interpolation_method: Literal["linear", "slerp"] = Field(
-        default="linear",
-        description="Method for temporal interpolation between blends across frames",
+    transition_method: Literal["linear", "slerp"] | None = Field(
+        default=None,
+        description="Method for temporal interpolation between blends",
     )
 
+    # Keyframe images (paths)
+    first_frame_image: str | None = Field(
+        default=None, description="Path to first frame reference image"
+    )
+    last_frame_image: str | None = Field(
+        default=None, description="Path to last frame reference image"
+    )
+    vace_ref_images: list[str] | None = Field(
+        default=None, description="List of reference image paths for VACE conditioning"
+    )
 
-class ChunkRefImagesSpec(BaseModel):
-    """Specification for reference images at a specific chunk."""
-
-    chunk: int = Field(default=0, ge=0, description="Chunk index (default: 0)")
-    images: list[str] = Field(..., description="List of reference image paths")
-
-
-class ChunkVACESpec(BaseModel):
-    """Per-chunk VACE conditioning specification."""
-
-    chunk: int = Field(..., ge=0, description="Chunk index")
-    frames: "EncodedArray | None" = Field(
-        default=None,
-        description="VACE conditioning frames for this chunk ([1, C, T, H, W] float32 [-1, 1])",
+    # Generation parameters
+    seed: int | None = Field(default=None, description="Random seed override")
+    noise_scale: float | None = Field(default=None, description="Noise scale override")
+    kv_cache_attention_bias: float | None = Field(
+        default=None, description="KV cache attention bias override"
     )
-    masks: "EncodedArray | None" = Field(
-        default=None,
-        description="VACE masks for this chunk ([1, 1, T, H, W] float32 {0, 1})",
+    reset_cache: bool = Field(
+        default=False, description="Force cache reset at this chunk"
     )
-    context_scale: float | None = Field(
-        default=None,
-        description="VACE context scale override for this chunk. If None, uses global vace_context_scale.",
+    noise_controller: bool | None = Field(
+        default=None, description="Noise controller override"
     )
-    vace_temporally_locked: bool = Field(
-        default=True,
-        description="When True, frames/masks are sliced temporally to match chunk position. When False, used as-is and padded.",
+    manage_cache: bool | None = Field(
+        default=None, description="Cache management override"
     )
 
+    # LoRA scales: {path: scale}
+    lora_scales: dict[str, float] | None = Field(
+        default=None, description="LoRA scales by path for this chunk"
+    )
 
-class EncodedArray(BaseModel):
-    """Base64-encoded numpy array with shape metadata."""
+    # VACE conditioning (offsets into binary blob)
+    vace_context_scale: float | None = Field(
+        default=None, description="VACE context scale override"
+    )
+    vace_temporally_locked: bool = Field(
+        default=True,
+        description="When True, frames/masks are sliced temporally. When False, used as-is.",
+    )
+    vace_frames_shape: list[int] | None = Field(
+        default=None, description="Shape of VACE frames ([1, C, T, H, W] float32)"
+    )
+    vace_frames_offset: int | None = Field(
+        default=None, description="Byte offset into blob for VACE frames"
+    )
+    vace_masks_shape: list[int] | None = Field(
+        default=None, description="Shape of VACE masks ([1, 1, T, H, W] float32)"
+    )
+    vace_masks_offset: int | None = Field(
+        default=None, description="Byte offset into blob for VACE masks"
+    )
 
-    base64: str = Field(..., description="Base64-encoded numpy array bytes")
-    shape: list[int] = Field(..., description="Array shape for decoding")
+    # Input video for this chunk (offset into binary blob)
+    input_video_shape: list[int] | None = Field(
+        default=None, description="Shape of per-chunk input video [T, H, W, C] uint8"
+    )
+    input_video_offset: int | None = Field(
+        default=None, description="Byte offset into blob for per-chunk input video"
+    )
 
 
 class VideoUploadResponse(BaseModel):
@@ -914,14 +932,6 @@ class GenerateRequest(BaseModel):
         ...,
         description="Text prompt for generation (sent on chunk 0). Can be a simple string or a list of weighted prompts for spatial blending.",
     )
-    chunk_prompts: list[ChunkPromptSpec] | None = Field(
-        default=None,
-        description="Prompt changes at later chunks (sticky behavior). Each entry supports simple text or weighted prompt lists.",
-    )
-    transitions: list[ChunkTransitionSpec] | None = Field(
-        default=None,
-        description="Temporal transitions at specific chunks. Each specifies a target prompt blend and number of interpolation steps.",
-    )
     num_frames: int = Field(
         default=64,
         ge=1,
@@ -940,86 +950,70 @@ class GenerateRequest(BaseModel):
         le=2048,
         description="Output width (defaults to pipeline's native resolution)",
     )
-    seed: int | list[int] = Field(
-        default=42,
-        description="Random seed. Single int applies to all chunks; list applies per-chunk.",
-    )
-    # Video-to-video input (optional) - two mutually exclusive options
-    input_video: EncodedArray | None = Field(
+
+    # Per-chunk specs (replaces all scattered per-chunk lists)
+    chunk_specs: list[ChunkSpec] | None = Field(
         default=None,
-        description="Input video frames (THWC, uint8). If provided, enables video-to-video mode. For large videos, use input_path instead.",
+        description="Unified per-chunk specifications. Each entry can override prompt, transition, "
+        "keyframes, generation parameters, LoRA scales, and VACE conditioning for a specific chunk.",
     )
-    input_path: str | None = Field(
+
+    # Binary blob path (from /generate/upload-data)
+    data_blob_path: str | None = Field(
         default=None,
-        description="Path to uploaded video file (from /generate/upload). Alternative to input_video for large files.",
+        description="Path to uploaded binary data blob (from /generate/upload-data). "
+        "Contains raw arrays referenced by chunk_specs offsets (VACE frames/masks, input video).",
     )
-    noise_scale: float | list[float] = Field(
-        default=0.7,
-        description="Noise scale for video-to-video mode. Single float applies to all chunks; list applies per-chunk.",
+
+    # Global defaults (applied to chunks without per-chunk override)
+    seed: int = Field(
+        default=42,
+        description="Random seed (default for all chunks).",
     )
-    denoising_steps: list[int] | None = Field(
-        default=None,
-        description="Denoising timesteps (e.g., [1000, 750, 500, 250])",
+    noise_scale: float = Field(
+        default=0.7,
+        description="Noise scale for video-to-video mode (default for all chunks).",
     )
     manage_cache: bool = Field(
         default=True,
-        description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).",
-    )
-    cache_reset_chunks: list[int] | None = Field(
-        default=None,
-        description="List of chunk indices where the KV cache should be forcibly reset (init_cache=True). Chunk 0 always resets.",
+        description="Enable automatic cache management.",
     )
     noise_controller: bool | None = Field(
         default=None,
         description="Enable automatic noise scale adjustment based on motion detection.",
     )
-    kv_cache_attention_bias: float | list[float] | None = Field(
+    kv_cache_attention_bias: float | None = Field(
         default=None,
-        description="Controls reliance on past frames in cache. Lower values mitigate error accumulation. Single float applies to all chunks; list applies per-chunk. Typical values: 0.3-0.7 moderate, 0.1-0.2 strong.",
+        description="Controls reliance on past frames in cache. Lower values mitigate error accumulation.",
     )
     prompt_interpolation_method: Literal["linear", "slerp"] = Field(
         default="linear",
-        description="Spatial interpolation method for blending multiple prompts: linear (weighted average) or slerp (spherical).",
-    )
-    vace_use_input_video: bool | None = Field(
-        default=None,
-        description="When enabled in video-to-video mode, input video is used for VACE conditioning instead of latent initialization.",
-    )
-    # Per-chunk parameters
-    lora_scales: dict[str, float | list[float]] | None = Field(
-        default=None,
-        description="LoRA scales by path. Single float applies to all chunks; list applies per-chunk. Example: {'path/to/lora.pt': 0.8} or {'path/to/lora.pt': [0.5, 0.7, 0.9]}",
+        description="Spatial interpolation method for blending multiple prompts.",
     )
-    vace_context_scale: float | list[float] = Field(
+    vace_context_scale: float = Field(
         default=1.0,
-        description="VACE context scale. Single float applies to all chunks; list applies per-chunk.",
-    )
-    # Keyframe specifications (chunk, image) pairs
-    first_frames: list[ChunkFrameSpec] | None = Field(
-        default=None,
-        description="First frame anchors. Each specifies a chunk index and image path to use as that chunk's first frame.",
-    )
-    last_frames: list[ChunkFrameSpec] | None = Field(
-        default=None,
-        description="Last frame anchors. Each specifies a chunk index and image path to use as that chunk's last frame.",
+        description="VACE context scale (default for all chunks).",
     )
-    vace_ref_images: list[ChunkRefImagesSpec] | None = Field(
+    vace_use_input_video: bool | None = Field(
         default=None,
-        description="Reference images for VACE conditioning. Each specifies a chunk index and list of image paths.",
+        description="When enabled in video-to-video mode, input video is used for VACE conditioning.",
     )
-    # VACE conditioning frames/masks (for depth guidance, inpainting, etc.)
-    vace_frames: EncodedArray | None = Field(
+    denoising_steps: list[int] | None = Field(
         default=None,
-        description="VACE conditioning frames ([1, C, T, H, W] float32 [-1, 1]). Used for depth guidance, structural control, etc.",
+        description="Denoising timesteps (e.g., [1000, 750, 500, 250])",
     )
-    vace_masks: EncodedArray | None = Field(
+    lora_scales: dict[str, float] | None = Field(
         default=None,
-        description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).",
+        description="Global LoRA scales by path (default for all chunks).",
     )
-    vace_chunk_specs: list[ChunkVACESpec] | None = Field(
+
+    # Video-to-video input (file-based upload)
+    input_path: str | None = Field(
         default=None,
-        description="Per-chunk VACE conditioning. Each specifies frames/masks for a specific chunk. Overrides global vace_frames/vace_masks for that chunk.",
+        description="Path to uploaded video file (from /generate/upload).",
     )
+
+    # Processors
     pre_processor_id: str | None = Field(
         default=None,
         description="Pipeline ID for pre-processing each chunk before the main pipeline.",
@@ -1030,6 +1024,15 @@ class GenerateRequest(BaseModel):
     )
 
 
+class DataUploadResponse(BaseModel):
+    """Response after uploading binary data blob for generate request."""
+
+    data_blob_path: str = Field(
+        ..., description="Path to uploaded data blob file for generate request"
+    )
+    size_bytes: int = Field(..., description="Size of the uploaded blob in bytes")
+
+
 class GenerateResponse(BaseModel):
     """Response from batch video generation.
 

From d2c66b9022be9f585114fdf5d034fa72af65c7a4 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Fri, 20 Feb 2026 11:48:08 -0500
Subject: [PATCH 14/16] cleanup

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 docs/api/generate.md         | 102 ++++++
 src/scope/server/app.py      |  21 +-
 src/scope/server/generate.py | 682 +++++++++++++++--------------------
 src/scope/server/schema.py   |  19 +-
 4 files changed, 408 insertions(+), 416 deletions(-)
 create mode 100644 docs/api/generate.md

diff --git a/docs/api/generate.md b/docs/api/generate.md
new file mode 100644
index 000000000..3f22d4d44
--- /dev/null
+++ b/docs/api/generate.md
@@ -0,0 +1,102 @@
+# Generate Endpoint
+
+Batch video generation via HTTP. Unlike the WebRTC streaming path (real-time, interactive), the generate endpoint produces a complete video in one request, processing it chunk-by-chunk with SSE progress events.
+
+Primary consumer: ComfyUI custom nodes (`comfyui-scope`).
+
+## Endpoints
+
+| Endpoint | Method | Purpose |
+|---|---|---|
+| `/api/v1/generate` | POST | Generate video (SSE stream) |
+| `/api/v1/generate/cancel` | POST | Cancel after current chunk |
+| `/api/v1/generate/upload` | POST | Upload input video for v2v |
+| `/api/v1/generate/upload-data` | POST | Upload binary data blob (VACE, per-chunk video) |
+| `/api/v1/generate/download` | GET | Download output video |
+
+Only one generation can run at a time (409 if busy).
+
+## Flow
+
+```
+1. [optional] POST /generate/upload      → input_path
+2. [optional] POST /generate/upload-data  → data_blob_path
+3. POST /generate (JSON body, references paths from steps 1-2)
+   ← SSE: event: progress  {chunk, total_chunks, frames, latency, fps}
+   ← SSE: event: complete  {output_path, video_shape, num_frames, ...}
+4. GET /generate/download?path=<output_path>
+   ← binary video data
+```
+
+## Binary Protocol
+
+### Video Upload (`/generate/upload`)
+
+**Request**: Raw uint8 bytes in THWC order (frames × height × width × channels).
+
+**Headers** (required):
+- `X-Video-Frames`: T
+- `X-Video-Height`: H
+- `X-Video-Width`: W
+- `X-Video-Channels`: C (default 3)
+
+**Stored format**: 20-byte header + raw data.
+```
+[4 bytes: ndim (little-endian u32)]
+[4 bytes × ndim: shape dimensions (little-endian u32 each)]
+[raw uint8 video bytes]
+```
+
+### Data Blob Upload (`/generate/upload-data`)
+
+**Request**: Raw binary blob containing packed arrays. Max size: 2 GB.
+
+The blob is an opaque byte buffer. `ChunkSpec` entries in the generate request reference regions of this blob by offset:
+
+```json
+{
+  "chunk": 0,
+  "vace_frames_offset": 0,
+  "vace_frames_shape": [1, 3, 12, 320, 576],
+  "vace_masks_offset": 26542080,
+  "vace_masks_shape": [1, 1, 12, 320, 576]
+}
+```
+
+Arrays are packed as contiguous float32 (VACE frames/masks) or uint8 (input video). The client is responsible for computing offsets when packing the blob.
+
+### Video Download (`/generate/download`)
+
+**Response**: Same binary format as upload (20-byte header + raw uint8 THWC data).
+
+**Response headers**:
+- `X-Video-Frames`, `X-Video-Height`, `X-Video-Width`, `X-Video-Channels`
+
+## GenerateRequest
+
+```json
+{
+  "pipeline_id": "longlive",
+  "prompt": "a cat walking",
+  "num_frames": 48,
+  "seed": 42,
+  "noise_scale": 0.7,
+  "input_path": "<from /generate/upload>",
+  "data_blob_path": "<from /generate/upload-data>",
+  "chunk_specs": [
+    {
+      "chunk": 0,
+      "text": "override prompt for chunk 0",
+      "lora_scales": {"path/to/lora.safetensors": 0.5},
+      "vace_frames_offset": 0,
+      "vace_frames_shape": [1, 3, 12, 320, 576]
+    }
+  ],
+  "pre_processor_id": null,
+  "post_processor_id": null
+}
+```
+
+Request-level fields are global defaults. `chunk_specs` entries override any field for a specific chunk index. Only fields that change need to be specified — prompts are sticky (last-set persists).
+
+See `schema.py` for the full `GenerateRequest` and `ChunkSpec` field definitions.
diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index 75cb79e88..10fe79c39 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -1134,6 +1134,14 @@ async def generate_video(
     pipeline_manager: "PipelineManager" = Depends(get_pipeline_manager),
 ):
     """Generate video frames in batch mode with SSE progress streaming."""
+    from .generate import is_generation_active
+
+    if is_generation_active():
+        raise HTTPException(
+            status_code=409,
+            detail="A generation is already in progress. Cancel it first or wait for completion.",
+        )
+
     status_info = await pipeline_manager.get_status_info_async()
     if status_info["status"] != "loaded":
         raise HTTPException(
@@ -1253,12 +1261,21 @@ async def upload_data_blob(request: Request):
             ".bin", TEMP_FILE_PREFIXES["generate_data"]
         )
 
-        # Stream body to file
+        from .generate import MAX_DATA_BLOB_BYTES
+
+        # Stream body to file with size limit
         bytes_written = 0
         with open(file_path, "wb") as f:
             async for chunk in request.stream():
-                f.write(chunk)
                 bytes_written += len(chunk)
+                if bytes_written > MAX_DATA_BLOB_BYTES:
+                    f.close()
+                    Path(file_path).unlink(missing_ok=True)
+                    raise HTTPException(
+                        status_code=413,
+                        detail=f"Data blob exceeds maximum size of {MAX_DATA_BLOB_BYTES} bytes",
+                    )
+                f.write(chunk)
 
         if bytes_written == 0:
             Path(file_path).unlink(missing_ok=True)
diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 3251738ab..93d91b25c 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -8,7 +8,7 @@
 from collections.abc import Iterator
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import IO, TYPE_CHECKING
 
 import numpy as np
 import torch
@@ -16,6 +16,12 @@
 # Cancellation support (single-client, so one event suffices)
 _cancel_event = threading.Event()
 
+# Generation lock (single-client: only one generation at a time)
+_generation_lock = threading.Lock()
+
+# Max data blob upload size (2 GB)
+MAX_DATA_BLOB_BYTES = 2 * 1024 * 1024 * 1024
+
 
 def cancel_generation():
     """Signal the current generation to stop after the current chunk."""
@@ -27,6 +33,11 @@ def is_generation_cancelled() -> bool:
     return _cancel_event.is_set()
 
 
+def is_generation_active() -> bool:
+    """Check if a generation is currently in progress."""
+    return _generation_lock.locked()
+
+
 # Defaults
 DEFAULT_HEIGHT = 320
 DEFAULT_WIDTH = 576
@@ -42,6 +53,11 @@ def is_generation_cancelled() -> bool:
     from .schema import ChunkSpec, GenerateRequest
 
 
+# ---------------------------------------------------------------------------
+# Array utilities
+# ---------------------------------------------------------------------------
+
+
 def loop_to_length(arr: np.ndarray, target: int, axis: int) -> np.ndarray:
     """Tile array along axis to reach target length."""
     current = arr.shape[axis]
@@ -66,11 +82,21 @@ def pad_chunk(arr: np.ndarray, target_size: int, axis: int) -> np.ndarray:
     return np.concatenate([arr, padding], axis=axis)
 
 
+# ---------------------------------------------------------------------------
+# SSE helpers
+# ---------------------------------------------------------------------------
+
+
 def sse_event(event_type: str, data: dict) -> str:
     """Format a server-sent event."""
     return f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
 
 
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+
 @dataclass
 class DecodedInputs:
     """Decoded and preprocessed inputs for generation."""
@@ -86,15 +112,81 @@ class DecodedInputs:
     chunk_specs_map: "dict[int, ChunkSpec]" = field(default_factory=dict)
 
 
-def load_video_from_file(file_path: str) -> np.ndarray:
-    """Load video from temp file.
+@dataclass
+class GenerationState:
+    """Mutable state accumulated during chunk-by-chunk generation."""
+
+    output_file: IO[bytes]
+    num_chunks: int
+    logger: "Logger"
+    total_frames: int = 0
+    height: int | None = None
+    width: int | None = None
+    channels: int | None = None
+    latencies: list[float] = field(default_factory=list)
+    fps_measures: list[float] = field(default_factory=list)
+
+    def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> str:
+        """Write chunk output to file and return SSE progress event."""
+        chunk_output = result["video"]
+        num_output_frames = chunk_output.shape[0]
+        chunk_fps = num_output_frames / chunk_latency
+
+        self.latencies.append(chunk_latency)
+        self.fps_measures.append(chunk_fps)
+
+        self.logger.info(
+            f"Chunk {chunk_idx + 1}/{self.num_chunks}: "
+            f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
+        )
 
-    Args:
-        file_path: Path to video file with header
+        chunk_np = chunk_output.detach().cpu().numpy()
+        chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8)
+        self.output_file.write(chunk_uint8.tobytes())
 
-    Returns:
-        Video array [T, H, W, C] uint8
-    """
+        self.total_frames += num_output_frames
+        if self.height is None:
+            self.height = chunk_np.shape[1]
+            self.width = chunk_np.shape[2]
+            self.channels = chunk_np.shape[3]
+
+        return sse_event(
+            "progress",
+            {
+                "chunk": chunk_idx + 1,
+                "total_chunks": self.num_chunks,
+                "frames": num_output_frames,
+                "latency": round(chunk_latency, 3),
+                "fps": round(chunk_fps, 2),
+            },
+        )
+
+    @property
+    def output_shape(self) -> list[int]:
+        return [self.total_frames, self.height, self.width, self.channels]
+
+    def log_summary(self):
+        """Log performance summary."""
+        if not self.latencies:
+            return
+        avg_lat = sum(self.latencies) / len(self.latencies)
+        avg_fps = sum(self.fps_measures) / len(self.fps_measures)
+        self.logger.info(
+            f"=== Performance Summary ({self.num_chunks} chunks) ===\n"
+            f"  Latency - Avg: {avg_lat:.2f}s, "
+            f"Max: {max(self.latencies):.2f}s, Min: {min(self.latencies):.2f}s\n"
+            f"  FPS - Avg: {avg_fps:.2f}, "
+            f"Max: {max(self.fps_measures):.2f}, Min: {min(self.fps_measures):.2f}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Input decoding
+# ---------------------------------------------------------------------------
+
+
+def load_video_from_file(file_path: str) -> np.ndarray:
+    """Load video from temp file with header (ndim + shape + raw uint8)."""
     with open(file_path, "rb") as f:
         ndim = int.from_bytes(f.read(4), "little")
         shape = tuple(int.from_bytes(f.read(4), "little") for _ in range(ndim))
@@ -102,6 +194,16 @@ def load_video_from_file(file_path: str) -> np.ndarray:
     return data
 
 
+def _read_blob_array(
+    blob: bytes, offset: int, shape: list[int], dtype=np.float32
+) -> np.ndarray:
+    """Read a contiguous array from a binary blob at a given offset."""
+    count = 1
+    for d in shape:
+        count *= d
+    return np.frombuffer(blob, dtype=dtype, count=count, offset=offset).reshape(shape)
+
+
 def decode_inputs(
     request: "GenerateRequest", num_frames: int, logger: "Logger"
 ) -> DecodedInputs:
@@ -129,7 +231,6 @@ def decode_inputs(
 
         from .recording import TEMP_FILE_PREFIXES
 
-        # Security: validate path prefix and temp dir
         blob_path = Path(request.data_blob_path)
         temp_dir = Path(tempfile.gettempdir())
         if not blob_path.is_relative_to(temp_dir) or not blob_path.name.startswith(
@@ -146,7 +247,6 @@ def decode_inputs(
 
     # Process chunk specs — single loop, single source of truth
     for spec in request.chunk_specs or []:
-        # Store spec for build_chunk_kwargs
         inputs.chunk_specs_map[spec.chunk] = spec
 
         # Prompts
@@ -180,23 +280,17 @@ def decode_inputs(
         if blob is not None and spec.vace_frames_offset is not None:
             decoded: dict = {"vace_temporally_locked": spec.vace_temporally_locked}
             if spec.vace_frames_shape and spec.vace_frames_offset is not None:
-                count = 1
-                for d in spec.vace_frames_shape:
-                    count *= d
-                arr = np.frombuffer(
-                    blob, dtype=np.float32, count=count, offset=spec.vace_frames_offset
-                ).reshape(spec.vace_frames_shape)
+                arr = _read_blob_array(
+                    blob, spec.vace_frames_offset, spec.vace_frames_shape
+                )
                 decoded["frames"] = arr
                 logger.info(
                     f"decode_inputs: chunk {spec.chunk} VACE frames shape={arr.shape}"
                 )
             if spec.vace_masks_shape and spec.vace_masks_offset is not None:
-                count = 1
-                for d in spec.vace_masks_shape:
-                    count *= d
-                arr = np.frombuffer(
-                    blob, dtype=np.float32, count=count, offset=spec.vace_masks_offset
-                ).reshape(spec.vace_masks_shape)
+                arr = _read_blob_array(
+                    blob, spec.vace_masks_offset, spec.vace_masks_shape
+                )
                 decoded["masks"] = arr
                 logger.info(
                     f"decode_inputs: chunk {spec.chunk} VACE masks shape={arr.shape}"
@@ -211,12 +305,9 @@ def decode_inputs(
             and spec.input_video_offset is not None
             and spec.input_video_shape is not None
         ):
-            count = 1
-            for d in spec.input_video_shape:
-                count *= d
-            inputs.input_video_chunks[spec.chunk] = np.frombuffer(
-                blob, dtype=np.uint8, count=count, offset=spec.input_video_offset
-            ).reshape(spec.input_video_shape)
+            inputs.input_video_chunks[spec.chunk] = _read_blob_array(
+                blob, spec.input_video_offset, spec.input_video_shape, dtype=np.uint8
+            )
 
     logger.info(
         f"decode_inputs: prompts={list(inputs.prompts.keys())}, "
@@ -230,6 +321,21 @@ def decode_inputs(
     return inputs
 
 
+# ---------------------------------------------------------------------------
+# Chunk kwargs builder
+# ---------------------------------------------------------------------------
+
+
+def _resolve(spec, attr: str, request, fallback=None):
+    """Return per-chunk spec value if set, else request-level value, else fallback."""
+    if spec is not None:
+        val = getattr(spec, attr, None)
+        if val is not None:
+            return val
+    val = getattr(request, attr, None)
+    return val if val is not None else fallback
+
+
 def build_chunk_kwargs(
     request: "GenerateRequest",
     inputs: DecodedInputs,
@@ -246,24 +352,22 @@ def build_chunk_kwargs(
 
     Per-chunk ChunkSpec values override request-level globals.
     """
-    # Get per-chunk spec (if any)
     spec = inputs.chunk_specs_map.get(chunk_idx)
+    load_params = status_info.get("load_params", {})
 
-    kwargs = {
+    kwargs: dict = {
         "height": request.height
-        or status_info.get("load_params", {}).get("height", DEFAULT_HEIGHT),
+        if request.height is not None
+        else load_params.get("height", DEFAULT_HEIGHT),
         "width": request.width
-        or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH),
-        "base_seed": spec.seed if spec and spec.seed is not None else request.seed,
+        if request.width is not None
+        else load_params.get("width", DEFAULT_WIDTH),
+        "base_seed": _resolve(spec, "seed", request, DEFAULT_SEED),
         "init_cache": chunk_idx == 0 or (spec is not None and spec.reset_cache),
-        "manage_cache": (
-            spec.manage_cache
-            if spec and spec.manage_cache is not None
-            else request.manage_cache
-        ),
+        "manage_cache": _resolve(spec, "manage_cache", request, True),
     }
 
-    # Prompt (sticky behavior - only send when it changes)
+    # Prompt (sticky — only send when it changes)
     if chunk_idx in inputs.prompts:
         kwargs["prompts"] = inputs.prompts[chunk_idx]
 
@@ -274,79 +378,54 @@ def build_chunk_kwargs(
     if request.denoising_steps:
         kwargs["denoising_step_list"] = request.denoising_steps
 
-    # Video-to-video: per-chunk input video takes priority over global input video
+    # Video-to-video: per-chunk input video takes priority over global
     if chunk_idx in inputs.input_video_chunks:
-        # Per-chunk input video from blob (enables v2v/t2v switching per chunk)
-        chunk_frames = inputs.input_video_chunks[chunk_idx]
-        chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0)
+        chunk_frames = pad_chunk(
+            inputs.input_video_chunks[chunk_idx], chunk_size, axis=0
+        )
         kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames]
-        kwargs["noise_scale"] = (
-            spec.noise_scale
-            if spec and spec.noise_scale is not None
-            else request.noise_scale
+        kwargs["noise_scale"] = _resolve(
+            spec, "noise_scale", request, DEFAULT_NOISE_SCALE
         )
         logger.info(
             f"Chunk {chunk_idx}: Using per-chunk input video ({chunk_frames.shape[0]} frames)"
         )
     elif inputs.input_video is not None:
-        chunk_frames = inputs.input_video[start_frame:end_frame]
-        chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0)
+        chunk_frames = pad_chunk(
+            inputs.input_video[start_frame:end_frame], chunk_size, axis=0
+        )
         kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames]
-        kwargs["noise_scale"] = (
-            spec.noise_scale
-            if spec and spec.noise_scale is not None
-            else request.noise_scale
+        kwargs["noise_scale"] = _resolve(
+            spec, "noise_scale", request, DEFAULT_NOISE_SCALE
         )
     else:
         kwargs["num_frames"] = chunk_size
 
-    # VACE context scale
-    kwargs["vace_context_scale"] = (
-        spec.vace_context_scale
-        if spec and spec.vace_context_scale is not None
-        else request.vace_context_scale
+    kwargs["vace_context_scale"] = _resolve(spec, "vace_context_scale", request, 1.0)
+    kwargs["prompt_interpolation_method"] = _resolve(
+        spec, "prompt_interpolation_method", request, "linear"
     )
 
-    # Noise controller
-    noise_ctrl = (
-        spec.noise_controller
-        if spec and spec.noise_controller is not None
-        else request.noise_controller
-    )
+    # Optional overrides (only include in kwargs when non-None)
+    noise_ctrl = _resolve(spec, "noise_controller", request)
     if noise_ctrl is not None:
         kwargs["noise_controller"] = noise_ctrl
 
-    # KV cache attention bias
-    kv_bias = (
-        spec.kv_cache_attention_bias
-        if spec and spec.kv_cache_attention_bias is not None
-        else request.kv_cache_attention_bias
-    )
+    kv_bias = _resolve(spec, "kv_cache_attention_bias", request)
     if kv_bias is not None:
         kwargs["kv_cache_attention_bias"] = kv_bias
 
-    # Prompt interpolation method
-    kwargs["prompt_interpolation_method"] = (
-        spec.prompt_interpolation_method
-        if spec and spec.prompt_interpolation_method is not None
-        else request.prompt_interpolation_method
-    )
-
-    # VACE use input video
     if request.vace_use_input_video is not None:
         kwargs["vace_use_input_video"] = request.vace_use_input_video
 
     # LoRA scales: per-chunk spec overrides global
     lora_scales = spec.lora_scales if spec and spec.lora_scales else request.lora_scales
     if lora_scales:
-        lora_scale_updates = []
-        for path, scale in lora_scales.items():
-            lora_scale_updates.append({"path": path, "scale": scale})
-            logger.info(
-                f"Chunk {chunk_idx}: LoRA scale={scale:.3f} for {Path(path).name}"
-            )
-        if lora_scale_updates:
-            kwargs["lora_scales"] = lora_scale_updates
+        kwargs["lora_scales"] = [
+            {"path": p, "scale": s} for p, s in lora_scales.items()
+        ]
+        for p, s in lora_scales.items():
+            logger.info(f"Chunk {chunk_idx}: LoRA scale={s:.3f} for {Path(p).name}")
 
     # Keyframes
     if chunk_idx in inputs.first_frames:
@@ -354,229 +433,96 @@ def build_chunk_kwargs(
         kwargs["extension_mode"] = (
             "firstlastframe" if chunk_idx in inputs.last_frames else "firstframe"
         )
-
     if chunk_idx in inputs.last_frames:
         kwargs["last_frame_image"] = inputs.last_frames[chunk_idx]
         if chunk_idx not in inputs.first_frames:
             kwargs["extension_mode"] = "lastframe"
-
     if chunk_idx in inputs.ref_images:
         kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx]
 
     # VACE conditioning from blob
-    logger.info(
-        f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}"
-    )
     if chunk_idx in inputs.vace_chunk_specs:
-        logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING PER-CHUNK VACE SPEC")
         vace_spec = inputs.vace_chunk_specs[chunk_idx]
-
         if "frames" in vace_spec:
-            frames = vace_spec["frames"]
-            frames = pad_chunk(frames, chunk_size, axis=2)
+            frames = pad_chunk(vace_spec["frames"], chunk_size, axis=2)
             kwargs["vace_input_frames"] = torch.from_numpy(frames).to(device, dtype)
-
         if "masks" in vace_spec:
-            masks = vace_spec["masks"]
-            masks = pad_chunk(masks, chunk_size, axis=2)
+            masks = pad_chunk(vace_spec["masks"], chunk_size, axis=2)
             kwargs["vace_input_masks"] = torch.from_numpy(masks).to(device, dtype)
-
         if "context_scale" in vace_spec:
             kwargs["vace_context_scale"] = vace_spec["context_scale"]
+
     return kwargs
 
 
+# ---------------------------------------------------------------------------
+# Chunk logging
+# ---------------------------------------------------------------------------
+
+# (key, format_string) — format_string uses {v} for the value
+_CHUNK_LOG_ENTRIES = [
+    ("init_cache", "Resetting cache (init_cache=True)", lambda v: v),
+    ("extension_mode", "Extension mode: {v}", None),
+    ("vace_context_scale", "VACE context scale: {v}", lambda v: v != 1.0),
+    ("vace_use_input_video", "VACE use input video: {v}", None),
+    ("denoising_step_list", "Denoising steps: {v}", None),
+    ("noise_controller", "Using noise controller: {v}", None),
+    ("kv_cache_attention_bias", "KV cache attention bias: {v}", None),
+]
+
+
 def _log_chunk_info(kwargs: dict, chunk_idx: int, num_chunks: int, logger: "Logger"):
     """Log detailed chunk information."""
-    logger.info(f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}")
-    if kwargs.get("init_cache"):
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)"
-        )
+    prefix = f"generate: Chunk {chunk_idx}"
+    logger.info(f"generate: Starting chunk {chunk_idx + 1}/{num_chunks}")
+
+    # Structured entries
     if "prompts" in kwargs:
-        prompt_texts = [p["text"] for p in kwargs["prompts"]]
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}"
-        )
+        logger.info(f"{prefix}: Prompt → {[p['text'] for p in kwargs['prompts']]}")
     if "transition" in kwargs:
-        target_texts = [p["text"] for p in kwargs["transition"]["target_prompts"]]
+        t = kwargs["transition"]
         logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} "
-            f"over {kwargs['transition']['num_steps']} steps "
-            f"(method: {kwargs['transition']['temporal_interpolation_method']})"
+            f"{prefix}: Transition → {[p['text'] for p in t['target_prompts']]} "
+            f"over {t['num_steps']} steps ({t['temporal_interpolation_method']})"
         )
     if "first_frame_image" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe"
-        )
+        logger.info(f"{prefix}: Using first frame keyframe")
     if "last_frame_image" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe"
-        )
-    if "extension_mode" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}"
-        )
+        logger.info(f"{prefix}: Using last frame keyframe")
     if "vace_ref_images" in kwargs:
         logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Using {len(kwargs['vace_ref_images'])} VACE reference images"
+            f"{prefix}: Using {len(kwargs['vace_ref_images'])} VACE reference images"
         )
     if "vace_input_frames" in kwargs:
         logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {kwargs['vace_input_frames'].shape}"
+            f"{prefix}: VACE input frames shape: {kwargs['vace_input_frames'].shape}"
         )
     if "vace_input_masks" in kwargs:
         logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {kwargs['vace_input_masks'].shape}"
-        )
-    if "vace_context_scale" in kwargs and kwargs["vace_context_scale"] != 1.0:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}"
-        )
-    if "vace_use_input_video" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}"
+            f"{prefix}: VACE input masks shape: {kwargs['vace_input_masks'].shape}"
         )
     if "video" in kwargs:
         logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}"
+            f"{prefix}: Video-to-video ({len(kwargs['video'])} frames, "
+            f"noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)})"
         )
     elif "num_frames" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames"
-        )
-    if "denoising_step_list" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}"
-        )
-    if "noise_controller" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}"
-        )
-    if "kv_cache_attention_bias" in kwargs:
-        logger.info(
-            f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}"
-        )
-
+        logger.info(f"{prefix}: Text-to-video ({kwargs['num_frames']} frames)")
 
-def _write_chunk_output(
-    result: dict,
-    chunk_idx: int,
-    num_chunks: int,
-    chunk_latency: float,
-    output_file,
-    latency_measures: list,
-    fps_measures: list,
-    logger: "Logger",
-    total_frames_ref: list,
-    dimensions_ref: list,
-) -> str:
-    """Write chunk output to file and return SSE progress event."""
-    chunk_output = result["video"]
-    num_output_frames = chunk_output.shape[0]
-    chunk_fps = num_output_frames / chunk_latency
+    # Table-driven simple entries
+    for key, msg, condition in _CHUNK_LOG_ENTRIES:
+        if key in kwargs:
+            v = kwargs[key]
+            if condition is None or condition(v):
+                logger.info(f"{prefix}: {msg.format(v=v)}")
 
-    latency_measures.append(chunk_latency)
-    fps_measures.append(chunk_fps)
 
-    logger.info(
-        f"Chunk {chunk_idx + 1}/{num_chunks}: "
-        f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}"
-    )
-
-    chunk_np = chunk_output.detach().cpu().numpy()
-    chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8)
-    output_file.write(chunk_uint8.tobytes())
-
-    total_frames_ref[0] += num_output_frames
-    if dimensions_ref[0] is None:
-        dimensions_ref[0] = chunk_np.shape[1]
-        dimensions_ref[1] = chunk_np.shape[2]
-        dimensions_ref[2] = chunk_np.shape[3]
-
-    return sse_event(
-        "progress",
-        {
-            "chunk": chunk_idx + 1,
-            "total_chunks": num_chunks,
-            "frames": num_output_frames,
-            "latency": round(chunk_latency, 3),
-            "fps": round(chunk_fps, 2),
-        },
-    )
+# ---------------------------------------------------------------------------
+# Generation engine
+# ---------------------------------------------------------------------------
 
 
-def _generate_sequential(
-    request: "GenerateRequest",
-    pipeline,
-    inputs: DecodedInputs,
-    num_chunks: int,
-    chunk_size: int,
-    status_info: dict,
-    device: torch.device,
-    dtype: torch.dtype,
-    output_file,
-    latency_measures: list,
-    fps_measures: list,
-    logger: "Logger",
-    total_frames_ref: list,
-    dimensions_ref: list,
-) -> Iterator[str]:
-    """Sequential chunk processing (original code path, no processors)."""
-    for chunk_idx in range(num_chunks):
-        if _cancel_event.is_set():
-            logger.info("Generation cancelled by user")
-            yield sse_event(
-                "cancelled",
-                {
-                    "chunk": chunk_idx,
-                    "total_chunks": num_chunks,
-                    "frames_completed": total_frames_ref[0],
-                },
-            )
-            return
-
-        start_frame = chunk_idx * chunk_size
-        end_frame = min(start_frame + chunk_size, request.num_frames)
-
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-
-        kwargs = build_chunk_kwargs(
-            request,
-            inputs,
-            chunk_idx,
-            chunk_size,
-            start_frame,
-            end_frame,
-            status_info,
-            device,
-            dtype,
-            logger,
-        )
-        _log_chunk_info(kwargs, chunk_idx, num_chunks, logger)
-
-        chunk_start = time.time()
-        with torch.amp.autocast("cuda", dtype=dtype):
-            result = pipeline(**kwargs)
-        chunk_latency = time.time() - chunk_start
-
-        yield _write_chunk_output(
-            result,
-            chunk_idx,
-            num_chunks,
-            chunk_latency,
-            output_file,
-            latency_measures,
-            fps_measures,
-            logger,
-            total_frames_ref,
-            dimensions_ref,
-        )
-
-
-def _generate_with_processors(
+def _generate_chunks(
     request: "GenerateRequest",
     pipeline,
     pipeline_manager: "PipelineManager",
@@ -586,51 +532,50 @@ def _generate_with_processors(
     status_info: dict,
     device: torch.device,
     dtype: torch.dtype,
-    output_file,
-    latency_measures: list,
-    fps_measures: list,
+    state: GenerationState,
     logger: "Logger",
-    total_frames_ref: list,
-    dimensions_ref: list,
 ) -> Iterator[str]:
-    """Chunk processing with pre/post processor pipeline chaining."""
+    """Process chunks through a processor chain, yielding SSE events.
+
+    Always uses PipelineProcessor — when there are no pre/post processors
+    the chain is just [main_pipeline].
+    """
     from .pipeline_processor import _SENTINEL, PipelineProcessor
 
-    # Build the processor chain
+    # Build processor chain: [pre?] → main → [post?]
     processors: list[PipelineProcessor] = []
 
     if request.pre_processor_id:
         pre_pipeline = pipeline_manager.get_pipeline_by_id(request.pre_processor_id)
-        pre_proc = PipelineProcessor(
-            pipeline=pre_pipeline,
-            pipeline_id=request.pre_processor_id,
-            batch_mode=True,
+        processors.append(
+            PipelineProcessor(
+                pipeline=pre_pipeline,
+                pipeline_id=request.pre_processor_id,
+                batch_mode=True,
+            )
         )
-        processors.append(pre_proc)
         logger.info(f"Pre-processor: {request.pre_processor_id}")
 
-    main_proc = PipelineProcessor(
-        pipeline=pipeline,
-        pipeline_id=request.pipeline_id,
-        batch_mode=True,
+    processors.append(
+        PipelineProcessor(
+            pipeline=pipeline, pipeline_id=request.pipeline_id, batch_mode=True
+        )
     )
-    processors.append(main_proc)
 
     if request.post_processor_id:
         post_pipeline = pipeline_manager.get_pipeline_by_id(request.post_processor_id)
-        post_proc = PipelineProcessor(
-            pipeline=post_pipeline,
-            pipeline_id=request.post_processor_id,
-            batch_mode=True,
+        processors.append(
+            PipelineProcessor(
+                pipeline=post_pipeline,
+                pipeline_id=request.post_processor_id,
+                batch_mode=True,
+            )
         )
-        processors.append(post_proc)
         logger.info(f"Post-processor: {request.post_processor_id}")
 
-    # Chain processors
+    # Chain and start
     for i in range(len(processors) - 1):
         processors[i].set_next_processor(processors[i + 1])
-
-    # Start all processors
     for proc in processors:
         proc.start()
 
@@ -638,7 +583,6 @@ def _generate_with_processors(
     last_proc = processors[-1]
 
     try:
-        # Feed chunks into the first processor's input queue
         for chunk_idx in range(num_chunks):
             if _cancel_event.is_set():
                 logger.info("Generation cancelled by user")
@@ -647,7 +591,7 @@ def _generate_with_processors(
                     {
                         "chunk": chunk_idx,
                         "total_chunks": num_chunks,
-                        "frames_completed": total_frames_ref[0],
+                        "frames_completed": state.total_frames,
                     },
                 )
                 return
@@ -675,10 +619,9 @@ def _generate_with_processors(
 
             chunk_start = time.time()
 
-            # Feed kwargs into chain (blocking put)
             first_proc.input_queue.put(kwargs)
 
-            # Collect result from last processor (blocking get)
+            # Collect result from last processor
             while True:
                 try:
                     result = last_proc.output_queue.get(timeout=1.0)
@@ -689,29 +632,21 @@ def _generate_with_processors(
                     continue
 
             chunk_latency = time.time() - chunk_start
-
-            yield _write_chunk_output(
-                result,
-                chunk_idx,
-                num_chunks,
-                chunk_latency,
-                output_file,
-                latency_measures,
-                fps_measures,
-                logger,
-                total_frames_ref,
-                dimensions_ref,
-            )
+            yield state.write_chunk(result, chunk_idx, chunk_latency)
 
         # Signal end of input
         first_proc.input_queue.put(_SENTINEL)
 
     finally:
-        # Stop all processors
         for proc in processors:
             proc.stop()
 
 
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
 def generate_video_stream(
     request: "GenerateRequest",
     pipeline_manager: "PipelineManager",
@@ -721,7 +656,12 @@ def generate_video_stream(
     """Generate video frames, yielding SSE events.
 
     Writes output to temp file incrementally, returns output_path for download.
+    Only one generation can run at a time (single-client).
     """
+    if not _generation_lock.acquire(blocking=False):
+        yield sse_event("error", {"error": "A generation is already in progress"})
+        return
+
     _cancel_event.clear()
     output_file_path = None
     completed = False
@@ -735,16 +675,12 @@ def generate_video_stream(
         chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE
         num_chunks = (request.num_frames + chunk_size - 1) // chunk_size
 
-        # Decode inputs (supports both file-based and base64)
         inputs = decode_inputs(request, request.num_frames, logger)
 
-        # Setup
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         dtype = torch.bfloat16
-        latency_measures = []
-        fps_measures = []
 
-        # Create output file for incremental writing (reuse recording pattern)
+        # Create output file with placeholder header
         from .recording import TEMP_FILE_PREFIXES, RecordingManager
 
         output_file_path = RecordingManager._create_temp_file(
@@ -752,67 +688,32 @@ def generate_video_stream(
         )
         output_file = open(output_file_path, "wb")
 
-        # We'll write a placeholder header, then update it at the end
-        # Header format: ndim (4 bytes) + shape (4 * ndim bytes)
-        # For video [T, H, W, C], that's 4 + 16 = 20 bytes
-        header_size = 4 + 4 * 4  # ndim + 4 dimensions
-        output_file.write(b"\x00" * header_size)  # Placeholder
-
-        total_frames = 0
-        video_height = None
-        video_width = None
-        video_channels = None
-
-        # Determine if we need processor chaining
-        use_processors = (
-            request.pre_processor_id is not None
-            or request.post_processor_id is not None
+        # Header: ndim (4 bytes) + shape (4 * ndim bytes) = 20 bytes for [T, H, W, C]
+        header_size = 4 + 4 * 4
+        output_file.write(b"\x00" * header_size)
+
+        state = GenerationState(
+            output_file=output_file, num_chunks=num_chunks, logger=logger
         )
 
         try:
-            if use_processors:
-                yield from _generate_with_processors(
-                    request,
-                    pipeline,
-                    pipeline_manager,
-                    inputs,
-                    num_chunks,
-                    chunk_size,
-                    status_info,
-                    device,
-                    dtype,
-                    output_file,
-                    latency_measures,
-                    fps_measures,
-                    logger,
-                    _total_frames_ref := [0],
-                    _dimensions_ref := [None, None, None],
-                )
-                total_frames = _total_frames_ref[0]
-                video_height, video_width, video_channels = _dimensions_ref
-            else:
-                yield from _generate_sequential(
-                    request,
-                    pipeline,
-                    inputs,
-                    num_chunks,
-                    chunk_size,
-                    status_info,
-                    device,
-                    dtype,
-                    output_file,
-                    latency_measures,
-                    fps_measures,
-                    logger,
-                    _total_frames_ref := [0],
-                    _dimensions_ref := [None, None, None],
-                )
-                total_frames = _total_frames_ref[0]
-                video_height, video_width, video_channels = _dimensions_ref
+            yield from _generate_chunks(
+                request,
+                pipeline,
+                pipeline_manager,
+                inputs,
+                num_chunks,
+                chunk_size,
+                status_info,
+                device,
+                dtype,
+                state,
+                logger,
+            )
 
             # Update header with actual shape
             output_file.seek(0)
-            shape = (total_frames, video_height, video_width, video_channels)
+            shape = tuple(state.output_shape)
             output_file.write(len(shape).to_bytes(4, "little"))
             for dim in shape:
                 output_file.write(dim.to_bytes(4, "little"))
@@ -821,27 +722,14 @@ def generate_video_stream(
             output_file.close()
 
         logger.info(f"Output video saved: {output_file_path}")
-
-        # Log performance summary
-        if latency_measures:
-            avg_latency = sum(latency_measures) / len(latency_measures)
-            avg_fps = sum(fps_measures) / len(fps_measures)
-            logger.info(
-                f"=== Performance Summary ({num_chunks} chunks) ===\n"
-                f"  Latency - Avg: {avg_latency:.2f}s, "
-                f"Max: {max(latency_measures):.2f}s, Min: {min(latency_measures):.2f}s\n"
-                f"  FPS - Avg: {avg_fps:.2f}, "
-                f"Max: {max(fps_measures):.2f}, Min: {min(fps_measures):.2f}"
-            )
-
-        output_shape = [total_frames, video_height, video_width, video_channels]
+        state.log_summary()
 
         yield sse_event(
             "complete",
             {
                 "output_path": output_file_path,
-                "video_shape": output_shape,
-                "num_frames": total_frames,
+                "video_shape": state.output_shape,
+                "num_frames": state.total_frames,
                 "num_chunks": num_chunks,
                 "chunk_size": chunk_size,
             },
@@ -853,26 +741,22 @@ def generate_video_stream(
         yield sse_event("error", {"error": str(e)})
 
     finally:
-        # Clean up uploaded data blob file
-        if request.data_blob_path:
-            try:
-                Path(request.data_blob_path).unlink(missing_ok=True)
-                logger.info(f"Cleaned up data blob file: {request.data_blob_path}")
-            except Exception as e:
-                logger.warning(f"Failed to clean up data blob file: {e}")
-
-        # Clean up uploaded input file
-        if request.input_path:
-            try:
-                Path(request.input_path).unlink(missing_ok=True)
-                logger.info(f"Cleaned up input file: {request.input_path}")
-            except Exception as e:
-                logger.warning(f"Failed to clean up input file: {e}")
+        # Clean up uploaded files
+        for path_attr in ("data_blob_path", "input_path"):
+            path = getattr(request, path_attr, None)
+            if path:
+                try:
+                    Path(path).unlink(missing_ok=True)
+                    logger.info(f"Cleaned up {path_attr}: {path}")
+                except Exception as e:
+                    logger.warning(f"Failed to clean up {path_attr}: {e}")
 
-        # Clean up output file if generation didn't complete successfully
+        # Clean up output file if generation didn't complete
         if not completed and output_file_path:
             try:
                 Path(output_file_path).unlink(missing_ok=True)
                 logger.info(f"Cleaned up orphaned output file: {output_file_path}")
             except Exception as e:
                 logger.warning(f"Failed to clean up output file: {e}")
+
+        _generation_lock.release()
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index ad61c6a32..2c8b8848b 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -1034,22 +1034,11 @@ class DataUploadResponse(BaseModel):
 
 
 class GenerateResponse(BaseModel):
-    """Response from batch video generation.
+    """Response from batch video generation."""
 
-    Supports two modes:
-    - Legacy: video_base64 contains the full video (for small videos)
-    - File-based: output_path references a downloadable file (for large videos)
-    """
-
-    # File-based output (preferred for large videos)
-    output_path: str | None = Field(
-        default=None,
-        description="Path to output video file for download via /generate/download. Preferred for large videos.",
-    )
-    # Legacy base64 output (kept for backwards compatibility)
-    video_base64: str | None = Field(
-        default=None,
-        description="Base64-encoded output video frames (THWC, uint8). Deprecated for large videos, use output_path.",
+    output_path: str = Field(
+        ...,
+        description="Path to output video file for download via /generate/download.",
     )
     video_shape: list[int] = Field(
         ...,

From 59eb2aba98683c03d85606d4673c52d73a30240e Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Fri, 20 Feb 2026 15:06:50 -0500
Subject: [PATCH 15/16] rm gc

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 src/scope/server/generate.py | 56 ++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py
index 93d91b25c..d332ff715 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/generate.py
@@ -1,6 +1,6 @@
 """Video generation service for batch mode with chunked processing."""
 
-import gc
+import concurrent.futures
 import json
 import queue
 import threading
@@ -126,8 +126,19 @@ class GenerationState:
     latencies: list[float] = field(default_factory=list)
     fps_measures: list[float] = field(default_factory=list)
 
-    def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> str:
-        """Write chunk output to file and return SSE progress event."""
+    def build_chunk_sse(self, chunk_idx: int, chunk_latency: float) -> str:
+        """Build SSE progress event (call from main thread before write)."""
+        return sse_event(
+            "progress",
+            {
+                "chunk": chunk_idx + 1,
+                "total_chunks": self.num_chunks,
+                "latency": round(chunk_latency, 3),
+            },
+        )
+
+    def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> None:
+        """Write chunk output to file (safe to call from background thread)."""
         chunk_output = result["video"]
         num_output_frames = chunk_output.shape[0]
         chunk_fps = num_output_frames / chunk_latency
@@ -150,17 +161,6 @@ def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> str
             self.width = chunk_np.shape[2]
             self.channels = chunk_np.shape[3]
 
-        return sse_event(
-            "progress",
-            {
-                "chunk": chunk_idx + 1,
-                "total_chunks": self.num_chunks,
-                "frames": num_output_frames,
-                "latency": round(chunk_latency, 3),
-                "fps": round(chunk_fps, 2),
-            },
-        )
-
     @property
     def output_shape(self) -> list[int]:
         return [self.total_frames, self.height, self.width, self.channels]
@@ -582,6 +582,10 @@ def _generate_chunks(
     first_proc = processors[0]
     last_proc = processors[-1]
 
+    write_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+    write_future: concurrent.futures.Future | None = None
+    pending_sse: str | None = None
+
     try:
         for chunk_idx in range(num_chunks):
             if _cancel_event.is_set():
@@ -599,10 +603,6 @@ def _generate_chunks(
             start_frame = chunk_idx * chunk_size
             end_frame = min(start_frame + chunk_size, request.num_frames)
 
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-
             kwargs = build_chunk_kwargs(
                 request,
                 inputs,
@@ -632,12 +632,30 @@ def _generate_chunks(
                     continue
 
             chunk_latency = time.time() - chunk_start
-            yield state.write_chunk(result, chunk_idx, chunk_latency)
+
+            # Wait for previous async write before starting a new one
+            if write_future is not None:
+                write_future.result()
+            if pending_sse is not None:
+                yield pending_sse
+
+            # Offload CPU transfer + disk I/O to background thread
+            pending_sse = state.build_chunk_sse(chunk_idx, chunk_latency)
+            write_future = write_executor.submit(
+                state.write_chunk, result, chunk_idx, chunk_latency
+            )
+
+        # Wait for final write
+        if write_future is not None:
+            write_future.result()
+        if pending_sse is not None:
+            yield pending_sse
 
         # Signal end of input
         first_proc.input_queue.put(_SENTINEL)
 
     finally:
+        write_executor.shutdown(wait=True)
         for proc in processors:
             proc.stop()
 

From ad34a65712193f9a90ad706a3489811b2b2b7c63 Mon Sep 17 00:00:00 2001
From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
Date: Fri, 20 Feb 2026 15:27:31 -0500
Subject: [PATCH 16/16] generate -> batch

Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com>
---
 docs/api/{generate.md => batch.md}         | 36 ++++++++++----------
 scripts/test_generate_endpoint.py          | 10 +++---
 src/scope/server/app.py                    | 39 +++++++++++-----------
 src/scope/server/{generate.py => batch.py} | 32 +++++++++---------
 src/scope/server/recording.py              | 12 +++----
 5 files changed, 64 insertions(+), 65 deletions(-)
 rename docs/api/{generate.md => batch.md} (65%)
 rename src/scope/server/{generate.py => batch.py} (97%)

diff --git a/docs/api/generate.md b/docs/api/batch.md
similarity index 65%
rename from docs/api/generate.md
rename to docs/api/batch.md
index 3f22d4d44..7b8ba3d10 100644
--- a/docs/api/generate.md
+++ b/docs/api/batch.md
@@ -1,6 +1,6 @@
-# Generate Endpoint
+# Batch Endpoint
 
-Batch video generation via HTTP. Unlike the WebRTC streaming path (real-time, interactive), the generate endpoint produces a complete video in one request, processing it chunk-by-chunk with SSE progress events.
+Batch video generation via HTTP. Unlike the WebRTC streaming path (real-time, interactive), the batch endpoint produces a complete video in one request, processing it chunk-by-chunk with SSE progress events.
 
 Primary consumer: ComfyUI custom nodes (`comfyui-scope`).
 
@@ -8,29 +8,29 @@ Primary consumer: ComfyUI custom nodes (`comfyui-scope`).
 
 | Endpoint | Method | Purpose |
 |---|---|---|
-| `/api/v1/generate` | POST | Generate video (SSE stream) |
-| `/api/v1/generate/cancel` | POST | Cancel after current chunk |
-| `/api/v1/generate/upload` | POST | Upload input video for v2v |
-| `/api/v1/generate/upload-data` | POST | Upload binary data blob (VACE, per-chunk video) |
-| `/api/v1/generate/download` | GET | Download output video |
+| `/api/v1/batch` | POST | Generate video (SSE stream) |
+| `/api/v1/batch/cancel` | POST | Cancel after current chunk |
+| `/api/v1/batch/upload` | POST | Upload input video for v2v |
+| `/api/v1/batch/upload-data` | POST | Upload binary data blob (VACE, per-chunk video) |
+| `/api/v1/batch/download` | GET | Download output video |
 
 Only one generation can run at a time (409 if busy).
 
 ## Flow
 
 ```
-1. [optional] POST /generate/upload      → input_path
-2. [optional] POST /generate/upload-data  → data_blob_path
-3. POST /generate (JSON body, references paths from steps 1-2)
+1. [optional] POST /batch/upload      → input_path
+2. [optional] POST /batch/upload-data  → data_blob_path
+3. POST /batch (JSON body, references paths from steps 1-2)
    ← SSE: event: progress  {chunk, total_chunks, frames, latency, fps}
    ← SSE: event: complete  {output_path, video_shape, num_frames, ...}
-4. GET /generate/download?path=<output_path>
+4. GET /batch/download?path=<output_path>
    ← binary video data
 ```
 
 ## Binary Protocol
 
-### Video Upload (`/generate/upload`)
+### Video Upload (`/batch/upload`)
 
 **Request**: Raw uint8 bytes in THWC order (frames × height × width × channels).
 
@@ -47,11 +47,11 @@ Only one generation can run at a time (409 if busy).
 [raw uint8 video bytes]
 ```
 
-### Data Blob Upload (`/generate/upload-data`)
+### Data Blob Upload (`/batch/upload-data`)
 
 **Request**: Raw binary blob containing packed arrays. Max size: 2 GB.
 
-The blob is an opaque byte buffer. `ChunkSpec` entries in the generate request reference regions of this blob by offset:
+The blob is an opaque byte buffer. `ChunkSpec` entries in the batch request reference regions of this blob by offset:
 
 ```json
 {
@@ -65,14 +65,14 @@ The blob is an opaque byte buffer. `ChunkSpec` entries in the generate request r
 
 Arrays are packed as contiguous float32 (VACE frames/masks) or uint8 (input video). The client is responsible for computing offsets when packing the blob.
 
-### Video Download (`/generate/download`)
+### Video Download (`/batch/download`)
 
 **Response**: Same binary format as upload (20-byte header + raw uint8 THWC data).
 
 **Response headers**:
 - `X-Video-Frames`, `X-Video-Height`, `X-Video-Width`, `X-Video-Channels`
 
-## GenerateRequest
+## BatchRequest
 
 ```json
 {
@@ -81,8 +81,8 @@ Arrays are packed as contiguous float32 (VACE frames/masks) or uint8 (input vide
   "num_frames": 48,
   "seed": 42,
   "noise_scale": 0.7,
-  "input_path": "<from /generate/upload>",
-  "data_blob_path": "<from /generate/upload-data>",
+  "input_path": "<from /batch/upload>",
+  "data_blob_path": "<from /batch/upload-data>",
   "chunk_specs": [
     {
       "chunk": 0,
diff --git a/scripts/test_generate_endpoint.py b/scripts/test_generate_endpoint.py
index 122dcb4c6..753df0e8e 100644
--- a/scripts/test_generate_endpoint.py
+++ b/scripts/test_generate_endpoint.py
@@ -1,4 +1,4 @@
-"""Test script for the /api/v1/generate endpoint.
+"""Test script for the /api/v1/batch endpoint.
 
 Usage:
     python test_generate_endpoint.py <test_name>
@@ -98,7 +98,7 @@ def upload_video_for_v2v(path: str, height: int, width: int) -> str:
     num_frames, h, w, c = arr.shape
 
     response = requests.post(
-        f"{SERVER_URL}/api/v1/generate/upload",
+        f"{SERVER_URL}/api/v1/batch/upload",
         data=arr.tobytes(),
         headers={
             "Content-Type": "application/octet-stream",
@@ -163,7 +163,7 @@ def upload_vace_data(
 
     # Upload blob
     response = requests.post(
-        f"{SERVER_URL}/api/v1/generate/upload-data",
+        f"{SERVER_URL}/api/v1/batch/upload-data",
         data=bytes(blob),
         headers={"Content-Type": "application/octet-stream"},
         timeout=300,
@@ -214,7 +214,7 @@ def wait_for_pipeline(timeout: int = 300):
 def download_video(output_path: str) -> np.ndarray:
     """Download generated video from server."""
     response = requests.get(
-        f"{SERVER_URL}/api/v1/generate/download",
+        f"{SERVER_URL}/api/v1/batch/download",
         params={"path": output_path},
         timeout=300,
     )
@@ -346,7 +346,7 @@ def run_test(name: str):
     start = time.time()
 
     with requests.post(
-        f"{SERVER_URL}/api/v1/generate",
+        f"{SERVER_URL}/api/v1/batch",
         json=gen_request.model_dump(exclude_none=True),
         stream=True,
         headers={"Accept": "text/event-stream"},
diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index 10fe79c39..272852f84 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -45,7 +45,6 @@
     VIDEO_EXTENSIONS,
     iter_files,
 )
-from .generate import generate_video_stream
 from .kafka_publisher import (
     KafkaPublisher,
     is_kafka_enabled,
@@ -1128,15 +1127,15 @@ def download_in_background():
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
-@app.post("/api/v1/generate")
-async def generate_video(
+@app.post("/api/v1/batch")
+async def batch_video(
     request: "GenerateRequest",
     pipeline_manager: "PipelineManager" = Depends(get_pipeline_manager),
 ):
     """Generate video frames in batch mode with SSE progress streaming."""
-    from .generate import is_generation_active
+    from .batch import batch_video_stream, is_batch_active
 
-    if is_generation_active():
+    if is_batch_active():
         raise HTTPException(
             status_code=409,
             detail="A generation is already in progress. Cancel it first or wait for completion.",
@@ -1150,7 +1149,7 @@ async def generate_video(
         )
 
     return StreamingResponse(
-        generate_video_stream(request, pipeline_manager, status_info, logger),
+        batch_video_stream(request, pipeline_manager, status_info, logger),
         media_type="text/event-stream",
         headers={
             "Cache-Control": "no-cache",
@@ -1160,17 +1159,17 @@ async def generate_video(
     )
 
 
-@app.post("/api/v1/generate/cancel")
-async def cancel_generate():
+@app.post("/api/v1/batch/cancel")
+async def cancel_batch():
     """Cancel the current video generation after the current chunk completes."""
-    from .generate import cancel_generation
+    from .batch import cancel_batch as _cancel_batch
 
-    cancel_generation()
+    _cancel_batch()
     return {"status": "cancelling"}
 
 
-@app.post("/api/v1/generate/upload")
-async def upload_video_for_generate(request: Request):
+@app.post("/api/v1/batch/upload")
+async def upload_video_for_batch(request: Request):
     """Upload a video for batch generation (file-based transfer for large videos).
 
     Accepts raw binary video data with metadata headers:
@@ -1181,7 +1180,7 @@ async def upload_video_for_generate(request: Request):
 
     Video data should be raw uint8 bytes in THWC order.
 
-    Returns input_path to use in the generate request.
+    Returns input_path to use in the batch request.
     """
     from .recording import TEMP_FILE_PREFIXES, RecordingManager
     from .schema import VideoUploadResponse
@@ -1204,7 +1203,7 @@ async def upload_video_for_generate(request: Request):
 
         # Create temp file (reuse recording pattern)
         file_path = RecordingManager._create_temp_file(
-            ".bin", TEMP_FILE_PREFIXES["generate_input"]
+            ".bin", TEMP_FILE_PREFIXES["batch_input"]
         )
 
         # Stream body to file
@@ -1242,14 +1241,14 @@ async def upload_video_for_generate(request: Request):
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
-@app.post("/api/v1/generate/upload-data")
+@app.post("/api/v1/batch/upload-data")
 async def upload_data_blob(request: Request):
     """Upload binary data blob for batch generation.
 
     Accepts raw binary data containing VACE frames/masks, input video, or other
     array data referenced by ChunkSpec offsets in the generate request.
 
-    Returns data_blob_path to use in the generate request.
+    Returns data_blob_path to use in the batch request.
     """
 
     from .recording import TEMP_FILE_PREFIXES, RecordingManager
@@ -1258,10 +1257,10 @@ async def upload_data_blob(request: Request):
     try:
         # Create temp file
         file_path = RecordingManager._create_temp_file(
-            ".bin", TEMP_FILE_PREFIXES["generate_data"]
+            ".bin", TEMP_FILE_PREFIXES["batch_data"]
         )
 
-        from .generate import MAX_DATA_BLOB_BYTES
+        from .batch import MAX_DATA_BLOB_BYTES
 
         # Stream body to file with size limit
         bytes_written = 0
@@ -1295,7 +1294,7 @@ async def upload_data_blob(request: Request):
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
-@app.get("/api/v1/generate/download")
+@app.get("/api/v1/batch/download")
 async def download_generated_video(
     path: str = Query(..., description="Path to output video file"),
     background_tasks: BackgroundTasks = None,
@@ -1321,7 +1320,7 @@ async def download_generated_video(
         temp_dir = Path(tempfile.gettempdir())
         if not file_path.is_relative_to(temp_dir):
             raise HTTPException(status_code=403, detail="Invalid file path")
-        if not file_path.name.startswith(TEMP_FILE_PREFIXES["generate_output"]):
+        if not file_path.name.startswith(TEMP_FILE_PREFIXES["batch_output"]):
             raise HTTPException(status_code=403, detail="Invalid file path")
 
         if not file_path.exists():
diff --git a/src/scope/server/generate.py b/src/scope/server/batch.py
similarity index 97%
rename from src/scope/server/generate.py
rename to src/scope/server/batch.py
index d332ff715..a5cfbd802 100644
--- a/src/scope/server/generate.py
+++ b/src/scope/server/batch.py
@@ -17,25 +17,25 @@
 _cancel_event = threading.Event()
 
 # Generation lock (single-client: only one generation at a time)
-_generation_lock = threading.Lock()
+_batch_lock = threading.Lock()
 
 # Max data blob upload size (2 GB)
 MAX_DATA_BLOB_BYTES = 2 * 1024 * 1024 * 1024
 
 
-def cancel_generation():
+def cancel_batch():
     """Signal the current generation to stop after the current chunk."""
     _cancel_event.set()
 
 
-def is_generation_cancelled() -> bool:
+def is_batch_cancelled() -> bool:
     """Check if cancellation has been requested."""
     return _cancel_event.is_set()
 
 
-def is_generation_active() -> bool:
+def is_batch_active() -> bool:
     """Check if a generation is currently in progress."""
-    return _generation_lock.locked()
+    return _batch_lock.locked()
 
 
 # Defaults
@@ -113,7 +113,7 @@ class DecodedInputs:
 
 
 @dataclass
-class GenerationState:
+class BatchState:
     """Mutable state accumulated during chunk-by-chunk generation."""
 
     output_file: IO[bytes]
@@ -234,10 +234,10 @@ def decode_inputs(
         blob_path = Path(request.data_blob_path)
         temp_dir = Path(tempfile.gettempdir())
         if not blob_path.is_relative_to(temp_dir) or not blob_path.name.startswith(
-            TEMP_FILE_PREFIXES["generate_data"]
+            TEMP_FILE_PREFIXES["batch_data"]
         ):
             raise ValueError(
-                f"Invalid data_blob_path: must be a temp file with prefix {TEMP_FILE_PREFIXES['generate_data']}"
+                f"Invalid data_blob_path: must be a temp file with prefix {TEMP_FILE_PREFIXES['batch_data']}"
             )
         with open(blob_path, "rb") as f:
             blob = f.read()
@@ -522,7 +522,7 @@ def _log_chunk_info(kwargs: dict, chunk_idx: int, num_chunks: int, logger: "Logg
 # ---------------------------------------------------------------------------
 
 
-def _generate_chunks(
+def _batch_chunks(
     request: "GenerateRequest",
     pipeline,
     pipeline_manager: "PipelineManager",
@@ -532,7 +532,7 @@ def _generate_chunks(
     status_info: dict,
     device: torch.device,
     dtype: torch.dtype,
-    state: GenerationState,
+    state: BatchState,
     logger: "Logger",
 ) -> Iterator[str]:
     """Process chunks through a processor chain, yielding SSE events.
@@ -665,7 +665,7 @@ def _generate_chunks(
 # ---------------------------------------------------------------------------
 
 
-def generate_video_stream(
+def batch_video_stream(
     request: "GenerateRequest",
     pipeline_manager: "PipelineManager",
     status_info: dict,
@@ -676,7 +676,7 @@ def generate_video_stream(
     Writes output to temp file incrementally, returns output_path for download.
     Only one generation can run at a time (single-client).
     """
-    if not _generation_lock.acquire(blocking=False):
+    if not _batch_lock.acquire(blocking=False):
         yield sse_event("error", {"error": "A generation is already in progress"})
         return
 
@@ -702,7 +702,7 @@ def generate_video_stream(
         from .recording import TEMP_FILE_PREFIXES, RecordingManager
 
         output_file_path = RecordingManager._create_temp_file(
-            ".bin", TEMP_FILE_PREFIXES["generate_output"]
+            ".bin", TEMP_FILE_PREFIXES["batch_output"]
         )
         output_file = open(output_file_path, "wb")
 
@@ -710,12 +710,12 @@ def generate_video_stream(
         header_size = 4 + 4 * 4
         output_file.write(b"\x00" * header_size)
 
-        state = GenerationState(
+        state = BatchState(
             output_file=output_file, num_chunks=num_chunks, logger=logger
         )
 
         try:
-            yield from _generate_chunks(
+            yield from _batch_chunks(
                 request,
                 pipeline,
                 pipeline_manager,
@@ -777,4 +777,4 @@ def generate_video_stream(
             except Exception as e:
                 logger.warning(f"Failed to clean up output file: {e}")
 
-        _generation_lock.release()
+        _batch_lock.release()
diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py
index 280ba9fe1..568239314 100644
--- a/src/scope/server/recording.py
+++ b/src/scope/server/recording.py
@@ -17,9 +17,9 @@
 TEMP_FILE_PREFIXES = {
     "recording": "scope_recording_",
     "download": "scope_download_",
-    "generate_input": "scope_gen_input_",
-    "generate_output": "scope_gen_output_",
-    "generate_data": "scope_gen_data_",
+    "batch_input": "scope_gen_input_",
+    "batch_output": "scope_gen_output_",
+    "batch_data": "scope_gen_data_",
 }
 
 # Environment variables
@@ -440,9 +440,9 @@ def cleanup_recording_files():
     patterns = [
         f"{TEMP_FILE_PREFIXES['recording']}*.mp4",
         f"{TEMP_FILE_PREFIXES['download']}*.mp4",
-        f"{TEMP_FILE_PREFIXES['generate_input']}*.bin",
-        f"{TEMP_FILE_PREFIXES['generate_output']}*.bin",
-        f"{TEMP_FILE_PREFIXES['generate_data']}*.bin",
+        f"{TEMP_FILE_PREFIXES['batch_input']}*.bin",
+        f"{TEMP_FILE_PREFIXES['batch_output']}*.bin",
+        f"{TEMP_FILE_PREFIXES['batch_data']}*.bin",
     ]
 
     deleted_count = 0