From ce1ed48a77435c41209a524f61826ffd5ffb5507 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:16:06 -0500 Subject: [PATCH 01/16] feat: generate endpoint with SSE streaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Add batch video generation endpoint with SSE streaming ## Summary Adds `/api/v1/generate` endpoint for batch video generation with server-side chunking and SSE progress streaming. Supports text-to-video, video-to-video, VACE conditioning, and comprehensive per-chunk parameter scheduling. This is important for the ComfyUI node wrapper for Scope. It also could conceivably replace the test.py/test_vace.py, or at least their boiler plate code. ## Changes - **`schema.py`**: Add `GenerateRequest`/`GenerateResponse` models with `EncodedArray` for binary data - **`generate.py`**: New module handling chunked generation with SSE progress events - **`app.py`**: Wire up the endpoint - **`test_generate_endpoint.py`**: Integration tests for v2v, depth, inpainting, LoRA ramps - **ComfyUI nodes**: Update `ScopeSampler` to use new schema ## Features ### Generation modes - **Text-to-video**: Generate from prompt alone - **Video-to-video**: Transform input video with configurable noise scale ### VACE conditioning - **Reference images**: Style/identity conditioning via image paths - **Depth/structure guidance**: Pass conditioning frames for structural control - **Inpainting**: Binary masks specify regions to regenerate vs preserve ### Per-chunk parameter scheduling All scheduling parameters accept either a single value (applied to all chunks) or a list (applied per-chunk, last value repeats if list is shorter than chunk count). | Parameter | Type | Description | |-----------|------|-------------| | `seed` | `int \| list[int]` | Random seed per chunk | | `noise_scale` | `float \| list[float]` | V2V noise injection strength | | `vace_context_scale` | `float \| list[float]` | VACE conditioning influence | | `lora_scales` | `dict[str, float \| list[float]]` | Per-LoRA strength scheduling | ### Sparse keyframe updates These parameters use a chunk-indexed specification, only sending updates when values change (sticky behavior). | Parameter | Type | Description | |-----------|------|-------------| | `chunk_prompts` | `list[{chunk, text}]` | Prompt changes at specific chunks | | `first_frames` | `list[{chunk, image}]` | First frame anchors for extension mode | | `last_frames` | `list[{chunk, image}]` | Last frame anchors for extension mode | | `vace_ref_images` | `list[{chunk, images}]` | Reference images at specific chunks | ## Design decisions Some features were left out of this PR for simplicity (eg, prompt spatial/temporal blending). They can be added or included in a follow up. ### SSE streaming Clients, like test files or ComfyUI nodes, need performance and progress updates. SSE provides per-chunk progress updates without requiring WebSocket infrastructure: ``` event: progress data: {"chunk": 1, "total_chunks": 8, "fps": 4.2, "latency": 2.85} event: progress data: {"chunk": 2, "total_chunks": 8, "fps": 4.5, "latency": 2.67} event: complete data: {"video_base64": "...", "video_shape": [96, 320, 576, 3], ...} ``` ### Server-side chunking The server determines chunk size from the pipeline, handles frame padding, and manages KV cache initialization. Callers specify total frames and per-chunk parameters—the server handles the rest. ## Example usage ### LoRA strength ramp (dissolve effect) ```python request = GenerateRequest( pipeline_id="longlive", prompt="a woman dissolving into particles", num_frames=96, # 8 chunks × 12 frames lora_scales={ "path/to/dissolve.safetensors": [0.0, 0.15, 0.3, 0.5, 0.7, 0.85, 1.0, 1.0] }, ) ``` ### Video-to-video with prompt changes ```python request = GenerateRequest( pipeline_id="longlive", prompt="a cat sitting calmly", chunk_prompts=[ {"chunk": 3, "text": "a cat jumping"}, {"chunk": 6, "text": "a cat landing gracefully"}, ], input_video=EncodedArray(base64="...", shape=[96, 512, 512, 3]), noise_scale=0.6, ) ``` ### Depth-guided generation ```python request = GenerateRequest( pipeline_id="longlive", prompt="a robot walking through a forest", vace_frames=EncodedArray(base64="...", shape=[1, 3, 48, 320, 576]), vace_context_scale=1.5, ) ``` ## Test plan - [x] `uv run daydream-scope` starts without errors - [x] V2V generation produces correct output - [x] VACE depth conditioning works - [x] VACE inpainting with masks works - [x] LoRA scale ramping works across chunks - [x] Per-chunk noise scale scheduling works - [x] Prompt keyframing updates at correct chunks - [x] ComfyUI ScopeSampler node works (WIP) - [x] Test with Longlive - [x] Same test with StreamDiffusionv2 Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/app.py | 26 +++ src/scope/server/generate.py | 324 ++++++++++++++++++++++++++++++++ src/scope/server/schema.py | 123 ++++++++++++ tests/test_generate_endpoint.py | 301 +++++++++++++++++++++++++++++ 4 files changed, 774 insertions(+) create mode 100644 src/scope/server/generate.py create mode 100644 tests/test_generate_endpoint.py diff --git a/src/scope/server/app.py b/src/scope/server/app.py index ac8080f58..c5a79d485 100644 --- a/src/scope/server/app.py +++ b/src/scope/server/app.py @@ -50,6 +50,7 @@ is_kafka_enabled, set_kafka_publisher, ) +from .generate import generate_video_stream from .logs_config import ( cleanup_old_logs, ensure_logs_dir, @@ -78,6 +79,7 @@ AssetsResponse, CloudConnectRequest, CloudStatusResponse, + GenerateRequest, HardwareInfoResponse, HealthResponse, IceCandidateRequest, @@ -1126,6 +1128,30 @@ def download_in_background(): raise HTTPException(status_code=500, detail=str(e)) from e +@app.post("/api/v1/generate") +async def generate_video( + request: "GenerateRequest", + pipeline_manager: "PipelineManager" = Depends(get_pipeline_manager), +): + """Generate video frames in batch mode with SSE progress streaming.""" + status_info = await pipeline_manager.get_status_info_async() + if status_info["status"] != "loaded": + raise HTTPException( + status_code=400, + detail="Pipeline not loaded. Please load pipeline first.", + ) + + return StreamingResponse( + generate_video_stream(request, pipeline_manager, status_info, logger), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + def is_spout_available() -> bool: """Check if Spout is available (native Windows only, not WSL).""" return sys.platform == "win32" diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py new file mode 100644 index 000000000..9ce59d808 --- /dev/null +++ b/src/scope/server/generate.py @@ -0,0 +1,324 @@ +"""Video generation service for batch mode with chunked processing.""" + +import base64 +import gc +import json +import time +from collections.abc import Iterator +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING + +import numpy as np +import torch + +# Defaults +DEFAULT_HEIGHT = 320 +DEFAULT_WIDTH = 576 +DEFAULT_CHUNK_SIZE = 12 +DEFAULT_SEED = 42 +DEFAULT_NOISE_SCALE = 0.7 +PROMPT_WEIGHT = 100 + +if TYPE_CHECKING: + from logging import Logger + + from .pipeline_manager import PipelineManager + from .schema import EncodedArray, GenerateRequest + + +def decode_array(encoded: "EncodedArray", dtype: np.dtype) -> np.ndarray: + """Decode EncodedArray to numpy array.""" + data = base64.b64decode(encoded.base64) + return np.frombuffer(data, dtype=dtype).reshape(encoded.shape) + + +def loop_to_length(arr: np.ndarray, target: int, axis: int) -> np.ndarray: + """Tile array along axis to reach target length.""" + current = arr.shape[axis] + if current >= target: + return arr + repeats = (target + current - 1) // current + tiled = np.concatenate([arr] * repeats, axis=axis) + slices = [slice(None)] * arr.ndim + slices[axis] = slice(0, target) + return tiled[tuple(slices)] + + +def pad_chunk(arr: np.ndarray, target_size: int, axis: int) -> np.ndarray: + """Pad array with last frame along axis to reach target size.""" + current = arr.shape[axis] + if current >= target_size: + return arr + slices = [slice(None)] * arr.ndim + slices[axis] = slice(-1, None) + last_frame = arr[tuple(slices)] + padding = np.repeat(last_frame, target_size - current, axis=axis) + return np.concatenate([arr, padding], axis=axis) + + +def build_lookup(specs: list | None, value_attr: str = "image") -> dict: + """Build chunk -> value lookup from list of specs.""" + if not specs: + return {} + return {spec.chunk: getattr(spec, value_attr) for spec in specs} + + +def get_chunk_value(value, chunk_idx: int, default=None): + """Get per-chunk value from scalar or list.""" + if value is None: + return default + if isinstance(value, list): + return value[chunk_idx] if chunk_idx < len(value) else value[-1] + return value + + +def sse_event(event_type: str, data: dict) -> str: + """Format a server-sent event.""" + return f"event: {event_type}\ndata: {json.dumps(data)}\n\n" + + +@dataclass +class DecodedInputs: + """Decoded and preprocessed inputs for generation.""" + + input_video: np.ndarray | None = None + vace_frames: np.ndarray | None = None + vace_masks: np.ndarray | None = None + first_frames: dict[int, str] = field(default_factory=dict) + last_frames: dict[int, str] = field(default_factory=dict) + ref_images: dict[int, list[str]] = field(default_factory=dict) + prompts: dict[int, str] = field(default_factory=dict) + + +def decode_inputs(request: "GenerateRequest", num_frames: int) -> DecodedInputs: + """Decode all base64 inputs from request.""" + inputs = DecodedInputs() + + if request.input_video: + inputs.input_video = decode_array(request.input_video, np.uint8) + inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0) + + if request.vace_frames: + inputs.vace_frames = decode_array(request.vace_frames, np.float32) + inputs.vace_frames = loop_to_length(inputs.vace_frames, num_frames, axis=2) + + if request.vace_masks: + inputs.vace_masks = decode_array(request.vace_masks, np.float32) + inputs.vace_masks = loop_to_length(inputs.vace_masks, num_frames, axis=2) + + inputs.first_frames = build_lookup(request.first_frames, "image") + inputs.last_frames = build_lookup(request.last_frames, "image") + inputs.ref_images = build_lookup(request.vace_ref_images, "images") + inputs.prompts = {0: request.prompt} + inputs.prompts.update(build_lookup(request.chunk_prompts, "text")) + + return inputs + + +def build_chunk_kwargs( + request: "GenerateRequest", + inputs: DecodedInputs, + chunk_idx: int, + chunk_size: int, + start_frame: int, + end_frame: int, + status_info: dict, + device: torch.device, + dtype: torch.dtype, + logger: "Logger", +) -> dict: + """Build pipeline kwargs for a single chunk.""" + kwargs = { + "height": request.height + or status_info.get("load_params", {}).get("height", DEFAULT_HEIGHT), + "width": request.width + or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH), + "base_seed": get_chunk_value(request.seed, chunk_idx, DEFAULT_SEED), + "init_cache": chunk_idx == 0, + "manage_cache": request.manage_cache, + } + + # Prompt (sticky behavior - only send when it changes) + if chunk_idx in inputs.prompts: + kwargs["prompts"] = [ + {"text": inputs.prompts[chunk_idx], "weight": PROMPT_WEIGHT} + ] + + if request.denoising_steps: + kwargs["denoising_step_list"] = request.denoising_steps + + # Video-to-video + if inputs.input_video is not None: + chunk_frames = inputs.input_video[start_frame:end_frame] + chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0) + kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames] + kwargs["noise_scale"] = get_chunk_value( + request.noise_scale, chunk_idx, DEFAULT_NOISE_SCALE + ) + else: + kwargs["num_frames"] = chunk_size + + # VACE context scale + kwargs["vace_context_scale"] = get_chunk_value( + request.vace_context_scale, chunk_idx, 1.0 + ) + + # LoRA scales + if request.lora_scales: + lora_scale_updates = [] + for path, scale_value in request.lora_scales.items(): + scale = get_chunk_value(scale_value, chunk_idx, 1.0) + lora_scale_updates.append({"path": path, "scale": scale}) + logger.info( + f"Chunk {chunk_idx}: LoRA scale={scale:.3f} for {Path(path).name}" + ) + if lora_scale_updates: + kwargs["lora_scales"] = lora_scale_updates + + # Keyframes + if chunk_idx in inputs.first_frames: + kwargs["first_frame_image"] = inputs.first_frames[chunk_idx] + kwargs["extension_mode"] = ( + "firstlastframe" if chunk_idx in inputs.last_frames else "firstframe" + ) + + if chunk_idx in inputs.last_frames: + kwargs["last_frame_image"] = inputs.last_frames[chunk_idx] + if chunk_idx not in inputs.first_frames: + kwargs["extension_mode"] = "lastframe" + + if chunk_idx in inputs.ref_images: + kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx] + + # VACE conditioning frames [1, C, T, H, W] + if inputs.vace_frames is not None: + chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :] + chunk = pad_chunk(chunk, chunk_size, axis=2) + kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype) + + # VACE masks [1, 1, T, H, W] + if inputs.vace_masks is not None: + chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :] + chunk = pad_chunk(chunk, chunk_size, axis=2) + kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype) + + return kwargs + + +def generate_video_stream( + request: "GenerateRequest", + pipeline_manager: "PipelineManager", + status_info: dict, + logger: "Logger", +) -> Iterator[str]: + """Generate video frames, yielding SSE events.""" + try: + pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id) + + # Determine chunk size from pipeline + has_video = request.input_video is not None + requirements = pipeline.prepare(video=[] if has_video else None) + chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE + num_chunks = (request.num_frames + chunk_size - 1) // chunk_size + + # Decode inputs + inputs = decode_inputs(request, request.num_frames) + + # Setup + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + dtype = torch.bfloat16 + output_chunks = [] + latency_measures = [] + fps_measures = [] + + for chunk_idx in range(num_chunks): + start_frame = chunk_idx * chunk_size + end_frame = min(start_frame + chunk_size, request.num_frames) + actual_frames = end_frame - start_frame + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + kwargs = build_chunk_kwargs( + request, + inputs, + chunk_idx, + chunk_size, + start_frame, + end_frame, + status_info, + device, + dtype, + logger, + ) + + # Run pipeline + chunk_start = time.time() + with torch.amp.autocast("cuda", dtype=dtype): + result = pipeline(**kwargs) + chunk_latency = time.time() - chunk_start + + chunk_output = result["video"] + num_output_frames = chunk_output.shape[0] + chunk_fps = num_output_frames / chunk_latency + + latency_measures.append(chunk_latency) + fps_measures.append(chunk_fps) + + logger.info( + f"Chunk {chunk_idx + 1}/{num_chunks}: " + f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" + ) + + # Trim padding from output + if chunk_output.shape[0] > actual_frames: + chunk_output = chunk_output[:actual_frames] + + output_chunks.append(chunk_output.detach().cpu()) + + yield sse_event( + "progress", + { + "chunk": chunk_idx + 1, + "total_chunks": num_chunks, + "frames": num_output_frames, + "latency": round(chunk_latency, 3), + "fps": round(chunk_fps, 2), + }, + ) + + # Concatenate and encode output + output_video = torch.cat(output_chunks, dim=0) + output_np = output_video.numpy() + + # Log performance summary + if latency_measures: + avg_latency = sum(latency_measures) / len(latency_measures) + avg_fps = sum(fps_measures) / len(fps_measures) + logger.info( + f"=== Performance Summary ({num_chunks} chunks) ===\n" + f" Latency - Avg: {avg_latency:.2f}s, " + f"Max: {max(latency_measures):.2f}s, Min: {min(latency_measures):.2f}s\n" + f" FPS - Avg: {avg_fps:.2f}, " + f"Max: {max(fps_measures):.2f}, Min: {min(fps_measures):.2f}" + ) + + video_bytes = output_np.astype(np.float32).tobytes() + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + + yield sse_event( + "complete", + { + "video_base64": video_base64, + "video_shape": list(output_np.shape), + "num_frames": output_np.shape[0], + "num_chunks": num_chunks, + "chunk_size": chunk_size, + }, + ) + + except Exception as e: + logger.exception("Error generating video") + yield sse_event("error", {"error": str(e)}) diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index c50127aab..51c201345 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -816,3 +816,126 @@ class ApiKeySetResponse(BaseModel): class ApiKeyDeleteResponse(BaseModel): success: bool message: str +class ChunkFrameSpec(BaseModel): + """Specification for a frame image at a specific chunk.""" + + chunk: int = Field(..., ge=0, description="Chunk index") + image: str = Field(..., description="Path to image file") + + +class ChunkPromptSpec(BaseModel): + """Specification for a prompt at a specific chunk.""" + + chunk: int = Field(..., ge=0, description="Chunk index") + text: str = Field(..., description="Prompt text for this chunk") + + +class ChunkRefImagesSpec(BaseModel): + """Specification for reference images at a specific chunk.""" + + chunk: int = Field(default=0, ge=0, description="Chunk index (default: 0)") + images: list[str] = Field(..., description="List of reference image paths") + + +class EncodedArray(BaseModel): + """Base64-encoded numpy array with shape metadata.""" + + base64: str = Field(..., description="Base64-encoded numpy array bytes") + shape: list[int] = Field(..., description="Array shape for decoding") + + +class GenerateRequest(BaseModel): + """Request for batch video generation.""" + + pipeline_id: str = Field(..., description="Pipeline ID to use for generation") + prompt: str = Field(..., description="Text prompt for generation (sent on chunk 0)") + chunk_prompts: list[ChunkPromptSpec] | None = Field( + default=None, + description="Prompt changes at later chunks (sticky behavior). The prompt persists until the next specified chunk.", + ) + num_frames: int = Field( + default=64, + ge=1, + le=1024, + description="Total number of frames to generate", + ) + height: int | None = Field( + default=None, + ge=64, + le=2048, + description="Output height (defaults to pipeline's native resolution)", + ) + width: int | None = Field( + default=None, + ge=64, + le=2048, + description="Output width (defaults to pipeline's native resolution)", + ) + seed: int | list[int] = Field( + default=42, + description="Random seed. Single int applies to all chunks; list applies per-chunk.", + ) + # Video-to-video input (optional) + input_video: EncodedArray | None = Field( + default=None, + description="Input video frames (THWC, uint8). If provided, enables video-to-video mode.", + ) + noise_scale: float | list[float] = Field( + default=0.7, + description="Noise scale for video-to-video mode. Single float applies to all chunks; list applies per-chunk.", + ) + denoising_steps: list[int] | None = Field( + default=None, + description="Denoising timesteps (e.g., [1000, 750, 500, 250])", + ) + manage_cache: bool = Field( + default=True, + description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).", + ) + # Per-chunk parameters + lora_scales: dict[str, float | list[float]] | None = Field( + default=None, + description="LoRA scales by path. Single float applies to all chunks; list applies per-chunk. Example: {'path/to/lora.pt': 0.8} or {'path/to/lora.pt': [0.5, 0.7, 0.9]}", + ) + vace_context_scale: float | list[float] = Field( + default=1.0, + description="VACE context scale. Single float applies to all chunks; list applies per-chunk.", + ) + # Keyframe specifications (chunk, image) pairs + first_frames: list[ChunkFrameSpec] | None = Field( + default=None, + description="First frame anchors. Each specifies a chunk index and image path to use as that chunk's first frame.", + ) + last_frames: list[ChunkFrameSpec] | None = Field( + default=None, + description="Last frame anchors. Each specifies a chunk index and image path to use as that chunk's last frame.", + ) + vace_ref_images: list[ChunkRefImagesSpec] | None = Field( + default=None, + description="Reference images for VACE conditioning. Each specifies a chunk index and list of image paths.", + ) + # VACE conditioning frames/masks (for depth guidance, inpainting, etc.) + vace_frames: EncodedArray | None = Field( + default=None, + description="VACE conditioning frames ([1, C, T, H, W] float32 [-1, 1]). Used for depth guidance, structural control, etc.", + ) + vace_masks: EncodedArray | None = Field( + default=None, + description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).", + ) + + +class GenerateResponse(BaseModel): + """Response from batch video generation.""" + + video_base64: str = Field( + ..., + description="Base64-encoded output video frames as numpy array bytes (THWC, float32, [0,1] range)", + ) + video_shape: list[int] = Field( + ..., + description="Shape of output video [T, H, W, C]", + ) + num_frames: int = Field(..., description="Number of frames generated") + num_chunks: int = Field(..., description="Number of chunks processed") + chunk_size: int = Field(..., description="Frames per chunk") diff --git a/tests/test_generate_endpoint.py b/tests/test_generate_endpoint.py new file mode 100644 index 000000000..327db105b --- /dev/null +++ b/tests/test_generate_endpoint.py @@ -0,0 +1,301 @@ +"""Test script for the /api/v1/generate endpoint. + +Usage: + python test_generate_endpoint.py + python test_generate_endpoint.py --list +""" + +import base64 +import json +import sys +import time + +import numpy as np +import requests +from diffusers.utils import export_to_video + +from scope.core.pipelines.video import load_video +from scope.server.schema import ( + GenerateRequest, + LongLiveLoadParams, + LoRAConfig, + LoRAMergeMode, + PipelineLoadRequest, + PipelineStatusResponse, +) + +# ============================================================================= +# Configuration +# ============================================================================= + +SERVER_URL = "http://localhost:8000" +DEFAULT_PIPELINE = "longlive" + +# Asset paths (tests skip gracefully if missing) +LORA = "path/to/a/lora.safetensors" +TEST_VIDEO = "path/to/test_video.mp4" +VACE_CONDITIONING_VIDEO = "path/to/depth_video.mp4" +MASK_VIDEO = "path/to/mask_video.mp4" + +# ============================================================================= +# Test Definitions +# ============================================================================= + +TESTS = { + "lora": { + "description": "LoRA strength ramping over chunks", + "pipeline": "longlive", + "resolution": (576, 320), + "num_frames": 96, + "prompt": "a woman dissolving into particles, ethereal, magical transformation", + "lora": LORA, + "lora_ramp": [0.0, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9, 1.0], + "manage_cache": False, + }, + "v2v": { + "description": "Video-to-video transformation", + "resolution": (512, 512), + "num_frames": 48, + "prompt": "A 3D animated scene. A **panda** sitting in the grass, looking around.", + "input_video": TEST_VIDEO, + "noise_scale": 0.6, + }, + "v2v_lora": { + "description": "Video-to-video with LoRA ramp (0 -> 1.5 -> 0)", + "resolution": (512, 512), + "num_frames": 120, + "prompt": "a woman made of ral-dissolve, dissolving into particles", + "input_video": TEST_VIDEO, + "noise_scale": 0.7, + "lora": LORA, + "lora_ramp": [0.0, 0.3, 0.6, 1.0, 1.5, 1.5, 1.0, 0.6, 0.3, 0.0], + }, + "vace_conditioning": { + "description": "VACE structural conditioning (depth, pose, etc.)", + "resolution": (576, 320), + "num_frames": 48, + "prompt": "a cat walking towards the camera", + "vace_frames": VACE_CONDITIONING_VIDEO, + "vace_context_scale": 1.5, + }, + "inpainting": { + "description": "VACE inpainting with mask", + "resolution": (512, 512), + "num_frames": 48, + "prompt": "fireball doom flames", + "vace_frames": TEST_VIDEO, + "vace_masks": MASK_VIDEO, + }, +} + +# ============================================================================= +# Helpers +# ============================================================================= + + +def encode_array(arr: np.ndarray) -> dict: + """Encode numpy array as EncodedArray dict.""" + return { + "base64": base64.b64encode(arr.tobytes()).decode("utf-8"), + "shape": list(arr.shape), + } + + +def load_video_for_v2v(path: str, height: int, width: int) -> dict: + """Load video as [T, H, W, C] uint8 for video-to-video mode.""" + tensor = load_video(path, resize_hw=(height, width), normalize=False) + arr = tensor.permute(1, 2, 3, 0).numpy().astype(np.uint8) + return encode_array(arr) + + +def load_video_for_vace(path: str, height: int, width: int) -> dict: + """Load video as [1, C, T, H, W] float32 for VACE conditioning.""" + tensor = load_video(path, resize_hw=(height, width)) + arr = tensor.unsqueeze(0).numpy().astype(np.float32) + return encode_array(arr) + + +def load_mask_for_vace(path: str, height: int, width: int) -> dict: + """Load video as [1, 1, T, H, W] binary mask for VACE inpainting.""" + tensor = load_video(path, resize_hw=(height, width)) + arr = (tensor[0:1].unsqueeze(0).numpy() > 0.0).astype(np.float32) + return encode_array(arr) + + +def parse_sse_events(response): + """Parse SSE events using iter_content (handles large payloads).""" + buffer = "" + event_type = None + data_lines = [] + + for chunk in response.iter_content(chunk_size=None, decode_unicode=True): + buffer += chunk + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.rstrip("\r") + + if line.startswith("event:"): + event_type = line[6:].strip() + elif line.startswith("data:"): + data_lines.append(line[5:].strip()) + elif line == "": + if data_lines: + yield (event_type or "message", json.loads("\n".join(data_lines))) + event_type = None + data_lines = [] + + +def wait_for_pipeline(timeout: int = 300): + """Wait for pipeline to finish loading.""" + start = time.time() + while time.time() - start < timeout: + resp = requests.get(f"{SERVER_URL}/api/v1/pipeline/status") + status = PipelineStatusResponse.model_validate(resp.json()) + if status.status.value == "loaded": + return time.time() - start + if status.status.value == "error": + raise RuntimeError(f"Pipeline failed: {status.error}") + time.sleep(1) + raise TimeoutError(f"Pipeline did not load within {timeout}s") + + +# ============================================================================= +# Test Runner +# ============================================================================= + + +def run_test(name: str): + """Run a single test by name.""" + if name not in TESTS: + print(f"Unknown test: {name}") + print(f"Available: {', '.join(TESTS.keys())}") + return + + cfg = TESTS[name] + width, height = cfg.get("resolution", (576, 320)) + pipeline_id = cfg.get("pipeline", DEFAULT_PIPELINE) + + print(f"\n{'=' * 60}") + print(f"Test: {name}") + print(f"Description: {cfg['description']}") + print(f"{'=' * 60}") + + # Build LoRA config if specified + loras = None + lora_scales = None + if "lora" in cfg: + loras = [ + LoRAConfig( + path=cfg["lora"], scale=0.0, merge_mode=LoRAMergeMode.RUNTIME_PEFT + ) + ] + if "lora_ramp" in cfg: + lora_scales = {cfg["lora"]: cfg["lora_ramp"]} + print(f"LoRA ramp: {cfg['lora_ramp']}") + + # Load pipeline + print(f"Loading pipeline '{pipeline_id}' at {width}x{height}...") + request = PipelineLoadRequest( + pipeline_ids=[pipeline_id], + load_params=LongLiveLoadParams( + height=height, + width=width, + loras=loras, + lora_merge_mode=LoRAMergeMode.RUNTIME_PEFT + if loras + else LoRAMergeMode.PERMANENT_MERGE, + ), + ) + requests.post( + f"{SERVER_URL}/api/v1/pipeline/load", json=request.model_dump(mode="json") + ).raise_for_status() + load_time = wait_for_pipeline() + print(f"Pipeline loaded in {load_time:.1f}s") + + # Load input video if specified + input_video = None + if "input_video" in cfg: + input_video = load_video_for_v2v(cfg["input_video"], height, width) + print(f"Input video: {input_video['shape']}") + + # Load VACE frames if specified + vace_frames = None + if "vace_frames" in cfg: + vace_frames = load_video_for_vace(cfg["vace_frames"], height, width) + print(f"VACE frames: {vace_frames['shape']}") + + # Load VACE masks if specified + vace_masks = None + if "vace_masks" in cfg: + vace_masks = load_mask_for_vace(cfg["vace_masks"], height, width) + print(f"VACE masks: {vace_masks['shape']}") + + # Build and send request + gen_request = GenerateRequest( + pipeline_id=pipeline_id, + prompt=cfg["prompt"], + num_frames=cfg["num_frames"], + input_video=input_video, + noise_scale=cfg.get("noise_scale", 0.7), + vace_frames=vace_frames, + vace_masks=vace_masks, + vace_context_scale=cfg.get("vace_context_scale", 1.0), + lora_scales=lora_scales, + manage_cache=cfg.get("manage_cache", True), + ) + + print(f"Generating {cfg['num_frames']} frames...") + start = time.time() + + with requests.post( + f"{SERVER_URL}/api/v1/generate", + json=gen_request.model_dump(exclude_none=True), + stream=True, + headers={"Accept": "text/event-stream"}, + ) as resp: + resp.raise_for_status() + result = None + for event_type, data in parse_sse_events(resp): + if event_type == "progress": + print( + f" Chunk {data['chunk']}/{data['total_chunks']}: {data['fps']:.1f} fps" + ) + elif event_type == "complete": + result = data + break + elif event_type == "error": + raise RuntimeError(f"Generation failed: {data['error']}") + + if result is None: + raise RuntimeError("No complete event received") + + # Decode and save + video = np.frombuffer( + base64.b64decode(result["video_base64"]), dtype=np.float32 + ).reshape(result["video_shape"]) + + output_path = f"test_{name}.mp4" + export_to_video(video, output_path, fps=16) + + print(f"\nComplete in {time.time() - start:.1f}s") + print(f"Output: {output_path} ({result['video_shape']})") + + +def main(): + if len(sys.argv) < 2 or sys.argv[1] == "--list": + print("Available tests:") + for name, cfg in TESTS.items(): + print(f" {name:20} - {cfg['description']}") + print("\nUsage: python test_generate_endpoint.py ") + print(" python test_generate_endpoint.py all") + return + + if sys.argv[1] == "all": + for name in TESTS: + run_test(name) + else: + run_test(sys.argv[1]) + + +if __name__ == "__main__": + main() From f626b5b440e9501f1d7b496fe8f5e1697401f701 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:46:46 -0500 Subject: [PATCH 02/16] remove edge case padding enables rife Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 9ce59d808..0e270ee01 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -235,7 +235,6 @@ def generate_video_stream( for chunk_idx in range(num_chunks): start_frame = chunk_idx * chunk_size end_frame = min(start_frame + chunk_size, request.num_frames) - actual_frames = end_frame - start_frame gc.collect() if torch.cuda.is_available(): @@ -272,10 +271,6 @@ def generate_video_stream( f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" ) - # Trim padding from output - if chunk_output.shape[0] > actual_frames: - chunk_output = chunk_output[:actual_frames] - output_chunks.append(chunk_output.detach().cpu()) yield sse_event( From 4e38e70722b8afaf640b15990fd3c4f62daeeb18 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Wed, 4 Feb 2026 15:21:57 -0500 Subject: [PATCH 03/16] rm longliveloadparams Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- tests/test_generate_endpoint.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/test_generate_endpoint.py b/tests/test_generate_endpoint.py index 327db105b..c184e5da8 100644 --- a/tests/test_generate_endpoint.py +++ b/tests/test_generate_endpoint.py @@ -17,7 +17,6 @@ from scope.core.pipelines.video import load_video from scope.server.schema import ( GenerateRequest, - LongLiveLoadParams, LoRAConfig, LoRAMergeMode, PipelineLoadRequest, @@ -195,17 +194,11 @@ def run_test(name: str): # Load pipeline print(f"Loading pipeline '{pipeline_id}' at {width}x{height}...") - request = PipelineLoadRequest( - pipeline_ids=[pipeline_id], - load_params=LongLiveLoadParams( - height=height, - width=width, - loras=loras, - lora_merge_mode=LoRAMergeMode.RUNTIME_PEFT - if loras - else LoRAMergeMode.PERMANENT_MERGE, - ), - ) + load_params = {"height": height, "width": width} + if loras: + load_params["loras"] = [lora.model_dump() for lora in loras] + load_params["lora_merge_mode"] = "runtime_peft" + request = PipelineLoadRequest(pipeline_ids=[pipeline_id], load_params=load_params) requests.post( f"{SERVER_URL}/api/v1/pipeline/load", json=request.model_dump(mode="json") ).raise_for_status() From 7f5c9fd390dd60424b2ce79b1bd363617e2e8524 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Wed, 4 Feb 2026 15:22:13 -0500 Subject: [PATCH 04/16] move scripts Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- {tests => scripts}/test_generate_endpoint.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename {tests => scripts}/test_generate_endpoint.py (96%) diff --git a/tests/test_generate_endpoint.py b/scripts/test_generate_endpoint.py similarity index 96% rename from tests/test_generate_endpoint.py rename to scripts/test_generate_endpoint.py index c184e5da8..986133100 100644 --- a/tests/test_generate_endpoint.py +++ b/scripts/test_generate_endpoint.py @@ -31,10 +31,10 @@ DEFAULT_PIPELINE = "longlive" # Asset paths (tests skip gracefully if missing) -LORA = "path/to/a/lora.safetensors" -TEST_VIDEO = "path/to/test_video.mp4" -VACE_CONDITIONING_VIDEO = "path/to/depth_video.mp4" -MASK_VIDEO = "path/to/mask_video.mp4" +LORA = r"C:\Users\ryanf\.daydream-scope\models\lora\lora\output\model_245889_dissolve_imgvid\dissolve-000064.safetensors" +TEST_VIDEO = r"frontend\public\assets\test.mp4" +VACE_CONDITIONING_VIDEO = r"controlnet_test\control_frames_depth.mp4" +MASK_VIDEO = r"src\scope\core\pipelines\longlive\vace_tests\static_mask_half_white_half_black.mp4" # ============================================================================= # Test Definitions From 3da2f2f40dd47e1a93c51c4442946cff339aa7ff Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Thu, 5 Feb 2026 15:55:41 -0500 Subject: [PATCH 05/16] Add file-based transfer for generate endpoint - Reuse RecordingManager temp file pattern for large video I/O - Add POST /generate/upload and GET /generate/download endpoints - Write output chunks incrementally to disk (constant memory) - Add generate_input/generate_output prefixes to TEMP_FILE_PREFIXES Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/app.py | 133 +++++++++++++++++++++++ src/scope/server/generate.py | 196 ++++++++++++++++++++++------------ src/scope/server/recording.py | 2 + src/scope/server/schema.py | 37 +++++-- 4 files changed, 295 insertions(+), 73 deletions(-) diff --git a/src/scope/server/app.py b/src/scope/server/app.py index c5a79d485..916ce9c0c 100644 --- a/src/scope/server/app.py +++ b/src/scope/server/app.py @@ -1152,6 +1152,139 @@ async def generate_video( ) +@app.post("/api/v1/generate/upload") +async def upload_video_for_generate(request: Request): + """Upload a video for batch generation (file-based transfer for large videos). + + Accepts raw binary video data with metadata headers: + - X-Video-Frames: number of frames (T) + - X-Video-Height: frame height (H) + - X-Video-Width: frame width (W) + - X-Video-Channels: number of channels (C), typically 3 for RGB + + Video data should be raw uint8 bytes in THWC order. + + Returns input_path to use in the generate request. + """ + from .recording import TEMP_FILE_PREFIXES, RecordingManager + from .schema import VideoUploadResponse + + try: + # Get video dimensions from headers + num_frames = int(request.headers.get("X-Video-Frames", 0)) + height = int(request.headers.get("X-Video-Height", 0)) + width = int(request.headers.get("X-Video-Width", 0)) + channels = int(request.headers.get("X-Video-Channels", 3)) + + if not all([num_frames, height, width]): + raise HTTPException( + status_code=400, + detail="Missing required headers: X-Video-Frames, X-Video-Height, X-Video-Width", + ) + + expected_size = num_frames * height * width * channels + shape = (num_frames, height, width, channels) + + # Create temp file (reuse recording pattern) + file_path = RecordingManager._create_temp_file( + ".bin", TEMP_FILE_PREFIXES["generate_input"] + ) + + # Stream body to file + with open(file_path, "wb") as f: + # Write header: ndim (4 bytes) + shape (ndim * 4 bytes) + f.write(len(shape).to_bytes(4, "little")) + for dim in shape: + f.write(dim.to_bytes(4, "little")) + + # Stream video data + bytes_written = 0 + async for chunk in request.stream(): + f.write(chunk) + bytes_written += len(chunk) + + if bytes_written != expected_size: + Path(file_path).unlink(missing_ok=True) + raise HTTPException( + status_code=400, + detail=f"Video data size mismatch: expected {expected_size}, got {bytes_written}", + ) + + logger.info(f"Uploaded video: {file_path} (shape: {shape})") + + return VideoUploadResponse( + input_path=file_path, + num_frames=num_frames, + shape=list(shape), + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error uploading video: {e}") + raise HTTPException(status_code=500, detail=str(e)) from e + + +@app.get("/api/v1/generate/download") +async def download_generated_video( + path: str = Query(..., description="Path to output video file"), + background_tasks: BackgroundTasks = None, +): + """Download a generated video by path. + + Returns raw binary video data with metadata headers: + - X-Video-Frames: number of frames (T) + - X-Video-Height: frame height (H) + - X-Video-Width: frame width (W) + - X-Video-Channels: number of channels (C) + + Video data is raw uint8 bytes in THWC order. + """ + import tempfile + + from .recording import TEMP_FILE_PREFIXES, cleanup_temp_file + + try: + file_path = Path(path) + + # Security: only allow files in temp dir with our prefix + temp_dir = Path(tempfile.gettempdir()) + if not file_path.is_relative_to(temp_dir): + raise HTTPException(status_code=403, detail="Invalid file path") + if not file_path.name.startswith(TEMP_FILE_PREFIXES["generate_output"]): + raise HTTPException(status_code=403, detail="Invalid file path") + + if not file_path.exists(): + raise HTTPException(status_code=404, detail="Output video not found") + + # Read header to get shape + with open(file_path, "rb") as f: + ndim = int.from_bytes(f.read(4), "little") + shape = tuple(int.from_bytes(f.read(4), "little") for _ in range(ndim)) + + # Schedule cleanup after download + if background_tasks: + background_tasks.add_task(cleanup_temp_file, str(file_path)) + + # Return file with metadata headers + return FileResponse( + file_path, + media_type="application/octet-stream", + headers={ + "X-Video-Frames": str(shape[0]), + "X-Video-Height": str(shape[1]), + "X-Video-Width": str(shape[2]), + "X-Video-Channels": str(shape[3]) if len(shape) > 3 else "3", + }, + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error downloading generated video: {e}") + raise HTTPException(status_code=500, detail=str(e)) from e + + def is_spout_available() -> bool: """Check if Spout is available (native Windows only, not WSL).""" return sys.platform == "win32" diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 0e270ee01..64c71dc9c 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -91,11 +91,34 @@ class DecodedInputs: prompts: dict[int, str] = field(default_factory=dict) -def decode_inputs(request: "GenerateRequest", num_frames: int) -> DecodedInputs: - """Decode all base64 inputs from request.""" +def load_video_from_file(file_path: str) -> np.ndarray: + """Load video from temp file. + + Args: + file_path: Path to video file with header + + Returns: + Video array [T, H, W, C] uint8 + """ + with open(file_path, "rb") as f: + ndim = int.from_bytes(f.read(4), "little") + shape = tuple(int.from_bytes(f.read(4), "little") for _ in range(ndim)) + data = np.frombuffer(f.read(), dtype=np.uint8).reshape(shape) + return data + + +def decode_inputs( + request: "GenerateRequest", num_frames: int, logger: "Logger" +) -> DecodedInputs: + """Decode all inputs from request (base64 or file-based).""" inputs = DecodedInputs() - if request.input_video: + # Handle input video - either from file path or base64 + if request.input_path: + logger.info(f"Loading input video from file: {request.input_path}") + inputs.input_video = load_video_from_file(request.input_path) + inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0) + elif request.input_video: inputs.input_video = decode_array(request.input_video, np.uint8) inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0) @@ -212,81 +235,121 @@ def generate_video_stream( status_info: dict, logger: "Logger", ) -> Iterator[str]: - """Generate video frames, yielding SSE events.""" + """Generate video frames, yielding SSE events. + + Writes output to temp file incrementally, returns output_path for download. + """ try: pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id) # Determine chunk size from pipeline - has_video = request.input_video is not None + has_video = request.input_video is not None or request.input_path is not None requirements = pipeline.prepare(video=[] if has_video else None) chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE num_chunks = (request.num_frames + chunk_size - 1) // chunk_size - # Decode inputs - inputs = decode_inputs(request, request.num_frames) + # Decode inputs (supports both file-based and base64) + inputs = decode_inputs(request, request.num_frames, logger) # Setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 - output_chunks = [] latency_measures = [] fps_measures = [] - for chunk_idx in range(num_chunks): - start_frame = chunk_idx * chunk_size - end_frame = min(start_frame + chunk_size, request.num_frames) - - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - kwargs = build_chunk_kwargs( - request, - inputs, - chunk_idx, - chunk_size, - start_frame, - end_frame, - status_info, - device, - dtype, - logger, - ) - - # Run pipeline - chunk_start = time.time() - with torch.amp.autocast("cuda", dtype=dtype): - result = pipeline(**kwargs) - chunk_latency = time.time() - chunk_start - - chunk_output = result["video"] - num_output_frames = chunk_output.shape[0] - chunk_fps = num_output_frames / chunk_latency - - latency_measures.append(chunk_latency) - fps_measures.append(chunk_fps) + # Create output file for incremental writing (reuse recording pattern) + from .recording import TEMP_FILE_PREFIXES, RecordingManager - logger.info( - f"Chunk {chunk_idx + 1}/{num_chunks}: " - f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" - ) - - output_chunks.append(chunk_output.detach().cpu()) - - yield sse_event( - "progress", - { - "chunk": chunk_idx + 1, - "total_chunks": num_chunks, - "frames": num_output_frames, - "latency": round(chunk_latency, 3), - "fps": round(chunk_fps, 2), - }, - ) - - # Concatenate and encode output - output_video = torch.cat(output_chunks, dim=0) - output_np = output_video.numpy() + output_file_path = RecordingManager._create_temp_file( + ".bin", TEMP_FILE_PREFIXES["generate_output"] + ) + output_file = open(output_file_path, "wb") + + # We'll write a placeholder header, then update it at the end + # Header format: ndim (4 bytes) + shape (4 * ndim bytes) + # For video [T, H, W, C], that's 4 + 16 = 20 bytes + header_size = 4 + 4 * 4 # ndim + 4 dimensions + output_file.write(b"\x00" * header_size) # Placeholder + + total_frames = 0 + video_height = None + video_width = None + video_channels = None + + try: + for chunk_idx in range(num_chunks): + start_frame = chunk_idx * chunk_size + end_frame = min(start_frame + chunk_size, request.num_frames) + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + kwargs = build_chunk_kwargs( + request, + inputs, + chunk_idx, + chunk_size, + start_frame, + end_frame, + status_info, + device, + dtype, + logger, + ) + + # Run pipeline + chunk_start = time.time() + with torch.amp.autocast("cuda", dtype=dtype): + result = pipeline(**kwargs) + chunk_latency = time.time() - chunk_start + + chunk_output = result["video"] + num_output_frames = chunk_output.shape[0] + chunk_fps = num_output_frames / chunk_latency + + latency_measures.append(chunk_latency) + fps_measures.append(chunk_fps) + + logger.info( + f"Chunk {chunk_idx + 1}/{num_chunks}: " + f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" + ) + + # Write chunk to file immediately (convert to uint8) + chunk_np = chunk_output.detach().cpu().numpy() + chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8) + output_file.write(chunk_uint8.tobytes()) + + # Track dimensions + total_frames += num_output_frames + if video_height is None: + video_height = chunk_np.shape[1] + video_width = chunk_np.shape[2] + video_channels = chunk_np.shape[3] + + yield sse_event( + "progress", + { + "chunk": chunk_idx + 1, + "total_chunks": num_chunks, + "frames": num_output_frames, + "latency": round(chunk_latency, 3), + "fps": round(chunk_fps, 2), + }, + ) + + # Update header with actual shape + output_file.seek(0) + shape = (total_frames, video_height, video_width, video_channels) + output_file.write(len(shape).to_bytes(4, "little")) + for dim in shape: + output_file.write(dim.to_bytes(4, "little")) + + finally: + output_file.close() + + logger.info(f"Output video saved: {output_file_path}") # Log performance summary if latency_measures: @@ -300,15 +363,14 @@ def generate_video_stream( f"Max: {max(fps_measures):.2f}, Min: {min(fps_measures):.2f}" ) - video_bytes = output_np.astype(np.float32).tobytes() - video_base64 = base64.b64encode(video_bytes).decode("utf-8") + output_shape = [total_frames, video_height, video_width, video_channels] yield sse_event( "complete", { - "video_base64": video_base64, - "video_shape": list(output_np.shape), - "num_frames": output_np.shape[0], + "output_path": output_file_path, + "video_shape": output_shape, + "num_frames": total_frames, "num_chunks": num_chunks, "chunk_size": chunk_size, }, diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py index 5ac39fc2b..839109a05 100644 --- a/src/scope/server/recording.py +++ b/src/scope/server/recording.py @@ -17,6 +17,8 @@ TEMP_FILE_PREFIXES = { "recording": "scope_recording_", "download": "scope_download_", + "generate_input": "scope_gen_input_", + "generate_output": "scope_gen_output_", } # Environment variables diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index 51c201345..3e785f685 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -844,6 +844,16 @@ class EncodedArray(BaseModel): shape: list[int] = Field(..., description="Array shape for decoding") +class VideoUploadResponse(BaseModel): + """Response after uploading a video for generation.""" + + input_path: str = Field( + ..., description="Path to uploaded video file for generate request" + ) + num_frames: int = Field(..., description="Number of frames in uploaded video") + shape: list[int] = Field(..., description="Video shape [T, H, W, C]") + + class GenerateRequest(BaseModel): """Request for batch video generation.""" @@ -875,10 +885,14 @@ class GenerateRequest(BaseModel): default=42, description="Random seed. Single int applies to all chunks; list applies per-chunk.", ) - # Video-to-video input (optional) + # Video-to-video input (optional) - two mutually exclusive options input_video: EncodedArray | None = Field( default=None, - description="Input video frames (THWC, uint8). If provided, enables video-to-video mode.", + description="Input video frames (THWC, uint8). If provided, enables video-to-video mode. For large videos, use input_path instead.", + ) + input_path: str | None = Field( + default=None, + description="Path to uploaded video file (from /generate/upload). Alternative to input_video for large files.", ) noise_scale: float | list[float] = Field( default=0.7, @@ -926,11 +940,22 @@ class GenerateRequest(BaseModel): class GenerateResponse(BaseModel): - """Response from batch video generation.""" + """Response from batch video generation. - video_base64: str = Field( - ..., - description="Base64-encoded output video frames as numpy array bytes (THWC, float32, [0,1] range)", + Supports two modes: + - Legacy: video_base64 contains the full video (for small videos) + - File-based: output_path references a downloadable file (for large videos) + """ + + # File-based output (preferred for large videos) + output_path: str | None = Field( + default=None, + description="Path to output video file for download via /generate/download. Preferred for large videos.", + ) + # Legacy base64 output (kept for backwards compatibility) + video_base64: str | None = Field( + default=None, + description="Base64-encoded output video frames (THWC, uint8). Deprecated for large videos, use output_path.", ) video_shape: list[int] = Field( ..., From 4791f8efe1ed4c055916d0456c12ae422aa4ae5d Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Fri, 13 Feb 2026 14:55:40 -0500 Subject: [PATCH 06/16] add noise controller, bias, use vace input, interpolation method Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 16 ++++++++++++++++ src/scope/server/schema.py | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 64c71dc9c..b7f980592 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -187,6 +187,22 @@ def build_chunk_kwargs( request.vace_context_scale, chunk_idx, 1.0 ) + # Noise controller + if request.noise_controller is not None: + kwargs["noise_controller"] = request.noise_controller + + # KV cache attention bias + kv_bias = get_chunk_value(request.kv_cache_attention_bias, chunk_idx) + if kv_bias is not None: + kwargs["kv_cache_attention_bias"] = kv_bias + + # Prompt interpolation method + kwargs["prompt_interpolation_method"] = request.prompt_interpolation_method + + # VACE use input video + if request.vace_use_input_video is not None: + kwargs["vace_use_input_video"] = request.vace_use_input_video + # LoRA scales if request.lora_scales: lora_scale_updates = [] diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index 3e785f685..4b590a53a 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -816,6 +816,8 @@ class ApiKeySetResponse(BaseModel): class ApiKeyDeleteResponse(BaseModel): success: bool message: str + + class ChunkFrameSpec(BaseModel): """Specification for a frame image at a specific chunk.""" @@ -906,6 +908,22 @@ class GenerateRequest(BaseModel): default=True, description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).", ) + noise_controller: bool | None = Field( + default=None, + description="Enable automatic noise scale adjustment based on motion detection.", + ) + kv_cache_attention_bias: float | list[float] | None = Field( + default=None, + description="Controls reliance on past frames in cache. Lower values mitigate error accumulation. Single float applies to all chunks; list applies per-chunk. Typical values: 0.3-0.7 moderate, 0.1-0.2 strong.", + ) + prompt_interpolation_method: Literal["linear", "slerp"] = Field( + default="linear", + description="Spatial interpolation method for blending multiple prompts: linear (weighted average) or slerp (spherical).", + ) + vace_use_input_video: bool | None = Field( + default=None, + description="When enabled in video-to-video mode, input video is used for VACE conditioning instead of latent initialization.", + ) # Per-chunk parameters lora_scales: dict[str, float | list[float]] | None = Field( default=None, From c23446dd99e7f18bf0ca6127b039fa44ff3caea7 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:48:44 -0500 Subject: [PATCH 07/16] cancellation Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/app.py | 11 ++++++++++- src/scope/server/generate.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/scope/server/app.py b/src/scope/server/app.py index 916ce9c0c..61e79d1d8 100644 --- a/src/scope/server/app.py +++ b/src/scope/server/app.py @@ -45,12 +45,12 @@ VIDEO_EXTENSIONS, iter_files, ) +from .generate import generate_video_stream from .kafka_publisher import ( KafkaPublisher, is_kafka_enabled, set_kafka_publisher, ) -from .generate import generate_video_stream from .logs_config import ( cleanup_old_logs, ensure_logs_dir, @@ -1152,6 +1152,15 @@ async def generate_video( ) +@app.post("/api/v1/generate/cancel") +async def cancel_generate(): + """Cancel the current video generation after the current chunk completes.""" + from .generate import cancel_generation + + cancel_generation() + return {"status": "cancelling"} + + @app.post("/api/v1/generate/upload") async def upload_video_for_generate(request: Request): """Upload a video for batch generation (file-based transfer for large videos). diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index b7f980592..55d6d4a4c 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -3,6 +3,7 @@ import base64 import gc import json +import threading import time from collections.abc import Iterator from dataclasses import dataclass, field @@ -12,6 +13,20 @@ import numpy as np import torch +# Cancellation support (single-client, so one event suffices) +_cancel_event = threading.Event() + + +def cancel_generation(): + """Signal the current generation to stop after the current chunk.""" + _cancel_event.set() + + +def is_generation_cancelled() -> bool: + """Check if cancellation has been requested.""" + return _cancel_event.is_set() + + # Defaults DEFAULT_HEIGHT = 320 DEFAULT_WIDTH = 576 @@ -255,6 +270,8 @@ def generate_video_stream( Writes output to temp file incrementally, returns output_path for download. """ + _cancel_event.clear() + try: pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id) @@ -294,6 +311,18 @@ def generate_video_stream( try: for chunk_idx in range(num_chunks): + if _cancel_event.is_set(): + logger.info("Generation cancelled by user") + yield sse_event( + "cancelled", + { + "chunk": chunk_idx, + "total_chunks": num_chunks, + "frames_completed": total_frames, + }, + ) + return + start_frame = chunk_idx * chunk_size end_frame = min(start_frame + chunk_size, request.num_frames) From 5e6a9653fcdb9681e049c6da2861d72a190d2c90 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:54:56 -0500 Subject: [PATCH 08/16] temp file cleanup Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 9 +++++++++ src/scope/server/recording.py | 2 ++ 2 files changed, 11 insertions(+) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 55d6d4a4c..28dee1ca2 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -424,3 +424,12 @@ def generate_video_stream( except Exception as e: logger.exception("Error generating video") yield sse_event("error", {"error": str(e)}) + + finally: + # Clean up uploaded input file + if request.input_path: + try: + Path(request.input_path).unlink(missing_ok=True) + logger.info(f"Cleaned up input file: {request.input_path}") + except Exception as e: + logger.warning(f"Failed to clean up input file: {e}") diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py index 839109a05..bd06a3bca 100644 --- a/src/scope/server/recording.py +++ b/src/scope/server/recording.py @@ -439,6 +439,8 @@ def cleanup_recording_files(): patterns = [ f"{TEMP_FILE_PREFIXES['recording']}*.mp4", f"{TEMP_FILE_PREFIXES['download']}*.mp4", + f"{TEMP_FILE_PREFIXES['generate_input']}*.bin", + f"{TEMP_FILE_PREFIXES['generate_output']}*.bin", ] deleted_count = 0 From 090a0ea07c86a110ee0d29d2e75256497285e9ff Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Mon, 16 Feb 2026 10:21:21 -0500 Subject: [PATCH 09/16] prompt blending Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 49 ++++++++++++++++++++++++++++++------ src/scope/server/schema.py | 47 +++++++++++++++++++++++++++++++--- 2 files changed, 85 insertions(+), 11 deletions(-) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 28dee1ca2..d97494f5f 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -103,7 +103,8 @@ class DecodedInputs: first_frames: dict[int, str] = field(default_factory=dict) last_frames: dict[int, str] = field(default_factory=dict) ref_images: dict[int, list[str]] = field(default_factory=dict) - prompts: dict[int, str] = field(default_factory=dict) + prompts: dict[int, list[dict]] = field(default_factory=dict) + transitions: dict[int, dict] = field(default_factory=dict) def load_video_from_file(file_path: str) -> np.ndarray: @@ -148,8 +149,36 @@ def decode_inputs( inputs.first_frames = build_lookup(request.first_frames, "image") inputs.last_frames = build_lookup(request.last_frames, "image") inputs.ref_images = build_lookup(request.vace_ref_images, "images") - inputs.prompts = {0: request.prompt} - inputs.prompts.update(build_lookup(request.chunk_prompts, "text")) + # Normalize prompt to weighted list format + if isinstance(request.prompt, str): + inputs.prompts = {0: [{"text": request.prompt, "weight": PROMPT_WEIGHT}]} + else: + inputs.prompts = { + 0: [{"text": p.text, "weight": p.weight} for p in request.prompt] + } + + # Chunk prompts: support both text and weighted prompt lists + if request.chunk_prompts: + for spec in request.chunk_prompts: + if spec.prompts: + inputs.prompts[spec.chunk] = [ + {"text": p.text, "weight": p.weight} for p in spec.prompts + ] + elif spec.text: + inputs.prompts[spec.chunk] = [ + {"text": spec.text, "weight": PROMPT_WEIGHT} + ] + + # Build transitions lookup + if request.transitions: + for t in request.transitions: + inputs.transitions[t.chunk] = { + "target_prompts": [ + {"text": p.text, "weight": p.weight} for p in t.target_prompts + ], + "num_steps": t.num_steps, + "temporal_interpolation_method": t.temporal_interpolation_method, + } return inputs @@ -173,15 +202,21 @@ def build_chunk_kwargs( "width": request.width or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH), "base_seed": get_chunk_value(request.seed, chunk_idx, DEFAULT_SEED), - "init_cache": chunk_idx == 0, + "init_cache": chunk_idx == 0 + or ( + request.cache_reset_chunks is not None + and chunk_idx in request.cache_reset_chunks + ), "manage_cache": request.manage_cache, } # Prompt (sticky behavior - only send when it changes) if chunk_idx in inputs.prompts: - kwargs["prompts"] = [ - {"text": inputs.prompts[chunk_idx], "weight": PROMPT_WEIGHT} - ] + kwargs["prompts"] = inputs.prompts[chunk_idx] + + # Temporal transition + if chunk_idx in inputs.transitions: + kwargs["transition"] = inputs.transitions[chunk_idx] if request.denoising_steps: kwargs["denoising_step_list"] = request.denoising_steps diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index 4b590a53a..07aa9c223 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -826,10 +826,38 @@ class ChunkFrameSpec(BaseModel): class ChunkPromptSpec(BaseModel): - """Specification for a prompt at a specific chunk.""" + """Specification for a prompt at a specific chunk. + + Supports both simple text and weighted prompt lists for spatial blending. + """ chunk: int = Field(..., ge=0, description="Chunk index") - text: str = Field(..., description="Prompt text for this chunk") + text: str | None = Field( + default=None, + description="Simple prompt text for this chunk (mutually exclusive with prompts)", + ) + prompts: list[PromptItem] | None = Field( + default=None, + description="Weighted prompt list for spatial blending at this chunk (mutually exclusive with text)", + ) + + +class ChunkTransitionSpec(BaseModel): + """Specification for a temporal transition starting at a specific chunk.""" + + chunk: int = Field(..., ge=0, description="Chunk index where transition starts") + target_prompts: list[PromptItem] = Field( + ..., description="Target prompt blend to interpolate to" + ) + num_steps: int = Field( + default=4, + ge=0, + description="Number of generation calls to transition over (0 = instant)", + ) + temporal_interpolation_method: Literal["linear", "slerp"] = Field( + default="linear", + description="Method for temporal interpolation between blends across frames", + ) class ChunkRefImagesSpec(BaseModel): @@ -860,10 +888,17 @@ class GenerateRequest(BaseModel): """Request for batch video generation.""" pipeline_id: str = Field(..., description="Pipeline ID to use for generation") - prompt: str = Field(..., description="Text prompt for generation (sent on chunk 0)") + prompt: str | list[PromptItem] = Field( + ..., + description="Text prompt for generation (sent on chunk 0). Can be a simple string or a list of weighted prompts for spatial blending.", + ) chunk_prompts: list[ChunkPromptSpec] | None = Field( default=None, - description="Prompt changes at later chunks (sticky behavior). The prompt persists until the next specified chunk.", + description="Prompt changes at later chunks (sticky behavior). Each entry supports simple text or weighted prompt lists.", + ) + transitions: list[ChunkTransitionSpec] | None = Field( + default=None, + description="Temporal transitions at specific chunks. Each specifies a target prompt blend and number of interpolation steps.", ) num_frames: int = Field( default=64, @@ -908,6 +943,10 @@ class GenerateRequest(BaseModel): default=True, description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).", ) + cache_reset_chunks: list[int] | None = Field( + default=None, + description="List of chunk indices where the KV cache should be forcibly reset (init_cache=True). Chunk 0 always resets.", + ) noise_controller: bool | None = Field( default=None, description="Enable automatic noise scale adjustment based on motion detection.", From 7741692ef7a3cc773125330fb7134918ff6046f7 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Mon, 16 Feb 2026 10:27:49 -0500 Subject: [PATCH 10/16] additional per chunk logging and tmp file cleanup Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index d97494f5f..4b3a268a2 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -306,6 +306,8 @@ def generate_video_stream( Writes output to temp file incrementally, returns output_path for download. """ _cancel_event.clear() + output_file_path = None + completed = False try: pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id) @@ -378,6 +380,101 @@ def generate_video_stream( logger, ) + # Log chunk operations + logger.info( + f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}" + ) + + # Cache management + if kwargs.get("init_cache"): + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)" + ) + + # Prompt updates + if "prompts" in kwargs: + prompt_texts = [p["text"] for p in kwargs["prompts"]] + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}" + ) + + # Temporal transitions + if "transition" in kwargs: + target_texts = [ + p["text"] for p in kwargs["transition"]["target_prompts"] + ] + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} " + f"over {kwargs['transition']['num_steps']} steps " + f"(method: {kwargs['transition']['temporal_interpolation_method']})" + ) + + # Keyframes + if "first_frame_image" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe" + ) + if "last_frame_image" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe" + ) + if "extension_mode" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}" + ) + + # VACE + if "vace_ref_images" in kwargs: + num_refs = len(kwargs["vace_ref_images"]) + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using {num_refs} VACE reference images" + ) + if "vace_input_frames" in kwargs: + vace_shape = kwargs["vace_input_frames"].shape + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {vace_shape}" + ) + if "vace_input_masks" in kwargs: + mask_shape = kwargs["vace_input_masks"].shape + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {mask_shape}" + ) + if ( + "vace_context_scale" in kwargs + and kwargs["vace_context_scale"] != 1.0 + ): + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}" + ) + if "vace_use_input_video" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}" + ) + + # Video-to-video + if "video" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}" + ) + elif "num_frames" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames" + ) + + # Other parameters + if "denoising_step_list" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}" + ) + if "noise_controller" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}" + ) + if "kv_cache_attention_bias" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}" + ) + # Run pipeline chunk_start = time.time() with torch.amp.autocast("cuda", dtype=dtype): @@ -455,6 +552,7 @@ def generate_video_stream( "chunk_size": chunk_size, }, ) + completed = True except Exception as e: logger.exception("Error generating video") @@ -468,3 +566,11 @@ def generate_video_stream( logger.info(f"Cleaned up input file: {request.input_path}") except Exception as e: logger.warning(f"Failed to clean up input file: {e}") + + # Clean up output file if generation didn't complete successfully + if not completed and output_file_path: + try: + Path(output_file_path).unlink(missing_ok=True) + logger.info(f"Cleaned up orphaned output file: {output_file_path}") + except Exception as e: + logger.warning(f"Failed to clean up output file: {e}") From 67b0a95d3315375d547c578b7a9855f3523e0ab3 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Mon, 16 Feb 2026 17:47:20 -0500 Subject: [PATCH 11/16] pre and post processors Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 522 +++++++++++++++++-------- src/scope/server/pipeline_processor.py | 80 +++- src/scope/server/schema.py | 8 + 3 files changed, 449 insertions(+), 161 deletions(-) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 4b3a268a2..ec6ecf545 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -3,6 +3,7 @@ import base64 import gc import json +import queue import threading import time from collections.abc import Iterator @@ -295,6 +296,332 @@ def build_chunk_kwargs( return kwargs +def _log_chunk_info(kwargs: dict, chunk_idx: int, num_chunks: int, logger: "Logger"): + """Log detailed chunk information.""" + logger.info(f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}") + if kwargs.get("init_cache"): + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)" + ) + if "prompts" in kwargs: + prompt_texts = [p["text"] for p in kwargs["prompts"]] + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}" + ) + if "transition" in kwargs: + target_texts = [p["text"] for p in kwargs["transition"]["target_prompts"]] + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} " + f"over {kwargs['transition']['num_steps']} steps " + f"(method: {kwargs['transition']['temporal_interpolation_method']})" + ) + if "first_frame_image" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe" + ) + if "last_frame_image" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe" + ) + if "extension_mode" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}" + ) + if "vace_ref_images" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using {len(kwargs['vace_ref_images'])} VACE reference images" + ) + if "vace_input_frames" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {kwargs['vace_input_frames'].shape}" + ) + if "vace_input_masks" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {kwargs['vace_input_masks'].shape}" + ) + if "vace_context_scale" in kwargs and kwargs["vace_context_scale"] != 1.0: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}" + ) + if "vace_use_input_video" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}" + ) + if "video" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}" + ) + elif "num_frames" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames" + ) + if "denoising_step_list" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}" + ) + if "noise_controller" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}" + ) + if "kv_cache_attention_bias" in kwargs: + logger.info( + f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}" + ) + + +def _write_chunk_output( + result: dict, + chunk_idx: int, + num_chunks: int, + chunk_latency: float, + output_file, + latency_measures: list, + fps_measures: list, + logger: "Logger", + total_frames_ref: list, + dimensions_ref: list, +) -> str: + """Write chunk output to file and return SSE progress event.""" + chunk_output = result["video"] + num_output_frames = chunk_output.shape[0] + chunk_fps = num_output_frames / chunk_latency + + latency_measures.append(chunk_latency) + fps_measures.append(chunk_fps) + + logger.info( + f"Chunk {chunk_idx + 1}/{num_chunks}: " + f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" + ) + + chunk_np = chunk_output.detach().cpu().numpy() + chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8) + output_file.write(chunk_uint8.tobytes()) + + total_frames_ref[0] += num_output_frames + if dimensions_ref[0] is None: + dimensions_ref[0] = chunk_np.shape[1] + dimensions_ref[1] = chunk_np.shape[2] + dimensions_ref[2] = chunk_np.shape[3] + + return sse_event( + "progress", + { + "chunk": chunk_idx + 1, + "total_chunks": num_chunks, + "frames": num_output_frames, + "latency": round(chunk_latency, 3), + "fps": round(chunk_fps, 2), + }, + ) + + +def _generate_sequential( + request: "GenerateRequest", + pipeline, + inputs: DecodedInputs, + num_chunks: int, + chunk_size: int, + status_info: dict, + device: torch.device, + dtype: torch.dtype, + output_file, + latency_measures: list, + fps_measures: list, + logger: "Logger", + total_frames_ref: list, + dimensions_ref: list, +) -> Iterator[str]: + """Sequential chunk processing (original code path, no processors).""" + for chunk_idx in range(num_chunks): + if _cancel_event.is_set(): + logger.info("Generation cancelled by user") + yield sse_event( + "cancelled", + { + "chunk": chunk_idx, + "total_chunks": num_chunks, + "frames_completed": total_frames_ref[0], + }, + ) + return + + start_frame = chunk_idx * chunk_size + end_frame = min(start_frame + chunk_size, request.num_frames) + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + kwargs = build_chunk_kwargs( + request, + inputs, + chunk_idx, + chunk_size, + start_frame, + end_frame, + status_info, + device, + dtype, + logger, + ) + _log_chunk_info(kwargs, chunk_idx, num_chunks, logger) + + chunk_start = time.time() + with torch.amp.autocast("cuda", dtype=dtype): + result = pipeline(**kwargs) + chunk_latency = time.time() - chunk_start + + yield _write_chunk_output( + result, + chunk_idx, + num_chunks, + chunk_latency, + output_file, + latency_measures, + fps_measures, + logger, + total_frames_ref, + dimensions_ref, + ) + + +def _generate_with_processors( + request: "GenerateRequest", + pipeline, + pipeline_manager: "PipelineManager", + inputs: DecodedInputs, + num_chunks: int, + chunk_size: int, + status_info: dict, + device: torch.device, + dtype: torch.dtype, + output_file, + latency_measures: list, + fps_measures: list, + logger: "Logger", + total_frames_ref: list, + dimensions_ref: list, +) -> Iterator[str]: + """Chunk processing with pre/post processor pipeline chaining.""" + from .pipeline_processor import _SENTINEL, PipelineProcessor + + # Build the processor chain + processors: list[PipelineProcessor] = [] + + if request.pre_processor_id: + pre_pipeline = pipeline_manager.get_pipeline_by_id(request.pre_processor_id) + pre_proc = PipelineProcessor( + pipeline=pre_pipeline, + pipeline_id=request.pre_processor_id, + batch_mode=True, + ) + processors.append(pre_proc) + logger.info(f"Pre-processor: {request.pre_processor_id}") + + main_proc = PipelineProcessor( + pipeline=pipeline, + pipeline_id=request.pipeline_id, + batch_mode=True, + ) + processors.append(main_proc) + + if request.post_processor_id: + post_pipeline = pipeline_manager.get_pipeline_by_id(request.post_processor_id) + post_proc = PipelineProcessor( + pipeline=post_pipeline, + pipeline_id=request.post_processor_id, + batch_mode=True, + ) + processors.append(post_proc) + logger.info(f"Post-processor: {request.post_processor_id}") + + # Chain processors + for i in range(len(processors) - 1): + processors[i].set_next_processor(processors[i + 1]) + + # Start all processors + for proc in processors: + proc.start() + + first_proc = processors[0] + last_proc = processors[-1] + + try: + # Feed chunks into the first processor's input queue + for chunk_idx in range(num_chunks): + if _cancel_event.is_set(): + logger.info("Generation cancelled by user") + yield sse_event( + "cancelled", + { + "chunk": chunk_idx, + "total_chunks": num_chunks, + "frames_completed": total_frames_ref[0], + }, + ) + return + + start_frame = chunk_idx * chunk_size + end_frame = min(start_frame + chunk_size, request.num_frames) + + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + kwargs = build_chunk_kwargs( + request, + inputs, + chunk_idx, + chunk_size, + start_frame, + end_frame, + status_info, + device, + dtype, + logger, + ) + _log_chunk_info(kwargs, chunk_idx, num_chunks, logger) + + chunk_start = time.time() + + # Feed kwargs into chain (blocking put) + first_proc.input_queue.put(kwargs) + + # Collect result from last processor (blocking get) + while True: + try: + result = last_proc.output_queue.get(timeout=1.0) + break + except queue.Empty: + if _cancel_event.is_set(): + return + continue + + chunk_latency = time.time() - chunk_start + + yield _write_chunk_output( + result, + chunk_idx, + num_chunks, + chunk_latency, + output_file, + latency_measures, + fps_measures, + logger, + total_frames_ref, + dimensions_ref, + ) + + # Signal end of input + first_proc.input_queue.put(_SENTINEL) + + finally: + # Stop all processors + for proc in processors: + proc.stop() + + def generate_video_stream( request: "GenerateRequest", pipeline_manager: "PipelineManager", @@ -346,175 +673,52 @@ def generate_video_stream( video_width = None video_channels = None + # Determine if we need processor chaining + use_processors = ( + request.pre_processor_id is not None + or request.post_processor_id is not None + ) + try: - for chunk_idx in range(num_chunks): - if _cancel_event.is_set(): - logger.info("Generation cancelled by user") - yield sse_event( - "cancelled", - { - "chunk": chunk_idx, - "total_chunks": num_chunks, - "frames_completed": total_frames, - }, - ) - return - - start_frame = chunk_idx * chunk_size - end_frame = min(start_frame + chunk_size, request.num_frames) - - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - kwargs = build_chunk_kwargs( + if use_processors: + yield from _generate_with_processors( request, + pipeline, + pipeline_manager, inputs, - chunk_idx, + num_chunks, chunk_size, - start_frame, - end_frame, status_info, device, dtype, + output_file, + latency_measures, + fps_measures, logger, + _total_frames_ref := [0], + _dimensions_ref := [None, None, None], ) - - # Log chunk operations - logger.info( - f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}" - ) - - # Cache management - if kwargs.get("init_cache"): - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)" - ) - - # Prompt updates - if "prompts" in kwargs: - prompt_texts = [p["text"] for p in kwargs["prompts"]] - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}" - ) - - # Temporal transitions - if "transition" in kwargs: - target_texts = [ - p["text"] for p in kwargs["transition"]["target_prompts"] - ] - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} " - f"over {kwargs['transition']['num_steps']} steps " - f"(method: {kwargs['transition']['temporal_interpolation_method']})" - ) - - # Keyframes - if "first_frame_image" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe" - ) - if "last_frame_image" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe" - ) - if "extension_mode" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}" - ) - - # VACE - if "vace_ref_images" in kwargs: - num_refs = len(kwargs["vace_ref_images"]) - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using {num_refs} VACE reference images" - ) - if "vace_input_frames" in kwargs: - vace_shape = kwargs["vace_input_frames"].shape - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {vace_shape}" - ) - if "vace_input_masks" in kwargs: - mask_shape = kwargs["vace_input_masks"].shape - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {mask_shape}" - ) - if ( - "vace_context_scale" in kwargs - and kwargs["vace_context_scale"] != 1.0 - ): - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}" - ) - if "vace_use_input_video" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}" - ) - - # Video-to-video - if "video" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}" - ) - elif "num_frames" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames" - ) - - # Other parameters - if "denoising_step_list" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}" - ) - if "noise_controller" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}" - ) - if "kv_cache_attention_bias" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}" - ) - - # Run pipeline - chunk_start = time.time() - with torch.amp.autocast("cuda", dtype=dtype): - result = pipeline(**kwargs) - chunk_latency = time.time() - chunk_start - - chunk_output = result["video"] - num_output_frames = chunk_output.shape[0] - chunk_fps = num_output_frames / chunk_latency - - latency_measures.append(chunk_latency) - fps_measures.append(chunk_fps) - - logger.info( - f"Chunk {chunk_idx + 1}/{num_chunks}: " - f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" - ) - - # Write chunk to file immediately (convert to uint8) - chunk_np = chunk_output.detach().cpu().numpy() - chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8) - output_file.write(chunk_uint8.tobytes()) - - # Track dimensions - total_frames += num_output_frames - if video_height is None: - video_height = chunk_np.shape[1] - video_width = chunk_np.shape[2] - video_channels = chunk_np.shape[3] - - yield sse_event( - "progress", - { - "chunk": chunk_idx + 1, - "total_chunks": num_chunks, - "frames": num_output_frames, - "latency": round(chunk_latency, 3), - "fps": round(chunk_fps, 2), - }, + total_frames = _total_frames_ref[0] + video_height, video_width, video_channels = _dimensions_ref + else: + yield from _generate_sequential( + request, + pipeline, + inputs, + num_chunks, + chunk_size, + status_info, + device, + dtype, + output_file, + latency_measures, + fps_measures, + logger, + _total_frames_ref := [0], + _dimensions_ref := [None, None, None], ) + total_frames = _total_frames_ref[0] + video_height, video_width, video_channels = _dimensions_ref # Update header with actual shape output_file.seek(0) diff --git a/src/scope/server/pipeline_processor.py b/src/scope/server/pipeline_processor.py index b11638996..1ac243a91 100644 --- a/src/scope/server/pipeline_processor.py +++ b/src/scope/server/pipeline_processor.py @@ -23,6 +23,9 @@ SLEEP_TIME = 0.01 +# Sentinel value to signal end of batch input +_SENTINEL = object() + # FPS calculation constants MIN_FPS = 1.0 # Minimum FPS to prevent division by zero MAX_FPS = 60.0 # Maximum FPS cap @@ -42,6 +45,7 @@ def __init__( user_id: str | None = None, connection_id: str | None = None, connection_info: dict | None = None, + batch_mode: bool = False, ): """Initialize a pipeline processor. @@ -60,10 +64,15 @@ def __init__( self.user_id = user_id self.connection_id = connection_id self.connection_info = connection_info + self.batch_mode = batch_mode # Each processor creates its own queues - self.input_queue = queue.Queue(maxsize=30) - self.output_queue = queue.Queue(maxsize=8) + if batch_mode: + self.input_queue = queue.Queue(maxsize=2) + self.output_queue = queue.Queue(maxsize=2) + else: + self.input_queue = queue.Queue(maxsize=30) + self.output_queue = queue.Queue(maxsize=8) # Lock to protect input_queue assignment for thread-safe reference swapping self.input_queue_lock = threading.Lock() @@ -226,6 +235,10 @@ def worker_loop(self): """Main worker loop that processes frames.""" logger.info(f"Worker thread started for pipeline: {self.pipeline_id}") + if self.batch_mode: + self._worker_loop_batch() + return + while self.running and not self.shutdown_event.is_set(): try: self.process_chunk() @@ -267,6 +280,69 @@ def worker_loop(self): logger.info(f"Worker thread stopped for pipeline: {self.pipeline_id}") + def _worker_loop_batch(self): + """Batch-mode worker loop: processes chunk kwargs dicts from queue.""" + while self.running and not self.shutdown_event.is_set(): + try: + item = self.input_queue.get(timeout=1.0) + except queue.Empty: + continue + if item is _SENTINEL: + if self.next_processor: + self.next_processor.input_queue.put(_SENTINEL) + break + try: + self.process_chunk_batch(item) + except Exception as e: + logger.error( + f"Error in batch processing for {self.pipeline_id}: {e}", + exc_info=True, + ) + if not self._is_recoverable(e): + break + logger.info(f"Batch worker thread stopped for pipeline: {self.pipeline_id}") + + def process_chunk_batch(self, chunk_kwargs: dict): + """Process a single chunk in batch mode. + + Args: + chunk_kwargs: Pre-built kwargs dict for the pipeline call. + """ + dtype = torch.bfloat16 + with torch.amp.autocast("cuda", dtype=dtype): + result = self.pipeline(**chunk_kwargs) + + # Forward extra params to downstream processor + extra_params = {k: v for k, v in result.items() if k != "video"} + if extra_params and self.next_processor is not None: + self.next_processor.update_parameters(extra_params) + + if self.next_processor is not None: + # Convert video output to list-of-frames format for next pipeline. + # Pipeline __call__ expects video as list of [1, H, W, C] uint8 tensors + # (same format as real-time path: process_chunk converts to uint8 + # before putting on output queue, and preprocess_chunk expects [0, 255]). + video = result.get("video") + if video is not None: + video_uint8 = ( + (video * 255.0) + .clamp(0, 255) + .to(dtype=torch.uint8) + .contiguous() + .detach() + ) + next_kwargs = dict(chunk_kwargs) + next_kwargs["video"] = [f.unsqueeze(0) for f in video_uint8] + # Remove keys that are only valid for the original pipeline + for key in ("init_cache", "num_frames"): + next_kwargs.pop(key, None) + self.output_queue.put(next_kwargs) + else: + self.output_queue.put(chunk_kwargs) + else: + # Last processor: put raw result for collection + self.output_queue.put(result) + def prepare_chunk( self, input_queue_ref: queue.Queue, chunk_size: int ) -> list[torch.Tensor]: diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index 07aa9c223..6115564be 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -994,6 +994,14 @@ class GenerateRequest(BaseModel): default=None, description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).", ) + pre_processor_id: str | None = Field( + default=None, + description="Pipeline ID for pre-processing each chunk before the main pipeline.", + ) + post_processor_id: str | None = Field( + default=None, + description="Pipeline ID for post-processing each chunk after the main pipeline.", + ) class GenerateResponse(BaseModel): From c61a9a19a59d839e5b8957abc1d9100ac7698b32 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Wed, 18 Feb 2026 06:47:18 -0500 Subject: [PATCH 12/16] per chunk vace spec Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 76 ++++++++++++++++++++++++++++++------ src/scope/server/schema.py | 28 ++++++++++++- 2 files changed, 92 insertions(+), 12 deletions(-) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index ec6ecf545..54cd10633 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -106,6 +106,7 @@ class DecodedInputs: ref_images: dict[int, list[str]] = field(default_factory=dict) prompts: dict[int, list[dict]] = field(default_factory=dict) transitions: dict[int, dict] = field(default_factory=dict) + vace_chunk_specs: dict[int, dict] = field(default_factory=dict) def load_video_from_file(file_path: str) -> np.ndarray: @@ -170,6 +171,37 @@ def decode_inputs( {"text": spec.text, "weight": PROMPT_WEIGHT} ] + # Per-chunk VACE specs + if request.vace_chunk_specs: + logger.info( + f"decode_inputs: Found {len(request.vace_chunk_specs)} vace_chunk_specs" + ) + for spec in request.vace_chunk_specs: + logger.info( + f"decode_inputs: vace_chunk_spec chunk={spec.chunk}, has_frames={spec.frames is not None}, has_masks={spec.masks is not None}, context_scale={spec.context_scale}, temporally_locked={spec.vace_temporally_locked}" + ) + decoded_spec: dict = { + "vace_temporally_locked": spec.vace_temporally_locked, + } + if spec.frames is not None: + decoded_spec["frames"] = decode_array(spec.frames, np.float32) + logger.info( + f"decode_inputs: chunk {spec.chunk} decoded frames shape={decoded_spec['frames'].shape}" + ) + if spec.masks is not None: + decoded_spec["masks"] = decode_array(spec.masks, np.float32) + logger.info( + f"decode_inputs: chunk {spec.chunk} decoded masks shape={decoded_spec['masks'].shape}" + ) + if spec.context_scale is not None: + decoded_spec["context_scale"] = spec.context_scale + inputs.vace_chunk_specs[spec.chunk] = decoded_spec + logger.info( + f"decode_inputs: vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}" + ) + else: + logger.info("decode_inputs: No vace_chunk_specs in request") + # Build transitions lookup if request.transitions: for t in request.transitions: @@ -281,17 +313,39 @@ def build_chunk_kwargs( if chunk_idx in inputs.ref_images: kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx] - # VACE conditioning frames [1, C, T, H, W] - if inputs.vace_frames is not None: - chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :] - chunk = pad_chunk(chunk, chunk_size, axis=2) - kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype) - - # VACE masks [1, 1, T, H, W] - if inputs.vace_masks is not None: - chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :] - chunk = pad_chunk(chunk, chunk_size, axis=2) - kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype) + # VACE conditioning: per-chunk spec takes priority over global + logger.info( + f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}, has_global_frames={inputs.vace_frames is not None}, has_global_masks={inputs.vace_masks is not None}" + ) + if chunk_idx in inputs.vace_chunk_specs: + logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING PER-CHUNK VACE SPEC") + spec = inputs.vace_chunk_specs[chunk_idx] + + if "frames" in spec: + frames = spec["frames"] + frames = pad_chunk(frames, chunk_size, axis=2) + kwargs["vace_input_frames"] = torch.from_numpy(frames).to(device, dtype) + + if "masks" in spec: + masks = spec["masks"] + masks = pad_chunk(masks, chunk_size, axis=2) + kwargs["vace_input_masks"] = torch.from_numpy(masks).to(device, dtype) + + if "context_scale" in spec: + kwargs["vace_context_scale"] = spec["context_scale"] + else: + logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING GLOBAL VACE FALLBACK") + # Global VACE conditioning frames [1, C, T, H, W] + if inputs.vace_frames is not None: + chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :] + chunk = pad_chunk(chunk, chunk_size, axis=2) + kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype) + + # Global VACE masks [1, 1, T, H, W] + if inputs.vace_masks is not None: + chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :] + chunk = pad_chunk(chunk, chunk_size, axis=2) + kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype) return kwargs diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index 6115564be..d450ba44e 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -867,6 +867,28 @@ class ChunkRefImagesSpec(BaseModel): images: list[str] = Field(..., description="List of reference image paths") +class ChunkVACESpec(BaseModel): + """Per-chunk VACE conditioning specification.""" + + chunk: int = Field(..., ge=0, description="Chunk index") + frames: "EncodedArray | None" = Field( + default=None, + description="VACE conditioning frames for this chunk ([1, C, T, H, W] float32 [-1, 1])", + ) + masks: "EncodedArray | None" = Field( + default=None, + description="VACE masks for this chunk ([1, 1, T, H, W] float32 {0, 1})", + ) + context_scale: float | None = Field( + default=None, + description="VACE context scale override for this chunk. If None, uses global vace_context_scale.", + ) + vace_temporally_locked: bool = Field( + default=True, + description="When True, frames/masks are sliced temporally to match chunk position. When False, used as-is and padded.", + ) + + class EncodedArray(BaseModel): """Base64-encoded numpy array with shape metadata.""" @@ -903,7 +925,7 @@ class GenerateRequest(BaseModel): num_frames: int = Field( default=64, ge=1, - le=1024, + le=10000, description="Total number of frames to generate", ) height: int | None = Field( @@ -994,6 +1016,10 @@ class GenerateRequest(BaseModel): default=None, description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).", ) + vace_chunk_specs: list[ChunkVACESpec] | None = Field( + default=None, + description="Per-chunk VACE conditioning. Each specifies frames/masks for a specific chunk. Overrides global vace_frames/vace_masks for that chunk.", + ) pre_processor_id: str | None = Field( default=None, description="Pipeline ID for pre-processing each chunk before the main pipeline.", From 8f61a2c7c8c6336d60f12da662f13f677947d28b Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:46:59 -0500 Subject: [PATCH 13/16] wip Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- scripts/test_generate_endpoint.py | 231 ++++++++++++++++------ src/scope/server/app.py | 44 +++++ src/scope/server/generate.py | 310 +++++++++++++++++------------- src/scope/server/recording.py | 2 + src/scope/server/schema.py | 229 +++++++++++----------- 5 files changed, 508 insertions(+), 308 deletions(-) diff --git a/scripts/test_generate_endpoint.py b/scripts/test_generate_endpoint.py index 986133100..122dcb4c6 100644 --- a/scripts/test_generate_endpoint.py +++ b/scripts/test_generate_endpoint.py @@ -5,7 +5,6 @@ python test_generate_endpoint.py --list """ -import base64 import json import sys import time @@ -92,33 +91,87 @@ # ============================================================================= -def encode_array(arr: np.ndarray) -> dict: - """Encode numpy array as EncodedArray dict.""" - return { - "base64": base64.b64encode(arr.tobytes()).decode("utf-8"), - "shape": list(arr.shape), - } - - -def load_video_for_v2v(path: str, height: int, width: int) -> dict: - """Load video as [T, H, W, C] uint8 for video-to-video mode.""" +def upload_video_for_v2v(path: str, height: int, width: int) -> str: + """Load and upload video for video-to-video mode. Returns input_path.""" tensor = load_video(path, resize_hw=(height, width), normalize=False) arr = tensor.permute(1, 2, 3, 0).numpy().astype(np.uint8) - return encode_array(arr) - - -def load_video_for_vace(path: str, height: int, width: int) -> dict: - """Load video as [1, C, T, H, W] float32 for VACE conditioning.""" - tensor = load_video(path, resize_hw=(height, width)) - arr = tensor.unsqueeze(0).numpy().astype(np.float32) - return encode_array(arr) - + num_frames, h, w, c = arr.shape + + response = requests.post( + f"{SERVER_URL}/api/v1/generate/upload", + data=arr.tobytes(), + headers={ + "Content-Type": "application/octet-stream", + "X-Video-Frames": str(num_frames), + "X-Video-Height": str(h), + "X-Video-Width": str(w), + "X-Video-Channels": str(c), + }, + timeout=300, + ) + response.raise_for_status() + return response.json()["input_path"] + + +def upload_vace_data( + vace_frames_path: str | None, + vace_masks_path: str | None, + height: int, + width: int, + num_frames: int, + chunk_size: int, + vace_context_scale: float = 1.0, +) -> tuple[str, list[dict]]: + """Load VACE frames/masks, pack into blob, upload, return (data_blob_path, chunk_specs).""" + blob = bytearray() + num_chunks = (num_frames + chunk_size - 1) // chunk_size + chunk_specs = [] + + # Load tensors + vace_frames_tensor = None + vace_masks_tensor = None + if vace_frames_path: + vace_frames_tensor = load_video(vace_frames_path, resize_hw=(height, width)) + vace_frames_tensor = vace_frames_tensor.unsqueeze(0).numpy().astype(np.float32) + if vace_masks_path: + masks_tensor = load_video(vace_masks_path, resize_hw=(height, width)) + vace_masks_tensor = (masks_tensor[0:1].unsqueeze(0).numpy() > 0.0).astype( + np.float32 + ) + + for chunk_idx in range(num_chunks): + spec = {"chunk": chunk_idx, "vace_temporally_locked": True} + start = chunk_idx * chunk_size + end = start + chunk_size + + if vace_frames_tensor is not None: + sliced = vace_frames_tensor[:, :, start:end, :, :] + spec["vace_frames_offset"] = len(blob) + spec["vace_frames_shape"] = list(sliced.shape) + blob.extend(sliced.tobytes()) + + if vace_masks_tensor is not None: + sliced_masks = vace_masks_tensor[:, :, start:end, :, :] + spec["vace_masks_offset"] = len(blob) + spec["vace_masks_shape"] = list(sliced_masks.shape) + blob.extend(sliced_masks.tobytes()) + + if vace_context_scale != 1.0: + spec["vace_context_scale"] = vace_context_scale + + chunk_specs.append(spec) + + # Upload blob + response = requests.post( + f"{SERVER_URL}/api/v1/generate/upload-data", + data=bytes(blob), + headers={"Content-Type": "application/octet-stream"}, + timeout=300, + ) + response.raise_for_status() + data_blob_path = response.json()["data_blob_path"] -def load_mask_for_vace(path: str, height: int, width: int) -> dict: - """Load video as [1, 1, T, H, W] binary mask for VACE inpainting.""" - tensor = load_video(path, resize_hw=(height, width)) - arr = (tensor[0:1].unsqueeze(0).numpy() > 0.0).astype(np.float32) - return encode_array(arr) + return data_blob_path, chunk_specs def parse_sse_events(response): @@ -158,6 +211,30 @@ def wait_for_pipeline(timeout: int = 300): raise TimeoutError(f"Pipeline did not load within {timeout}s") +def download_video(output_path: str) -> np.ndarray: + """Download generated video from server.""" + response = requests.get( + f"{SERVER_URL}/api/v1/generate/download", + params={"path": output_path}, + timeout=300, + ) + response.raise_for_status() + + num_frames = int(response.headers.get("X-Video-Frames", 0)) + height = int(response.headers.get("X-Video-Height", 0)) + width = int(response.headers.get("X-Video-Width", 0)) + channels = int(response.headers.get("X-Video-Channels", 3)) + + # Skip header (ndim + shape) + content = response.content + header_size = 4 + 4 * 4 + video_bytes = content[header_size:] + + return np.frombuffer(video_bytes, dtype=np.uint8).reshape( + (num_frames, height, width, channels) + ) + + # ============================================================================= # Test Runner # ============================================================================= @@ -189,8 +266,8 @@ def run_test(name: str): ) ] if "lora_ramp" in cfg: - lora_scales = {cfg["lora"]: cfg["lora_ramp"]} - print(f"LoRA ramp: {cfg['lora_ramp']}") + lora_scales = cfg["lora_ramp"] + print(f"LoRA ramp: {lora_scales}") # Load pipeline print(f"Loading pipeline '{pipeline_id}' at {width}x{height}...") @@ -205,37 +282,65 @@ def run_test(name: str): load_time = wait_for_pipeline() print(f"Pipeline loaded in {load_time:.1f}s") - # Load input video if specified - input_video = None + # Build request kwargs + request_kwargs = { + "pipeline_id": pipeline_id, + "prompt": cfg["prompt"], + "num_frames": cfg["num_frames"], + "noise_scale": cfg.get("noise_scale", 0.7), + "vace_context_scale": cfg.get("vace_context_scale", 1.0), + "manage_cache": cfg.get("manage_cache", True), + } + + # Upload input video if specified if "input_video" in cfg: - input_video = load_video_for_v2v(cfg["input_video"], height, width) - print(f"Input video: {input_video['shape']}") - - # Load VACE frames if specified - vace_frames = None - if "vace_frames" in cfg: - vace_frames = load_video_for_vace(cfg["vace_frames"], height, width) - print(f"VACE frames: {vace_frames['shape']}") - - # Load VACE masks if specified - vace_masks = None - if "vace_masks" in cfg: - vace_masks = load_mask_for_vace(cfg["vace_masks"], height, width) - print(f"VACE masks: {vace_masks['shape']}") - - # Build and send request - gen_request = GenerateRequest( - pipeline_id=pipeline_id, - prompt=cfg["prompt"], - num_frames=cfg["num_frames"], - input_video=input_video, - noise_scale=cfg.get("noise_scale", 0.7), - vace_frames=vace_frames, - vace_masks=vace_masks, - vace_context_scale=cfg.get("vace_context_scale", 1.0), - lora_scales=lora_scales, - manage_cache=cfg.get("manage_cache", True), - ) + input_path = upload_video_for_v2v(cfg["input_video"], height, width) + request_kwargs["input_path"] = input_path + print(f"Input video uploaded: {input_path}") + + # Build chunk_specs for LoRA ramp + chunk_specs = [] + if lora_scales and "lora" in cfg: + for i, scale in enumerate(lora_scales): + chunk_specs.append( + { + "chunk": i, + "lora_scales": {cfg["lora"]: scale}, + } + ) + + # Handle VACE data + if "vace_frames" in cfg or "vace_masks" in cfg: + # Assume chunk_size=12 (default for longlive) + chunk_size = 12 + data_blob_path, vace_specs = upload_vace_data( + vace_frames_path=cfg.get("vace_frames"), + vace_masks_path=cfg.get("vace_masks"), + height=height, + width=width, + num_frames=cfg["num_frames"], + chunk_size=chunk_size, + vace_context_scale=cfg.get("vace_context_scale", 1.0), + ) + request_kwargs["data_blob_path"] = data_blob_path + # Merge VACE specs into chunk_specs + existing_chunks = {s["chunk"] for s in chunk_specs} + for vs in vace_specs: + if vs["chunk"] in existing_chunks: + # Merge into existing spec + for cs in chunk_specs: + if cs["chunk"] == vs["chunk"]: + cs.update(vs) + break + else: + chunk_specs.append(vs) + print(f"VACE data uploaded: {data_blob_path}") + + if chunk_specs: + chunk_specs.sort(key=lambda s: s["chunk"]) + request_kwargs["chunk_specs"] = chunk_specs + + gen_request = GenerateRequest(**request_kwargs) print(f"Generating {cfg['num_frames']} frames...") start = time.time() @@ -262,13 +367,15 @@ def run_test(name: str): if result is None: raise RuntimeError("No complete event received") - # Decode and save - video = np.frombuffer( - base64.b64decode(result["video_base64"]), dtype=np.float32 - ).reshape(result["video_shape"]) + # Download and save + if "output_path" in result: + video = download_video(result["output_path"]) + video_float = video.astype(np.float32) / 255.0 + else: + raise RuntimeError("No output_path in result") output_path = f"test_{name}.mp4" - export_to_video(video, output_path, fps=16) + export_to_video(video_float, output_path, fps=16) print(f"\nComplete in {time.time() - start:.1f}s") print(f"Output: {output_path} ({result['video_shape']})") diff --git a/src/scope/server/app.py b/src/scope/server/app.py index 61e79d1d8..75cb79e88 100644 --- a/src/scope/server/app.py +++ b/src/scope/server/app.py @@ -1234,6 +1234,50 @@ async def upload_video_for_generate(request: Request): raise HTTPException(status_code=500, detail=str(e)) from e +@app.post("/api/v1/generate/upload-data") +async def upload_data_blob(request: Request): + """Upload binary data blob for batch generation. + + Accepts raw binary data containing VACE frames/masks, input video, or other + array data referenced by ChunkSpec offsets in the generate request. + + Returns data_blob_path to use in the generate request. + """ + + from .recording import TEMP_FILE_PREFIXES, RecordingManager + from .schema import DataUploadResponse + + try: + # Create temp file + file_path = RecordingManager._create_temp_file( + ".bin", TEMP_FILE_PREFIXES["generate_data"] + ) + + # Stream body to file + bytes_written = 0 + with open(file_path, "wb") as f: + async for chunk in request.stream(): + f.write(chunk) + bytes_written += len(chunk) + + if bytes_written == 0: + Path(file_path).unlink(missing_ok=True) + raise HTTPException(status_code=400, detail="Empty request body") + + logger.info(f"Uploaded data blob: {file_path} ({bytes_written} bytes)") + + return DataUploadResponse( + data_blob_path=file_path, + size_bytes=bytes_written, + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error uploading data blob: {e}") + raise HTTPException(status_code=500, detail=str(e)) from e + + @app.get("/api/v1/generate/download") async def download_generated_video( path: str = Query(..., description="Path to output video file"), diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 54cd10633..3251738ab 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -1,6 +1,5 @@ """Video generation service for batch mode with chunked processing.""" -import base64 import gc import json import queue @@ -40,13 +39,7 @@ def is_generation_cancelled() -> bool: from logging import Logger from .pipeline_manager import PipelineManager - from .schema import EncodedArray, GenerateRequest - - -def decode_array(encoded: "EncodedArray", dtype: np.dtype) -> np.ndarray: - """Decode EncodedArray to numpy array.""" - data = base64.b64decode(encoded.base64) - return np.frombuffer(data, dtype=dtype).reshape(encoded.shape) + from .schema import ChunkSpec, GenerateRequest def loop_to_length(arr: np.ndarray, target: int, axis: int) -> np.ndarray: @@ -73,22 +66,6 @@ def pad_chunk(arr: np.ndarray, target_size: int, axis: int) -> np.ndarray: return np.concatenate([arr, padding], axis=axis) -def build_lookup(specs: list | None, value_attr: str = "image") -> dict: - """Build chunk -> value lookup from list of specs.""" - if not specs: - return {} - return {spec.chunk: getattr(spec, value_attr) for spec in specs} - - -def get_chunk_value(value, chunk_idx: int, default=None): - """Get per-chunk value from scalar or list.""" - if value is None: - return default - if isinstance(value, list): - return value[chunk_idx] if chunk_idx < len(value) else value[-1] - return value - - def sse_event(event_type: str, data: dict) -> str: """Format a server-sent event.""" return f"event: {event_type}\ndata: {json.dumps(data)}\n\n" @@ -99,14 +76,14 @@ class DecodedInputs: """Decoded and preprocessed inputs for generation.""" input_video: np.ndarray | None = None - vace_frames: np.ndarray | None = None - vace_masks: np.ndarray | None = None first_frames: dict[int, str] = field(default_factory=dict) last_frames: dict[int, str] = field(default_factory=dict) ref_images: dict[int, list[str]] = field(default_factory=dict) prompts: dict[int, list[dict]] = field(default_factory=dict) transitions: dict[int, dict] = field(default_factory=dict) vace_chunk_specs: dict[int, dict] = field(default_factory=dict) + input_video_chunks: dict[int, np.ndarray] = field(default_factory=dict) + chunk_specs_map: "dict[int, ChunkSpec]" = field(default_factory=dict) def load_video_from_file(file_path: str) -> np.ndarray: @@ -128,30 +105,16 @@ def load_video_from_file(file_path: str) -> np.ndarray: def decode_inputs( request: "GenerateRequest", num_frames: int, logger: "Logger" ) -> DecodedInputs: - """Decode all inputs from request (base64 or file-based).""" + """Decode all inputs from request using unified ChunkSpec.""" inputs = DecodedInputs() - # Handle input video - either from file path or base64 + # Input video from file path if request.input_path: logger.info(f"Loading input video from file: {request.input_path}") inputs.input_video = load_video_from_file(request.input_path) inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0) - elif request.input_video: - inputs.input_video = decode_array(request.input_video, np.uint8) - inputs.input_video = loop_to_length(inputs.input_video, num_frames, axis=0) - - if request.vace_frames: - inputs.vace_frames = decode_array(request.vace_frames, np.float32) - inputs.vace_frames = loop_to_length(inputs.vace_frames, num_frames, axis=2) - if request.vace_masks: - inputs.vace_masks = decode_array(request.vace_masks, np.float32) - inputs.vace_masks = loop_to_length(inputs.vace_masks, num_frames, axis=2) - - inputs.first_frames = build_lookup(request.first_frames, "image") - inputs.last_frames = build_lookup(request.last_frames, "image") - inputs.ref_images = build_lookup(request.vace_ref_images, "images") - # Normalize prompt to weighted list format + # Default prompt if isinstance(request.prompt, str): inputs.prompts = {0: [{"text": request.prompt, "weight": PROMPT_WEIGHT}]} else: @@ -159,59 +122,110 @@ def decode_inputs( 0: [{"text": p.text, "weight": p.weight} for p in request.prompt] } - # Chunk prompts: support both text and weighted prompt lists - if request.chunk_prompts: - for spec in request.chunk_prompts: - if spec.prompts: - inputs.prompts[spec.chunk] = [ - {"text": p.text, "weight": p.weight} for p in spec.prompts - ] - elif spec.text: - inputs.prompts[spec.chunk] = [ - {"text": spec.text, "weight": PROMPT_WEIGHT} - ] - - # Per-chunk VACE specs - if request.vace_chunk_specs: + # Load binary blob if provided + blob: bytes | None = None + if request.data_blob_path: + import tempfile + + from .recording import TEMP_FILE_PREFIXES + + # Security: validate path prefix and temp dir + blob_path = Path(request.data_blob_path) + temp_dir = Path(tempfile.gettempdir()) + if not blob_path.is_relative_to(temp_dir) or not blob_path.name.startswith( + TEMP_FILE_PREFIXES["generate_data"] + ): + raise ValueError( + f"Invalid data_blob_path: must be a temp file with prefix {TEMP_FILE_PREFIXES['generate_data']}" + ) + with open(blob_path, "rb") as f: + blob = f.read() logger.info( - f"decode_inputs: Found {len(request.vace_chunk_specs)} vace_chunk_specs" + f"decode_inputs: Loaded data blob from {request.data_blob_path} ({len(blob)} bytes)" ) - for spec in request.vace_chunk_specs: - logger.info( - f"decode_inputs: vace_chunk_spec chunk={spec.chunk}, has_frames={spec.frames is not None}, has_masks={spec.masks is not None}, context_scale={spec.context_scale}, temporally_locked={spec.vace_temporally_locked}" - ) - decoded_spec: dict = { - "vace_temporally_locked": spec.vace_temporally_locked, + + # Process chunk specs — single loop, single source of truth + for spec in request.chunk_specs or []: + # Store spec for build_chunk_kwargs + inputs.chunk_specs_map[spec.chunk] = spec + + # Prompts + if spec.prompts: + inputs.prompts[spec.chunk] = [ + {"text": p.text, "weight": p.weight} for p in spec.prompts + ] + elif spec.text: + inputs.prompts[spec.chunk] = [{"text": spec.text, "weight": PROMPT_WEIGHT}] + + # Transitions + if spec.transition_target_prompts: + inputs.transitions[spec.chunk] = { + "target_prompts": [ + {"text": p.text, "weight": p.weight} + for p in spec.transition_target_prompts + ], + "num_steps": spec.transition_num_steps or 4, + "temporal_interpolation_method": spec.transition_method or "linear", } - if spec.frames is not None: - decoded_spec["frames"] = decode_array(spec.frames, np.float32) + + # Keyframes + if spec.first_frame_image: + inputs.first_frames[spec.chunk] = spec.first_frame_image + if spec.last_frame_image: + inputs.last_frames[spec.chunk] = spec.last_frame_image + if spec.vace_ref_images: + inputs.ref_images[spec.chunk] = spec.vace_ref_images + + # VACE from blob + if blob is not None and spec.vace_frames_offset is not None: + decoded: dict = {"vace_temporally_locked": spec.vace_temporally_locked} + if spec.vace_frames_shape and spec.vace_frames_offset is not None: + count = 1 + for d in spec.vace_frames_shape: + count *= d + arr = np.frombuffer( + blob, dtype=np.float32, count=count, offset=spec.vace_frames_offset + ).reshape(spec.vace_frames_shape) + decoded["frames"] = arr logger.info( - f"decode_inputs: chunk {spec.chunk} decoded frames shape={decoded_spec['frames'].shape}" + f"decode_inputs: chunk {spec.chunk} VACE frames shape={arr.shape}" ) - if spec.masks is not None: - decoded_spec["masks"] = decode_array(spec.masks, np.float32) + if spec.vace_masks_shape and spec.vace_masks_offset is not None: + count = 1 + for d in spec.vace_masks_shape: + count *= d + arr = np.frombuffer( + blob, dtype=np.float32, count=count, offset=spec.vace_masks_offset + ).reshape(spec.vace_masks_shape) + decoded["masks"] = arr logger.info( - f"decode_inputs: chunk {spec.chunk} decoded masks shape={decoded_spec['masks'].shape}" + f"decode_inputs: chunk {spec.chunk} VACE masks shape={arr.shape}" ) - if spec.context_scale is not None: - decoded_spec["context_scale"] = spec.context_scale - inputs.vace_chunk_specs[spec.chunk] = decoded_spec - logger.info( - f"decode_inputs: vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}" - ) - else: - logger.info("decode_inputs: No vace_chunk_specs in request") + if spec.vace_context_scale is not None: + decoded["context_scale"] = spec.vace_context_scale + inputs.vace_chunk_specs[spec.chunk] = decoded + + # Input video from blob (per-chunk video-to-video) + if ( + blob is not None + and spec.input_video_offset is not None + and spec.input_video_shape is not None + ): + count = 1 + for d in spec.input_video_shape: + count *= d + inputs.input_video_chunks[spec.chunk] = np.frombuffer( + blob, dtype=np.uint8, count=count, offset=spec.input_video_offset + ).reshape(spec.input_video_shape) - # Build transitions lookup - if request.transitions: - for t in request.transitions: - inputs.transitions[t.chunk] = { - "target_prompts": [ - {"text": p.text, "weight": p.weight} for p in t.target_prompts - ], - "num_steps": t.num_steps, - "temporal_interpolation_method": t.temporal_interpolation_method, - } + logger.info( + f"decode_inputs: prompts={list(inputs.prompts.keys())}, " + f"transitions={list(inputs.transitions.keys())}, " + f"vace_specs={list(inputs.vace_chunk_specs.keys())}, " + f"input_video_chunks={list(inputs.input_video_chunks.keys())}, " + f"first_frames={list(inputs.first_frames.keys())}, " + f"last_frames={list(inputs.last_frames.keys())}" + ) return inputs @@ -228,19 +242,25 @@ def build_chunk_kwargs( dtype: torch.dtype, logger: "Logger", ) -> dict: - """Build pipeline kwargs for a single chunk.""" + """Build pipeline kwargs for a single chunk. + + Per-chunk ChunkSpec values override request-level globals. + """ + # Get per-chunk spec (if any) + spec = inputs.chunk_specs_map.get(chunk_idx) + kwargs = { "height": request.height or status_info.get("load_params", {}).get("height", DEFAULT_HEIGHT), "width": request.width or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH), - "base_seed": get_chunk_value(request.seed, chunk_idx, DEFAULT_SEED), - "init_cache": chunk_idx == 0 - or ( - request.cache_reset_chunks is not None - and chunk_idx in request.cache_reset_chunks + "base_seed": spec.seed if spec and spec.seed is not None else request.seed, + "init_cache": chunk_idx == 0 or (spec is not None and spec.reset_cache), + "manage_cache": ( + spec.manage_cache + if spec and spec.manage_cache is not None + else request.manage_cache ), - "manage_cache": request.manage_cache, } # Prompt (sticky behavior - only send when it changes) @@ -254,43 +274,73 @@ def build_chunk_kwargs( if request.denoising_steps: kwargs["denoising_step_list"] = request.denoising_steps - # Video-to-video - if inputs.input_video is not None: + # Video-to-video: per-chunk input video takes priority over global input video + if chunk_idx in inputs.input_video_chunks: + # Per-chunk input video from blob (enables v2v/t2v switching per chunk) + chunk_frames = inputs.input_video_chunks[chunk_idx] + chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0) + kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames] + kwargs["noise_scale"] = ( + spec.noise_scale + if spec and spec.noise_scale is not None + else request.noise_scale + ) + logger.info( + f"Chunk {chunk_idx}: Using per-chunk input video ({chunk_frames.shape[0]} frames)" + ) + elif inputs.input_video is not None: chunk_frames = inputs.input_video[start_frame:end_frame] chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0) kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames] - kwargs["noise_scale"] = get_chunk_value( - request.noise_scale, chunk_idx, DEFAULT_NOISE_SCALE + kwargs["noise_scale"] = ( + spec.noise_scale + if spec and spec.noise_scale is not None + else request.noise_scale ) else: kwargs["num_frames"] = chunk_size # VACE context scale - kwargs["vace_context_scale"] = get_chunk_value( - request.vace_context_scale, chunk_idx, 1.0 + kwargs["vace_context_scale"] = ( + spec.vace_context_scale + if spec and spec.vace_context_scale is not None + else request.vace_context_scale ) # Noise controller - if request.noise_controller is not None: - kwargs["noise_controller"] = request.noise_controller + noise_ctrl = ( + spec.noise_controller + if spec and spec.noise_controller is not None + else request.noise_controller + ) + if noise_ctrl is not None: + kwargs["noise_controller"] = noise_ctrl # KV cache attention bias - kv_bias = get_chunk_value(request.kv_cache_attention_bias, chunk_idx) + kv_bias = ( + spec.kv_cache_attention_bias + if spec and spec.kv_cache_attention_bias is not None + else request.kv_cache_attention_bias + ) if kv_bias is not None: kwargs["kv_cache_attention_bias"] = kv_bias # Prompt interpolation method - kwargs["prompt_interpolation_method"] = request.prompt_interpolation_method + kwargs["prompt_interpolation_method"] = ( + spec.prompt_interpolation_method + if spec and spec.prompt_interpolation_method is not None + else request.prompt_interpolation_method + ) # VACE use input video if request.vace_use_input_video is not None: kwargs["vace_use_input_video"] = request.vace_use_input_video - # LoRA scales - if request.lora_scales: + # LoRA scales: per-chunk spec overrides global + lora_scales = spec.lora_scales if spec and spec.lora_scales else request.lora_scales + if lora_scales: lora_scale_updates = [] - for path, scale_value in request.lora_scales.items(): - scale = get_chunk_value(scale_value, chunk_idx, 1.0) + for path, scale in lora_scales.items(): lora_scale_updates.append({"path": path, "scale": scale}) logger.info( f"Chunk {chunk_idx}: LoRA scale={scale:.3f} for {Path(path).name}" @@ -313,40 +363,26 @@ def build_chunk_kwargs( if chunk_idx in inputs.ref_images: kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx] - # VACE conditioning: per-chunk spec takes priority over global + # VACE conditioning from blob logger.info( - f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}, has_global_frames={inputs.vace_frames is not None}, has_global_masks={inputs.vace_masks is not None}" + f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}" ) if chunk_idx in inputs.vace_chunk_specs: logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING PER-CHUNK VACE SPEC") - spec = inputs.vace_chunk_specs[chunk_idx] + vace_spec = inputs.vace_chunk_specs[chunk_idx] - if "frames" in spec: - frames = spec["frames"] + if "frames" in vace_spec: + frames = vace_spec["frames"] frames = pad_chunk(frames, chunk_size, axis=2) kwargs["vace_input_frames"] = torch.from_numpy(frames).to(device, dtype) - if "masks" in spec: - masks = spec["masks"] + if "masks" in vace_spec: + masks = vace_spec["masks"] masks = pad_chunk(masks, chunk_size, axis=2) kwargs["vace_input_masks"] = torch.from_numpy(masks).to(device, dtype) - if "context_scale" in spec: - kwargs["vace_context_scale"] = spec["context_scale"] - else: - logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING GLOBAL VACE FALLBACK") - # Global VACE conditioning frames [1, C, T, H, W] - if inputs.vace_frames is not None: - chunk = inputs.vace_frames[:, :, start_frame:end_frame, :, :] - chunk = pad_chunk(chunk, chunk_size, axis=2) - kwargs["vace_input_frames"] = torch.from_numpy(chunk).to(device, dtype) - - # Global VACE masks [1, 1, T, H, W] - if inputs.vace_masks is not None: - chunk = inputs.vace_masks[:, :, start_frame:end_frame, :, :] - chunk = pad_chunk(chunk, chunk_size, axis=2) - kwargs["vace_input_masks"] = torch.from_numpy(chunk).to(device, dtype) - + if "context_scale" in vace_spec: + kwargs["vace_context_scale"] = vace_spec["context_scale"] return kwargs @@ -694,7 +730,7 @@ def generate_video_stream( pipeline = pipeline_manager.get_pipeline_by_id(request.pipeline_id) # Determine chunk size from pipeline - has_video = request.input_video is not None or request.input_path is not None + has_video = request.input_path is not None requirements = pipeline.prepare(video=[] if has_video else None) chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE num_chunks = (request.num_frames + chunk_size - 1) // chunk_size @@ -817,6 +853,14 @@ def generate_video_stream( yield sse_event("error", {"error": str(e)}) finally: + # Clean up uploaded data blob file + if request.data_blob_path: + try: + Path(request.data_blob_path).unlink(missing_ok=True) + logger.info(f"Cleaned up data blob file: {request.data_blob_path}") + except Exception as e: + logger.warning(f"Failed to clean up data blob file: {e}") + # Clean up uploaded input file if request.input_path: try: diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py index bd06a3bca..280ba9fe1 100644 --- a/src/scope/server/recording.py +++ b/src/scope/server/recording.py @@ -19,6 +19,7 @@ "download": "scope_download_", "generate_input": "scope_gen_input_", "generate_output": "scope_gen_output_", + "generate_data": "scope_gen_data_", } # Environment variables @@ -441,6 +442,7 @@ def cleanup_recording_files(): f"{TEMP_FILE_PREFIXES['download']}*.mp4", f"{TEMP_FILE_PREFIXES['generate_input']}*.bin", f"{TEMP_FILE_PREFIXES['generate_output']}*.bin", + f"{TEMP_FILE_PREFIXES['generate_data']}*.bin", ] deleted_count = 0 diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index d450ba44e..ad61c6a32 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -818,82 +818,100 @@ class ApiKeyDeleteResponse(BaseModel): message: str -class ChunkFrameSpec(BaseModel): - """Specification for a frame image at a specific chunk.""" +class ChunkSpec(BaseModel): + """Unified per-chunk specification. All fields optional — only set what changes.""" - chunk: int = Field(..., ge=0, description="Chunk index") - image: str = Field(..., description="Path to image file") + chunk: int = Field(..., ge=0, description="Chunk index (required)") - -class ChunkPromptSpec(BaseModel): - """Specification for a prompt at a specific chunk. - - Supports both simple text and weighted prompt lists for spatial blending. - """ - - chunk: int = Field(..., ge=0, description="Chunk index") + # Prompt text: str | None = Field( default=None, - description="Simple prompt text for this chunk (mutually exclusive with prompts)", + description="Simple prompt text (mutually exclusive with prompts)", ) prompts: list[PromptItem] | None = Field( default=None, - description="Weighted prompt list for spatial blending at this chunk (mutually exclusive with text)", + description="Weighted prompt list for spatial blending (mutually exclusive with text)", + ) + prompt_interpolation_method: Literal["linear", "slerp"] | None = Field( + default=None, + description="Spatial interpolation method override for this chunk", ) - -class ChunkTransitionSpec(BaseModel): - """Specification for a temporal transition starting at a specific chunk.""" - - chunk: int = Field(..., ge=0, description="Chunk index where transition starts") - target_prompts: list[PromptItem] = Field( - ..., description="Target prompt blend to interpolate to" + # Temporal transition + transition_target_prompts: list[PromptItem] | None = Field( + default=None, + description="Target prompt blend to interpolate to", ) - num_steps: int = Field( - default=4, + transition_num_steps: int | None = Field( + default=None, ge=0, description="Number of generation calls to transition over (0 = instant)", ) - temporal_interpolation_method: Literal["linear", "slerp"] = Field( - default="linear", - description="Method for temporal interpolation between blends across frames", + transition_method: Literal["linear", "slerp"] | None = Field( + default=None, + description="Method for temporal interpolation between blends", ) + # Keyframe images (paths) + first_frame_image: str | None = Field( + default=None, description="Path to first frame reference image" + ) + last_frame_image: str | None = Field( + default=None, description="Path to last frame reference image" + ) + vace_ref_images: list[str] | None = Field( + default=None, description="List of reference image paths for VACE conditioning" + ) -class ChunkRefImagesSpec(BaseModel): - """Specification for reference images at a specific chunk.""" - - chunk: int = Field(default=0, ge=0, description="Chunk index (default: 0)") - images: list[str] = Field(..., description="List of reference image paths") - - -class ChunkVACESpec(BaseModel): - """Per-chunk VACE conditioning specification.""" - - chunk: int = Field(..., ge=0, description="Chunk index") - frames: "EncodedArray | None" = Field( - default=None, - description="VACE conditioning frames for this chunk ([1, C, T, H, W] float32 [-1, 1])", + # Generation parameters + seed: int | None = Field(default=None, description="Random seed override") + noise_scale: float | None = Field(default=None, description="Noise scale override") + kv_cache_attention_bias: float | None = Field( + default=None, description="KV cache attention bias override" ) - masks: "EncodedArray | None" = Field( - default=None, - description="VACE masks for this chunk ([1, 1, T, H, W] float32 {0, 1})", + reset_cache: bool = Field( + default=False, description="Force cache reset at this chunk" ) - context_scale: float | None = Field( - default=None, - description="VACE context scale override for this chunk. If None, uses global vace_context_scale.", + noise_controller: bool | None = Field( + default=None, description="Noise controller override" ) - vace_temporally_locked: bool = Field( - default=True, - description="When True, frames/masks are sliced temporally to match chunk position. When False, used as-is and padded.", + manage_cache: bool | None = Field( + default=None, description="Cache management override" ) + # LoRA scales: {path: scale} + lora_scales: dict[str, float] | None = Field( + default=None, description="LoRA scales by path for this chunk" + ) -class EncodedArray(BaseModel): - """Base64-encoded numpy array with shape metadata.""" + # VACE conditioning (offsets into binary blob) + vace_context_scale: float | None = Field( + default=None, description="VACE context scale override" + ) + vace_temporally_locked: bool = Field( + default=True, + description="When True, frames/masks are sliced temporally. When False, used as-is.", + ) + vace_frames_shape: list[int] | None = Field( + default=None, description="Shape of VACE frames ([1, C, T, H, W] float32)" + ) + vace_frames_offset: int | None = Field( + default=None, description="Byte offset into blob for VACE frames" + ) + vace_masks_shape: list[int] | None = Field( + default=None, description="Shape of VACE masks ([1, 1, T, H, W] float32)" + ) + vace_masks_offset: int | None = Field( + default=None, description="Byte offset into blob for VACE masks" + ) - base64: str = Field(..., description="Base64-encoded numpy array bytes") - shape: list[int] = Field(..., description="Array shape for decoding") + # Input video for this chunk (offset into binary blob) + input_video_shape: list[int] | None = Field( + default=None, description="Shape of per-chunk input video [T, H, W, C] uint8" + ) + input_video_offset: int | None = Field( + default=None, description="Byte offset into blob for per-chunk input video" + ) class VideoUploadResponse(BaseModel): @@ -914,14 +932,6 @@ class GenerateRequest(BaseModel): ..., description="Text prompt for generation (sent on chunk 0). Can be a simple string or a list of weighted prompts for spatial blending.", ) - chunk_prompts: list[ChunkPromptSpec] | None = Field( - default=None, - description="Prompt changes at later chunks (sticky behavior). Each entry supports simple text or weighted prompt lists.", - ) - transitions: list[ChunkTransitionSpec] | None = Field( - default=None, - description="Temporal transitions at specific chunks. Each specifies a target prompt blend and number of interpolation steps.", - ) num_frames: int = Field( default=64, ge=1, @@ -940,86 +950,70 @@ class GenerateRequest(BaseModel): le=2048, description="Output width (defaults to pipeline's native resolution)", ) - seed: int | list[int] = Field( - default=42, - description="Random seed. Single int applies to all chunks; list applies per-chunk.", - ) - # Video-to-video input (optional) - two mutually exclusive options - input_video: EncodedArray | None = Field( + + # Per-chunk specs (replaces all scattered per-chunk lists) + chunk_specs: list[ChunkSpec] | None = Field( default=None, - description="Input video frames (THWC, uint8). If provided, enables video-to-video mode. For large videos, use input_path instead.", + description="Unified per-chunk specifications. Each entry can override prompt, transition, " + "keyframes, generation parameters, LoRA scales, and VACE conditioning for a specific chunk.", ) - input_path: str | None = Field( + + # Binary blob path (from /generate/upload-data) + data_blob_path: str | None = Field( default=None, - description="Path to uploaded video file (from /generate/upload). Alternative to input_video for large files.", + description="Path to uploaded binary data blob (from /generate/upload-data). " + "Contains raw arrays referenced by chunk_specs offsets (VACE frames/masks, input video).", ) - noise_scale: float | list[float] = Field( - default=0.7, - description="Noise scale for video-to-video mode. Single float applies to all chunks; list applies per-chunk.", + + # Global defaults (applied to chunks without per-chunk override) + seed: int = Field( + default=42, + description="Random seed (default for all chunks).", ) - denoising_steps: list[int] | None = Field( - default=None, - description="Denoising timesteps (e.g., [1000, 750, 500, 250])", + noise_scale: float = Field( + default=0.7, + description="Noise scale for video-to-video mode (default for all chunks).", ) manage_cache: bool = Field( default=True, - description="Enable automatic cache management. Set to False to prevent cache resets when parameters change (e.g., LoRA scales).", - ) - cache_reset_chunks: list[int] | None = Field( - default=None, - description="List of chunk indices where the KV cache should be forcibly reset (init_cache=True). Chunk 0 always resets.", + description="Enable automatic cache management.", ) noise_controller: bool | None = Field( default=None, description="Enable automatic noise scale adjustment based on motion detection.", ) - kv_cache_attention_bias: float | list[float] | None = Field( + kv_cache_attention_bias: float | None = Field( default=None, - description="Controls reliance on past frames in cache. Lower values mitigate error accumulation. Single float applies to all chunks; list applies per-chunk. Typical values: 0.3-0.7 moderate, 0.1-0.2 strong.", + description="Controls reliance on past frames in cache. Lower values mitigate error accumulation.", ) prompt_interpolation_method: Literal["linear", "slerp"] = Field( default="linear", - description="Spatial interpolation method for blending multiple prompts: linear (weighted average) or slerp (spherical).", - ) - vace_use_input_video: bool | None = Field( - default=None, - description="When enabled in video-to-video mode, input video is used for VACE conditioning instead of latent initialization.", - ) - # Per-chunk parameters - lora_scales: dict[str, float | list[float]] | None = Field( - default=None, - description="LoRA scales by path. Single float applies to all chunks; list applies per-chunk. Example: {'path/to/lora.pt': 0.8} or {'path/to/lora.pt': [0.5, 0.7, 0.9]}", + description="Spatial interpolation method for blending multiple prompts.", ) - vace_context_scale: float | list[float] = Field( + vace_context_scale: float = Field( default=1.0, - description="VACE context scale. Single float applies to all chunks; list applies per-chunk.", - ) - # Keyframe specifications (chunk, image) pairs - first_frames: list[ChunkFrameSpec] | None = Field( - default=None, - description="First frame anchors. Each specifies a chunk index and image path to use as that chunk's first frame.", - ) - last_frames: list[ChunkFrameSpec] | None = Field( - default=None, - description="Last frame anchors. Each specifies a chunk index and image path to use as that chunk's last frame.", + description="VACE context scale (default for all chunks).", ) - vace_ref_images: list[ChunkRefImagesSpec] | None = Field( + vace_use_input_video: bool | None = Field( default=None, - description="Reference images for VACE conditioning. Each specifies a chunk index and list of image paths.", + description="When enabled in video-to-video mode, input video is used for VACE conditioning.", ) - # VACE conditioning frames/masks (for depth guidance, inpainting, etc.) - vace_frames: EncodedArray | None = Field( + denoising_steps: list[int] | None = Field( default=None, - description="VACE conditioning frames ([1, C, T, H, W] float32 [-1, 1]). Used for depth guidance, structural control, etc.", + description="Denoising timesteps (e.g., [1000, 750, 500, 250])", ) - vace_masks: EncodedArray | None = Field( + lora_scales: dict[str, float] | None = Field( default=None, - description="VACE masks ([1, 1, T, H, W] float32 {0, 1}). Used for inpainting (1 = regenerate, 0 = keep).", + description="Global LoRA scales by path (default for all chunks).", ) - vace_chunk_specs: list[ChunkVACESpec] | None = Field( + + # Video-to-video input (file-based upload) + input_path: str | None = Field( default=None, - description="Per-chunk VACE conditioning. Each specifies frames/masks for a specific chunk. Overrides global vace_frames/vace_masks for that chunk.", + description="Path to uploaded video file (from /generate/upload).", ) + + # Processors pre_processor_id: str | None = Field( default=None, description="Pipeline ID for pre-processing each chunk before the main pipeline.", @@ -1030,6 +1024,15 @@ class GenerateRequest(BaseModel): ) +class DataUploadResponse(BaseModel): + """Response after uploading binary data blob for generate request.""" + + data_blob_path: str = Field( + ..., description="Path to uploaded data blob file for generate request" + ) + size_bytes: int = Field(..., description="Size of the uploaded blob in bytes") + + class GenerateResponse(BaseModel): """Response from batch video generation. From d2c66b9022be9f585114fdf5d034fa72af65c7a4 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Fri, 20 Feb 2026 11:48:08 -0500 Subject: [PATCH 14/16] cleanup Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- docs/api/generate.md | 102 ++++++ src/scope/server/app.py | 21 +- src/scope/server/generate.py | 682 +++++++++++++++-------------------- src/scope/server/schema.py | 19 +- 4 files changed, 408 insertions(+), 416 deletions(-) create mode 100644 docs/api/generate.md diff --git a/docs/api/generate.md b/docs/api/generate.md new file mode 100644 index 000000000..3f22d4d44 --- /dev/null +++ b/docs/api/generate.md @@ -0,0 +1,102 @@ +# Generate Endpoint + +Batch video generation via HTTP. Unlike the WebRTC streaming path (real-time, interactive), the generate endpoint produces a complete video in one request, processing it chunk-by-chunk with SSE progress events. + +Primary consumer: ComfyUI custom nodes (`comfyui-scope`). + +## Endpoints + +| Endpoint | Method | Purpose | +|---|---|---| +| `/api/v1/generate` | POST | Generate video (SSE stream) | +| `/api/v1/generate/cancel` | POST | Cancel after current chunk | +| `/api/v1/generate/upload` | POST | Upload input video for v2v | +| `/api/v1/generate/upload-data` | POST | Upload binary data blob (VACE, per-chunk video) | +| `/api/v1/generate/download` | GET | Download output video | + +Only one generation can run at a time (409 if busy). + +## Flow + +``` +1. [optional] POST /generate/upload → input_path +2. [optional] POST /generate/upload-data → data_blob_path +3. POST /generate (JSON body, references paths from steps 1-2) + ← SSE: event: progress {chunk, total_chunks, frames, latency, fps} + ← SSE: event: complete {output_path, video_shape, num_frames, ...} +4. GET /generate/download?path= + ← binary video data +``` + +## Binary Protocol + +### Video Upload (`/generate/upload`) + +**Request**: Raw uint8 bytes in THWC order (frames × height × width × channels). + +**Headers** (required): +- `X-Video-Frames`: T +- `X-Video-Height`: H +- `X-Video-Width`: W +- `X-Video-Channels`: C (default 3) + +**Stored format**: 20-byte header + raw data. +``` +[4 bytes: ndim (little-endian u32)] +[4 bytes × ndim: shape dimensions (little-endian u32 each)] +[raw uint8 video bytes] +``` + +### Data Blob Upload (`/generate/upload-data`) + +**Request**: Raw binary blob containing packed arrays. Max size: 2 GB. + +The blob is an opaque byte buffer. `ChunkSpec` entries in the generate request reference regions of this blob by offset: + +```json +{ + "chunk": 0, + "vace_frames_offset": 0, + "vace_frames_shape": [1, 3, 12, 320, 576], + "vace_masks_offset": 26542080, + "vace_masks_shape": [1, 1, 12, 320, 576] +} +``` + +Arrays are packed as contiguous float32 (VACE frames/masks) or uint8 (input video). The client is responsible for computing offsets when packing the blob. + +### Video Download (`/generate/download`) + +**Response**: Same binary format as upload (20-byte header + raw uint8 THWC data). + +**Response headers**: +- `X-Video-Frames`, `X-Video-Height`, `X-Video-Width`, `X-Video-Channels` + +## GenerateRequest + +```json +{ + "pipeline_id": "longlive", + "prompt": "a cat walking", + "num_frames": 48, + "seed": 42, + "noise_scale": 0.7, + "input_path": "", + "data_blob_path": "", + "chunk_specs": [ + { + "chunk": 0, + "text": "override prompt for chunk 0", + "lora_scales": {"path/to/lora.safetensors": 0.5}, + "vace_frames_offset": 0, + "vace_frames_shape": [1, 3, 12, 320, 576] + } + ], + "pre_processor_id": null, + "post_processor_id": null +} +``` + +Request-level fields are global defaults. `chunk_specs` entries override any field for a specific chunk index. Only fields that change need to be specified — prompts are sticky (last-set persists). + +See `schema.py` for the full `GenerateRequest` and `ChunkSpec` field definitions. diff --git a/src/scope/server/app.py b/src/scope/server/app.py index 75cb79e88..10fe79c39 100644 --- a/src/scope/server/app.py +++ b/src/scope/server/app.py @@ -1134,6 +1134,14 @@ async def generate_video( pipeline_manager: "PipelineManager" = Depends(get_pipeline_manager), ): """Generate video frames in batch mode with SSE progress streaming.""" + from .generate import is_generation_active + + if is_generation_active(): + raise HTTPException( + status_code=409, + detail="A generation is already in progress. Cancel it first or wait for completion.", + ) + status_info = await pipeline_manager.get_status_info_async() if status_info["status"] != "loaded": raise HTTPException( @@ -1253,12 +1261,21 @@ async def upload_data_blob(request: Request): ".bin", TEMP_FILE_PREFIXES["generate_data"] ) - # Stream body to file + from .generate import MAX_DATA_BLOB_BYTES + + # Stream body to file with size limit bytes_written = 0 with open(file_path, "wb") as f: async for chunk in request.stream(): - f.write(chunk) bytes_written += len(chunk) + if bytes_written > MAX_DATA_BLOB_BYTES: + f.close() + Path(file_path).unlink(missing_ok=True) + raise HTTPException( + status_code=413, + detail=f"Data blob exceeds maximum size of {MAX_DATA_BLOB_BYTES} bytes", + ) + f.write(chunk) if bytes_written == 0: Path(file_path).unlink(missing_ok=True) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 3251738ab..93d91b25c 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -8,7 +8,7 @@ from collections.abc import Iterator from dataclasses import dataclass, field from pathlib import Path -from typing import TYPE_CHECKING +from typing import IO, TYPE_CHECKING import numpy as np import torch @@ -16,6 +16,12 @@ # Cancellation support (single-client, so one event suffices) _cancel_event = threading.Event() +# Generation lock (single-client: only one generation at a time) +_generation_lock = threading.Lock() + +# Max data blob upload size (2 GB) +MAX_DATA_BLOB_BYTES = 2 * 1024 * 1024 * 1024 + def cancel_generation(): """Signal the current generation to stop after the current chunk.""" @@ -27,6 +33,11 @@ def is_generation_cancelled() -> bool: return _cancel_event.is_set() +def is_generation_active() -> bool: + """Check if a generation is currently in progress.""" + return _generation_lock.locked() + + # Defaults DEFAULT_HEIGHT = 320 DEFAULT_WIDTH = 576 @@ -42,6 +53,11 @@ def is_generation_cancelled() -> bool: from .schema import ChunkSpec, GenerateRequest +# --------------------------------------------------------------------------- +# Array utilities +# --------------------------------------------------------------------------- + + def loop_to_length(arr: np.ndarray, target: int, axis: int) -> np.ndarray: """Tile array along axis to reach target length.""" current = arr.shape[axis] @@ -66,11 +82,21 @@ def pad_chunk(arr: np.ndarray, target_size: int, axis: int) -> np.ndarray: return np.concatenate([arr, padding], axis=axis) +# --------------------------------------------------------------------------- +# SSE helpers +# --------------------------------------------------------------------------- + + def sse_event(event_type: str, data: dict) -> str: """Format a server-sent event.""" return f"event: {event_type}\ndata: {json.dumps(data)}\n\n" +# --------------------------------------------------------------------------- +# Dataclasses +# --------------------------------------------------------------------------- + + @dataclass class DecodedInputs: """Decoded and preprocessed inputs for generation.""" @@ -86,15 +112,81 @@ class DecodedInputs: chunk_specs_map: "dict[int, ChunkSpec]" = field(default_factory=dict) -def load_video_from_file(file_path: str) -> np.ndarray: - """Load video from temp file. +@dataclass +class GenerationState: + """Mutable state accumulated during chunk-by-chunk generation.""" + + output_file: IO[bytes] + num_chunks: int + logger: "Logger" + total_frames: int = 0 + height: int | None = None + width: int | None = None + channels: int | None = None + latencies: list[float] = field(default_factory=list) + fps_measures: list[float] = field(default_factory=list) + + def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> str: + """Write chunk output to file and return SSE progress event.""" + chunk_output = result["video"] + num_output_frames = chunk_output.shape[0] + chunk_fps = num_output_frames / chunk_latency + + self.latencies.append(chunk_latency) + self.fps_measures.append(chunk_fps) + + self.logger.info( + f"Chunk {chunk_idx + 1}/{self.num_chunks}: " + f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" + ) - Args: - file_path: Path to video file with header + chunk_np = chunk_output.detach().cpu().numpy() + chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8) + self.output_file.write(chunk_uint8.tobytes()) - Returns: - Video array [T, H, W, C] uint8 - """ + self.total_frames += num_output_frames + if self.height is None: + self.height = chunk_np.shape[1] + self.width = chunk_np.shape[2] + self.channels = chunk_np.shape[3] + + return sse_event( + "progress", + { + "chunk": chunk_idx + 1, + "total_chunks": self.num_chunks, + "frames": num_output_frames, + "latency": round(chunk_latency, 3), + "fps": round(chunk_fps, 2), + }, + ) + + @property + def output_shape(self) -> list[int]: + return [self.total_frames, self.height, self.width, self.channels] + + def log_summary(self): + """Log performance summary.""" + if not self.latencies: + return + avg_lat = sum(self.latencies) / len(self.latencies) + avg_fps = sum(self.fps_measures) / len(self.fps_measures) + self.logger.info( + f"=== Performance Summary ({self.num_chunks} chunks) ===\n" + f" Latency - Avg: {avg_lat:.2f}s, " + f"Max: {max(self.latencies):.2f}s, Min: {min(self.latencies):.2f}s\n" + f" FPS - Avg: {avg_fps:.2f}, " + f"Max: {max(self.fps_measures):.2f}, Min: {min(self.fps_measures):.2f}" + ) + + +# --------------------------------------------------------------------------- +# Input decoding +# --------------------------------------------------------------------------- + + +def load_video_from_file(file_path: str) -> np.ndarray: + """Load video from temp file with header (ndim + shape + raw uint8).""" with open(file_path, "rb") as f: ndim = int.from_bytes(f.read(4), "little") shape = tuple(int.from_bytes(f.read(4), "little") for _ in range(ndim)) @@ -102,6 +194,16 @@ def load_video_from_file(file_path: str) -> np.ndarray: return data +def _read_blob_array( + blob: bytes, offset: int, shape: list[int], dtype=np.float32 +) -> np.ndarray: + """Read a contiguous array from a binary blob at a given offset.""" + count = 1 + for d in shape: + count *= d + return np.frombuffer(blob, dtype=dtype, count=count, offset=offset).reshape(shape) + + def decode_inputs( request: "GenerateRequest", num_frames: int, logger: "Logger" ) -> DecodedInputs: @@ -129,7 +231,6 @@ def decode_inputs( from .recording import TEMP_FILE_PREFIXES - # Security: validate path prefix and temp dir blob_path = Path(request.data_blob_path) temp_dir = Path(tempfile.gettempdir()) if not blob_path.is_relative_to(temp_dir) or not blob_path.name.startswith( @@ -146,7 +247,6 @@ def decode_inputs( # Process chunk specs — single loop, single source of truth for spec in request.chunk_specs or []: - # Store spec for build_chunk_kwargs inputs.chunk_specs_map[spec.chunk] = spec # Prompts @@ -180,23 +280,17 @@ def decode_inputs( if blob is not None and spec.vace_frames_offset is not None: decoded: dict = {"vace_temporally_locked": spec.vace_temporally_locked} if spec.vace_frames_shape and spec.vace_frames_offset is not None: - count = 1 - for d in spec.vace_frames_shape: - count *= d - arr = np.frombuffer( - blob, dtype=np.float32, count=count, offset=spec.vace_frames_offset - ).reshape(spec.vace_frames_shape) + arr = _read_blob_array( + blob, spec.vace_frames_offset, spec.vace_frames_shape + ) decoded["frames"] = arr logger.info( f"decode_inputs: chunk {spec.chunk} VACE frames shape={arr.shape}" ) if spec.vace_masks_shape and spec.vace_masks_offset is not None: - count = 1 - for d in spec.vace_masks_shape: - count *= d - arr = np.frombuffer( - blob, dtype=np.float32, count=count, offset=spec.vace_masks_offset - ).reshape(spec.vace_masks_shape) + arr = _read_blob_array( + blob, spec.vace_masks_offset, spec.vace_masks_shape + ) decoded["masks"] = arr logger.info( f"decode_inputs: chunk {spec.chunk} VACE masks shape={arr.shape}" @@ -211,12 +305,9 @@ def decode_inputs( and spec.input_video_offset is not None and spec.input_video_shape is not None ): - count = 1 - for d in spec.input_video_shape: - count *= d - inputs.input_video_chunks[spec.chunk] = np.frombuffer( - blob, dtype=np.uint8, count=count, offset=spec.input_video_offset - ).reshape(spec.input_video_shape) + inputs.input_video_chunks[spec.chunk] = _read_blob_array( + blob, spec.input_video_offset, spec.input_video_shape, dtype=np.uint8 + ) logger.info( f"decode_inputs: prompts={list(inputs.prompts.keys())}, " @@ -230,6 +321,21 @@ def decode_inputs( return inputs +# --------------------------------------------------------------------------- +# Chunk kwargs builder +# --------------------------------------------------------------------------- + + +def _resolve(spec, attr: str, request, fallback=None): + """Return per-chunk spec value if set, else request-level value, else fallback.""" + if spec is not None: + val = getattr(spec, attr, None) + if val is not None: + return val + val = getattr(request, attr, None) + return val if val is not None else fallback + + def build_chunk_kwargs( request: "GenerateRequest", inputs: DecodedInputs, @@ -246,24 +352,22 @@ def build_chunk_kwargs( Per-chunk ChunkSpec values override request-level globals. """ - # Get per-chunk spec (if any) spec = inputs.chunk_specs_map.get(chunk_idx) + load_params = status_info.get("load_params", {}) - kwargs = { + kwargs: dict = { "height": request.height - or status_info.get("load_params", {}).get("height", DEFAULT_HEIGHT), + if request.height is not None + else load_params.get("height", DEFAULT_HEIGHT), "width": request.width - or status_info.get("load_params", {}).get("width", DEFAULT_WIDTH), - "base_seed": spec.seed if spec and spec.seed is not None else request.seed, + if request.width is not None + else load_params.get("width", DEFAULT_WIDTH), + "base_seed": _resolve(spec, "seed", request, DEFAULT_SEED), "init_cache": chunk_idx == 0 or (spec is not None and spec.reset_cache), - "manage_cache": ( - spec.manage_cache - if spec and spec.manage_cache is not None - else request.manage_cache - ), + "manage_cache": _resolve(spec, "manage_cache", request, True), } - # Prompt (sticky behavior - only send when it changes) + # Prompt (sticky — only send when it changes) if chunk_idx in inputs.prompts: kwargs["prompts"] = inputs.prompts[chunk_idx] @@ -274,79 +378,54 @@ def build_chunk_kwargs( if request.denoising_steps: kwargs["denoising_step_list"] = request.denoising_steps - # Video-to-video: per-chunk input video takes priority over global input video + # Video-to-video: per-chunk input video takes priority over global if chunk_idx in inputs.input_video_chunks: - # Per-chunk input video from blob (enables v2v/t2v switching per chunk) - chunk_frames = inputs.input_video_chunks[chunk_idx] - chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0) + chunk_frames = pad_chunk( + inputs.input_video_chunks[chunk_idx], chunk_size, axis=0 + ) kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames] - kwargs["noise_scale"] = ( - spec.noise_scale - if spec and spec.noise_scale is not None - else request.noise_scale + kwargs["noise_scale"] = _resolve( + spec, "noise_scale", request, DEFAULT_NOISE_SCALE ) logger.info( f"Chunk {chunk_idx}: Using per-chunk input video ({chunk_frames.shape[0]} frames)" ) elif inputs.input_video is not None: - chunk_frames = inputs.input_video[start_frame:end_frame] - chunk_frames = pad_chunk(chunk_frames, chunk_size, axis=0) + chunk_frames = pad_chunk( + inputs.input_video[start_frame:end_frame], chunk_size, axis=0 + ) kwargs["video"] = [torch.from_numpy(f).unsqueeze(0) for f in chunk_frames] - kwargs["noise_scale"] = ( - spec.noise_scale - if spec and spec.noise_scale is not None - else request.noise_scale + kwargs["noise_scale"] = _resolve( + spec, "noise_scale", request, DEFAULT_NOISE_SCALE ) else: kwargs["num_frames"] = chunk_size - # VACE context scale - kwargs["vace_context_scale"] = ( - spec.vace_context_scale - if spec and spec.vace_context_scale is not None - else request.vace_context_scale + kwargs["vace_context_scale"] = _resolve(spec, "vace_context_scale", request, 1.0) + kwargs["prompt_interpolation_method"] = _resolve( + spec, "prompt_interpolation_method", request, "linear" ) - # Noise controller - noise_ctrl = ( - spec.noise_controller - if spec and spec.noise_controller is not None - else request.noise_controller - ) + # Optional overrides (only include in kwargs when non-None) + noise_ctrl = _resolve(spec, "noise_controller", request) if noise_ctrl is not None: kwargs["noise_controller"] = noise_ctrl - # KV cache attention bias - kv_bias = ( - spec.kv_cache_attention_bias - if spec and spec.kv_cache_attention_bias is not None - else request.kv_cache_attention_bias - ) + kv_bias = _resolve(spec, "kv_cache_attention_bias", request) if kv_bias is not None: kwargs["kv_cache_attention_bias"] = kv_bias - # Prompt interpolation method - kwargs["prompt_interpolation_method"] = ( - spec.prompt_interpolation_method - if spec and spec.prompt_interpolation_method is not None - else request.prompt_interpolation_method - ) - - # VACE use input video if request.vace_use_input_video is not None: kwargs["vace_use_input_video"] = request.vace_use_input_video # LoRA scales: per-chunk spec overrides global lora_scales = spec.lora_scales if spec and spec.lora_scales else request.lora_scales if lora_scales: - lora_scale_updates = [] - for path, scale in lora_scales.items(): - lora_scale_updates.append({"path": path, "scale": scale}) - logger.info( - f"Chunk {chunk_idx}: LoRA scale={scale:.3f} for {Path(path).name}" - ) - if lora_scale_updates: - kwargs["lora_scales"] = lora_scale_updates + kwargs["lora_scales"] = [ + {"path": p, "scale": s} for p, s in lora_scales.items() + ] + for p, s in lora_scales.items(): + logger.info(f"Chunk {chunk_idx}: LoRA scale={s:.3f} for {Path(p).name}") # Keyframes if chunk_idx in inputs.first_frames: @@ -354,229 +433,96 @@ def build_chunk_kwargs( kwargs["extension_mode"] = ( "firstlastframe" if chunk_idx in inputs.last_frames else "firstframe" ) - if chunk_idx in inputs.last_frames: kwargs["last_frame_image"] = inputs.last_frames[chunk_idx] if chunk_idx not in inputs.first_frames: kwargs["extension_mode"] = "lastframe" - if chunk_idx in inputs.ref_images: kwargs["vace_ref_images"] = inputs.ref_images[chunk_idx] # VACE conditioning from blob - logger.info( - f"build_chunk_kwargs: chunk {chunk_idx}, vace_chunk_specs keys={list(inputs.vace_chunk_specs.keys())}" - ) if chunk_idx in inputs.vace_chunk_specs: - logger.info(f"build_chunk_kwargs: chunk {chunk_idx} USING PER-CHUNK VACE SPEC") vace_spec = inputs.vace_chunk_specs[chunk_idx] - if "frames" in vace_spec: - frames = vace_spec["frames"] - frames = pad_chunk(frames, chunk_size, axis=2) + frames = pad_chunk(vace_spec["frames"], chunk_size, axis=2) kwargs["vace_input_frames"] = torch.from_numpy(frames).to(device, dtype) - if "masks" in vace_spec: - masks = vace_spec["masks"] - masks = pad_chunk(masks, chunk_size, axis=2) + masks = pad_chunk(vace_spec["masks"], chunk_size, axis=2) kwargs["vace_input_masks"] = torch.from_numpy(masks).to(device, dtype) - if "context_scale" in vace_spec: kwargs["vace_context_scale"] = vace_spec["context_scale"] + return kwargs +# --------------------------------------------------------------------------- +# Chunk logging +# --------------------------------------------------------------------------- + +# (key, format_string) — format_string uses {v} for the value +_CHUNK_LOG_ENTRIES = [ + ("init_cache", "Resetting cache (init_cache=True)", lambda v: v), + ("extension_mode", "Extension mode: {v}", None), + ("vace_context_scale", "VACE context scale: {v}", lambda v: v != 1.0), + ("vace_use_input_video", "VACE use input video: {v}", None), + ("denoising_step_list", "Denoising steps: {v}", None), + ("noise_controller", "Using noise controller: {v}", None), + ("kv_cache_attention_bias", "KV cache attention bias: {v}", None), +] + + def _log_chunk_info(kwargs: dict, chunk_idx: int, num_chunks: int, logger: "Logger"): """Log detailed chunk information.""" - logger.info(f"generate_video_stream: Starting chunk {chunk_idx + 1}/{num_chunks}") - if kwargs.get("init_cache"): - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Resetting cache (init_cache=True)" - ) + prefix = f"generate: Chunk {chunk_idx}" + logger.info(f"generate: Starting chunk {chunk_idx + 1}/{num_chunks}") + + # Structured entries if "prompts" in kwargs: - prompt_texts = [p["text"] for p in kwargs["prompts"]] - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Updating prompt to {prompt_texts}" - ) + logger.info(f"{prefix}: Prompt → {[p['text'] for p in kwargs['prompts']]}") if "transition" in kwargs: - target_texts = [p["text"] for p in kwargs["transition"]["target_prompts"]] + t = kwargs["transition"] logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Temporal transition to {target_texts} " - f"over {kwargs['transition']['num_steps']} steps " - f"(method: {kwargs['transition']['temporal_interpolation_method']})" + f"{prefix}: Transition → {[p['text'] for p in t['target_prompts']]} " + f"over {t['num_steps']} steps ({t['temporal_interpolation_method']})" ) if "first_frame_image" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using first frame keyframe" - ) + logger.info(f"{prefix}: Using first frame keyframe") if "last_frame_image" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using last frame keyframe" - ) - if "extension_mode" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Extension mode: {kwargs['extension_mode']}" - ) + logger.info(f"{prefix}: Using last frame keyframe") if "vace_ref_images" in kwargs: logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using {len(kwargs['vace_ref_images'])} VACE reference images" + f"{prefix}: Using {len(kwargs['vace_ref_images'])} VACE reference images" ) if "vace_input_frames" in kwargs: logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE input frames shape: {kwargs['vace_input_frames'].shape}" + f"{prefix}: VACE input frames shape: {kwargs['vace_input_frames'].shape}" ) if "vace_input_masks" in kwargs: logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE input masks shape: {kwargs['vace_input_masks'].shape}" - ) - if "vace_context_scale" in kwargs and kwargs["vace_context_scale"] != 1.0: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE context scale: {kwargs['vace_context_scale']}" - ) - if "vace_use_input_video" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: VACE use input video: {kwargs['vace_use_input_video']}" + f"{prefix}: VACE input masks shape: {kwargs['vace_input_masks'].shape}" ) if "video" in kwargs: logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Video-to-video mode with {len(kwargs['video'])} frames, noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)}" + f"{prefix}: Video-to-video ({len(kwargs['video'])} frames, " + f"noise_scale={kwargs.get('noise_scale', DEFAULT_NOISE_SCALE)})" ) elif "num_frames" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Text-to-video mode generating {kwargs['num_frames']} frames" - ) - if "denoising_step_list" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Denoising steps: {kwargs['denoising_step_list']}" - ) - if "noise_controller" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: Using noise controller: {kwargs['noise_controller']}" - ) - if "kv_cache_attention_bias" in kwargs: - logger.info( - f"generate_video_stream: Chunk {chunk_idx}: KV cache attention bias: {kwargs['kv_cache_attention_bias']}" - ) - + logger.info(f"{prefix}: Text-to-video ({kwargs['num_frames']} frames)") -def _write_chunk_output( - result: dict, - chunk_idx: int, - num_chunks: int, - chunk_latency: float, - output_file, - latency_measures: list, - fps_measures: list, - logger: "Logger", - total_frames_ref: list, - dimensions_ref: list, -) -> str: - """Write chunk output to file and return SSE progress event.""" - chunk_output = result["video"] - num_output_frames = chunk_output.shape[0] - chunk_fps = num_output_frames / chunk_latency + # Table-driven simple entries + for key, msg, condition in _CHUNK_LOG_ENTRIES: + if key in kwargs: + v = kwargs[key] + if condition is None or condition(v): + logger.info(f"{prefix}: {msg.format(v=v)}") - latency_measures.append(chunk_latency) - fps_measures.append(chunk_fps) - logger.info( - f"Chunk {chunk_idx + 1}/{num_chunks}: " - f"{num_output_frames} frames, latency={chunk_latency:.2f}s, fps={chunk_fps:.2f}" - ) - - chunk_np = chunk_output.detach().cpu().numpy() - chunk_uint8 = (chunk_np * 255).clip(0, 255).astype(np.uint8) - output_file.write(chunk_uint8.tobytes()) - - total_frames_ref[0] += num_output_frames - if dimensions_ref[0] is None: - dimensions_ref[0] = chunk_np.shape[1] - dimensions_ref[1] = chunk_np.shape[2] - dimensions_ref[2] = chunk_np.shape[3] - - return sse_event( - "progress", - { - "chunk": chunk_idx + 1, - "total_chunks": num_chunks, - "frames": num_output_frames, - "latency": round(chunk_latency, 3), - "fps": round(chunk_fps, 2), - }, - ) +# --------------------------------------------------------------------------- +# Generation engine +# --------------------------------------------------------------------------- -def _generate_sequential( - request: "GenerateRequest", - pipeline, - inputs: DecodedInputs, - num_chunks: int, - chunk_size: int, - status_info: dict, - device: torch.device, - dtype: torch.dtype, - output_file, - latency_measures: list, - fps_measures: list, - logger: "Logger", - total_frames_ref: list, - dimensions_ref: list, -) -> Iterator[str]: - """Sequential chunk processing (original code path, no processors).""" - for chunk_idx in range(num_chunks): - if _cancel_event.is_set(): - logger.info("Generation cancelled by user") - yield sse_event( - "cancelled", - { - "chunk": chunk_idx, - "total_chunks": num_chunks, - "frames_completed": total_frames_ref[0], - }, - ) - return - - start_frame = chunk_idx * chunk_size - end_frame = min(start_frame + chunk_size, request.num_frames) - - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - kwargs = build_chunk_kwargs( - request, - inputs, - chunk_idx, - chunk_size, - start_frame, - end_frame, - status_info, - device, - dtype, - logger, - ) - _log_chunk_info(kwargs, chunk_idx, num_chunks, logger) - - chunk_start = time.time() - with torch.amp.autocast("cuda", dtype=dtype): - result = pipeline(**kwargs) - chunk_latency = time.time() - chunk_start - - yield _write_chunk_output( - result, - chunk_idx, - num_chunks, - chunk_latency, - output_file, - latency_measures, - fps_measures, - logger, - total_frames_ref, - dimensions_ref, - ) - - -def _generate_with_processors( +def _generate_chunks( request: "GenerateRequest", pipeline, pipeline_manager: "PipelineManager", @@ -586,51 +532,50 @@ def _generate_with_processors( status_info: dict, device: torch.device, dtype: torch.dtype, - output_file, - latency_measures: list, - fps_measures: list, + state: GenerationState, logger: "Logger", - total_frames_ref: list, - dimensions_ref: list, ) -> Iterator[str]: - """Chunk processing with pre/post processor pipeline chaining.""" + """Process chunks through a processor chain, yielding SSE events. + + Always uses PipelineProcessor — when there are no pre/post processors + the chain is just [main_pipeline]. + """ from .pipeline_processor import _SENTINEL, PipelineProcessor - # Build the processor chain + # Build processor chain: [pre?] → main → [post?] processors: list[PipelineProcessor] = [] if request.pre_processor_id: pre_pipeline = pipeline_manager.get_pipeline_by_id(request.pre_processor_id) - pre_proc = PipelineProcessor( - pipeline=pre_pipeline, - pipeline_id=request.pre_processor_id, - batch_mode=True, + processors.append( + PipelineProcessor( + pipeline=pre_pipeline, + pipeline_id=request.pre_processor_id, + batch_mode=True, + ) ) - processors.append(pre_proc) logger.info(f"Pre-processor: {request.pre_processor_id}") - main_proc = PipelineProcessor( - pipeline=pipeline, - pipeline_id=request.pipeline_id, - batch_mode=True, + processors.append( + PipelineProcessor( + pipeline=pipeline, pipeline_id=request.pipeline_id, batch_mode=True + ) ) - processors.append(main_proc) if request.post_processor_id: post_pipeline = pipeline_manager.get_pipeline_by_id(request.post_processor_id) - post_proc = PipelineProcessor( - pipeline=post_pipeline, - pipeline_id=request.post_processor_id, - batch_mode=True, + processors.append( + PipelineProcessor( + pipeline=post_pipeline, + pipeline_id=request.post_processor_id, + batch_mode=True, + ) ) - processors.append(post_proc) logger.info(f"Post-processor: {request.post_processor_id}") - # Chain processors + # Chain and start for i in range(len(processors) - 1): processors[i].set_next_processor(processors[i + 1]) - - # Start all processors for proc in processors: proc.start() @@ -638,7 +583,6 @@ def _generate_with_processors( last_proc = processors[-1] try: - # Feed chunks into the first processor's input queue for chunk_idx in range(num_chunks): if _cancel_event.is_set(): logger.info("Generation cancelled by user") @@ -647,7 +591,7 @@ def _generate_with_processors( { "chunk": chunk_idx, "total_chunks": num_chunks, - "frames_completed": total_frames_ref[0], + "frames_completed": state.total_frames, }, ) return @@ -675,10 +619,9 @@ def _generate_with_processors( chunk_start = time.time() - # Feed kwargs into chain (blocking put) first_proc.input_queue.put(kwargs) - # Collect result from last processor (blocking get) + # Collect result from last processor while True: try: result = last_proc.output_queue.get(timeout=1.0) @@ -689,29 +632,21 @@ def _generate_with_processors( continue chunk_latency = time.time() - chunk_start - - yield _write_chunk_output( - result, - chunk_idx, - num_chunks, - chunk_latency, - output_file, - latency_measures, - fps_measures, - logger, - total_frames_ref, - dimensions_ref, - ) + yield state.write_chunk(result, chunk_idx, chunk_latency) # Signal end of input first_proc.input_queue.put(_SENTINEL) finally: - # Stop all processors for proc in processors: proc.stop() +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + def generate_video_stream( request: "GenerateRequest", pipeline_manager: "PipelineManager", @@ -721,7 +656,12 @@ def generate_video_stream( """Generate video frames, yielding SSE events. Writes output to temp file incrementally, returns output_path for download. + Only one generation can run at a time (single-client). """ + if not _generation_lock.acquire(blocking=False): + yield sse_event("error", {"error": "A generation is already in progress"}) + return + _cancel_event.clear() output_file_path = None completed = False @@ -735,16 +675,12 @@ def generate_video_stream( chunk_size = requirements.input_size if requirements else DEFAULT_CHUNK_SIZE num_chunks = (request.num_frames + chunk_size - 1) // chunk_size - # Decode inputs (supports both file-based and base64) inputs = decode_inputs(request, request.num_frames, logger) - # Setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 - latency_measures = [] - fps_measures = [] - # Create output file for incremental writing (reuse recording pattern) + # Create output file with placeholder header from .recording import TEMP_FILE_PREFIXES, RecordingManager output_file_path = RecordingManager._create_temp_file( @@ -752,67 +688,32 @@ def generate_video_stream( ) output_file = open(output_file_path, "wb") - # We'll write a placeholder header, then update it at the end - # Header format: ndim (4 bytes) + shape (4 * ndim bytes) - # For video [T, H, W, C], that's 4 + 16 = 20 bytes - header_size = 4 + 4 * 4 # ndim + 4 dimensions - output_file.write(b"\x00" * header_size) # Placeholder - - total_frames = 0 - video_height = None - video_width = None - video_channels = None - - # Determine if we need processor chaining - use_processors = ( - request.pre_processor_id is not None - or request.post_processor_id is not None + # Header: ndim (4 bytes) + shape (4 * ndim bytes) = 20 bytes for [T, H, W, C] + header_size = 4 + 4 * 4 + output_file.write(b"\x00" * header_size) + + state = GenerationState( + output_file=output_file, num_chunks=num_chunks, logger=logger ) try: - if use_processors: - yield from _generate_with_processors( - request, - pipeline, - pipeline_manager, - inputs, - num_chunks, - chunk_size, - status_info, - device, - dtype, - output_file, - latency_measures, - fps_measures, - logger, - _total_frames_ref := [0], - _dimensions_ref := [None, None, None], - ) - total_frames = _total_frames_ref[0] - video_height, video_width, video_channels = _dimensions_ref - else: - yield from _generate_sequential( - request, - pipeline, - inputs, - num_chunks, - chunk_size, - status_info, - device, - dtype, - output_file, - latency_measures, - fps_measures, - logger, - _total_frames_ref := [0], - _dimensions_ref := [None, None, None], - ) - total_frames = _total_frames_ref[0] - video_height, video_width, video_channels = _dimensions_ref + yield from _generate_chunks( + request, + pipeline, + pipeline_manager, + inputs, + num_chunks, + chunk_size, + status_info, + device, + dtype, + state, + logger, + ) # Update header with actual shape output_file.seek(0) - shape = (total_frames, video_height, video_width, video_channels) + shape = tuple(state.output_shape) output_file.write(len(shape).to_bytes(4, "little")) for dim in shape: output_file.write(dim.to_bytes(4, "little")) @@ -821,27 +722,14 @@ def generate_video_stream( output_file.close() logger.info(f"Output video saved: {output_file_path}") - - # Log performance summary - if latency_measures: - avg_latency = sum(latency_measures) / len(latency_measures) - avg_fps = sum(fps_measures) / len(fps_measures) - logger.info( - f"=== Performance Summary ({num_chunks} chunks) ===\n" - f" Latency - Avg: {avg_latency:.2f}s, " - f"Max: {max(latency_measures):.2f}s, Min: {min(latency_measures):.2f}s\n" - f" FPS - Avg: {avg_fps:.2f}, " - f"Max: {max(fps_measures):.2f}, Min: {min(fps_measures):.2f}" - ) - - output_shape = [total_frames, video_height, video_width, video_channels] + state.log_summary() yield sse_event( "complete", { "output_path": output_file_path, - "video_shape": output_shape, - "num_frames": total_frames, + "video_shape": state.output_shape, + "num_frames": state.total_frames, "num_chunks": num_chunks, "chunk_size": chunk_size, }, @@ -853,26 +741,22 @@ def generate_video_stream( yield sse_event("error", {"error": str(e)}) finally: - # Clean up uploaded data blob file - if request.data_blob_path: - try: - Path(request.data_blob_path).unlink(missing_ok=True) - logger.info(f"Cleaned up data blob file: {request.data_blob_path}") - except Exception as e: - logger.warning(f"Failed to clean up data blob file: {e}") - - # Clean up uploaded input file - if request.input_path: - try: - Path(request.input_path).unlink(missing_ok=True) - logger.info(f"Cleaned up input file: {request.input_path}") - except Exception as e: - logger.warning(f"Failed to clean up input file: {e}") + # Clean up uploaded files + for path_attr in ("data_blob_path", "input_path"): + path = getattr(request, path_attr, None) + if path: + try: + Path(path).unlink(missing_ok=True) + logger.info(f"Cleaned up {path_attr}: {path}") + except Exception as e: + logger.warning(f"Failed to clean up {path_attr}: {e}") - # Clean up output file if generation didn't complete successfully + # Clean up output file if generation didn't complete if not completed and output_file_path: try: Path(output_file_path).unlink(missing_ok=True) logger.info(f"Cleaned up orphaned output file: {output_file_path}") except Exception as e: logger.warning(f"Failed to clean up output file: {e}") + + _generation_lock.release() diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py index ad61c6a32..2c8b8848b 100644 --- a/src/scope/server/schema.py +++ b/src/scope/server/schema.py @@ -1034,22 +1034,11 @@ class DataUploadResponse(BaseModel): class GenerateResponse(BaseModel): - """Response from batch video generation. + """Response from batch video generation.""" - Supports two modes: - - Legacy: video_base64 contains the full video (for small videos) - - File-based: output_path references a downloadable file (for large videos) - """ - - # File-based output (preferred for large videos) - output_path: str | None = Field( - default=None, - description="Path to output video file for download via /generate/download. Preferred for large videos.", - ) - # Legacy base64 output (kept for backwards compatibility) - video_base64: str | None = Field( - default=None, - description="Base64-encoded output video frames (THWC, uint8). Deprecated for large videos, use output_path.", + output_path: str = Field( + ..., + description="Path to output video file for download via /generate/download.", ) video_shape: list[int] = Field( ..., From 59eb2aba98683c03d85606d4673c52d73a30240e Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:06:50 -0500 Subject: [PATCH 15/16] rm gc Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- src/scope/server/generate.py | 56 ++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/src/scope/server/generate.py b/src/scope/server/generate.py index 93d91b25c..d332ff715 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/generate.py @@ -1,6 +1,6 @@ """Video generation service for batch mode with chunked processing.""" -import gc +import concurrent.futures import json import queue import threading @@ -126,8 +126,19 @@ class GenerationState: latencies: list[float] = field(default_factory=list) fps_measures: list[float] = field(default_factory=list) - def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> str: - """Write chunk output to file and return SSE progress event.""" + def build_chunk_sse(self, chunk_idx: int, chunk_latency: float) -> str: + """Build SSE progress event (call from main thread before write).""" + return sse_event( + "progress", + { + "chunk": chunk_idx + 1, + "total_chunks": self.num_chunks, + "latency": round(chunk_latency, 3), + }, + ) + + def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> None: + """Write chunk output to file (safe to call from background thread).""" chunk_output = result["video"] num_output_frames = chunk_output.shape[0] chunk_fps = num_output_frames / chunk_latency @@ -150,17 +161,6 @@ def write_chunk(self, result: dict, chunk_idx: int, chunk_latency: float) -> str self.width = chunk_np.shape[2] self.channels = chunk_np.shape[3] - return sse_event( - "progress", - { - "chunk": chunk_idx + 1, - "total_chunks": self.num_chunks, - "frames": num_output_frames, - "latency": round(chunk_latency, 3), - "fps": round(chunk_fps, 2), - }, - ) - @property def output_shape(self) -> list[int]: return [self.total_frames, self.height, self.width, self.channels] @@ -582,6 +582,10 @@ def _generate_chunks( first_proc = processors[0] last_proc = processors[-1] + write_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + write_future: concurrent.futures.Future | None = None + pending_sse: str | None = None + try: for chunk_idx in range(num_chunks): if _cancel_event.is_set(): @@ -599,10 +603,6 @@ def _generate_chunks( start_frame = chunk_idx * chunk_size end_frame = min(start_frame + chunk_size, request.num_frames) - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - kwargs = build_chunk_kwargs( request, inputs, @@ -632,12 +632,30 @@ def _generate_chunks( continue chunk_latency = time.time() - chunk_start - yield state.write_chunk(result, chunk_idx, chunk_latency) + + # Wait for previous async write before starting a new one + if write_future is not None: + write_future.result() + if pending_sse is not None: + yield pending_sse + + # Offload CPU transfer + disk I/O to background thread + pending_sse = state.build_chunk_sse(chunk_idx, chunk_latency) + write_future = write_executor.submit( + state.write_chunk, result, chunk_idx, chunk_latency + ) + + # Wait for final write + if write_future is not None: + write_future.result() + if pending_sse is not None: + yield pending_sse # Signal end of input first_proc.input_queue.put(_SENTINEL) finally: + write_executor.shutdown(wait=True) for proc in processors: proc.stop() From ad34a65712193f9a90ad706a3489811b2b2b7c63 Mon Sep 17 00:00:00 2001 From: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> Date: Fri, 20 Feb 2026 15:27:31 -0500 Subject: [PATCH 16/16] generate -> batch Signed-off-by: RyanOnTheInside <7623207+ryanontheinside@users.noreply.github.com> --- docs/api/{generate.md => batch.md} | 36 ++++++++++---------- scripts/test_generate_endpoint.py | 10 +++--- src/scope/server/app.py | 39 +++++++++++----------- src/scope/server/{generate.py => batch.py} | 32 +++++++++--------- src/scope/server/recording.py | 12 +++---- 5 files changed, 64 insertions(+), 65 deletions(-) rename docs/api/{generate.md => batch.md} (65%) rename src/scope/server/{generate.py => batch.py} (97%) diff --git a/docs/api/generate.md b/docs/api/batch.md similarity index 65% rename from docs/api/generate.md rename to docs/api/batch.md index 3f22d4d44..7b8ba3d10 100644 --- a/docs/api/generate.md +++ b/docs/api/batch.md @@ -1,6 +1,6 @@ -# Generate Endpoint +# Batch Endpoint -Batch video generation via HTTP. Unlike the WebRTC streaming path (real-time, interactive), the generate endpoint produces a complete video in one request, processing it chunk-by-chunk with SSE progress events. +Batch video generation via HTTP. Unlike the WebRTC streaming path (real-time, interactive), the batch endpoint produces a complete video in one request, processing it chunk-by-chunk with SSE progress events. Primary consumer: ComfyUI custom nodes (`comfyui-scope`). @@ -8,29 +8,29 @@ Primary consumer: ComfyUI custom nodes (`comfyui-scope`). | Endpoint | Method | Purpose | |---|---|---| -| `/api/v1/generate` | POST | Generate video (SSE stream) | -| `/api/v1/generate/cancel` | POST | Cancel after current chunk | -| `/api/v1/generate/upload` | POST | Upload input video for v2v | -| `/api/v1/generate/upload-data` | POST | Upload binary data blob (VACE, per-chunk video) | -| `/api/v1/generate/download` | GET | Download output video | +| `/api/v1/batch` | POST | Generate video (SSE stream) | +| `/api/v1/batch/cancel` | POST | Cancel after current chunk | +| `/api/v1/batch/upload` | POST | Upload input video for v2v | +| `/api/v1/batch/upload-data` | POST | Upload binary data blob (VACE, per-chunk video) | +| `/api/v1/batch/download` | GET | Download output video | Only one generation can run at a time (409 if busy). ## Flow ``` -1. [optional] POST /generate/upload → input_path -2. [optional] POST /generate/upload-data → data_blob_path -3. POST /generate (JSON body, references paths from steps 1-2) +1. [optional] POST /batch/upload → input_path +2. [optional] POST /batch/upload-data → data_blob_path +3. POST /batch (JSON body, references paths from steps 1-2) ← SSE: event: progress {chunk, total_chunks, frames, latency, fps} ← SSE: event: complete {output_path, video_shape, num_frames, ...} -4. GET /generate/download?path= +4. GET /batch/download?path= ← binary video data ``` ## Binary Protocol -### Video Upload (`/generate/upload`) +### Video Upload (`/batch/upload`) **Request**: Raw uint8 bytes in THWC order (frames × height × width × channels). @@ -47,11 +47,11 @@ Only one generation can run at a time (409 if busy). [raw uint8 video bytes] ``` -### Data Blob Upload (`/generate/upload-data`) +### Data Blob Upload (`/batch/upload-data`) **Request**: Raw binary blob containing packed arrays. Max size: 2 GB. -The blob is an opaque byte buffer. `ChunkSpec` entries in the generate request reference regions of this blob by offset: +The blob is an opaque byte buffer. `ChunkSpec` entries in the batch request reference regions of this blob by offset: ```json { @@ -65,14 +65,14 @@ The blob is an opaque byte buffer. `ChunkSpec` entries in the generate request r Arrays are packed as contiguous float32 (VACE frames/masks) or uint8 (input video). The client is responsible for computing offsets when packing the blob. -### Video Download (`/generate/download`) +### Video Download (`/batch/download`) **Response**: Same binary format as upload (20-byte header + raw uint8 THWC data). **Response headers**: - `X-Video-Frames`, `X-Video-Height`, `X-Video-Width`, `X-Video-Channels` -## GenerateRequest +## BatchRequest ```json { @@ -81,8 +81,8 @@ Arrays are packed as contiguous float32 (VACE frames/masks) or uint8 (input vide "num_frames": 48, "seed": 42, "noise_scale": 0.7, - "input_path": "", - "data_blob_path": "", + "input_path": "", + "data_blob_path": "", "chunk_specs": [ { "chunk": 0, diff --git a/scripts/test_generate_endpoint.py b/scripts/test_generate_endpoint.py index 122dcb4c6..753df0e8e 100644 --- a/scripts/test_generate_endpoint.py +++ b/scripts/test_generate_endpoint.py @@ -1,4 +1,4 @@ -"""Test script for the /api/v1/generate endpoint. +"""Test script for the /api/v1/batch endpoint. Usage: python test_generate_endpoint.py @@ -98,7 +98,7 @@ def upload_video_for_v2v(path: str, height: int, width: int) -> str: num_frames, h, w, c = arr.shape response = requests.post( - f"{SERVER_URL}/api/v1/generate/upload", + f"{SERVER_URL}/api/v1/batch/upload", data=arr.tobytes(), headers={ "Content-Type": "application/octet-stream", @@ -163,7 +163,7 @@ def upload_vace_data( # Upload blob response = requests.post( - f"{SERVER_URL}/api/v1/generate/upload-data", + f"{SERVER_URL}/api/v1/batch/upload-data", data=bytes(blob), headers={"Content-Type": "application/octet-stream"}, timeout=300, @@ -214,7 +214,7 @@ def wait_for_pipeline(timeout: int = 300): def download_video(output_path: str) -> np.ndarray: """Download generated video from server.""" response = requests.get( - f"{SERVER_URL}/api/v1/generate/download", + f"{SERVER_URL}/api/v1/batch/download", params={"path": output_path}, timeout=300, ) @@ -346,7 +346,7 @@ def run_test(name: str): start = time.time() with requests.post( - f"{SERVER_URL}/api/v1/generate", + f"{SERVER_URL}/api/v1/batch", json=gen_request.model_dump(exclude_none=True), stream=True, headers={"Accept": "text/event-stream"}, diff --git a/src/scope/server/app.py b/src/scope/server/app.py index 10fe79c39..272852f84 100644 --- a/src/scope/server/app.py +++ b/src/scope/server/app.py @@ -45,7 +45,6 @@ VIDEO_EXTENSIONS, iter_files, ) -from .generate import generate_video_stream from .kafka_publisher import ( KafkaPublisher, is_kafka_enabled, @@ -1128,15 +1127,15 @@ def download_in_background(): raise HTTPException(status_code=500, detail=str(e)) from e -@app.post("/api/v1/generate") -async def generate_video( +@app.post("/api/v1/batch") +async def batch_video( request: "GenerateRequest", pipeline_manager: "PipelineManager" = Depends(get_pipeline_manager), ): """Generate video frames in batch mode with SSE progress streaming.""" - from .generate import is_generation_active + from .batch import batch_video_stream, is_batch_active - if is_generation_active(): + if is_batch_active(): raise HTTPException( status_code=409, detail="A generation is already in progress. Cancel it first or wait for completion.", @@ -1150,7 +1149,7 @@ async def generate_video( ) return StreamingResponse( - generate_video_stream(request, pipeline_manager, status_info, logger), + batch_video_stream(request, pipeline_manager, status_info, logger), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", @@ -1160,17 +1159,17 @@ async def generate_video( ) -@app.post("/api/v1/generate/cancel") -async def cancel_generate(): +@app.post("/api/v1/batch/cancel") +async def cancel_batch(): """Cancel the current video generation after the current chunk completes.""" - from .generate import cancel_generation + from .batch import cancel_batch as _cancel_batch - cancel_generation() + _cancel_batch() return {"status": "cancelling"} -@app.post("/api/v1/generate/upload") -async def upload_video_for_generate(request: Request): +@app.post("/api/v1/batch/upload") +async def upload_video_for_batch(request: Request): """Upload a video for batch generation (file-based transfer for large videos). Accepts raw binary video data with metadata headers: @@ -1181,7 +1180,7 @@ async def upload_video_for_generate(request: Request): Video data should be raw uint8 bytes in THWC order. - Returns input_path to use in the generate request. + Returns input_path to use in the batch request. """ from .recording import TEMP_FILE_PREFIXES, RecordingManager from .schema import VideoUploadResponse @@ -1204,7 +1203,7 @@ async def upload_video_for_generate(request: Request): # Create temp file (reuse recording pattern) file_path = RecordingManager._create_temp_file( - ".bin", TEMP_FILE_PREFIXES["generate_input"] + ".bin", TEMP_FILE_PREFIXES["batch_input"] ) # Stream body to file @@ -1242,14 +1241,14 @@ async def upload_video_for_generate(request: Request): raise HTTPException(status_code=500, detail=str(e)) from e -@app.post("/api/v1/generate/upload-data") +@app.post("/api/v1/batch/upload-data") async def upload_data_blob(request: Request): """Upload binary data blob for batch generation. Accepts raw binary data containing VACE frames/masks, input video, or other array data referenced by ChunkSpec offsets in the generate request. - Returns data_blob_path to use in the generate request. + Returns data_blob_path to use in the batch request. """ from .recording import TEMP_FILE_PREFIXES, RecordingManager @@ -1258,10 +1257,10 @@ async def upload_data_blob(request: Request): try: # Create temp file file_path = RecordingManager._create_temp_file( - ".bin", TEMP_FILE_PREFIXES["generate_data"] + ".bin", TEMP_FILE_PREFIXES["batch_data"] ) - from .generate import MAX_DATA_BLOB_BYTES + from .batch import MAX_DATA_BLOB_BYTES # Stream body to file with size limit bytes_written = 0 @@ -1295,7 +1294,7 @@ async def upload_data_blob(request: Request): raise HTTPException(status_code=500, detail=str(e)) from e -@app.get("/api/v1/generate/download") +@app.get("/api/v1/batch/download") async def download_generated_video( path: str = Query(..., description="Path to output video file"), background_tasks: BackgroundTasks = None, @@ -1321,7 +1320,7 @@ async def download_generated_video( temp_dir = Path(tempfile.gettempdir()) if not file_path.is_relative_to(temp_dir): raise HTTPException(status_code=403, detail="Invalid file path") - if not file_path.name.startswith(TEMP_FILE_PREFIXES["generate_output"]): + if not file_path.name.startswith(TEMP_FILE_PREFIXES["batch_output"]): raise HTTPException(status_code=403, detail="Invalid file path") if not file_path.exists(): diff --git a/src/scope/server/generate.py b/src/scope/server/batch.py similarity index 97% rename from src/scope/server/generate.py rename to src/scope/server/batch.py index d332ff715..a5cfbd802 100644 --- a/src/scope/server/generate.py +++ b/src/scope/server/batch.py @@ -17,25 +17,25 @@ _cancel_event = threading.Event() # Generation lock (single-client: only one generation at a time) -_generation_lock = threading.Lock() +_batch_lock = threading.Lock() # Max data blob upload size (2 GB) MAX_DATA_BLOB_BYTES = 2 * 1024 * 1024 * 1024 -def cancel_generation(): +def cancel_batch(): """Signal the current generation to stop after the current chunk.""" _cancel_event.set() -def is_generation_cancelled() -> bool: +def is_batch_cancelled() -> bool: """Check if cancellation has been requested.""" return _cancel_event.is_set() -def is_generation_active() -> bool: +def is_batch_active() -> bool: """Check if a generation is currently in progress.""" - return _generation_lock.locked() + return _batch_lock.locked() # Defaults @@ -113,7 +113,7 @@ class DecodedInputs: @dataclass -class GenerationState: +class BatchState: """Mutable state accumulated during chunk-by-chunk generation.""" output_file: IO[bytes] @@ -234,10 +234,10 @@ def decode_inputs( blob_path = Path(request.data_blob_path) temp_dir = Path(tempfile.gettempdir()) if not blob_path.is_relative_to(temp_dir) or not blob_path.name.startswith( - TEMP_FILE_PREFIXES["generate_data"] + TEMP_FILE_PREFIXES["batch_data"] ): raise ValueError( - f"Invalid data_blob_path: must be a temp file with prefix {TEMP_FILE_PREFIXES['generate_data']}" + f"Invalid data_blob_path: must be a temp file with prefix {TEMP_FILE_PREFIXES['batch_data']}" ) with open(blob_path, "rb") as f: blob = f.read() @@ -522,7 +522,7 @@ def _log_chunk_info(kwargs: dict, chunk_idx: int, num_chunks: int, logger: "Logg # --------------------------------------------------------------------------- -def _generate_chunks( +def _batch_chunks( request: "GenerateRequest", pipeline, pipeline_manager: "PipelineManager", @@ -532,7 +532,7 @@ def _generate_chunks( status_info: dict, device: torch.device, dtype: torch.dtype, - state: GenerationState, + state: BatchState, logger: "Logger", ) -> Iterator[str]: """Process chunks through a processor chain, yielding SSE events. @@ -665,7 +665,7 @@ def _generate_chunks( # --------------------------------------------------------------------------- -def generate_video_stream( +def batch_video_stream( request: "GenerateRequest", pipeline_manager: "PipelineManager", status_info: dict, @@ -676,7 +676,7 @@ def generate_video_stream( Writes output to temp file incrementally, returns output_path for download. Only one generation can run at a time (single-client). """ - if not _generation_lock.acquire(blocking=False): + if not _batch_lock.acquire(blocking=False): yield sse_event("error", {"error": "A generation is already in progress"}) return @@ -702,7 +702,7 @@ def generate_video_stream( from .recording import TEMP_FILE_PREFIXES, RecordingManager output_file_path = RecordingManager._create_temp_file( - ".bin", TEMP_FILE_PREFIXES["generate_output"] + ".bin", TEMP_FILE_PREFIXES["batch_output"] ) output_file = open(output_file_path, "wb") @@ -710,12 +710,12 @@ def generate_video_stream( header_size = 4 + 4 * 4 output_file.write(b"\x00" * header_size) - state = GenerationState( + state = BatchState( output_file=output_file, num_chunks=num_chunks, logger=logger ) try: - yield from _generate_chunks( + yield from _batch_chunks( request, pipeline, pipeline_manager, @@ -777,4 +777,4 @@ def generate_video_stream( except Exception as e: logger.warning(f"Failed to clean up output file: {e}") - _generation_lock.release() + _batch_lock.release() diff --git a/src/scope/server/recording.py b/src/scope/server/recording.py index 280ba9fe1..568239314 100644 --- a/src/scope/server/recording.py +++ b/src/scope/server/recording.py @@ -17,9 +17,9 @@ TEMP_FILE_PREFIXES = { "recording": "scope_recording_", "download": "scope_download_", - "generate_input": "scope_gen_input_", - "generate_output": "scope_gen_output_", - "generate_data": "scope_gen_data_", + "batch_input": "scope_gen_input_", + "batch_output": "scope_gen_output_", + "batch_data": "scope_gen_data_", } # Environment variables @@ -440,9 +440,9 @@ def cleanup_recording_files(): patterns = [ f"{TEMP_FILE_PREFIXES['recording']}*.mp4", f"{TEMP_FILE_PREFIXES['download']}*.mp4", - f"{TEMP_FILE_PREFIXES['generate_input']}*.bin", - f"{TEMP_FILE_PREFIXES['generate_output']}*.bin", - f"{TEMP_FILE_PREFIXES['generate_data']}*.bin", + f"{TEMP_FILE_PREFIXES['batch_input']}*.bin", + f"{TEMP_FILE_PREFIXES['batch_output']}*.bin", + f"{TEMP_FILE_PREFIXES['batch_data']}*.bin", ] deleted_count = 0