From b5ad03c8d8ed1770d0958c97c716429b74aab21a Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 10 Feb 2026 19:44:33 -0500
Subject: [PATCH 1/5] Added nvfp4 official support.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/components/ComplexFields.tsx     |  15 +-
 frontend/src/components/SettingsPanel.tsx     |  15 +-
 frontend/src/hooks/useStreamState.ts          |   4 +-
 frontend/src/pages/StreamPage.tsx             |   2 +-
 frontend/src/types/index.ts                   |   2 +-
 src/scope/core/pipelines/enums.py             |   1 +
 .../pipelines/krea_realtime_video/pipeline.py |  25 +-
 src/scope/core/pipelines/longlive/pipeline.py |  25 +-
 src/scope/core/pipelines/memflow/pipeline.py  |  25 +-
 .../core/pipelines/quantization_utils.py      | 474 ++++++++++++++++++
 .../core/pipelines/reward_forcing/pipeline.py |  25 +-
 .../pipelines/streamdiffusionv2/pipeline.py   |  25 +-
 src/scope/core/pipelines/utils.py             |   3 +
 src/scope/core/pipelines/wan2_1/vace/mixin.py |  57 +--
 14 files changed, 534 insertions(+), 164 deletions(-)
 create mode 100644 src/scope/core/pipelines/quantization_utils.py
diff --git a/frontend/src/components/ComplexFields.tsx b/frontend/src/components/ComplexFields.tsx
index 77e162d32..61860e571 100644
--- a/frontend/src/components/ComplexFields.tsx
+++ b/frontend/src/components/ComplexFields.tsx
@@ -49,7 +49,7 @@ export interface SchemaComplexFieldContext {
   vaceUseInputVideo?: boolean;
   onVaceUseInputVideoChange?: (enabled: boolean) => void;
   vaceContextScaleSlider?: SliderState;
-  quantization?: "fp8_e4m3fn" | null;
+  quantization?: "fp8_e4m3fn" | "nvfp4" | null;
   loras?: LoRAConfig[];
   onLorasChange?: (loras: LoRAConfig[]) => void;
   loraMergeStrategy?: LoraMergeStrategy;
@@ -63,7 +63,7 @@ export interface SchemaComplexFieldContext {
   noiseScaleSlider?: SliderState;
   noiseController?: boolean;
   onNoiseControllerChange?: (enabled: boolean) => void;
-  onQuantizationChange?: (q: "fp8_e4m3fn" | null) => void;
+  onQuantizationChange?: (q: "fp8_e4m3fn" | "nvfp4" | null) => void;
   inputMode?: "text" | "video";
   supportsNoiseControls?: boolean;
   supportsQuantization?: boolean;
@@ -156,7 +156,7 @@ export function SchemaComplexField({
             <div className="flex items-start gap-1.5 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
               <Info className="h-3.5 w-3.5 mt-0.5 shrink-0 text-amber-600 dark:text-amber-500" />
               <p className="text-xs text-amber-600 dark:text-amber-500">
-                VACE is incompatible with FP8 quantization. Please disable
+                VACE is incompatible with quantization. Please disable
                 quantization to use VACE.
               </p>
             </div>
@@ -482,14 +482,14 @@ export function SchemaComplexField({
                 value={ctx.quantization ?? "none"}
                 onValueChange={v =>
                   ctx.onQuantizationChange?.(
-                    v === "none" ? null : (v as "fp8_e4m3fn")
+                    v === "none" ? null : (v as "fp8_e4m3fn" | "nvfp4")
                   )
                 }
                 disabled={
                   (ctx.isStreaming ?? false) || (ctx.vaceEnabled ?? false)
                 }
               >
-                <SelectTrigger className="w-[140px] h-7">
+                <SelectTrigger className="w-[180px] h-7">
                   <SelectValue />
                 </SelectTrigger>
                 <SelectContent>
@@ -497,12 +497,15 @@ export function SchemaComplexField({
                   <SelectItem value="fp8_e4m3fn">
                     fp8_e4m3fn (Dynamic)
                   </SelectItem>
+                  <SelectItem value="nvfp4">
+                    nvfp4 (Blackwell)
+                  </SelectItem>
                 </SelectContent>
               </Select>
             </div>
             {ctx.vaceEnabled && (
               <p className="text-xs text-muted-foreground">
-                Disabled because VACE is enabled. Disable VACE to use FP8
+                Disabled because VACE is enabled. Disable VACE to use
                 quantization.
               </p>
             )}
diff --git a/frontend/src/components/SettingsPanel.tsx b/frontend/src/components/SettingsPanel.tsx
index 3125e22ca..e58e7aa75 100644
--- a/frontend/src/components/SettingsPanel.tsx
+++ b/frontend/src/components/SettingsPanel.tsx
@@ -75,8 +75,8 @@ interface SettingsPanelProps {
   onNoiseControllerChange?: (enabled: boolean) => void;
   manageCache?: boolean;
   onManageCacheChange?: (enabled: boolean) => void;
-  quantization?: "fp8_e4m3fn" | null;
-  onQuantizationChange?: (quantization: "fp8_e4m3fn" | null) => void;
+  quantization?: "fp8_e4m3fn" | "nvfp4" | null;
+  onQuantizationChange?: (quantization: "fp8_e4m3fn" | "nvfp4" | null) => void;
   kvCacheAttentionBias?: number;
   onKvCacheAttentionBiasChange?: (bias: number) => void;
   onResetCache?: () => void;
@@ -617,7 +617,7 @@ export function SettingsPanel({
                     <div className="flex items-start gap-1.5 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
                       <Info className="h-3.5 w-3.5 mt-0.5 shrink-0 text-amber-600 dark:text-amber-500" />
                       <p className="text-xs text-amber-600 dark:text-amber-500">
-                        VACE is incompatible with FP8 quantization. Please
+                        VACE is incompatible with quantization. Please
                         disable quantization to use VACE.
                       </p>
                     </div>
@@ -951,12 +951,12 @@ export function SettingsPanel({
                           value={quantization || "none"}
                           onValueChange={value => {
                             onQuantizationChange?.(
-                              value === "none" ? null : (value as "fp8_e4m3fn")
+                              value === "none" ? null : (value as "fp8_e4m3fn" | "nvfp4")
                             );
                           }}
                           disabled={isStreaming || vaceEnabled}
                         >
-                          <SelectTrigger className="w-[140px] h-7">
+                          <SelectTrigger className="w-[180px] h-7">
                             <SelectValue />
                           </SelectTrigger>
                           <SelectContent>
@@ -964,6 +964,9 @@ export function SettingsPanel({
                             <SelectItem value="fp8_e4m3fn">
                               fp8_e4m3fn (Dynamic)
                             </SelectItem>
+                            <SelectItem value="nvfp4">
+                              nvfp4 (Blackwell)
+                            </SelectItem>
                           </SelectContent>
                         </Select>
                       </div>
@@ -971,7 +974,7 @@ export function SettingsPanel({
                       {vaceEnabled && (
                         <p className="text-xs text-muted-foreground">
                           Disabled because VACE is enabled. Disable VACE to use
-                          FP8 quantization.
+                          quantization.
                         </p>
                       )}
                     </div>
diff --git a/frontend/src/hooks/useStreamState.ts b/frontend/src/hooks/useStreamState.ts
index 3f76c424d..d4e827005 100644
--- a/frontend/src/hooks/useStreamState.ts
+++ b/frontend/src/hooks/useStreamState.ts
@@ -38,7 +38,7 @@ function getFallbackDefaults(mode?: InputMode) {
     noiseController: isVideoMode ? true : undefined,
     defaultTemporalInterpolationSteps: undefined as number | undefined,
     inputMode: effectiveMode,
-    quantization: undefined as "fp8_e4m3fn" | undefined,
+    quantization: undefined as "fp8_e4m3fn" | "nvfp4" | undefined,
   };
 }
 
@@ -125,7 +125,7 @@ export function useStreamState() {
           noiseController,
           defaultTemporalInterpolationSteps,
           inputMode: effectiveMode,
-          quantization: undefined as "fp8_e4m3fn" | undefined,
+          quantization: undefined as "fp8_e4m3fn" | "nvfp4" | undefined,
         };
       }
       // Fallback to derived defaults if schemas not loaded
diff --git a/frontend/src/pages/StreamPage.tsx b/frontend/src/pages/StreamPage.tsx
index a35f5da09..beea3352b 100644
--- a/frontend/src/pages/StreamPage.tsx
+++ b/frontend/src/pages/StreamPage.tsx
@@ -588,7 +588,7 @@ export function StreamPage() {
     });
   };
 
-  const handleQuantizationChange = (quantization: "fp8_e4m3fn" | null) => {
+  const handleQuantizationChange = (quantization: "fp8_e4m3fn" | "nvfp4" | null) => {
     updateSettings({ quantization });
     // Note: This setting requires pipeline reload, so we don't send parameter update here
   };
diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
index 35d01686d..ac0074f3c 100644
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -55,7 +55,7 @@ export interface SettingsState {
   noiseScale?: number;
   noiseController?: boolean;
   manageCache?: boolean;
-  quantization?: "fp8_e4m3fn" | null;
+  quantization?: "fp8_e4m3fn" | "nvfp4" | null;
   kvCacheAttentionBias?: number;
   paused?: boolean;
   loras?: LoRAConfig[];
diff --git a/src/scope/core/pipelines/enums.py b/src/scope/core/pipelines/enums.py
index 9de333da1..ca33e4a34 100644
--- a/src/scope/core/pipelines/enums.py
+++ b/src/scope/core/pipelines/enums.py
@@ -12,6 +12,7 @@ class Quantization(str, Enum):
     """Quantization method enumeration."""
 
     FP8_E4M3FN = "fp8_e4m3fn"
+    NVFP4 = "nvfp4"
 
 
 class VaeType(str, Enum):
diff --git a/src/scope/core/pipelines/krea_realtime_video/pipeline.py b/src/scope/core/pipelines/krea_realtime_video/pipeline.py
index c411db2c9..603db1cee 100644
--- a/src/scope/core/pipelines/krea_realtime_video/pipeline.py
+++ b/src/scope/core/pipelines/krea_realtime_video/pipeline.py
@@ -15,6 +15,7 @@
 )
 from ..interface import Pipeline, Requirements
 from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
 from ..utils import Quantization, load_model_config, validate_resolution
 from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
 from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -111,29 +112,7 @@ def __init__(
         # Initialize optional LoRA adapters on the underlying model AFTER VACE.
         generator.model = self._init_loras(config, generator.model)
 
-        if quantization == Quantization.FP8_E4M3FN:
-            # Cast before optional quantization
-            generator = generator.to(dtype=dtype)
-
-            start = time.time()
-
-            from torchao.quantization.quant_api import (
-                Float8DynamicActivationFloat8WeightConfig,
-                PerTensor,
-                quantize_,
-            )
-
-            # Move to target device during quantization
-            # Defaults to using fp8_e4m3fn for both weights and activations
-            quantize_(
-                generator,
-                Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
-                device=device,
-            )
-
-            print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
-        else:
-            generator = generator.to(device=device, dtype=dtype)
+        generator = apply_quantization(generator, quantization, device, dtype)
 
         if compile:
             # Only compile the attention blocks
diff --git a/src/scope/core/pipelines/longlive/pipeline.py b/src/scope/core/pipelines/longlive/pipeline.py
index f0b53b55d..dc2e80c9e 100644
--- a/src/scope/core/pipelines/longlive/pipeline.py
+++ b/src/scope/core/pipelines/longlive/pipeline.py
@@ -15,6 +15,7 @@
 )
 from ..interface import Pipeline, Requirements
 from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
 from ..utils import Quantization, load_model_config, validate_resolution
 from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
 from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -110,29 +111,7 @@ def __init__(
         # This is additive and does not replace the original LongLive performance LoRA.
         generator.model = self._init_loras(config, generator.model)
 
-        if quantization == Quantization.FP8_E4M3FN:
-            # Cast before optional quantization
-            generator = generator.to(dtype=dtype)
-
-            start = time.time()
-
-            from torchao.quantization.quant_api import (
-                Float8DynamicActivationFloat8WeightConfig,
-                PerTensor,
-                quantize_,
-            )
-
-            # Move to target device during quantization
-            # Defaults to using fp8_e4m3fn for both weights and activations
-            quantize_(
-                generator,
-                Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
-                device=device,
-            )
-
-            print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
-        else:
-            generator = generator.to(device=device, dtype=dtype)
+        generator = apply_quantization(generator, quantization, device, dtype)
 
         start = time.time()
         text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/memflow/pipeline.py b/src/scope/core/pipelines/memflow/pipeline.py
index e4e5b9d78..5f1efb90a 100644
--- a/src/scope/core/pipelines/memflow/pipeline.py
+++ b/src/scope/core/pipelines/memflow/pipeline.py
@@ -15,6 +15,7 @@
 )
 from ..interface import Pipeline, Requirements
 from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
 from ..utils import Quantization, load_model_config, validate_resolution
 from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
 from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -110,29 +111,7 @@ def __init__(
         # This is additive and does not replace the original MemFlow performance LoRA.
         generator.model = self._init_loras(config, generator.model)
 
-        if quantization == Quantization.FP8_E4M3FN:
-            # Cast before optional quantization
-            generator = generator.to(dtype=dtype)
-
-            start = time.time()
-
-            from torchao.quantization.quant_api import (
-                Float8DynamicActivationFloat8WeightConfig,
-                PerTensor,
-                quantize_,
-            )
-
-            # Move to target device during quantization
-            # Defaults to using fp8_e4m3fn for both weights and activations
-            quantize_(
-                generator,
-                Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
-                device=device,
-            )
-
-            print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
-        else:
-            generator = generator.to(device=device, dtype=dtype)
+        generator = apply_quantization(generator, quantization, device, dtype)
 
         start = time.time()
         text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/quantization_utils.py b/src/scope/core/pipelines/quantization_utils.py
new file mode 100644
index 000000000..be40ddcfe
--- /dev/null
+++ b/src/scope/core/pipelines/quantization_utils.py
@@ -0,0 +1,474 @@
+"""Quantization utilities for pipeline models.
+
+Provides shared quantization functions used across all pipelines that support
+quantization (FP8 via torchao, NVFP4 via comfy-kitchen).
+
+NVFP4 (E2M1) provides ~4x weight memory reduction on Blackwell GPUs (SM >= 10.0)
+using comfy-kitchen's QuantizedTensor and optimized CUDA kernels.
+
+FP8 (E4M3FN) provides ~2x weight memory reduction on Ada+ GPUs (SM >= 8.9)
+using torchao's dynamic activation quantization.
+"""
+
+from __future__ import annotations
+
+import gc
+import logging
+import time
+from collections.abc import Callable
+
+import torch
+
+from .enums import Quantization
+
+logger = logging.getLogger(__name__)
+
+# ============================================================================
+# NVFP4 Support
+# ============================================================================
+
+# Minimum SM version for NVFP4 hardware acceleration
+MIN_NVFP4_SM_VERSION = (10, 0)  # Blackwell
+
+# Layout name for comfy-kitchen's NVFP4 layout
+NVFP4_LAYOUT = "TensorCoreNVFP4Layout"
+
+
+def check_nvfp4_support() -> tuple[bool, str]:
+    """Check if NVFP4 is supported on current hardware.
+
+    Returns:
+        Tuple of (is_supported, reason_if_not)
+    """
+    if not torch.cuda.is_available():
+        return False, "CUDA not available"
+
+    cap = torch.cuda.get_device_capability()
+    if cap < MIN_NVFP4_SM_VERSION:
+        return (
+            False,
+            f"Requires SM >= {MIN_NVFP4_SM_VERSION[0]}.{MIN_NVFP4_SM_VERSION[1]} (Blackwell), "
+            f"current: SM {cap[0]}.{cap[1]}",
+        )
+
+    # Check if comfy-kitchen is available
+    try:
+        import comfy_kitchen  # noqa: F401
+    except ImportError:
+        return (
+            False,
+            "comfy-kitchen package not installed. Install with: pip install comfy-kitchen[cublas]",
+        )
+
+    # Check if QuantizedTensor and NVFP4 layout are available
+    try:
+        from comfy_kitchen.tensor import (  # noqa: F401
+            QuantizedTensor,
+            TensorCoreNVFP4Layout,
+        )
+    except ImportError:
+        return False, "comfy-kitchen QuantizedTensor not available"
+
+    return True, ""
+
+
+class NVFP4Linear(torch.nn.Module):
+    """Linear layer with NVFP4 quantized weights using comfy-kitchen.
+
+    Stores weights as comfy-kitchen QuantizedTensor which automatically
+    dispatches to optimized NVFP4 kernels during matmul.
+
+    The weight is stored as an nn.Parameter containing a QuantizedTensor,
+    enabling the __torch_dispatch__ mechanism to route F.linear calls
+    to optimized NVFP4 kernels.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self._orig_dtype = dtype or torch.bfloat16
+        self._layout_type = NVFP4_LAYOUT
+
+        self.register_parameter("weight", None)
+
+        if bias:
+            self.bias = torch.nn.Parameter(
+                torch.zeros(out_features, device=device, dtype=dtype or torch.bfloat16)
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    @classmethod
+    def from_linear(cls, linear: torch.nn.Linear) -> NVFP4Linear:
+        """Create NVFP4Linear from a standard Linear layer.
+
+        Note: Does NOT free the original linear layer's memory.
+        The caller is responsible for cleanup after this returns.
+        """
+        from comfy_kitchen.tensor import QuantizedTensor
+
+        in_features = linear.in_features
+        out_features = linear.out_features
+        has_bias = linear.bias is not None
+        device = linear.weight.device
+        dtype = linear.weight.dtype
+
+        nvfp4_linear = cls(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+            device=device,
+            dtype=dtype,
+        )
+
+        weight_2d = linear.weight.data.detach()
+        quantized_weight = QuantizedTensor.from_float(weight_2d, NVFP4_LAYOUT)
+        nvfp4_linear.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+
+        if has_bias:
+            nvfp4_linear.bias = torch.nn.Parameter(
+                linear.bias.data.detach().clone().to(dtype), requires_grad=False
+            )
+
+        return nvfp4_linear
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass with NVFP4 quantized computation."""
+        from comfy_kitchen.tensor import QuantizedTensor
+
+        orig_shape = x.shape
+        reshaped_3d = x.dim() == 3
+
+        if reshaped_3d:
+            x = x.reshape(-1, orig_shape[2])
+
+        if x.dim() == 2:
+            x_qt = QuantizedTensor.from_float(x, self._layout_type)
+            out = torch.nn.functional.linear(x_qt, self.weight, self.bias)
+        else:
+            weight_dq = (
+                self.weight.dequantize()
+                if hasattr(self.weight, "dequantize")
+                else self.weight
+            )
+            out = torch.nn.functional.linear(x, weight_dq, self.bias)
+
+        if reshaped_3d:
+            out = out.reshape(orig_shape[0], orig_shape[1], self.weight.shape[0])
+
+        return out
+
+    def extra_repr(self) -> str:
+        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
+
+
+def _default_layer_filter(name: str, module: torch.nn.Module) -> bool:
+    """Default filter for selecting transformer block linear layers for quantization.
+
+    Quantizes attention projections and MLP/FFN layers.
+    Excludes embedding layers, layer norms, output projections, and LoRA adapters.
+    """
+    if not isinstance(module, torch.nn.Linear):
+        return False
+
+    name_lower = name.lower()
+
+    # Skip LoRA adapter layers
+    name_parts = name.split(".")
+    is_lora_layer = any(
+        part.lower().startswith("lora_") or part in ("lora_A", "lora_B")
+        for part in name_parts
+    )
+    if is_lora_layer:
+        return False
+
+    # Skip embedding, output, and input projection layers
+    skip_patterns = [
+        "embed",
+        "lm_head",
+        "output_proj",
+        "final",
+        "norm",
+        "ln_",
+        "layernorm",
+        "patchify",
+        "caption_projection",
+    ]
+    for pattern in skip_patterns:
+        if pattern in name_lower:
+            return False
+
+    # Include attention and MLP layers
+    include_patterns = [
+        "attn",
+        "attention",
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "out_proj",
+        "qkv",
+        "mlp",
+        "ffn",
+        "fc1",
+        "fc2",
+        "gate",
+        "up_proj",
+        "down_proj",
+        "dense",
+        "linear",
+        "proj",
+    ]
+
+    for pattern in include_patterns:
+        if pattern in name_lower:
+            return True
+
+    # Default: quantize layers inside transformer blocks
+    block_patterns = ["block", "layer", "transformer"]
+    for pattern in block_patterns:
+        if pattern in name_lower:
+            return True
+
+    return False
+
+
+def quantize_model_nvfp4(
+    model: torch.nn.Module,
+    layer_filter: Callable[[str, torch.nn.Module], bool] | None = None,
+    streaming: bool = False,
+    target_device: torch.device | None = None,
+) -> None:
+    """Quantize Linear layers in a model to NVFP4 in-place.
+
+    Replaces nn.Linear layers with NVFP4Linear layers for ~4x weight memory
+    reduction and hardware-accelerated matmul on Blackwell GPUs.
+
+    Args:
+        model: PyTorch model to quantize
+        layer_filter: Optional function (name, module) -> bool to filter layers.
+                     If None, uses _default_layer_filter.
+        streaming: If True, use streaming mode for low-VRAM GPUs.
+        target_device: Target device for quantization (only used in streaming mode).
+    """
+    if layer_filter is None:
+        layer_filter = _default_layer_filter
+
+    # Only store layer names to avoid keeping module references alive
+    layer_names_to_replace: list[str] = []
+    skipped_lora: list[str] = []
+
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            name_parts = name.split(".")
+            is_lora_layer = any(
+                part.startswith("lora_") or part in ("lora_A", "lora_B")
+                for part in name_parts
+            )
+
+            if is_lora_layer:
+                skipped_lora.append(name)
+                continue
+
+            if layer_filter(name, module):
+                layer_names_to_replace.append(name)
+
+    if skipped_lora:
+        logger.info(f"Skipped {len(skipped_lora)} LoRA adapter layers")
+
+    num_layers = len(layer_names_to_replace)
+    logger.info(f"Quantizing {num_layers} Linear layers to NVFP4")
+
+    if streaming and target_device is None:
+        target_device = (
+            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        )
+
+    if streaming:
+        logger.info(
+            f"Using streaming quantization mode (target device: {target_device})"
+        )
+
+    # Log memory before quantization
+    mem_before = 0.0
+    if torch.cuda.is_available():
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        mem_before = torch.cuda.memory_allocated() / 1024**3
+        logger.info(f"GPU memory before NVFP4 quantization: {mem_before:.2f} GB")
+
+    for i, name in enumerate(layer_names_to_replace):
+        parts = name.split(".")
+        parent = model
+        for part in parts[:-1]:
+            parent = getattr(parent, part)
+
+        module = getattr(parent, parts[-1])
+
+        if not isinstance(module, torch.nn.Linear):
+            continue
+
+        if streaming:
+            original_device = module.weight.device
+            if original_device != target_device:
+                module = module.to(target_device)
+                setattr(parent, parts[-1], module)
+
+            nvfp4_module = NVFP4Linear.from_linear(module)
+            nvfp4_module = nvfp4_module.to("cpu")
+            setattr(parent, parts[-1], nvfp4_module)
+
+            if module.weight is not None:
+                module.weight.data = torch.empty(0, device="cpu", dtype=torch.float32)
+            if module.bias is not None:
+                module.bias.data = torch.empty(0, device="cpu", dtype=torch.float32)
+            del module
+
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        else:
+            nvfp4_module = NVFP4Linear.from_linear(module)
+            setattr(parent, parts[-1], nvfp4_module)
+
+            if module.weight is not None:
+                module.weight.data = torch.empty(0, device="cpu", dtype=torch.float32)
+            if module.bias is not None:
+                module.bias.data = torch.empty(0, device="cpu", dtype=torch.float32)
+            del module
+
+            if (i + 1) % 25 == 0:
+                gc.collect()
+                torch.cuda.empty_cache()
+
+        if (i + 1) % 100 == 0 or (streaming and (i + 1) % 50 == 0):
+            if torch.cuda.is_available():
+                current_mem = torch.cuda.memory_allocated() / 1024**3
+                logger.info(
+                    f"Quantized {i + 1}/{num_layers} layers, "
+                    f"GPU memory: {current_mem:.2f} GB"
+                )
+
+    gc.collect()
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        mem_after = torch.cuda.memory_allocated() / 1024**3
+        mem_saved = mem_before - mem_after
+        if mem_saved > 0:
+            logger.info(f"NVFP4 quantization saved {mem_saved:.2f} GB GPU memory")
+
+
+# ============================================================================
+# Unified Quantization API
+# ============================================================================
+
+
+def apply_quantization(
+    model: torch.nn.Module,
+    quantization: Quantization | None,
+    device: torch.device | str,
+    dtype: torch.dtype,
+) -> torch.nn.Module:
+    """Apply quantization to a model and move it to the target device.
+
+    This is the shared entry point used by all pipelines that support quantization.
+    It handles both FP8 and NVFP4 quantization methods, falling back to a simple
+    device/dtype cast when quantization is None.
+
+    Args:
+        model: The model to quantize (typically the diffusion generator)
+        quantization: Quantization method to apply, or None for no quantization
+        device: Target device
+        dtype: Target dtype (typically torch.bfloat16)
+
+    Returns:
+        The quantized model on the target device
+    """
+    if quantization == Quantization.FP8_E4M3FN:
+        # Cast before quantization
+        model = model.to(dtype=dtype)
+
+        start = time.time()
+
+        from torchao.quantization.quant_api import (
+            Float8DynamicActivationFloat8WeightConfig,
+            PerTensor,
+            quantize_,
+        )
+
+        quantize_(
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
+            device=device,
+        )
+
+        print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
+
+    elif quantization == Quantization.NVFP4:
+        supported, reason = check_nvfp4_support()
+        if not supported:
+            raise RuntimeError(f"NVFP4 quantization not supported: {reason}")
+
+        # Cast to dtype first, then move to device
+        model = model.to(dtype=dtype, device=device)
+
+        start = time.time()
+        quantize_model_nvfp4(model, layer_filter=_default_layer_filter)
+        print(f"Quantized diffusion model to nvfp4 in {time.time() - start:.3f}s")
+
+    else:
+        model = model.to(device=device, dtype=dtype)
+
+    return model
+
+
+def apply_quantization_to_module(
+    module: torch.nn.Module,
+    quantization: Quantization | None,
+    device: torch.device | str,
+    dtype: torch.dtype,
+) -> None:
+    """Apply quantization to a specific module (e.g., VACE components).
+
+    Unlike apply_quantization, this operates on sub-modules that are
+    already on the correct device and doesn't return the module.
+
+    Args:
+        module: The module to quantize
+        quantization: Quantization method to apply
+        device: Target device
+        dtype: Target dtype
+    """
+    if quantization is None:
+        return
+
+    if quantization == Quantization.FP8_E4M3FN:
+        from torchao.quantization.quant_api import (
+            Float8DynamicActivationFloat8WeightConfig,
+            PerTensor,
+            quantize_,
+        )
+
+        quantize_(
+            module,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
+            device=device,
+        )
+
+    elif quantization == Quantization.NVFP4:
+        supported, reason = check_nvfp4_support()
+        if not supported:
+            logger.warning(f"NVFP4 not supported for sub-module, skipping: {reason}")
+            return
+
+        quantize_model_nvfp4(module, layer_filter=_default_layer_filter)
diff --git a/src/scope/core/pipelines/reward_forcing/pipeline.py b/src/scope/core/pipelines/reward_forcing/pipeline.py
index f36263f55..59983028d 100644
--- a/src/scope/core/pipelines/reward_forcing/pipeline.py
+++ b/src/scope/core/pipelines/reward_forcing/pipeline.py
@@ -15,6 +15,7 @@
 )
 from ..interface import Pipeline, Requirements
 from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
 from ..utils import Quantization, load_model_config, validate_resolution
 from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
 from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -84,29 +85,7 @@ def __init__(
         # Initialize any additional, user-configured LoRA adapters via shared manager.
         generator.model = self._init_loras(config, generator.model)
 
-        if quantization == Quantization.FP8_E4M3FN:
-            # Cast before optional quantization
-            generator = generator.to(dtype=dtype)
-
-            start = time.time()
-
-            from torchao.quantization.quant_api import (
-                Float8DynamicActivationFloat8WeightConfig,
-                PerTensor,
-                quantize_,
-            )
-
-            # Move to target device during quantization
-            # Defaults to using fp8_e4m3fn for both weights and activations
-            quantize_(
-                generator,
-                Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
-                device=device,
-            )
-
-            print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
-        else:
-            generator = generator.to(device=device, dtype=dtype)
+        generator = apply_quantization(generator, quantization, device, dtype)
 
         start = time.time()
         text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/streamdiffusionv2/pipeline.py b/src/scope/core/pipelines/streamdiffusionv2/pipeline.py
index 5c4bfb46b..9d1e106a1 100644
--- a/src/scope/core/pipelines/streamdiffusionv2/pipeline.py
+++ b/src/scope/core/pipelines/streamdiffusionv2/pipeline.py
@@ -15,6 +15,7 @@
 )
 from ..interface import Pipeline, Requirements
 from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
 from ..utils import Quantization, load_model_config, validate_resolution
 from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
 from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -87,29 +88,7 @@ def __init__(
         # Initialize optional LoRA adapters on the underlying model.
         generator.model = self._init_loras(config, generator.model)
 
-        if quantization == Quantization.FP8_E4M3FN:
-            # Cast before optional quantization
-            generator = generator.to(dtype=dtype)
-
-            start = time.time()
-
-            from torchao.quantization.quant_api import (
-                Float8DynamicActivationFloat8WeightConfig,
-                PerTensor,
-                quantize_,
-            )
-
-            # Move to target device during quantization
-            # Defaults to using fp8_e4m3fn for both weights and activations
-            quantize_(
-                generator,
-                Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
-                device=device,
-            )
-
-            print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
-        else:
-            generator = generator.to(device=device, dtype=dtype)
+        generator = apply_quantization(generator, quantization, device, dtype)
 
         start = time.time()
         text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/utils.py b/src/scope/core/pipelines/utils.py
index c05e5bd01..43483bf52 100644
--- a/src/scope/core/pipelines/utils.py
+++ b/src/scope/core/pipelines/utils.py
@@ -10,6 +10,9 @@
 from .enums import Quantization as Quantization  # noqa: PLC0414
 from .enums import VaeType as VaeType  # noqa: PLC0414
 
+# Re-export quantization utilities
+from .quantization_utils import apply_quantization as apply_quantization  # noqa: PLC0414
+
 
 def load_state_dict(weights_path: str) -> dict:
     """Load weights with automatic format detection."""
diff --git a/src/scope/core/pipelines/wan2_1/vace/mixin.py b/src/scope/core/pipelines/wan2_1/vace/mixin.py
index d984ba023..d080a1ce3 100644
--- a/src/scope/core/pipelines/wan2_1/vace/mixin.py
+++ b/src/scope/core/pipelines/wan2_1/vace/mixin.py
@@ -144,42 +144,33 @@ def _init_vace(
 
         # Quantize VACE components if quantization is enabled
         if quantization is not None:
-            # Import here to avoid circular dependency
             try:
-                from ...utils import Quantization
-
-                if quantization == Quantization.FP8_E4M3FN:
-                    logger.info(
-                        "_init_vace: Quantizing VACE components to FP8 (matching base model)..."
-                    )
-                    start = time.time()
-
-                    from torchao.quantization.quant_api import (
-                        Float8DynamicActivationFloat8WeightConfig,
-                        PerTensor,
-                        quantize_,
-                    )
-
-                    quantize_(
-                        vace_wrapped_model.vace_patch_embedding,
-                        Float8DynamicActivationFloat8WeightConfig(
-                            granularity=PerTensor()
-                        ),
-                        device=device,
-                    )
-                    quantize_(
-                        vace_wrapped_model.vace_blocks,
-                        Float8DynamicActivationFloat8WeightConfig(
-                            granularity=PerTensor()
-                        ),
-                        device=device,
-                    )
-                    logger.info(
-                        f"_init_vace: Quantized VACE to FP8 in {time.time() - start:.3f}s"
-                    )
+                from ...quantization_utils import apply_quantization_to_module
+
+                logger.info(
+                    f"_init_vace: Quantizing VACE components with {quantization}..."
+                )
+                start = time.time()
+
+                apply_quantization_to_module(
+                    vace_wrapped_model.vace_patch_embedding,
+                    quantization,
+                    device,
+                    dtype,
+                )
+                apply_quantization_to_module(
+                    vace_wrapped_model.vace_blocks,
+                    quantization,
+                    device,
+                    dtype,
+                )
+
+                logger.info(
+                    f"_init_vace: Quantized VACE components in {time.time() - start:.3f}s"
+                )
             except ImportError:
                 logger.warning(
-                    "_init_vace: Could not import Quantization, skipping quantization check"
+                    "_init_vace: Could not import quantization_utils, skipping quantization"
                 )
 
         self.vace_enabled = True

From f59fa9355a6367b683b293c2201b88b4f54aed5b Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 10 Feb 2026 20:01:44 -0500
Subject: [PATCH 2/5] Added dependencies required for nvfp4 and audio for
 future proof.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 987fa935d..01ba73f9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "uvicorn>=0.35.0",
     "torch==2.9.1",
     "torchvision==0.24.1",
+    "torchaudio==2.9.1",
     "easydict>=1.13",
     "diffusers>=0.31.0",
     "ftfy>=6.3.1",
@@ -57,6 +58,7 @@ dependencies = [
     "triton-windows==3.5.1.post24; sys_platform == 'win32'",
     "SpoutGL>=0.1.1; sys_platform == 'win32'",
     "PyOpenGL>=3.1.10; sys_platform == 'win32'",
+    "comfy-kitchen[cublas]>=0.1.0; sys_platform == 'linux' or sys_platform == 'win32'",
 ]
 
 [project.optional-dependencies]
@@ -95,6 +97,9 @@ torch = [
 torchvision = [
     { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
+torchaudio = [
+    { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
 flash-attn = [
     # Prebuilt Linux wheels from https://github.com/Dao-AILab/flash-attention
     { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux'" },

From 0fc457ff453915a7263d716a4a535e530d782e5e Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 10 Feb 2026 20:08:03 -0500
Subject: [PATCH 3/5] Built nvfp4 detection via hardware info.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/components/ComplexFields.tsx |  9 ++++++---
 frontend/src/components/SettingsPanel.tsx | 12 +++++++++---
 frontend/src/hooks/useStreamState.ts      | 14 ++++++++++++++
 frontend/src/lib/api.ts                   |  1 +
 frontend/src/pages/StreamPage.tsx         |  2 ++
 src/scope/server/app.py                   |  6 ++++++
 src/scope/server/cloud_proxy.py           |  1 +
 src/scope/server/schema.py                |  4 ++++
 8 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/frontend/src/components/ComplexFields.tsx b/frontend/src/components/ComplexFields.tsx
index 61860e571..566a29448 100644
--- a/frontend/src/components/ComplexFields.tsx
+++ b/frontend/src/components/ComplexFields.tsx
@@ -67,6 +67,7 @@ export interface SchemaComplexFieldContext {
   inputMode?: "text" | "video";
   supportsNoiseControls?: boolean;
   supportsQuantization?: boolean;
+  supportsNvfp4?: boolean;
   supportsCacheManagement?: boolean;
   supportsKvCacheBias?: boolean;
   isStreaming?: boolean;
@@ -497,9 +498,11 @@ export function SchemaComplexField({
                   <SelectItem value="fp8_e4m3fn">
                     fp8_e4m3fn (Dynamic)
                   </SelectItem>
-                  <SelectItem value="nvfp4">
-                    nvfp4 (Blackwell)
-                  </SelectItem>
+                  {ctx.supportsNvfp4 && (
+                    <SelectItem value="nvfp4">
+                      nvfp4 (Blackwell)
+                    </SelectItem>
+                  )}
                 </SelectContent>
               </Select>
             </div>
diff --git a/frontend/src/components/SettingsPanel.tsx b/frontend/src/components/SettingsPanel.tsx
index e58e7aa75..1fc293d2e 100644
--- a/frontend/src/components/SettingsPanel.tsx
+++ b/frontend/src/components/SettingsPanel.tsx
@@ -90,6 +90,8 @@ interface SettingsPanelProps {
   // Spout settings
   spoutSender?: SettingsState["spoutSender"];
   onSpoutSenderChange?: (spoutSender: SettingsState["spoutSender"]) => void;
+  // Whether GPU supports NVFP4 quantization (Blackwell SM >= 10.0)
+  supportsNvfp4?: boolean;
   // Whether Spout is available (server-side detection for native Windows, not WSL)
   spoutAvailable?: boolean;
   // VACE settings
@@ -143,6 +145,7 @@ export function SettingsPanel({
   loraMergeStrategy = "permanent_merge",
   inputMode,
   supportsNoiseControls = false,
+  supportsNvfp4 = false,
   spoutSender,
   onSpoutSenderChange,
   spoutAvailable = false,
@@ -525,6 +528,7 @@ export function SettingsPanel({
               supportsNoiseControls,
               supportsQuantization:
                 pipelines?.[pipelineId]?.supportsQuantization,
+              supportsNvfp4,
               supportsCacheManagement:
                 pipelines?.[pipelineId]?.supportsCacheManagement,
               supportsKvCacheBias: pipelines?.[pipelineId]?.supportsKvCacheBias,
@@ -964,9 +968,11 @@ export function SettingsPanel({
                             <SelectItem value="fp8_e4m3fn">
                               fp8_e4m3fn (Dynamic)
                             </SelectItem>
-                            <SelectItem value="nvfp4">
-                              nvfp4 (Blackwell)
-                            </SelectItem>
+                            {supportsNvfp4 && (
+                              <SelectItem value="nvfp4">
+                                nvfp4 (Blackwell)
+                              </SelectItem>
+                            )}
                           </SelectContent>
                         </Select>
                       </div>
diff --git a/frontend/src/hooks/useStreamState.ts b/frontend/src/hooks/useStreamState.ts
index d4e827005..3b6d8e0a2 100644
--- a/frontend/src/hooks/useStreamState.ts
+++ b/frontend/src/hooks/useStreamState.ts
@@ -334,6 +334,20 @@ export function useStreamState() {
     }
   }, [settings.pipelineId, hardwareInfo, pipelineSchemas]);
 
+  // Reset nvfp4 selection if GPU doesn't support it (e.g. from persisted state)
+  useEffect(() => {
+    if (
+      hardwareInfo &&
+      !hardwareInfo.supports_nvfp4 &&
+      settings.quantization === "nvfp4"
+    ) {
+      setSettings(prev => ({
+        ...prev,
+        quantization: "fp8_e4m3fn",
+      }));
+    }
+  }, [hardwareInfo, settings.quantization]);
+
   // Set recommended VACE enabled state based on pipeline schema and available VRAM
   // VACE is enabled by default, but disabled if VRAM is below recommended_quantization_vram_threshold
   useEffect(() => {
diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts
index fbea0bc9e..3f9cbce4c 100644
--- a/frontend/src/lib/api.ts
+++ b/frontend/src/lib/api.ts
@@ -206,6 +206,7 @@ export const downloadPipelineModels = async (
 export interface HardwareInfoResponse {
   vram_gb: number | null;
   spout_available: boolean;
+  supports_nvfp4: boolean;
 }
 
 export const getHardwareInfo = async (): Promise<HardwareInfoResponse> => {
diff --git a/frontend/src/pages/StreamPage.tsx b/frontend/src/pages/StreamPage.tsx
index beea3352b..60b61765f 100644
--- a/frontend/src/pages/StreamPage.tsx
+++ b/frontend/src/pages/StreamPage.tsx
@@ -104,6 +104,7 @@ export function StreamPage() {
     getDefaults,
     supportsNoiseControls,
     spoutAvailable,
+    hardwareInfo,
     refreshPipelineSchemas,
     refreshHardwareInfo,
   } = useStreamState();
@@ -1532,6 +1533,7 @@ export function StreamPage() {
             loraMergeStrategy={settings.loraMergeStrategy ?? "permanent_merge"}
             inputMode={settings.inputMode}
             supportsNoiseControls={supportsNoiseControls(settings.pipelineId)}
+            supportsNvfp4={hardwareInfo?.supports_nvfp4 ?? false}
             spoutSender={settings.spoutSender}
             onSpoutSenderChange={handleSpoutSenderChange}
             spoutAvailable={spoutAvailable}
diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index f14769ddc..bb2a61de5 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -1152,15 +1152,21 @@ async def get_hardware_info(
         import torch  # Lazy import to avoid loading at CLI startup
 
         vram_gb = None
+        supports_nvfp4 = False
 
         if torch.cuda.is_available():
             # Get total VRAM from the first GPU (in bytes), convert to GB
             _, total_mem = torch.cuda.mem_get_info(0)
             vram_gb = total_mem / (1024**3)
 
+            # Blackwell GPUs (SM >= 10.0) support NVFP4 quantization
+            cap = torch.cuda.get_device_capability()
+            supports_nvfp4 = cap >= (10, 0)
+
         return HardwareInfoResponse(
             vram_gb=vram_gb,
             spout_available=is_spout_available(),
+            supports_nvfp4=supports_nvfp4,
         )
     except HTTPException:
         raise
diff --git a/src/scope/server/cloud_proxy.py b/src/scope/server/cloud_proxy.py
index 2e1f4d623..90b085fc9 100644
--- a/src/scope/server/cloud_proxy.py
+++ b/src/scope/server/cloud_proxy.py
@@ -130,6 +130,7 @@ async def get_hardware_info_from_cloud(
     return HardwareInfoResponse(
         vram_gb=data.get("vram_gb"),
         spout_available=spout_available,
+        supports_nvfp4=data.get("supports_nvfp4", False),
     )
 
 
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index e0b9a2a91..3950d2217 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -251,6 +251,10 @@ class HardwareInfoResponse(BaseModel):
         default=False,
         description="Whether Spout is available (Windows only, not WSL)",
     )
+    supports_nvfp4: bool = Field(
+        default=False,
+        description="Whether GPU supports NVFP4 quantization (Blackwell SM >= 10.0)",
+    )
 
 
 class PipelineStatusEnum(str, Enum):

From ca83d1b46f06dd6d74c648d1b803cbc8268e3385 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 10 Feb 2026 20:43:21 -0500
Subject: [PATCH 4/5] Linting.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/components/ComplexFields.tsx |  4 +-
 frontend/src/components/SettingsPanel.tsx |  8 +-
 frontend/src/pages/StreamPage.tsx         |  4 +-
 uv.lock                                   | 92 +++++++++++++++++++++++
 4 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/frontend/src/components/ComplexFields.tsx b/frontend/src/components/ComplexFields.tsx
index 566a29448..53da45962 100644
--- a/frontend/src/components/ComplexFields.tsx
+++ b/frontend/src/components/ComplexFields.tsx
@@ -499,9 +499,7 @@ export function SchemaComplexField({
                     fp8_e4m3fn (Dynamic)
                   </SelectItem>
                   {ctx.supportsNvfp4 && (
-                    <SelectItem value="nvfp4">
-                      nvfp4 (Blackwell)
-                    </SelectItem>
+                    <SelectItem value="nvfp4">nvfp4 (Blackwell)</SelectItem>
                   )}
                 </SelectContent>
               </Select>
diff --git a/frontend/src/components/SettingsPanel.tsx b/frontend/src/components/SettingsPanel.tsx
index 1fc293d2e..fc8f1c6b6 100644
--- a/frontend/src/components/SettingsPanel.tsx
+++ b/frontend/src/components/SettingsPanel.tsx
@@ -621,8 +621,8 @@ export function SettingsPanel({
                     <div className="flex items-start gap-1.5 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
                       <Info className="h-3.5 w-3.5 mt-0.5 shrink-0 text-amber-600 dark:text-amber-500" />
                       <p className="text-xs text-amber-600 dark:text-amber-500">
-                        VACE is incompatible with quantization. Please
-                        disable quantization to use VACE.
+                        VACE is incompatible with quantization. Please disable
+                        quantization to use VACE.
                       </p>
                     </div>
                   )}
@@ -955,7 +955,9 @@ export function SettingsPanel({
                           value={quantization || "none"}
                           onValueChange={value => {
                             onQuantizationChange?.(
-                              value === "none" ? null : (value as "fp8_e4m3fn" | "nvfp4")
+                              value === "none"
+                                ? null
+                                : (value as "fp8_e4m3fn" | "nvfp4")
                             );
                           }}
                           disabled={isStreaming || vaceEnabled}
diff --git a/frontend/src/pages/StreamPage.tsx b/frontend/src/pages/StreamPage.tsx
index 60b61765f..b8e6f133a 100644
--- a/frontend/src/pages/StreamPage.tsx
+++ b/frontend/src/pages/StreamPage.tsx
@@ -589,7 +589,9 @@ export function StreamPage() {
     });
   };
 
-  const handleQuantizationChange = (quantization: "fp8_e4m3fn" | "nvfp4" | null) => {
+  const handleQuantizationChange = (
+    quantization: "fp8_e4m3fn" | "nvfp4" | null
+  ) => {
     updateSettings({ quantization });
     // Note: This setting requires pipeline reload, so we don't send parameter update here
   };
diff --git a/uv.lock b/uv.lock
index cc6638621..5b4470e16 100644
--- a/uv.lock
+++ b/uv.lock
@@ -462,6 +462,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 
+[[package]]
+name = "comfy-kitchen"
+version = "0.2.7"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/e1/9b6e7764f8dcd5cb9b9ae369e55660bf24b7f48825584521246e3bddf43e/comfy_kitchen-0.2.7-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a168eb1fcdbb31707bb0e1226c6d44e1bd1b0a5ac1ac0a4d9c6eb7296b903ae", size = 680629, upload-time = "2026-01-17T03:48:13.922Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/6b/1cea270d5014a465929375c434c2f78a35fadde5dfb6f436864e4c8f7a52/comfy_kitchen-0.2.7-cp312-abi3-win_amd64.whl", hash = "sha256:047b9ac7c8c1a845a51b0de3fb05c8d007666d68a3e776e07ecb5db21f15fbdd", size = 592877, upload-time = "2026-01-17T03:48:15.262Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/65/d483613734d0b9753bd9bfa297ff334cb2c7766e82306099db6b259b4e2c/comfy_kitchen-0.2.7-py3-none-any.whl", hash = "sha256:f8faa579b69d331d2f1eac09e96a95586c2a6b958a54bc19e7f1c1a77852dd36", size = 58034, upload-time = "2026-01-17T03:48:16.561Z" },
+]
+
+[package.optional-dependencies]
+cublas = [
+    { name = "nvidia-cublas", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+
 [[package]]
 name = "cryptography"
 version = "46.0.3"
@@ -527,6 +542,7 @@ dependencies = [
     { name = "aiohttp" },
     { name = "aiortc" },
     { name = "click" },
+    { name = "comfy-kitchen", extra = ["cublas"], marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
     { name = "diffusers" },
     { name = "easydict" },
     { name = "einops" },
@@ -549,6 +565,9 @@ dependencies = [
     { name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
     { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
     { name = "torchao" },
+    { name = "torchaudio", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+    { name = "torchaudio", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torchaudio", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or sys_platform == 'win32'" },
     { name = "torchvision", version = "0.24.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
     { name = "torchvision", version = "0.24.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
     { name = "torchvision", version = "0.24.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or sys_platform == 'win32'" },
@@ -582,6 +601,7 @@ requires-dist = [
     { name = "aiokafka", marker = "extra == 'kafka'", specifier = ">=0.10.0" },
     { name = "aiortc", specifier = ">=1.13.0" },
     { name = "click", specifier = ">=8.3.1" },
+    { name = "comfy-kitchen", extras = ["cublas"], marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = ">=0.1.0" },
     { name = "diffusers", specifier = ">=0.31.0" },
     { name = "easydict", specifier = ">=1.13" },
     { name = "einops", specifier = ">=0.8.1" },
@@ -604,6 +624,8 @@ requires-dist = [
     { name = "torch", marker = "sys_platform != 'linux' and sys_platform != 'win32'", specifier = "==2.9.1" },
     { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu128" },
     { name = "torchao", specifier = "==0.15.0" },
+    { name = "torchaudio", marker = "sys_platform != 'linux' and sys_platform != 'win32'", specifier = "==2.9.1" },
+    { name = "torchaudio", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu128" },
     { name = "torchvision", marker = "sys_platform != 'linux' and sys_platform != 'win32'", specifier = "==0.24.1" },
     { name = "torchvision", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==0.24.1", index = "https://download.pytorch.org/whl/cu128" },
     { name = "transformers", specifier = ">=4.49.0" },
@@ -1505,6 +1527,16 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/0d/eca3d962f9eef265f01a8e0d20085c6dd1f443cbffc11b6dede81fd82356/numpy-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:6436cffb4f2bf26c974344439439c95e152c9a527013f26b3577be6c2ca64295", size = 10667121, upload-time = "2026-01-10T06:44:41.644Z" },
 ]
 
+[[package]]
+name = "nvidia-cublas"
+version = "13.2.1.1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/36/0124129e1378e9834e0cbe19781fbe0ffd5f870c2af6f01cdf17a9869c39/nvidia_cublas-13.2.1.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8b4a4cd8b73772fde9ccaa1f3967eb001ae5fde8b1dc37f7442d072b64d6f5da", size = 502470979, upload-time = "2026-01-13T22:39:37.619Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/e7/39e43c0688f9788c88da0b91ea18125448c5f515104aadf65a70243f144f/nvidia_cublas-13.2.1.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8c13c93cf8be4480b4909905c96d2d31575b4af43fcd3af0e84af94762665e4f", size = 401085577, upload-time = "2026-01-13T22:40:18.702Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9b/d9788b63872c6e4ce0fb292f2000642e73a8ae4da2d6f6b33759b77059af/nvidia_cublas-13.2.1.1-py3-none-win_amd64.whl", hash = "sha256:bc94f0597c21cfd6fea9446b18309b2351630ff227bb4e8575196494fb51c6b6", size = 385519499, upload-time = "2026-01-13T22:57:28.499Z" },
+]
+
 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.8.4.1"
@@ -2558,6 +2590,66 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f6/3b/6b9d5618720f63dbc2e2509cd6b57aae9c0d61b738d1d2172f4d5d9efaab/torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c", size = 1080679, upload-time = "2025-12-18T23:14:43.807Z" },
 ]
 
+[[package]]
+name = "torchaudio"
+version = "2.9.1"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+    "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:59f70f6aa6a7e77a1fd51756d7d25fec22bead0b50ce7bed4ede75a5fa6b21d1" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ea6fe3b9525493df0a5eb5eed5c22925065b5830f6999980ed76bb36c4592d34" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cfd604a617a245d26a5381ad7d669047ac1c152896227d8a006aad12151f8" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:bb790f968539f6b4115e637dbf3aab71f3f92e41f0c12e6bc7d52324f0051113" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:7d8da8816dfa25869da206eb4cdddb11603b042ab59326c6466a65b6e64d2684" },
+]
+
+[[package]]
+name = "torchaudio"
+version = "2.9.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/83/71cbadd7b66753818b5775f2088bad4f721d581de276996df4968000a626/torchaudio-2.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7581ef170794c599aed55918e00d0acd9e5c9a0f19400c9a9a840955180365c5", size = 808098, upload-time = "2025-11-12T15:26:01.408Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/1b/3321ad6379ac2d968064704e8d015c31ccae5d1ece070f87fb44b17d90e6/torchaudio-2.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bb69557484c92513a980027ec4cb314b0f43cf4442bbfd97440e66528dbad22d", size = 808136, upload-time = "2025-11-12T15:26:00.276Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/58/e82d8b5f447abdddc950965f1395f36baef3602643dd069100c6369ba73e/torchaudio-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9290f6a6409deb1f9113d5aef97ec646eeee6410b6bcc57ab8b57066b54da7c1", size = 813456, upload-time = "2025-11-12T15:26:13.963Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/38/0dabf362f946ab5773d3db3322718d652d70ad12a82f500d54c6c8b9cc88/torchaudio-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:69a582650279ee16ff9087f99b4234fe5d766e1bf7f0be352db5f46991854c1e", size = 810496, upload-time = "2025-11-12T15:26:11.515Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f6/237e00a04dea497a40a8567d024dfb39193abec3ca3695ad51919ad633d1/torchaudio-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e13cb38971ac259fc4e102282a3e48f6df5f0ab00eb785ca5155e3392d1e86f1", size = 813463, upload-time = "2025-11-12T15:26:16.261Z" },
+]
+
+[[package]]
+name = "torchaudio"
+version = "2.9.1+cu128"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+    "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux')",
+    "sys_platform == 'win32'",
+]
+dependencies = [
+    { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or sys_platform == 'win32'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:54eb19e634b8c567886a1b53b4184506d943c3ba5139198e9fe1b941bc566f30" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:88896c7bfa486102439fab6c85ac834176617e9c06eb0be9074c07ee1183b47d" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:52297b7dfb7c42e311385572bc9c0186e602ea1a5f20c42923765baea99aff83" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:abb0ee5a40c883ad17d90cdd45965c06deef42a0a2ffc58c51e32729642292f0" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ddc7410908858693d3b81346f53b5e5e51f987b3b7128978be6c774314377204" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:9d29dc3a2e0c43da66d33bcb9e22ad58c58f0ae1b6dcfe2d8d94bda279ddcf89" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:12c3e3d3aaf856d679328a5a9d46d866bc88b4c5290f2128f306abff975fa51e" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314-win_amd64.whl", hash = "sha256:eb6c714557c8d47f4fc65ec58b14a21cb4150940c242fe77e7517636c20ed3c3" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:6d9f5d53861b2fc057c1dd5051721f60b2253176c44a856d0f19100e312add3f" },
+    { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:150a8d7d51df9f667b5386cff5850f685b6059c59db51e056d7157955aad9e75" },
+]
+
 [[package]]
 name = "torchvision"
 version = "0.24.1"

From 202b03148a14f5579a34ab6f8000e97f68fdc1d5 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 10 Feb 2026 20:45:41 -0500
Subject: [PATCH 5/5] Linting.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 src/scope/core/pipelines/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/scope/core/pipelines/utils.py b/src/scope/core/pipelines/utils.py
index 43483bf52..3d2f022be 100644
--- a/src/scope/core/pipelines/utils.py
+++ b/src/scope/core/pipelines/utils.py
@@ -11,7 +11,9 @@
 from .enums import VaeType as VaeType  # noqa: PLC0414
 
 # Re-export quantization utilities
-from .quantization_utils import apply_quantization as apply_quantization  # noqa: PLC0414
+from .quantization_utils import (
+    apply_quantization as apply_quantization,  # noqa: PLC0414
+)
 
 
 def load_state_dict(weights_path: str) -> dict: