diff --git a/frontend/src/components/ComplexFields.tsx b/frontend/src/components/ComplexFields.tsx
index 77e162d32..53da45962 100644
--- a/frontend/src/components/ComplexFields.tsx
+++ b/frontend/src/components/ComplexFields.tsx
@@ -49,7 +49,7 @@ export interface SchemaComplexFieldContext {
vaceUseInputVideo?: boolean;
onVaceUseInputVideoChange?: (enabled: boolean) => void;
vaceContextScaleSlider?: SliderState;
- quantization?: "fp8_e4m3fn" | null;
+ quantization?: "fp8_e4m3fn" | "nvfp4" | null;
loras?: LoRAConfig[];
onLorasChange?: (loras: LoRAConfig[]) => void;
loraMergeStrategy?: LoraMergeStrategy;
@@ -63,10 +63,11 @@ export interface SchemaComplexFieldContext {
noiseScaleSlider?: SliderState;
noiseController?: boolean;
onNoiseControllerChange?: (enabled: boolean) => void;
- onQuantizationChange?: (q: "fp8_e4m3fn" | null) => void;
+ onQuantizationChange?: (q: "fp8_e4m3fn" | "nvfp4" | null) => void;
inputMode?: "text" | "video";
supportsNoiseControls?: boolean;
supportsQuantization?: boolean;
+ supportsNvfp4?: boolean;
supportsCacheManagement?: boolean;
supportsKvCacheBias?: boolean;
isStreaming?: boolean;
@@ -156,7 +157,7 @@ export function SchemaComplexField({
- VACE is incompatible with FP8 quantization. Please disable
+ VACE is incompatible with quantization. Please disable
quantization to use VACE.
@@ -482,14 +483,14 @@ export function SchemaComplexField({
value={ctx.quantization ?? "none"}
onValueChange={v =>
ctx.onQuantizationChange?.(
- v === "none" ? null : (v as "fp8_e4m3fn")
+ v === "none" ? null : (v as "fp8_e4m3fn" | "nvfp4")
)
}
disabled={
(ctx.isStreaming ?? false) || (ctx.vaceEnabled ?? false)
}
>
-
+
@@ -497,12 +498,15 @@ export function SchemaComplexField({
fp8_e4m3fn (Dynamic)
+ {ctx.supportsNvfp4 && (
+ nvfp4 (Blackwell)
+ )}
{ctx.vaceEnabled && (
- Disabled because VACE is enabled. Disable VACE to use FP8
+ Disabled because VACE is enabled. Disable VACE to use
quantization.
)}
diff --git a/frontend/src/components/SettingsPanel.tsx b/frontend/src/components/SettingsPanel.tsx
index 3125e22ca..fc8f1c6b6 100644
--- a/frontend/src/components/SettingsPanel.tsx
+++ b/frontend/src/components/SettingsPanel.tsx
@@ -75,8 +75,8 @@ interface SettingsPanelProps {
onNoiseControllerChange?: (enabled: boolean) => void;
manageCache?: boolean;
onManageCacheChange?: (enabled: boolean) => void;
- quantization?: "fp8_e4m3fn" | null;
- onQuantizationChange?: (quantization: "fp8_e4m3fn" | null) => void;
+ quantization?: "fp8_e4m3fn" | "nvfp4" | null;
+ onQuantizationChange?: (quantization: "fp8_e4m3fn" | "nvfp4" | null) => void;
kvCacheAttentionBias?: number;
onKvCacheAttentionBiasChange?: (bias: number) => void;
onResetCache?: () => void;
@@ -90,6 +90,8 @@ interface SettingsPanelProps {
// Spout settings
spoutSender?: SettingsState["spoutSender"];
onSpoutSenderChange?: (spoutSender: SettingsState["spoutSender"]) => void;
+ // Whether GPU supports NVFP4 quantization (Blackwell SM >= 10.0)
+ supportsNvfp4?: boolean;
// Whether Spout is available (server-side detection for native Windows, not WSL)
spoutAvailable?: boolean;
// VACE settings
@@ -143,6 +145,7 @@ export function SettingsPanel({
loraMergeStrategy = "permanent_merge",
inputMode,
supportsNoiseControls = false,
+ supportsNvfp4 = false,
spoutSender,
onSpoutSenderChange,
spoutAvailable = false,
@@ -525,6 +528,7 @@ export function SettingsPanel({
supportsNoiseControls,
supportsQuantization:
pipelines?.[pipelineId]?.supportsQuantization,
+ supportsNvfp4,
supportsCacheManagement:
pipelines?.[pipelineId]?.supportsCacheManagement,
supportsKvCacheBias: pipelines?.[pipelineId]?.supportsKvCacheBias,
@@ -617,8 +621,8 @@ export function SettingsPanel({
- VACE is incompatible with FP8 quantization. Please
- disable quantization to use VACE.
+ VACE is incompatible with quantization. Please disable
+ quantization to use VACE.
)}
@@ -951,12 +955,14 @@ export function SettingsPanel({
value={quantization || "none"}
onValueChange={value => {
onQuantizationChange?.(
- value === "none" ? null : (value as "fp8_e4m3fn")
+ value === "none"
+ ? null
+ : (value as "fp8_e4m3fn" | "nvfp4")
);
}}
disabled={isStreaming || vaceEnabled}
>
-
+
@@ -964,6 +970,11 @@ export function SettingsPanel({
fp8_e4m3fn (Dynamic)
+ {supportsNvfp4 && (
+
+ nvfp4 (Blackwell)
+
+ )}
@@ -971,7 +982,7 @@ export function SettingsPanel({
{vaceEnabled && (
Disabled because VACE is enabled. Disable VACE to use
- FP8 quantization.
+ quantization.
)}
diff --git a/frontend/src/hooks/useStreamState.ts b/frontend/src/hooks/useStreamState.ts
index 3f76c424d..3b6d8e0a2 100644
--- a/frontend/src/hooks/useStreamState.ts
+++ b/frontend/src/hooks/useStreamState.ts
@@ -38,7 +38,7 @@ function getFallbackDefaults(mode?: InputMode) {
noiseController: isVideoMode ? true : undefined,
defaultTemporalInterpolationSteps: undefined as number | undefined,
inputMode: effectiveMode,
- quantization: undefined as "fp8_e4m3fn" | undefined,
+ quantization: undefined as "fp8_e4m3fn" | "nvfp4" | undefined,
};
}
@@ -125,7 +125,7 @@ export function useStreamState() {
noiseController,
defaultTemporalInterpolationSteps,
inputMode: effectiveMode,
- quantization: undefined as "fp8_e4m3fn" | undefined,
+ quantization: undefined as "fp8_e4m3fn" | "nvfp4" | undefined,
};
}
// Fallback to derived defaults if schemas not loaded
@@ -334,6 +334,20 @@ export function useStreamState() {
}
}, [settings.pipelineId, hardwareInfo, pipelineSchemas]);
+ // Reset nvfp4 selection if GPU doesn't support it (e.g. from persisted state)
+ useEffect(() => {
+ if (
+ hardwareInfo &&
+ !hardwareInfo.supports_nvfp4 &&
+ settings.quantization === "nvfp4"
+ ) {
+ setSettings(prev => ({
+ ...prev,
+ quantization: "fp8_e4m3fn",
+ }));
+ }
+ }, [hardwareInfo, settings.quantization]);
+
// Set recommended VACE enabled state based on pipeline schema and available VRAM
// VACE is enabled by default, but disabled if VRAM is below recommended_quantization_vram_threshold
useEffect(() => {
diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts
index fbea0bc9e..3f9cbce4c 100644
--- a/frontend/src/lib/api.ts
+++ b/frontend/src/lib/api.ts
@@ -206,6 +206,7 @@ export const downloadPipelineModels = async (
export interface HardwareInfoResponse {
vram_gb: number | null;
spout_available: boolean;
+ supports_nvfp4: boolean;
}
export const getHardwareInfo = async (): Promise => {
diff --git a/frontend/src/pages/StreamPage.tsx b/frontend/src/pages/StreamPage.tsx
index a35f5da09..b8e6f133a 100644
--- a/frontend/src/pages/StreamPage.tsx
+++ b/frontend/src/pages/StreamPage.tsx
@@ -104,6 +104,7 @@ export function StreamPage() {
getDefaults,
supportsNoiseControls,
spoutAvailable,
+ hardwareInfo,
refreshPipelineSchemas,
refreshHardwareInfo,
} = useStreamState();
@@ -588,7 +589,9 @@ export function StreamPage() {
});
};
- const handleQuantizationChange = (quantization: "fp8_e4m3fn" | null) => {
+ const handleQuantizationChange = (
+ quantization: "fp8_e4m3fn" | "nvfp4" | null
+ ) => {
updateSettings({ quantization });
// Note: This setting requires pipeline reload, so we don't send parameter update here
};
@@ -1532,6 +1535,7 @@ export function StreamPage() {
loraMergeStrategy={settings.loraMergeStrategy ?? "permanent_merge"}
inputMode={settings.inputMode}
supportsNoiseControls={supportsNoiseControls(settings.pipelineId)}
+ supportsNvfp4={hardwareInfo?.supports_nvfp4 ?? false}
spoutSender={settings.spoutSender}
onSpoutSenderChange={handleSpoutSenderChange}
spoutAvailable={spoutAvailable}
diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
index 35d01686d..ac0074f3c 100644
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -55,7 +55,7 @@ export interface SettingsState {
noiseScale?: number;
noiseController?: boolean;
manageCache?: boolean;
- quantization?: "fp8_e4m3fn" | null;
+ quantization?: "fp8_e4m3fn" | "nvfp4" | null;
kvCacheAttentionBias?: number;
paused?: boolean;
loras?: LoRAConfig[];
diff --git a/pyproject.toml b/pyproject.toml
index 987fa935d..01ba73f9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
"uvicorn>=0.35.0",
"torch==2.9.1",
"torchvision==0.24.1",
+ "torchaudio==2.9.1",
"easydict>=1.13",
"diffusers>=0.31.0",
"ftfy>=6.3.1",
@@ -57,6 +58,7 @@ dependencies = [
"triton-windows==3.5.1.post24; sys_platform == 'win32'",
"SpoutGL>=0.1.1; sys_platform == 'win32'",
"PyOpenGL>=3.1.10; sys_platform == 'win32'",
+ "comfy-kitchen[cublas]>=0.1.0; sys_platform == 'linux' or sys_platform == 'win32'",
]
[project.optional-dependencies]
@@ -95,6 +97,9 @@ torch = [
torchvision = [
{ index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
]
+torchaudio = [
+ { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
flash-attn = [
# Prebuilt Linux wheels from https://github.com/Dao-AILab/flash-attention
{ url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux'" },
diff --git a/src/scope/core/pipelines/enums.py b/src/scope/core/pipelines/enums.py
index 9de333da1..ca33e4a34 100644
--- a/src/scope/core/pipelines/enums.py
+++ b/src/scope/core/pipelines/enums.py
@@ -12,6 +12,7 @@ class Quantization(str, Enum):
"""Quantization method enumeration."""
FP8_E4M3FN = "fp8_e4m3fn"
+ NVFP4 = "nvfp4"
class VaeType(str, Enum):
diff --git a/src/scope/core/pipelines/krea_realtime_video/pipeline.py b/src/scope/core/pipelines/krea_realtime_video/pipeline.py
index c411db2c9..603db1cee 100644
--- a/src/scope/core/pipelines/krea_realtime_video/pipeline.py
+++ b/src/scope/core/pipelines/krea_realtime_video/pipeline.py
@@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -111,29 +112,7 @@ def __init__(
# Initialize optional LoRA adapters on the underlying model AFTER VACE.
generator.model = self._init_loras(config, generator.model)
- if quantization == Quantization.FP8_E4M3FN:
- # Cast before optional quantization
- generator = generator.to(dtype=dtype)
-
- start = time.time()
-
- from torchao.quantization.quant_api import (
- Float8DynamicActivationFloat8WeightConfig,
- PerTensor,
- quantize_,
- )
-
- # Move to target device during quantization
- # Defaults to using fp8_e4m3fn for both weights and activations
- quantize_(
- generator,
- Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
- device=device,
- )
-
- print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
- else:
- generator = generator.to(device=device, dtype=dtype)
+ generator = apply_quantization(generator, quantization, device, dtype)
if compile:
# Only compile the attention blocks
diff --git a/src/scope/core/pipelines/longlive/pipeline.py b/src/scope/core/pipelines/longlive/pipeline.py
index f0b53b55d..dc2e80c9e 100644
--- a/src/scope/core/pipelines/longlive/pipeline.py
+++ b/src/scope/core/pipelines/longlive/pipeline.py
@@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -110,29 +111,7 @@ def __init__(
# This is additive and does not replace the original LongLive performance LoRA.
generator.model = self._init_loras(config, generator.model)
- if quantization == Quantization.FP8_E4M3FN:
- # Cast before optional quantization
- generator = generator.to(dtype=dtype)
-
- start = time.time()
-
- from torchao.quantization.quant_api import (
- Float8DynamicActivationFloat8WeightConfig,
- PerTensor,
- quantize_,
- )
-
- # Move to target device during quantization
- # Defaults to using fp8_e4m3fn for both weights and activations
- quantize_(
- generator,
- Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
- device=device,
- )
-
- print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
- else:
- generator = generator.to(device=device, dtype=dtype)
+ generator = apply_quantization(generator, quantization, device, dtype)
start = time.time()
text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/memflow/pipeline.py b/src/scope/core/pipelines/memflow/pipeline.py
index e4e5b9d78..5f1efb90a 100644
--- a/src/scope/core/pipelines/memflow/pipeline.py
+++ b/src/scope/core/pipelines/memflow/pipeline.py
@@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -110,29 +111,7 @@ def __init__(
# This is additive and does not replace the original MemFlow performance LoRA.
generator.model = self._init_loras(config, generator.model)
- if quantization == Quantization.FP8_E4M3FN:
- # Cast before optional quantization
- generator = generator.to(dtype=dtype)
-
- start = time.time()
-
- from torchao.quantization.quant_api import (
- Float8DynamicActivationFloat8WeightConfig,
- PerTensor,
- quantize_,
- )
-
- # Move to target device during quantization
- # Defaults to using fp8_e4m3fn for both weights and activations
- quantize_(
- generator,
- Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
- device=device,
- )
-
- print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
- else:
- generator = generator.to(device=device, dtype=dtype)
+ generator = apply_quantization(generator, quantization, device, dtype)
start = time.time()
text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/quantization_utils.py b/src/scope/core/pipelines/quantization_utils.py
new file mode 100644
index 000000000..be40ddcfe
--- /dev/null
+++ b/src/scope/core/pipelines/quantization_utils.py
@@ -0,0 +1,474 @@
+"""Quantization utilities for pipeline models.
+
+Provides shared quantization functions used across all pipelines that support
+quantization (FP8 via torchao, NVFP4 via comfy-kitchen).
+
+NVFP4 (E2M1) provides ~4x weight memory reduction on Blackwell GPUs (SM >= 10.0)
+using comfy-kitchen's QuantizedTensor and optimized CUDA kernels.
+
+FP8 (E4M3FN) provides ~2x weight memory reduction on Ada+ GPUs (SM >= 8.9)
+using torchao's dynamic activation quantization.
+"""
+
+from __future__ import annotations
+
+import gc
+import logging
+import time
+from collections.abc import Callable
+
+import torch
+
+from .enums import Quantization
+
+logger = logging.getLogger(__name__)
+
+# ============================================================================
+# NVFP4 Support
+# ============================================================================
+
+# Minimum SM version for NVFP4 hardware acceleration
+MIN_NVFP4_SM_VERSION = (10, 0) # Blackwell
+
+# Layout name for comfy-kitchen's NVFP4 layout
+NVFP4_LAYOUT = "TensorCoreNVFP4Layout"
+
+
+def check_nvfp4_support() -> tuple[bool, str]:
+ """Check if NVFP4 is supported on current hardware.
+
+ Returns:
+ Tuple of (is_supported, reason_if_not)
+ """
+ if not torch.cuda.is_available():
+ return False, "CUDA not available"
+
+ cap = torch.cuda.get_device_capability()
+ if cap < MIN_NVFP4_SM_VERSION:
+ return (
+ False,
+ f"Requires SM >= {MIN_NVFP4_SM_VERSION[0]}.{MIN_NVFP4_SM_VERSION[1]} (Blackwell), "
+ f"current: SM {cap[0]}.{cap[1]}",
+ )
+
+ # Check if comfy-kitchen is available
+ try:
+ import comfy_kitchen # noqa: F401
+ except ImportError:
+ return (
+ False,
+ "comfy-kitchen package not installed. Install with: pip install comfy-kitchen[cublas]",
+ )
+
+ # Check if QuantizedTensor and NVFP4 layout are available
+ try:
+ from comfy_kitchen.tensor import ( # noqa: F401
+ QuantizedTensor,
+ TensorCoreNVFP4Layout,
+ )
+ except ImportError:
+ return False, "comfy-kitchen QuantizedTensor not available"
+
+ return True, ""
+
+
+class NVFP4Linear(torch.nn.Module):
+ """Linear layer with NVFP4 quantized weights using comfy-kitchen.
+
+ Stores weights as comfy-kitchen QuantizedTensor which automatically
+ dispatches to optimized NVFP4 kernels during matmul.
+
+ The weight is stored as an nn.Parameter containing a QuantizedTensor,
+ enabling the __torch_dispatch__ mechanism to route F.linear calls
+ to optimized NVFP4 kernels.
+ """
+
+ def __init__(
+ self,
+ in_features: int,
+ out_features: int,
+ bias: bool = True,
+ device: torch.device | None = None,
+ dtype: torch.dtype | None = None,
+ ):
+ super().__init__()
+ self.in_features = in_features
+ self.out_features = out_features
+ self._orig_dtype = dtype or torch.bfloat16
+ self._layout_type = NVFP4_LAYOUT
+
+ self.register_parameter("weight", None)
+
+ if bias:
+ self.bias = torch.nn.Parameter(
+ torch.zeros(out_features, device=device, dtype=dtype or torch.bfloat16)
+ )
+ else:
+ self.register_parameter("bias", None)
+
+ @classmethod
+ def from_linear(cls, linear: torch.nn.Linear) -> NVFP4Linear:
+ """Create NVFP4Linear from a standard Linear layer.
+
+ Note: Does NOT free the original linear layer's memory.
+ The caller is responsible for cleanup after this returns.
+ """
+ from comfy_kitchen.tensor import QuantizedTensor
+
+ in_features = linear.in_features
+ out_features = linear.out_features
+ has_bias = linear.bias is not None
+ device = linear.weight.device
+ dtype = linear.weight.dtype
+
+ nvfp4_linear = cls(
+ in_features=in_features,
+ out_features=out_features,
+ bias=has_bias,
+ device=device,
+ dtype=dtype,
+ )
+
+ weight_2d = linear.weight.data.detach()
+ quantized_weight = QuantizedTensor.from_float(weight_2d, NVFP4_LAYOUT)
+ nvfp4_linear.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+
+ if has_bias:
+ nvfp4_linear.bias = torch.nn.Parameter(
+ linear.bias.data.detach().clone().to(dtype), requires_grad=False
+ )
+
+ return nvfp4_linear
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """Forward pass with NVFP4 quantized computation."""
+ from comfy_kitchen.tensor import QuantizedTensor
+
+ orig_shape = x.shape
+ reshaped_3d = x.dim() == 3
+
+ if reshaped_3d:
+ x = x.reshape(-1, orig_shape[2])
+
+ if x.dim() == 2:
+ x_qt = QuantizedTensor.from_float(x, self._layout_type)
+ out = torch.nn.functional.linear(x_qt, self.weight, self.bias)
+ else:
+ weight_dq = (
+ self.weight.dequantize()
+ if hasattr(self.weight, "dequantize")
+ else self.weight
+ )
+ out = torch.nn.functional.linear(x, weight_dq, self.bias)
+
+ if reshaped_3d:
+ out = out.reshape(orig_shape[0], orig_shape[1], self.weight.shape[0])
+
+ return out
+
+ def extra_repr(self) -> str:
+ return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
+
+
+def _default_layer_filter(name: str, module: torch.nn.Module) -> bool:
+ """Default filter for selecting transformer block linear layers for quantization.
+
+ Quantizes attention projections and MLP/FFN layers.
+ Excludes embedding layers, layer norms, output projections, and LoRA adapters.
+ """
+ if not isinstance(module, torch.nn.Linear):
+ return False
+
+ name_lower = name.lower()
+
+ # Skip LoRA adapter layers
+ name_parts = name.split(".")
+ is_lora_layer = any(
+ part.lower().startswith("lora_") or part in ("lora_A", "lora_B")
+ for part in name_parts
+ )
+ if is_lora_layer:
+ return False
+
+ # Skip embedding, output, and input projection layers
+ skip_patterns = [
+ "embed",
+ "lm_head",
+ "output_proj",
+ "final",
+ "norm",
+ "ln_",
+ "layernorm",
+ "patchify",
+ "caption_projection",
+ ]
+ for pattern in skip_patterns:
+ if pattern in name_lower:
+ return False
+
+ # Include attention and MLP layers
+ include_patterns = [
+ "attn",
+ "attention",
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ "o_proj",
+ "out_proj",
+ "qkv",
+ "mlp",
+ "ffn",
+ "fc1",
+ "fc2",
+ "gate",
+ "up_proj",
+ "down_proj",
+ "dense",
+ "linear",
+ "proj",
+ ]
+
+ for pattern in include_patterns:
+ if pattern in name_lower:
+ return True
+
+ # Default: quantize layers inside transformer blocks
+ block_patterns = ["block", "layer", "transformer"]
+ for pattern in block_patterns:
+ if pattern in name_lower:
+ return True
+
+ return False
+
+
+def quantize_model_nvfp4(
+ model: torch.nn.Module,
+ layer_filter: Callable[[str, torch.nn.Module], bool] | None = None,
+ streaming: bool = False,
+ target_device: torch.device | None = None,
+) -> None:
+ """Quantize Linear layers in a model to NVFP4 in-place.
+
+ Replaces nn.Linear layers with NVFP4Linear layers for ~4x weight memory
+ reduction and hardware-accelerated matmul on Blackwell GPUs.
+
+ Args:
+ model: PyTorch model to quantize
+ layer_filter: Optional function (name, module) -> bool to filter layers.
+ If None, uses _default_layer_filter.
+ streaming: If True, use streaming mode for low-VRAM GPUs.
+ target_device: Target device for quantization (only used in streaming mode).
+ """
+ if layer_filter is None:
+ layer_filter = _default_layer_filter
+
+ # Only store layer names to avoid keeping module references alive
+ layer_names_to_replace: list[str] = []
+ skipped_lora: list[str] = []
+
+ for name, module in model.named_modules():
+ if isinstance(module, torch.nn.Linear):
+ name_parts = name.split(".")
+ is_lora_layer = any(
+ part.startswith("lora_") or part in ("lora_A", "lora_B")
+ for part in name_parts
+ )
+
+ if is_lora_layer:
+ skipped_lora.append(name)
+ continue
+
+ if layer_filter(name, module):
+ layer_names_to_replace.append(name)
+
+ if skipped_lora:
+ logger.info(f"Skipped {len(skipped_lora)} LoRA adapter layers")
+
+ num_layers = len(layer_names_to_replace)
+ logger.info(f"Quantizing {num_layers} Linear layers to NVFP4")
+
+ if streaming and target_device is None:
+ target_device = (
+ torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+ )
+
+ if streaming:
+ logger.info(
+ f"Using streaming quantization mode (target device: {target_device})"
+ )
+
+ # Log memory before quantization
+ mem_before = 0.0
+ if torch.cuda.is_available():
+ gc.collect()
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+ mem_before = torch.cuda.memory_allocated() / 1024**3
+ logger.info(f"GPU memory before NVFP4 quantization: {mem_before:.2f} GB")
+
+ for i, name in enumerate(layer_names_to_replace):
+ parts = name.split(".")
+ parent = model
+ for part in parts[:-1]:
+ parent = getattr(parent, part)
+
+ module = getattr(parent, parts[-1])
+
+ if not isinstance(module, torch.nn.Linear):
+ continue
+
+ if streaming:
+ original_device = module.weight.device
+ if original_device != target_device:
+ module = module.to(target_device)
+ setattr(parent, parts[-1], module)
+
+ nvfp4_module = NVFP4Linear.from_linear(module)
+ nvfp4_module = nvfp4_module.to("cpu")
+ setattr(parent, parts[-1], nvfp4_module)
+
+ if module.weight is not None:
+ module.weight.data = torch.empty(0, device="cpu", dtype=torch.float32)
+ if module.bias is not None:
+ module.bias.data = torch.empty(0, device="cpu", dtype=torch.float32)
+ del module
+
+ gc.collect()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ else:
+ nvfp4_module = NVFP4Linear.from_linear(module)
+ setattr(parent, parts[-1], nvfp4_module)
+
+ if module.weight is not None:
+ module.weight.data = torch.empty(0, device="cpu", dtype=torch.float32)
+ if module.bias is not None:
+ module.bias.data = torch.empty(0, device="cpu", dtype=torch.float32)
+ del module
+
+ if (i + 1) % 25 == 0:
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ if (i + 1) % 100 == 0 or (streaming and (i + 1) % 50 == 0):
+ if torch.cuda.is_available():
+ current_mem = torch.cuda.memory_allocated() / 1024**3
+ logger.info(
+ f"Quantized {i + 1}/{num_layers} layers, "
+ f"GPU memory: {current_mem:.2f} GB"
+ )
+
+ gc.collect()
+ torch.cuda.empty_cache()
+ if torch.cuda.is_available():
+ torch.cuda.synchronize()
+ mem_after = torch.cuda.memory_allocated() / 1024**3
+ mem_saved = mem_before - mem_after
+ if mem_saved > 0:
+ logger.info(f"NVFP4 quantization saved {mem_saved:.2f} GB GPU memory")
+
+
+# ============================================================================
+# Unified Quantization API
+# ============================================================================
+
+
+def apply_quantization(
+ model: torch.nn.Module,
+ quantization: Quantization | None,
+ device: torch.device | str,
+ dtype: torch.dtype,
+) -> torch.nn.Module:
+ """Apply quantization to a model and move it to the target device.
+
+ This is the shared entry point used by all pipelines that support quantization.
+ It handles both FP8 and NVFP4 quantization methods, falling back to a simple
+ device/dtype cast when quantization is None.
+
+ Args:
+ model: The model to quantize (typically the diffusion generator)
+ quantization: Quantization method to apply, or None for no quantization
+ device: Target device
+ dtype: Target dtype (typically torch.bfloat16)
+
+ Returns:
+ The quantized model on the target device
+ """
+ if quantization == Quantization.FP8_E4M3FN:
+ # Cast before quantization
+ model = model.to(dtype=dtype)
+
+ start = time.time()
+
+ from torchao.quantization.quant_api import (
+ Float8DynamicActivationFloat8WeightConfig,
+ PerTensor,
+ quantize_,
+ )
+
+ quantize_(
+ model,
+ Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
+ device=device,
+ )
+
+ print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
+
+ elif quantization == Quantization.NVFP4:
+ supported, reason = check_nvfp4_support()
+ if not supported:
+ raise RuntimeError(f"NVFP4 quantization not supported: {reason}")
+
+ # Cast to dtype first, then move to device
+ model = model.to(dtype=dtype, device=device)
+
+ start = time.time()
+ quantize_model_nvfp4(model, layer_filter=_default_layer_filter)
+ print(f"Quantized diffusion model to nvfp4 in {time.time() - start:.3f}s")
+
+ else:
+ model = model.to(device=device, dtype=dtype)
+
+ return model
+
+
+def apply_quantization_to_module(
+ module: torch.nn.Module,
+ quantization: Quantization | None,
+ device: torch.device | str,
+ dtype: torch.dtype,
+) -> None:
+ """Apply quantization to a specific module (e.g., VACE components).
+
+ Unlike apply_quantization, this operates on sub-modules that are
+ already on the correct device and doesn't return the module.
+
+ Args:
+ module: The module to quantize
+ quantization: Quantization method to apply
+ device: Target device
+ dtype: Target dtype
+ """
+ if quantization is None:
+ return
+
+ if quantization == Quantization.FP8_E4M3FN:
+ from torchao.quantization.quant_api import (
+ Float8DynamicActivationFloat8WeightConfig,
+ PerTensor,
+ quantize_,
+ )
+
+ quantize_(
+ module,
+ Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
+ device=device,
+ )
+
+ elif quantization == Quantization.NVFP4:
+ supported, reason = check_nvfp4_support()
+ if not supported:
+ logger.warning(f"NVFP4 not supported for sub-module, skipping: {reason}")
+ return
+
+ quantize_model_nvfp4(module, layer_filter=_default_layer_filter)
diff --git a/src/scope/core/pipelines/reward_forcing/pipeline.py b/src/scope/core/pipelines/reward_forcing/pipeline.py
index f36263f55..59983028d 100644
--- a/src/scope/core/pipelines/reward_forcing/pipeline.py
+++ b/src/scope/core/pipelines/reward_forcing/pipeline.py
@@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -84,29 +85,7 @@ def __init__(
# Initialize any additional, user-configured LoRA adapters via shared manager.
generator.model = self._init_loras(config, generator.model)
- if quantization == Quantization.FP8_E4M3FN:
- # Cast before optional quantization
- generator = generator.to(dtype=dtype)
-
- start = time.time()
-
- from torchao.quantization.quant_api import (
- Float8DynamicActivationFloat8WeightConfig,
- PerTensor,
- quantize_,
- )
-
- # Move to target device during quantization
- # Defaults to using fp8_e4m3fn for both weights and activations
- quantize_(
- generator,
- Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
- device=device,
- )
-
- print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
- else:
- generator = generator.to(device=device, dtype=dtype)
+ generator = apply_quantization(generator, quantization, device, dtype)
start = time.time()
text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/streamdiffusionv2/pipeline.py b/src/scope/core/pipelines/streamdiffusionv2/pipeline.py
index 5c4bfb46b..9d1e106a1 100644
--- a/src/scope/core/pipelines/streamdiffusionv2/pipeline.py
+++ b/src/scope/core/pipelines/streamdiffusionv2/pipeline.py
@@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
+from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
@@ -87,29 +88,7 @@ def __init__(
# Initialize optional LoRA adapters on the underlying model.
generator.model = self._init_loras(config, generator.model)
- if quantization == Quantization.FP8_E4M3FN:
- # Cast before optional quantization
- generator = generator.to(dtype=dtype)
-
- start = time.time()
-
- from torchao.quantization.quant_api import (
- Float8DynamicActivationFloat8WeightConfig,
- PerTensor,
- quantize_,
- )
-
- # Move to target device during quantization
- # Defaults to using fp8_e4m3fn for both weights and activations
- quantize_(
- generator,
- Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
- device=device,
- )
-
- print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
- else:
- generator = generator.to(device=device, dtype=dtype)
+ generator = apply_quantization(generator, quantization, device, dtype)
start = time.time()
text_encoder = WanTextEncoderWrapper(
diff --git a/src/scope/core/pipelines/utils.py b/src/scope/core/pipelines/utils.py
index c05e5bd01..3d2f022be 100644
--- a/src/scope/core/pipelines/utils.py
+++ b/src/scope/core/pipelines/utils.py
@@ -10,6 +10,11 @@
from .enums import Quantization as Quantization # noqa: PLC0414
from .enums import VaeType as VaeType # noqa: PLC0414
+# Re-export quantization utilities
+from .quantization_utils import (
+ apply_quantization as apply_quantization, # noqa: PLC0414
+)
+
def load_state_dict(weights_path: str) -> dict:
"""Load weights with automatic format detection."""
diff --git a/src/scope/core/pipelines/wan2_1/vace/mixin.py b/src/scope/core/pipelines/wan2_1/vace/mixin.py
index d984ba023..d080a1ce3 100644
--- a/src/scope/core/pipelines/wan2_1/vace/mixin.py
+++ b/src/scope/core/pipelines/wan2_1/vace/mixin.py
@@ -144,42 +144,33 @@ def _init_vace(
# Quantize VACE components if quantization is enabled
if quantization is not None:
- # Import here to avoid circular dependency
try:
- from ...utils import Quantization
-
- if quantization == Quantization.FP8_E4M3FN:
- logger.info(
- "_init_vace: Quantizing VACE components to FP8 (matching base model)..."
- )
- start = time.time()
-
- from torchao.quantization.quant_api import (
- Float8DynamicActivationFloat8WeightConfig,
- PerTensor,
- quantize_,
- )
-
- quantize_(
- vace_wrapped_model.vace_patch_embedding,
- Float8DynamicActivationFloat8WeightConfig(
- granularity=PerTensor()
- ),
- device=device,
- )
- quantize_(
- vace_wrapped_model.vace_blocks,
- Float8DynamicActivationFloat8WeightConfig(
- granularity=PerTensor()
- ),
- device=device,
- )
- logger.info(
- f"_init_vace: Quantized VACE to FP8 in {time.time() - start:.3f}s"
- )
+ from ...quantization_utils import apply_quantization_to_module
+
+ logger.info(
+ f"_init_vace: Quantizing VACE components with {quantization}..."
+ )
+ start = time.time()
+
+ apply_quantization_to_module(
+ vace_wrapped_model.vace_patch_embedding,
+ quantization,
+ device,
+ dtype,
+ )
+ apply_quantization_to_module(
+ vace_wrapped_model.vace_blocks,
+ quantization,
+ device,
+ dtype,
+ )
+
+ logger.info(
+ f"_init_vace: Quantized VACE components in {time.time() - start:.3f}s"
+ )
except ImportError:
logger.warning(
- "_init_vace: Could not import Quantization, skipping quantization check"
+ "_init_vace: Could not import quantization_utils, skipping quantization"
)
self.vace_enabled = True
diff --git a/src/scope/server/app.py b/src/scope/server/app.py
index f14769ddc..bb2a61de5 100644
--- a/src/scope/server/app.py
+++ b/src/scope/server/app.py
@@ -1152,15 +1152,21 @@ async def get_hardware_info(
import torch # Lazy import to avoid loading at CLI startup
vram_gb = None
+ supports_nvfp4 = False
if torch.cuda.is_available():
# Get total VRAM from the first GPU (in bytes), convert to GB
_, total_mem = torch.cuda.mem_get_info(0)
vram_gb = total_mem / (1024**3)
+ # Blackwell GPUs (SM >= 10.0) support NVFP4 quantization
+ cap = torch.cuda.get_device_capability()
+ supports_nvfp4 = cap >= (10, 0)
+
return HardwareInfoResponse(
vram_gb=vram_gb,
spout_available=is_spout_available(),
+ supports_nvfp4=supports_nvfp4,
)
except HTTPException:
raise
diff --git a/src/scope/server/cloud_proxy.py b/src/scope/server/cloud_proxy.py
index 2e1f4d623..90b085fc9 100644
--- a/src/scope/server/cloud_proxy.py
+++ b/src/scope/server/cloud_proxy.py
@@ -130,6 +130,7 @@ async def get_hardware_info_from_cloud(
return HardwareInfoResponse(
vram_gb=data.get("vram_gb"),
spout_available=spout_available,
+ supports_nvfp4=data.get("supports_nvfp4", False),
)
diff --git a/src/scope/server/schema.py b/src/scope/server/schema.py
index e0b9a2a91..3950d2217 100644
--- a/src/scope/server/schema.py
+++ b/src/scope/server/schema.py
@@ -251,6 +251,10 @@ class HardwareInfoResponse(BaseModel):
default=False,
description="Whether Spout is available (Windows only, not WSL)",
)
+ supports_nvfp4: bool = Field(
+ default=False,
+ description="Whether GPU supports NVFP4 quantization (Blackwell SM >= 10.0)",
+ )
class PipelineStatusEnum(str, Enum):
diff --git a/uv.lock b/uv.lock
index cc6638621..5b4470e16 100644
--- a/uv.lock
+++ b/uv.lock
@@ -462,6 +462,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
+[[package]]
+name = "comfy-kitchen"
+version = "0.2.7"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/44/e1/9b6e7764f8dcd5cb9b9ae369e55660bf24b7f48825584521246e3bddf43e/comfy_kitchen-0.2.7-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a168eb1fcdbb31707bb0e1226c6d44e1bd1b0a5ac1ac0a4d9c6eb7296b903ae", size = 680629, upload-time = "2026-01-17T03:48:13.922Z" },
+ { url = "https://files.pythonhosted.org/packages/b5/6b/1cea270d5014a465929375c434c2f78a35fadde5dfb6f436864e4c8f7a52/comfy_kitchen-0.2.7-cp312-abi3-win_amd64.whl", hash = "sha256:047b9ac7c8c1a845a51b0de3fb05c8d007666d68a3e776e07ecb5db21f15fbdd", size = 592877, upload-time = "2026-01-17T03:48:15.262Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/65/d483613734d0b9753bd9bfa297ff334cb2c7766e82306099db6b259b4e2c/comfy_kitchen-0.2.7-py3-none-any.whl", hash = "sha256:f8faa579b69d331d2f1eac09e96a95586c2a6b958a54bc19e7f1c1a77852dd36", size = 58034, upload-time = "2026-01-17T03:48:16.561Z" },
+]
+
+[package.optional-dependencies]
+cublas = [
+ { name = "nvidia-cublas", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+
[[package]]
name = "cryptography"
version = "46.0.3"
@@ -527,6 +542,7 @@ dependencies = [
{ name = "aiohttp" },
{ name = "aiortc" },
{ name = "click" },
+ { name = "comfy-kitchen", extra = ["cublas"], marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
{ name = "diffusers" },
{ name = "easydict" },
{ name = "einops" },
@@ -549,6 +565,9 @@ dependencies = [
{ name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
{ name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
{ name = "torchao" },
+ { name = "torchaudio", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+ { name = "torchaudio", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+ { name = "torchaudio", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or sys_platform == 'win32'" },
{ name = "torchvision", version = "0.24.1", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
{ name = "torchvision", version = "0.24.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
{ name = "torchvision", version = "0.24.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or sys_platform == 'win32'" },
@@ -582,6 +601,7 @@ requires-dist = [
{ name = "aiokafka", marker = "extra == 'kafka'", specifier = ">=0.10.0" },
{ name = "aiortc", specifier = ">=1.13.0" },
{ name = "click", specifier = ">=8.3.1" },
+ { name = "comfy-kitchen", extras = ["cublas"], marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = ">=0.1.0" },
{ name = "diffusers", specifier = ">=0.31.0" },
{ name = "easydict", specifier = ">=1.13" },
{ name = "einops", specifier = ">=0.8.1" },
@@ -604,6 +624,8 @@ requires-dist = [
{ name = "torch", marker = "sys_platform != 'linux' and sys_platform != 'win32'", specifier = "==2.9.1" },
{ name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu128" },
{ name = "torchao", specifier = "==0.15.0" },
+ { name = "torchaudio", marker = "sys_platform != 'linux' and sys_platform != 'win32'", specifier = "==2.9.1" },
+ { name = "torchaudio", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu128" },
{ name = "torchvision", marker = "sys_platform != 'linux' and sys_platform != 'win32'", specifier = "==0.24.1" },
{ name = "torchvision", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==0.24.1", index = "https://download.pytorch.org/whl/cu128" },
{ name = "transformers", specifier = ">=4.49.0" },
@@ -1505,6 +1527,16 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ad/0d/eca3d962f9eef265f01a8e0d20085c6dd1f443cbffc11b6dede81fd82356/numpy-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:6436cffb4f2bf26c974344439439c95e152c9a527013f26b3577be6c2ca64295", size = 10667121, upload-time = "2026-01-10T06:44:41.644Z" },
]
+[[package]]
+name = "nvidia-cublas"
+version = "13.2.1.1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9d/36/0124129e1378e9834e0cbe19781fbe0ffd5f870c2af6f01cdf17a9869c39/nvidia_cublas-13.2.1.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8b4a4cd8b73772fde9ccaa1f3967eb001ae5fde8b1dc37f7442d072b64d6f5da", size = 502470979, upload-time = "2026-01-13T22:39:37.619Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/e7/39e43c0688f9788c88da0b91ea18125448c5f515104aadf65a70243f144f/nvidia_cublas-13.2.1.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8c13c93cf8be4480b4909905c96d2d31575b4af43fcd3af0e84af94762665e4f", size = 401085577, upload-time = "2026-01-13T22:40:18.702Z" },
+ { url = "https://files.pythonhosted.org/packages/8b/9b/d9788b63872c6e4ce0fb292f2000642e73a8ae4da2d6f6b33759b77059af/nvidia_cublas-13.2.1.1-py3-none-win_amd64.whl", hash = "sha256:bc94f0597c21cfd6fea9446b18309b2351630ff227bb4e8575196494fb51c6b6", size = 385519499, upload-time = "2026-01-13T22:57:28.499Z" },
+]
+
[[package]]
name = "nvidia-cublas-cu12"
version = "12.8.4.1"
@@ -2558,6 +2590,66 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f6/3b/6b9d5618720f63dbc2e2509cd6b57aae9c0d61b738d1d2172f4d5d9efaab/torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c", size = 1080679, upload-time = "2025-12-18T23:14:43.807Z" },
]
+[[package]]
+name = "torchaudio"
+version = "2.9.1"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+ "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'",
+]
+dependencies = [
+ { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+]
+wheels = [
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:59f70f6aa6a7e77a1fd51756d7d25fec22bead0b50ce7bed4ede75a5fa6b21d1" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ea6fe3b9525493df0a5eb5eed5c22925065b5830f6999980ed76bb36c4592d34" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cfd604a617a245d26a5381ad7d669047ac1c152896227d8a006aad12151f8" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:bb790f968539f6b4115e637dbf3aab71f3f92e41f0c12e6bc7d52324f0051113" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:7d8da8816dfa25869da206eb4cdddb11603b042ab59326c6466a65b6e64d2684" },
+]
+
+[[package]]
+name = "torchaudio"
+version = "2.9.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+ { name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f1/83/71cbadd7b66753818b5775f2088bad4f721d581de276996df4968000a626/torchaudio-2.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7581ef170794c599aed55918e00d0acd9e5c9a0f19400c9a9a840955180365c5", size = 808098, upload-time = "2025-11-12T15:26:01.408Z" },
+ { url = "https://files.pythonhosted.org/packages/c0/1b/3321ad6379ac2d968064704e8d015c31ccae5d1ece070f87fb44b17d90e6/torchaudio-2.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bb69557484c92513a980027ec4cb314b0f43cf4442bbfd97440e66528dbad22d", size = 808136, upload-time = "2025-11-12T15:26:00.276Z" },
+ { url = "https://files.pythonhosted.org/packages/0c/58/e82d8b5f447abdddc950965f1395f36baef3602643dd069100c6369ba73e/torchaudio-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9290f6a6409deb1f9113d5aef97ec646eeee6410b6bcc57ab8b57066b54da7c1", size = 813456, upload-time = "2025-11-12T15:26:13.963Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/38/0dabf362f946ab5773d3db3322718d652d70ad12a82f500d54c6c8b9cc88/torchaudio-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:69a582650279ee16ff9087f99b4234fe5d766e1bf7f0be352db5f46991854c1e", size = 810496, upload-time = "2025-11-12T15:26:11.515Z" },
+ { url = "https://files.pythonhosted.org/packages/9c/f6/237e00a04dea497a40a8567d024dfb39193abec3ca3695ad51919ad633d1/torchaudio-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e13cb38971ac259fc4e102282a3e48f6df5f0ab00eb785ca5155e3392d1e86f1", size = 813463, upload-time = "2025-11-12T15:26:16.261Z" },
+]
+
+[[package]]
+name = "torchaudio"
+version = "2.9.1+cu128"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+ "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux')",
+ "sys_platform == 'win32'",
+]
+dependencies = [
+ { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.15' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or sys_platform == 'win32'" },
+]
+wheels = [
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:54eb19e634b8c567886a1b53b4184506d943c3ba5139198e9fe1b941bc566f30" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:88896c7bfa486102439fab6c85ac834176617e9c06eb0be9074c07ee1183b47d" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:52297b7dfb7c42e311385572bc9c0186e602ea1a5f20c42923765baea99aff83" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:abb0ee5a40c883ad17d90cdd45965c06deef42a0a2ffc58c51e32729642292f0" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ddc7410908858693d3b81346f53b5e5e51f987b3b7128978be6c774314377204" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:9d29dc3a2e0c43da66d33bcb9e22ad58c58f0ae1b6dcfe2d8d94bda279ddcf89" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:12c3e3d3aaf856d679328a5a9d46d866bc88b4c5290f2128f306abff975fa51e" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314-win_amd64.whl", hash = "sha256:eb6c714557c8d47f4fc65ec58b14a21cb4150940c242fe77e7517636c20ed3c3" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:6d9f5d53861b2fc057c1dd5051721f60b2253176c44a856d0f19100e312add3f" },
+ { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.9.1%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:150a8d7d51df9f667b5386cff5850f685b6059c59db51e056d7157955aad9e75" },
+]
+
[[package]]
name = "torchvision"
version = "0.24.1"