Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions frontend/src/components/ComplexFields.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ export interface SchemaComplexFieldContext {
vaceUseInputVideo?: boolean;
onVaceUseInputVideoChange?: (enabled: boolean) => void;
vaceContextScaleSlider?: SliderState;
quantization?: "fp8_e4m3fn" | null;
quantization?: "fp8_e4m3fn" | "nvfp4" | null;
loras?: LoRAConfig[];
onLorasChange?: (loras: LoRAConfig[]) => void;
loraMergeStrategy?: LoraMergeStrategy;
Expand All @@ -63,10 +63,11 @@ export interface SchemaComplexFieldContext {
noiseScaleSlider?: SliderState;
noiseController?: boolean;
onNoiseControllerChange?: (enabled: boolean) => void;
onQuantizationChange?: (q: "fp8_e4m3fn" | null) => void;
onQuantizationChange?: (q: "fp8_e4m3fn" | "nvfp4" | null) => void;
inputMode?: "text" | "video";
supportsNoiseControls?: boolean;
supportsQuantization?: boolean;
supportsNvfp4?: boolean;
supportsCacheManagement?: boolean;
supportsKvCacheBias?: boolean;
isStreaming?: boolean;
Expand Down Expand Up @@ -156,7 +157,7 @@ export function SchemaComplexField({
<div className="flex items-start gap-1.5 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
<Info className="h-3.5 w-3.5 mt-0.5 shrink-0 text-amber-600 dark:text-amber-500" />
<p className="text-xs text-amber-600 dark:text-amber-500">
VACE is incompatible with FP8 quantization. Please disable
VACE is incompatible with quantization. Please disable
quantization to use VACE.
</p>
</div>
Expand Down Expand Up @@ -482,27 +483,30 @@ export function SchemaComplexField({
value={ctx.quantization ?? "none"}
onValueChange={v =>
ctx.onQuantizationChange?.(
v === "none" ? null : (v as "fp8_e4m3fn")
v === "none" ? null : (v as "fp8_e4m3fn" | "nvfp4")
)
}
disabled={
(ctx.isStreaming ?? false) || (ctx.vaceEnabled ?? false)
}
>
<SelectTrigger className="w-[140px] h-7">
<SelectTrigger className="w-[180px] h-7">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="none">None</SelectItem>
<SelectItem value="fp8_e4m3fn">
fp8_e4m3fn (Dynamic)
</SelectItem>
{ctx.supportsNvfp4 && (
<SelectItem value="nvfp4">nvfp4 (Blackwell)</SelectItem>
)}
</SelectContent>
</Select>
</div>
{ctx.vaceEnabled && (
<p className="text-xs text-muted-foreground">
Disabled because VACE is enabled. Disable VACE to use FP8
Disabled because VACE is enabled. Disable VACE to use
quantization.
</p>
)}
Expand Down
25 changes: 18 additions & 7 deletions frontend/src/components/SettingsPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ interface SettingsPanelProps {
onNoiseControllerChange?: (enabled: boolean) => void;
manageCache?: boolean;
onManageCacheChange?: (enabled: boolean) => void;
quantization?: "fp8_e4m3fn" | null;
onQuantizationChange?: (quantization: "fp8_e4m3fn" | null) => void;
quantization?: "fp8_e4m3fn" | "nvfp4" | null;
onQuantizationChange?: (quantization: "fp8_e4m3fn" | "nvfp4" | null) => void;
kvCacheAttentionBias?: number;
onKvCacheAttentionBiasChange?: (bias: number) => void;
onResetCache?: () => void;
Expand All @@ -90,6 +90,8 @@ interface SettingsPanelProps {
// Spout settings
spoutSender?: SettingsState["spoutSender"];
onSpoutSenderChange?: (spoutSender: SettingsState["spoutSender"]) => void;
// Whether GPU supports NVFP4 quantization (Blackwell SM >= 10.0)
supportsNvfp4?: boolean;
// Whether Spout is available (server-side detection for native Windows, not WSL)
spoutAvailable?: boolean;
// VACE settings
Expand Down Expand Up @@ -143,6 +145,7 @@ export function SettingsPanel({
loraMergeStrategy = "permanent_merge",
inputMode,
supportsNoiseControls = false,
supportsNvfp4 = false,
spoutSender,
onSpoutSenderChange,
spoutAvailable = false,
Expand Down Expand Up @@ -525,6 +528,7 @@ export function SettingsPanel({
supportsNoiseControls,
supportsQuantization:
pipelines?.[pipelineId]?.supportsQuantization,
supportsNvfp4,
supportsCacheManagement:
pipelines?.[pipelineId]?.supportsCacheManagement,
supportsKvCacheBias: pipelines?.[pipelineId]?.supportsKvCacheBias,
Expand Down Expand Up @@ -617,8 +621,8 @@ export function SettingsPanel({
<div className="flex items-start gap-1.5 p-2 rounded-md bg-amber-500/10 border border-amber-500/20">
<Info className="h-3.5 w-3.5 mt-0.5 shrink-0 text-amber-600 dark:text-amber-500" />
<p className="text-xs text-amber-600 dark:text-amber-500">
VACE is incompatible with FP8 quantization. Please
disable quantization to use VACE.
VACE is incompatible with quantization. Please disable
quantization to use VACE.
</p>
</div>
)}
Expand Down Expand Up @@ -951,27 +955,34 @@ export function SettingsPanel({
value={quantization || "none"}
onValueChange={value => {
onQuantizationChange?.(
value === "none" ? null : (value as "fp8_e4m3fn")
value === "none"
? null
: (value as "fp8_e4m3fn" | "nvfp4")
);
}}
disabled={isStreaming || vaceEnabled}
>
<SelectTrigger className="w-[140px] h-7">
<SelectTrigger className="w-[180px] h-7">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="none">None</SelectItem>
<SelectItem value="fp8_e4m3fn">
fp8_e4m3fn (Dynamic)
</SelectItem>
{supportsNvfp4 && (
<SelectItem value="nvfp4">
nvfp4 (Blackwell)
</SelectItem>
)}
</SelectContent>
</Select>
</div>
{/* Note when quantization is disabled due to VACE */}
{vaceEnabled && (
<p className="text-xs text-muted-foreground">
Disabled because VACE is enabled. Disable VACE to use
FP8 quantization.
quantization.
</p>
)}
</div>
Expand Down
18 changes: 16 additions & 2 deletions frontend/src/hooks/useStreamState.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ function getFallbackDefaults(mode?: InputMode) {
noiseController: isVideoMode ? true : undefined,
defaultTemporalInterpolationSteps: undefined as number | undefined,
inputMode: effectiveMode,
quantization: undefined as "fp8_e4m3fn" | undefined,
quantization: undefined as "fp8_e4m3fn" | "nvfp4" | undefined,
};
}

Expand Down Expand Up @@ -125,7 +125,7 @@ export function useStreamState() {
noiseController,
defaultTemporalInterpolationSteps,
inputMode: effectiveMode,
quantization: undefined as "fp8_e4m3fn" | undefined,
quantization: undefined as "fp8_e4m3fn" | "nvfp4" | undefined,
};
}
// Fallback to derived defaults if schemas not loaded
Expand Down Expand Up @@ -334,6 +334,20 @@ export function useStreamState() {
}
}, [settings.pipelineId, hardwareInfo, pipelineSchemas]);

// Reset nvfp4 selection if GPU doesn't support it (e.g. from persisted state)
useEffect(() => {
if (
hardwareInfo &&
!hardwareInfo.supports_nvfp4 &&
settings.quantization === "nvfp4"
) {
setSettings(prev => ({
...prev,
quantization: "fp8_e4m3fn",
}));
}
}, [hardwareInfo, settings.quantization]);

// Set recommended VACE enabled state based on pipeline schema and available VRAM
// VACE is enabled by default, but disabled if VRAM is below recommended_quantization_vram_threshold
useEffect(() => {
Expand Down
1 change: 1 addition & 0 deletions frontend/src/lib/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ export const downloadPipelineModels = async (
export interface HardwareInfoResponse {
vram_gb: number | null;
spout_available: boolean;
supports_nvfp4: boolean;
}

export const getHardwareInfo = async (): Promise<HardwareInfoResponse> => {
Expand Down
6 changes: 5 additions & 1 deletion frontend/src/pages/StreamPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ export function StreamPage() {
getDefaults,
supportsNoiseControls,
spoutAvailable,
hardwareInfo,
refreshPipelineSchemas,
refreshHardwareInfo,
} = useStreamState();
Expand Down Expand Up @@ -588,7 +589,9 @@ export function StreamPage() {
});
};

const handleQuantizationChange = (quantization: "fp8_e4m3fn" | null) => {
const handleQuantizationChange = (
quantization: "fp8_e4m3fn" | "nvfp4" | null
) => {
updateSettings({ quantization });
// Note: This setting requires pipeline reload, so we don't send parameter update here
};
Expand Down Expand Up @@ -1532,6 +1535,7 @@ export function StreamPage() {
loraMergeStrategy={settings.loraMergeStrategy ?? "permanent_merge"}
inputMode={settings.inputMode}
supportsNoiseControls={supportsNoiseControls(settings.pipelineId)}
supportsNvfp4={hardwareInfo?.supports_nvfp4 ?? false}
spoutSender={settings.spoutSender}
onSpoutSenderChange={handleSpoutSenderChange}
spoutAvailable={spoutAvailable}
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ export interface SettingsState {
noiseScale?: number;
noiseController?: boolean;
manageCache?: boolean;
quantization?: "fp8_e4m3fn" | null;
quantization?: "fp8_e4m3fn" | "nvfp4" | null;
kvCacheAttentionBias?: number;
paused?: boolean;
loras?: LoRAConfig[];
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies = [
"uvicorn>=0.35.0",
"torch==2.9.1",
"torchvision==0.24.1",
"torchaudio==2.9.1",
"easydict>=1.13",
"diffusers>=0.31.0",
"ftfy>=6.3.1",
Expand All @@ -57,6 +58,7 @@ dependencies = [
"triton-windows==3.5.1.post24; sys_platform == 'win32'",
"SpoutGL>=0.1.1; sys_platform == 'win32'",
"PyOpenGL>=3.1.10; sys_platform == 'win32'",
"comfy-kitchen[cublas]>=0.1.0; sys_platform == 'linux' or sys_platform == 'win32'",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -95,6 +97,9 @@ torch = [
torchvision = [
{ index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
]
torchaudio = [
{ index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
]
flash-attn = [
# Prebuilt Linux wheels from https://github.com/Dao-AILab/flash-attention
{ url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux'" },
Expand Down
1 change: 1 addition & 0 deletions src/scope/core/pipelines/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Quantization(str, Enum):
"""Quantization method enumeration."""

FP8_E4M3FN = "fp8_e4m3fn"
NVFP4 = "nvfp4"


class VaeType(str, Enum):
Expand Down
25 changes: 2 additions & 23 deletions src/scope/core/pipelines/krea_realtime_video/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
Expand Down Expand Up @@ -111,29 +112,7 @@ def __init__(
# Initialize optional LoRA adapters on the underlying model AFTER VACE.
generator.model = self._init_loras(config, generator.model)

if quantization == Quantization.FP8_E4M3FN:
# Cast before optional quantization
generator = generator.to(dtype=dtype)

start = time.time()

from torchao.quantization.quant_api import (
Float8DynamicActivationFloat8WeightConfig,
PerTensor,
quantize_,
)

# Move to target device during quantization
# Defaults to using fp8_e4m3fn for both weights and activations
quantize_(
generator,
Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
device=device,
)

print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
else:
generator = generator.to(device=device, dtype=dtype)
generator = apply_quantization(generator, quantization, device, dtype)

if compile:
# Only compile the attention blocks
Expand Down
25 changes: 2 additions & 23 deletions src/scope/core/pipelines/longlive/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
Expand Down Expand Up @@ -110,29 +111,7 @@ def __init__(
# This is additive and does not replace the original LongLive performance LoRA.
generator.model = self._init_loras(config, generator.model)

if quantization == Quantization.FP8_E4M3FN:
# Cast before optional quantization
generator = generator.to(dtype=dtype)

start = time.time()

from torchao.quantization.quant_api import (
Float8DynamicActivationFloat8WeightConfig,
PerTensor,
quantize_,
)

# Move to target device during quantization
# Defaults to using fp8_e4m3fn for both weights and activations
quantize_(
generator,
Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
device=device,
)

print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
else:
generator = generator.to(device=device, dtype=dtype)
generator = apply_quantization(generator, quantization, device, dtype)

start = time.time()
text_encoder = WanTextEncoderWrapper(
Expand Down
25 changes: 2 additions & 23 deletions src/scope/core/pipelines/memflow/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from ..interface import Pipeline, Requirements
from ..process import postprocess_chunk
from ..quantization_utils import apply_quantization
from ..utils import Quantization, load_model_config, validate_resolution
from ..wan2_1.components import WanDiffusionWrapper, WanTextEncoderWrapper
from ..wan2_1.lora.mixin import LoRAEnabledPipeline
Expand Down Expand Up @@ -110,29 +111,7 @@ def __init__(
# This is additive and does not replace the original MemFlow performance LoRA.
generator.model = self._init_loras(config, generator.model)

if quantization == Quantization.FP8_E4M3FN:
# Cast before optional quantization
generator = generator.to(dtype=dtype)

start = time.time()

from torchao.quantization.quant_api import (
Float8DynamicActivationFloat8WeightConfig,
PerTensor,
quantize_,
)

# Move to target device during quantization
# Defaults to using fp8_e4m3fn for both weights and activations
quantize_(
generator,
Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor()),
device=device,
)

print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
else:
generator = generator.to(device=device, dtype=dtype)
generator = apply_quantization(generator, quantization, device, dtype)

start = time.time()
text_encoder = WanTextEncoderWrapper(
Expand Down
Loading
Loading