From 91443b1a9db76225096169fa900a201f00e6e303 Mon Sep 17 00:00:00 2001 From: Zhiyao Cen <2523403608@qq.com> Date: Wed, 1 Apr 2026 21:30:43 +0800 Subject: [PATCH 1/5] [test] add dedicated perf test suite with entrypoint matrix Split perf benchmarks into tests/perf_tests with shared benchmarking helpers and add class/instance/function/method coverage plus torch-compile mode checks across MLP, norm-residual fusion, and pointwise chains. Made-with: Cursor --- tests/model_definition.py | 32 ++- tests/perf_tests/__init__.py | 101 +++++++++ tests/perf_tests/conftest.py | 13 ++ tests/perf_tests/test_mlp_perf.py | 202 +++++++++++++++++ .../test_norm_residual_fusion_perf.py | 207 ++++++++++++++++++ .../perf_tests/test_pointwise_fusion_perf.py | 199 +++++++++++++++++ tests/utils.py | 124 +++++++++++ 7 files changed, 869 insertions(+), 9 deletions(-) create mode 100644 tests/perf_tests/__init__.py create mode 100644 tests/perf_tests/conftest.py create mode 100644 tests/perf_tests/test_mlp_perf.py create mode 100644 tests/perf_tests/test_norm_residual_fusion_perf.py create mode 100644 tests/perf_tests/test_pointwise_fusion_perf.py diff --git a/tests/model_definition.py b/tests/model_definition.py index 1fc7c9e..13c75bd 100644 --- a/tests/model_definition.py +++ b/tests/model_definition.py @@ -55,9 +55,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x.to(input_dtype) -@magi_compile(dynamic_arg_dims={"x": 0}) -class MLP(torch.nn.Module): - """MLP module with traditional architecture (up-projection, activation, and down-projection)""" +class RawMLP(torch.nn.Module): + """MLP module with traditional architecture (up-projection, activation, and down-projection). + + This is the uncompiled base class. Use ``MLP`` for the magi_compile-wrapped variant. + """ config: MLPConfig @@ -81,20 +83,25 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: - x: (num_tokens, hidden_size) - output: (num_tokens, hidden_size) """ - # Pre-normalization x = self.pre_norm(x).to(torch.bfloat16) - # Up-projection x = self.up_proj(x).to(torch.float32) - # Activation (SiLU) x = F.silu(x).to(torch.bfloat16) - # Down-projection x = self.down_proj(x).to(torch.float32) return x @magi_compile(dynamic_arg_dims={"x": 0}) -class RMSNormModule(torch.nn.Module): - """Compiled RMSNorm module for testing""" +class MLP(RawMLP): + """Compiled MLP module (magi_compile-wrapped ``RawMLP``).""" + + pass + + +class RawRMSNormModule(torch.nn.Module): + """RMSNorm module for testing. + + This is the uncompiled base class. Use ``RMSNormModule`` for the magi_compile-wrapped variant. + """ config: RMSNormConfig @@ -119,6 +126,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return self.norm(x) +@magi_compile(dynamic_arg_dims={"x": 0}) +class RMSNormModule(RawRMSNormModule): + """Compiled RMSNorm module (magi_compile-wrapped ``RawRMSNormModule``).""" + + pass + + def create_rms_norm_model(config: RMSNormConfig, device: torch.device) -> RMSNormModule: """Create RMSNorm model diff --git a/tests/perf_tests/__init__.py b/tests/perf_tests/__init__.py new file mode 100644 index 0000000..cbe7710 --- /dev/null +++ b/tests/perf_tests/__init__.py @@ -0,0 +1,101 @@ +# Copyright (c) 2025 SandAI. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import statistics +from collections.abc import Callable +from dataclasses import dataclass + +import torch +from triton.testing import do_bench + + +@dataclass +class BenchmarkResult: + times_ms: list[float] + + @property + def median(self) -> float: + return statistics.median(self.times_ms) + + @property + def mean(self) -> float: + return statistics.mean(self.times_ms) + + @property + def min(self) -> float: + return min(self.times_ms) + + @property + def stdev(self) -> float: + return statistics.stdev(self.times_ms) if len(self.times_ms) > 1 else 0.0 + + def summary(self, label: str = "") -> str: + prefix = f"[{label}] " if label else "" + return ( + f"{prefix}median={self.median:.3f}ms mean={self.mean:.3f}ms " + f"min={self.min:.3f}ms stdev={self.stdev:.3f}ms (n={len(self.times_ms)})" + ) + + +def cuda_benchmark( + fn: Callable[[], object], + *, + warmup: int = 25, + rep: int = 100, + grad_to_none: list[torch.Tensor] | None = None, + compilation_warmup: int = 0, +) -> BenchmarkResult: + if compilation_warmup > 0: + for _ in range(compilation_warmup): + fn() + torch.cuda.synchronize() + + times = do_bench( + fn, + warmup=warmup, + rep=rep, + grad_to_none=grad_to_none, + return_mode="all", + ) + return BenchmarkResult(times_ms=times) + + +def print_perf_comparison( + title: str, + eager: BenchmarkResult, + magi: BenchmarkResult, + torch_compile: BenchmarkResult | None = None, + extra_info: str = "", +) -> tuple[float, float]: + magi_vs_eager = eager.median / magi.median + torch_vs_eager = eager.median / torch_compile.median if torch_compile else 0.0 + magi_vs_torch = torch_compile.median / magi.median if torch_compile else 0.0 + + print(f"\n{'=' * 78}") + print(title) + if extra_info: + print(f" {extra_info}") + print(f"{'=' * 78}") + print(f" {eager.summary('eager ')}") + if torch_compile is not None: + print(f" {torch_compile.summary('torch.compile ')}") + print(f" {magi.summary('magi_compile ')}") + print(" ---") + if torch_compile is not None: + print(f" torch.compile vs eager: {torch_vs_eager:.2f}x") + print(f" magi_compile vs eager: {magi_vs_eager:.2f}x") + if torch_compile is not None: + print(f" magi_compile vs torch.compile: {magi_vs_torch:.2f}x") + print(f"{'=' * 78}") + return magi_vs_eager, magi_vs_torch diff --git a/tests/perf_tests/conftest.py b/tests/perf_tests/conftest.py new file mode 100644 index 0000000..3dbb800 --- /dev/null +++ b/tests/perf_tests/conftest.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025 SandAI. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py new file mode 100644 index 0000000..b3ff667 --- /dev/null +++ b/tests/perf_tests/test_mlp_perf.py @@ -0,0 +1,202 @@ +# Copyright (c) 2025 SandAI. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Performance test: end-to-end MLP block. + +Covers all supported compilation paths (class, instance, instance+TC, method). + +Measured baseline (H100): + torch.compile ~1.8x vs eager + magi_compile ~1.8x vs eager (all paths) +""" + +import pytest +import torch + +from magi_compiler import magi_compile +from magi_compiler.config import CompileMode +from tests.model_definition import MLPConfig, RawMLP +from tests.perf_tests import cuda_benchmark, print_perf_comparison + +HIDDEN_SIZE = 2048 +INTERMEDIATE_SIZE = 8192 +NUM_TOKENS = 8192 +SPEEDUP_VS_EAGER_THRESHOLD = 1.65 + + +def _build_config(): + return MLPConfig( + hidden_size=HIDDEN_SIZE, + intermediate_size=INTERMEDIATE_SIZE, + params_dtype=torch.bfloat16, + ) + + +# ── Shared baselines (computed once per module) ──────────────────────── + + +@pytest.fixture(scope="module") +def mlp_device(): + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@pytest.fixture(scope="module") +def mlp_input(mlp_device): + return torch.randn(NUM_TOKENS, HIDDEN_SIZE, device=mlp_device, dtype=torch.bfloat16) + + +@pytest.fixture(scope="module") +def mlp_baselines(mlp_device, mlp_input): + """Eager and torch.compile baselines, benchmarked once for the whole module.""" + config = _build_config() + eager_model = RawMLP(config).to(mlp_device).eval() + torch_compiled = torch.compile( + RawMLP(config).to(mlp_device).eval(), backend="inductor" + ) + with torch.no_grad(): + eager_result = cuda_benchmark(lambda: eager_model(mlp_input)) + torch_result = cuda_benchmark(lambda: torch_compiled(mlp_input), compilation_warmup=3) + return eager_result, torch_result + + +# ── Helpers ──────────────────────────────────────────────────────────── + + +def _assert_speedup(magi_vs_eager, eager_result, magi_result, label): + assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, ( + f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. " + f"Got {magi_vs_eager:.2f}x " + f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" + ) + + +# ── Tests ────────────────────────────────────────────────────────────── + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_mlp_class_decoration(mlp_device, mlp_input, mlp_baselines): + """MLP block: @magi_compile class decoration.""" + eager_result, torch_result = mlp_baselines + config = _build_config() + + @magi_compile(dynamic_arg_dims={"x": 0}) + class CompiledMLP(RawMLP): + pass + + magi_compiled = CompiledMLP(config).to(mlp_device).eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "MLP - class decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines): + """MLP block: magi_compile(instance) decoration.""" + eager_result, torch_result = mlp_baselines + config = _build_config() + + magi_compiled = magi_compile( + RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0} + ) + magi_compiled.eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "MLP - instance decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_mlp_instance_torch_compile_mode(mlp_device, mlp_input, mlp_baselines): + """MLP block: magi_compile(instance, mode=TORCH_COMPILE).""" + eager_result, torch_result = mlp_baselines + config = _build_config() + + def _tc_mode(cfg): + cfg.compile_mode = CompileMode.TORCH_COMPILE + return cfg + + magi_compiled = magi_compile( + RawMLP(config).to(mlp_device), + dynamic_arg_dims={"x": 0}, + config_patch=_tc_mode, + ) + magi_compiled.eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "MLP - instance (TORCH_COMPILE mode)", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_mlp_function_decoration(mlp_device, mlp_input, mlp_baselines): + """MLP block: @magi_compile function-level entry.""" + eager_result, torch_result = mlp_baselines + config = _build_config() + + model = RawMLP(config).to(mlp_device).eval() + + @magi_compile(dynamic_arg_dims={"x": 0}) + def compiled_entry(x: torch.Tensor) -> torch.Tensor: + return model(x) + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: compiled_entry(mlp_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "MLP - function decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_mlp_method_decoration(mlp_device, mlp_input, mlp_baselines): + """MLP block: magi_compile(model.forward) method decoration.""" + eager_result, torch_result = mlp_baselines + config = _build_config() + + magi_compiled = RawMLP(config).to(mlp_device).eval() + magi_compiled.forward = magi_compile( + magi_compiled.forward, dynamic_arg_dims={"x": 0} + ) + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "MLP - method decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py new file mode 100644 index 0000000..3eebc17 --- /dev/null +++ b/tests/perf_tests/test_norm_residual_fusion_perf.py @@ -0,0 +1,207 @@ +# Copyright (c) 2025 SandAI. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Performance test: norm + residual + activation fusion. + +Covers all supported compilation paths (class, instance, instance+TC, method). + +Measured baseline (H100): + torch.compile ~10.0x vs eager + magi_compile ~4.5x vs eager (all paths) +""" + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F + +from magi_compiler import magi_compile +from magi_compiler.config import CompileMode +from tests.model_definition import RMSNorm +from tests.perf_tests import cuda_benchmark, print_perf_comparison + +HIDDEN_SIZE = 4096 +NUM_TOKENS = 16384 +SPEEDUP_VS_EAGER_THRESHOLD = 4.05 + + +class NormResidualActivation(nn.Module): + def __init__(self, hidden_size: int): + super().__init__() + self.norm = RMSNorm(hidden_size) + + def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor: + return F.silu(self.norm(x) + residual) + + +# ── Shared baselines (computed once per module) ──────────────────────── + + +@pytest.fixture(scope="module") +def nra_device(): + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@pytest.fixture(scope="module") +def nra_inputs(nra_device): + x = torch.randn(NUM_TOKENS, HIDDEN_SIZE, device=nra_device, dtype=torch.bfloat16) + residual = torch.randn_like(x) + return x, residual + + +@pytest.fixture(scope="module") +def nra_baselines(nra_device, nra_inputs): + """Eager and torch.compile baselines, benchmarked once for the whole module.""" + x, residual = nra_inputs + eager_model = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval() + torch_compiled = torch.compile( + NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval(), backend="inductor" + ) + with torch.no_grad(): + eager_result = cuda_benchmark(lambda: eager_model(x, residual)) + torch_result = cuda_benchmark(lambda: torch_compiled(x, residual), compilation_warmup=3) + return eager_result, torch_result + + +# ── Helpers ──────────────────────────────────────────────────────────── + + +def _assert_speedup(magi_vs_eager, eager_result, magi_result, label): + assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, ( + f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. " + f"Got {magi_vs_eager:.2f}x " + f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" + ) + + +# ── Tests ────────────────────────────────────────────────────────────── + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_norm_residual_class_decoration(nra_device, nra_inputs, nra_baselines): + """Norm+residual+SiLU: @magi_compile class decoration.""" + eager_result, torch_result = nra_baselines + x, residual = nra_inputs + + @magi_compile(dynamic_arg_dims={"x": 0, "residual": 0}) + class CompiledNRA(NormResidualActivation): + pass + + magi_compiled = CompiledNRA(HIDDEN_SIZE).to(nra_device).eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Norm+Residual+SiLU - class decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines): + """Norm+residual+SiLU: magi_compile(instance) decoration.""" + eager_result, torch_result = nra_baselines + x, residual = nra_inputs + + magi_compiled = magi_compile( + NormResidualActivation(HIDDEN_SIZE).to(nra_device), + dynamic_arg_dims={"x": 0, "residual": 0}, + ) + magi_compiled.eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Norm+Residual+SiLU - instance decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_norm_residual_instance_torch_compile_mode(nra_device, nra_inputs, nra_baselines): + """Norm+residual+SiLU: magi_compile(instance, mode=TORCH_COMPILE).""" + eager_result, torch_result = nra_baselines + x, residual = nra_inputs + + def _tc_mode(cfg): + cfg.compile_mode = CompileMode.TORCH_COMPILE + return cfg + + magi_compiled = magi_compile( + NormResidualActivation(HIDDEN_SIZE).to(nra_device), + dynamic_arg_dims={"x": 0, "residual": 0}, + config_patch=_tc_mode, + ) + magi_compiled.eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Norm+Residual+SiLU - instance (TORCH_COMPILE mode)", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_norm_residual_function_decoration(nra_device, nra_inputs, nra_baselines): + """Norm+residual+SiLU: @magi_compile function-level entry.""" + eager_result, torch_result = nra_baselines + x, residual = nra_inputs + + model = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval() + + @magi_compile(dynamic_arg_dims={"x": 0, "residual": 0}) + def compiled_entry(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor: + return model(x, residual) + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: compiled_entry(x, residual), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Norm+Residual+SiLU - function decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_norm_residual_method_decoration(nra_device, nra_inputs, nra_baselines): + """Norm+residual+SiLU: magi_compile(model.forward) method decoration.""" + eager_result, torch_result = nra_baselines + x, residual = nra_inputs + + magi_compiled = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval() + magi_compiled.forward = magi_compile( + magi_compiled.forward, dynamic_arg_dims={"x": 0, "residual": 0} + ) + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Norm+Residual+SiLU - method decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py new file mode 100644 index 0000000..35eac0c --- /dev/null +++ b/tests/perf_tests/test_pointwise_fusion_perf.py @@ -0,0 +1,199 @@ +# Copyright (c) 2025 SandAI. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Performance test: pointwise operator chain. + +Covers all supported compilation paths (class, instance, instance+TC, method). + +Measured baseline (H100): + torch.compile ~5.9x vs eager + magi_compile ~3.5x vs eager (all paths) +""" + +import pytest +import torch +import torch.nn as nn + +from magi_compiler import magi_compile +from magi_compiler.config import CompileMode +from tests.perf_tests import cuda_benchmark, print_perf_comparison + +HIDDEN_SIZE = 4096 +NUM_TOKENS = 16384 +SPEEDUP_VS_EAGER_THRESHOLD = 3.15 + + +class PointwiseFusionChain(nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x * 0.5 + x = x + 1.0 + x = torch.relu(x) + x = x * x + x = x - 0.5 + x = torch.sigmoid(x) + return x + + +# ── Shared baselines (computed once per module) ──────────────────────── + + +@pytest.fixture(scope="module") +def pointwise_device(): + return torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@pytest.fixture(scope="module") +def pointwise_input(pointwise_device): + return torch.randn(NUM_TOKENS, HIDDEN_SIZE, device=pointwise_device, dtype=torch.float32) + + +@pytest.fixture(scope="module") +def pointwise_baselines(pointwise_device, pointwise_input): + """Eager and torch.compile baselines, benchmarked once for the whole module.""" + x = pointwise_input + eager_model = PointwiseFusionChain().to(pointwise_device).eval() + torch_compiled = torch.compile( + PointwiseFusionChain().to(pointwise_device).eval(), backend="inductor" + ) + with torch.no_grad(): + eager_result = cuda_benchmark(lambda: eager_model(x)) + torch_result = cuda_benchmark(lambda: torch_compiled(x), compilation_warmup=3) + return eager_result, torch_result + + +# ── Helpers ──────────────────────────────────────────────────────────── + + +def _assert_speedup(magi_vs_eager, eager_result, magi_result, label): + assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, ( + f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. " + f"Got {magi_vs_eager:.2f}x " + f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" + ) + + +# ── Tests ────────────────────────────────────────────────────────────── + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_pointwise_class_decoration(pointwise_device, pointwise_input, pointwise_baselines): + """Pointwise chain: @magi_compile class decoration.""" + eager_result, torch_result = pointwise_baselines + + @magi_compile(dynamic_arg_dims={"x": 0}) + class CompiledPointwise(PointwiseFusionChain): + pass + + magi_compiled = CompiledPointwise().to(pointwise_device).eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Pointwise - class decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointwise_baselines): + """Pointwise chain: magi_compile(instance) decoration.""" + eager_result, torch_result = pointwise_baselines + + magi_compiled = magi_compile( + PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0} + ) + magi_compiled.eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Pointwise - instance decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_pointwise_instance_torch_compile_mode(pointwise_device, pointwise_input, pointwise_baselines): + """Pointwise chain: magi_compile(instance, mode=TORCH_COMPILE).""" + eager_result, torch_result = pointwise_baselines + + def _tc_mode(cfg): + cfg.compile_mode = CompileMode.TORCH_COMPILE + return cfg + + magi_compiled = magi_compile( + PointwiseFusionChain().to(pointwise_device), + dynamic_arg_dims={"x": 0}, + config_patch=_tc_mode, + ) + magi_compiled.eval() + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Pointwise - instance (TORCH_COMPILE mode)", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_pointwise_function_decoration(pointwise_device, pointwise_input, pointwise_baselines): + """Pointwise chain: @magi_compile function-level entry.""" + eager_result, torch_result = pointwise_baselines + + model = PointwiseFusionChain().to(pointwise_device).eval() + + @magi_compile(dynamic_arg_dims={"x": 0}) + def compiled_entry(x: torch.Tensor) -> torch.Tensor: + return model(x) + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: compiled_entry(pointwise_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Pointwise - function decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") +def test_pointwise_method_decoration(pointwise_device, pointwise_input, pointwise_baselines): + """Pointwise chain: magi_compile(model.forward) method decoration.""" + eager_result, torch_result = pointwise_baselines + + magi_compiled = PointwiseFusionChain().to(pointwise_device).eval() + magi_compiled.forward = magi_compile( + magi_compiled.forward, dynamic_arg_dims={"x": 0} + ) + + with torch.no_grad(): + magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3) + + magi_vs_eager, _ = print_perf_comparison( + "Pointwise - method decoration", + eager_result, magi_result, torch_result, + extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", + ) + _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") diff --git a/tests/utils.py b/tests/utils.py index a5fa647..be24321 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,6 +13,12 @@ # limitations under the License. import shutil +import statistics +from collections.abc import Callable +from dataclasses import dataclass + +import torch +from triton.testing import do_bench from magi_compiler.config import get_compile_config @@ -28,6 +34,124 @@ def __exit__(self, exc_type, exc_val, exc_tb): shutil.rmtree(get_compile_config().cache_root_dir, ignore_errors=True) +@dataclass +class BenchmarkResult: + """Timing results from a CUDA benchmark run (all times in milliseconds). + + Follows the methodology of ``triton.testing.do_bench``: time-based warmup/rep, + L2 cache flush between iterations, per-iteration CUDA event timing. + """ + + times_ms: list[float] + + @property + def median(self) -> float: + return statistics.median(self.times_ms) + + @property + def mean(self) -> float: + return statistics.mean(self.times_ms) + + @property + def min(self) -> float: + return min(self.times_ms) + + @property + def stdev(self) -> float: + return statistics.stdev(self.times_ms) if len(self.times_ms) > 1 else 0.0 + + def summary(self, label: str = "") -> str: + prefix = f"[{label}] " if label else "" + return ( + f"{prefix}median={self.median:.3f}ms mean={self.mean:.3f}ms " + f"min={self.min:.3f}ms stdev={self.stdev:.3f}ms (n={len(self.times_ms)})" + ) + + +def cuda_benchmark( + fn: Callable[[], object], + *, + warmup: int = 25, + rep: int = 100, + grad_to_none: list[torch.Tensor] | None = None, + compilation_warmup: int = 0, +) -> BenchmarkResult: + """Benchmark a GPU callable using ``triton.testing.do_bench`` methodology. + + Uses time-based warmup/rep (in ms), L2 cache flush between iterations, + and per-iteration CUDA event timing -- the same approach torch inductor + uses internally for autotuning fused triton kernels. + + Args: + fn: Zero-argument callable to benchmark. + warmup: Warmup duration in milliseconds (passed to ``do_bench``). + rep: Benchmark repetition duration in milliseconds. + grad_to_none: Tensors whose ``.grad`` should be cleared between iterations. + compilation_warmup: Number of extra invocations **before** ``do_bench`` + to ensure lazy compilation (e.g. ``magi_compile``) is fully finished. + These calls are *not* timed. + + Returns: + BenchmarkResult with all per-iteration times in milliseconds. + """ + if compilation_warmup > 0: + for _ in range(compilation_warmup): + fn() + torch.cuda.synchronize() + + times = do_bench( + fn, + warmup=warmup, + rep=rep, + grad_to_none=grad_to_none, + return_mode="all", + ) + return BenchmarkResult(times_ms=times) + + +def print_perf_comparison( + title: str, + eager: BenchmarkResult, + magi: BenchmarkResult, + torch_compile: BenchmarkResult, + extra_info: str = "", + magi_torch_compile: BenchmarkResult | None = None, +) -> tuple[float, float]: + """Print a comparison table and return speedup ratios. + + When *magi_torch_compile* is provided the table includes the + ``magi_compile(compile_mode=TORCH_COMPILE)`` variant as well. + + Returns: + (magi_vs_eager_speedup, magi_vs_torch_compile_speedup) based on median. + """ + magi_vs_eager = eager.median / magi.median + torch_vs_eager = eager.median / torch_compile.median + magi_vs_torch = torch_compile.median / magi.median + + print(f"\n{'=' * 78}") + print(title) + if extra_info: + print(f" {extra_info}") + print(f"{'=' * 78}") + print(f" {eager.summary('eager ')}") + print(f" {torch_compile.summary('torch.compile ')}") + if magi_torch_compile is not None: + print(f" {magi_torch_compile.summary('magi (torch_compile mode) ')}") + print(f" {magi.summary('magi_compile ')}") + print(f" ---") + print(f" torch.compile vs eager: {torch_vs_eager:.2f}x") + if magi_torch_compile is not None: + mtc_vs_eager = eager.median / magi_torch_compile.median + mtc_vs_torch = torch_compile.median / magi_torch_compile.median + print(f" magi(torch mode) vs eager: {mtc_vs_eager:.2f}x") + print(f" magi(torch mode) vs torch.compile: {mtc_vs_torch:.2f}x") + print(f" magi_compile vs eager: {magi_vs_eager:.2f}x") + print(f" magi_compile vs torch.compile: {magi_vs_torch:.2f}x") + print(f"{'=' * 78}") + return magi_vs_eager, magi_vs_torch + + def enable_remote_debug(): import os From 02c87ae33f95db3761c113414aada40f96e80b32 Mon Sep 17 00:00:00 2001 From: Zhiyao Cen <2523403608@qq.com> Date: Wed, 1 Apr 2026 21:34:53 +0800 Subject: [PATCH 2/5] [chore] add perf-fusion-gap TODO markers in fusion benchmarks Document the known magi vs torch.compile gap in fusion-heavy perf suites so follow-up optimization work has explicit tracking context. Made-with: Cursor --- tests/perf_tests/test_norm_residual_fusion_perf.py | 3 +++ tests/perf_tests/test_pointwise_fusion_perf.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py index 3eebc17..7922b10 100644 --- a/tests/perf_tests/test_norm_residual_fusion_perf.py +++ b/tests/perf_tests/test_norm_residual_fusion_perf.py @@ -19,6 +19,9 @@ Measured baseline (H100): torch.compile ~10.0x vs eager magi_compile ~4.5x vs eager (all paths) + +TODO(perf-fusion-gap): magi_compile still trails torch.compile in fusion-heavy workloads; +investigate graph partitioning/fusion opportunities and reduce the gap. """ import pytest diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py index 35eac0c..bdc02c9 100644 --- a/tests/perf_tests/test_pointwise_fusion_perf.py +++ b/tests/perf_tests/test_pointwise_fusion_perf.py @@ -19,6 +19,9 @@ Measured baseline (H100): torch.compile ~5.9x vs eager magi_compile ~3.5x vs eager (all paths) + +TODO(perf-fusion-gap): magi_compile still trails torch.compile in fusion-heavy workloads; +investigate graph partitioning/fusion opportunities and reduce the gap. """ import pytest From 988b5095a07f986a88d66689f45bab4e2b09ece2 Mon Sep 17 00:00:00 2001 From: Zhiyao Cen <2523403608@qq.com> Date: Wed, 1 Apr 2026 22:01:32 +0800 Subject: [PATCH 3/5] [chore] apply pre-commit formatting updates Apply black-driven formatting updates for perf benchmark utilities and perf test files so repository hooks pass consistently in local and CI workflows. Made-with: Cursor --- tests/perf_tests/__init__.py | 8 +--- tests/perf_tests/test_mlp_perf.py | 44 +++++++++---------- .../test_norm_residual_fusion_perf.py | 37 ++++++++-------- .../perf_tests/test_pointwise_fusion_perf.py | 38 ++++++++-------- tests/utils.py | 8 +--- 5 files changed, 60 insertions(+), 75 deletions(-) diff --git a/tests/perf_tests/__init__.py b/tests/perf_tests/__init__.py index cbe7710..6a79447 100644 --- a/tests/perf_tests/__init__.py +++ b/tests/perf_tests/__init__.py @@ -61,13 +61,7 @@ def cuda_benchmark( fn() torch.cuda.synchronize() - times = do_bench( - fn, - warmup=warmup, - rep=rep, - grad_to_none=grad_to_none, - return_mode="all", - ) + times = do_bench(fn, warmup=warmup, rep=rep, grad_to_none=grad_to_none, return_mode="all") return BenchmarkResult(times_ms=times) diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py index b3ff667..68d5bc9 100644 --- a/tests/perf_tests/test_mlp_perf.py +++ b/tests/perf_tests/test_mlp_perf.py @@ -36,11 +36,7 @@ def _build_config(): - return MLPConfig( - hidden_size=HIDDEN_SIZE, - intermediate_size=INTERMEDIATE_SIZE, - params_dtype=torch.bfloat16, - ) + return MLPConfig(hidden_size=HIDDEN_SIZE, intermediate_size=INTERMEDIATE_SIZE, params_dtype=torch.bfloat16) # ── Shared baselines (computed once per module) ──────────────────────── @@ -61,9 +57,7 @@ def mlp_baselines(mlp_device, mlp_input): """Eager and torch.compile baselines, benchmarked once for the whole module.""" config = _build_config() eager_model = RawMLP(config).to(mlp_device).eval() - torch_compiled = torch.compile( - RawMLP(config).to(mlp_device).eval(), backend="inductor" - ) + torch_compiled = torch.compile(RawMLP(config).to(mlp_device).eval(), backend="inductor") with torch.no_grad(): eager_result = cuda_benchmark(lambda: eager_model(mlp_input)) torch_result = cuda_benchmark(lambda: torch_compiled(mlp_input), compilation_warmup=3) @@ -101,7 +95,9 @@ class CompiledMLP(RawMLP): magi_vs_eager, _ = print_perf_comparison( "MLP - class decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") @@ -113,9 +109,7 @@ def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines): eager_result, torch_result = mlp_baselines config = _build_config() - magi_compiled = magi_compile( - RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0} - ) + magi_compiled = magi_compile(RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0}) magi_compiled.eval() with torch.no_grad(): @@ -123,7 +117,9 @@ def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines): magi_vs_eager, _ = print_perf_comparison( "MLP - instance decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") @@ -139,11 +135,7 @@ def _tc_mode(cfg): cfg.compile_mode = CompileMode.TORCH_COMPILE return cfg - magi_compiled = magi_compile( - RawMLP(config).to(mlp_device), - dynamic_arg_dims={"x": 0}, - config_patch=_tc_mode, - ) + magi_compiled = magi_compile(RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0}, config_patch=_tc_mode) magi_compiled.eval() with torch.no_grad(): @@ -151,7 +143,9 @@ def _tc_mode(cfg): magi_vs_eager, _ = print_perf_comparison( "MLP - instance (TORCH_COMPILE mode)", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") @@ -174,7 +168,9 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor: magi_vs_eager, _ = print_perf_comparison( "MLP - function decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") @@ -187,16 +183,16 @@ def test_mlp_method_decoration(mlp_device, mlp_input, mlp_baselines): config = _build_config() magi_compiled = RawMLP(config).to(mlp_device).eval() - magi_compiled.forward = magi_compile( - magi_compiled.forward, dynamic_arg_dims={"x": 0} - ) + magi_compiled.forward = magi_compile(magi_compiled.forward, dynamic_arg_dims={"x": 0}) with torch.no_grad(): magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3) magi_vs_eager, _ = print_perf_comparison( "MLP - method decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py index 7922b10..b534444 100644 --- a/tests/perf_tests/test_norm_residual_fusion_perf.py +++ b/tests/perf_tests/test_norm_residual_fusion_perf.py @@ -68,9 +68,7 @@ def nra_baselines(nra_device, nra_inputs): """Eager and torch.compile baselines, benchmarked once for the whole module.""" x, residual = nra_inputs eager_model = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval() - torch_compiled = torch.compile( - NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval(), backend="inductor" - ) + torch_compiled = torch.compile(NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval(), backend="inductor") with torch.no_grad(): eager_result = cuda_benchmark(lambda: eager_model(x, residual)) torch_result = cuda_benchmark(lambda: torch_compiled(x, residual), compilation_warmup=3) @@ -108,7 +106,9 @@ class CompiledNRA(NormResidualActivation): magi_vs_eager, _ = print_perf_comparison( "Norm+Residual+SiLU - class decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") @@ -120,10 +120,7 @@ def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines eager_result, torch_result = nra_baselines x, residual = nra_inputs - magi_compiled = magi_compile( - NormResidualActivation(HIDDEN_SIZE).to(nra_device), - dynamic_arg_dims={"x": 0, "residual": 0}, - ) + magi_compiled = magi_compile(NormResidualActivation(HIDDEN_SIZE).to(nra_device), dynamic_arg_dims={"x": 0, "residual": 0}) magi_compiled.eval() with torch.no_grad(): @@ -131,7 +128,9 @@ def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines magi_vs_eager, _ = print_perf_comparison( "Norm+Residual+SiLU - instance decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") @@ -148,9 +147,7 @@ def _tc_mode(cfg): return cfg magi_compiled = magi_compile( - NormResidualActivation(HIDDEN_SIZE).to(nra_device), - dynamic_arg_dims={"x": 0, "residual": 0}, - config_patch=_tc_mode, + NormResidualActivation(HIDDEN_SIZE).to(nra_device), dynamic_arg_dims={"x": 0, "residual": 0}, config_patch=_tc_mode ) magi_compiled.eval() @@ -159,7 +156,9 @@ def _tc_mode(cfg): magi_vs_eager, _ = print_perf_comparison( "Norm+Residual+SiLU - instance (TORCH_COMPILE mode)", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") @@ -182,7 +181,9 @@ def compiled_entry(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor: magi_vs_eager, _ = print_perf_comparison( "Norm+Residual+SiLU - function decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") @@ -195,16 +196,16 @@ def test_norm_residual_method_decoration(nra_device, nra_inputs, nra_baselines): x, residual = nra_inputs magi_compiled = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval() - magi_compiled.forward = magi_compile( - magi_compiled.forward, dynamic_arg_dims={"x": 0, "residual": 0} - ) + magi_compiled.forward = magi_compile(magi_compiled.forward, dynamic_arg_dims={"x": 0, "residual": 0}) with torch.no_grad(): magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3) magi_vs_eager, _ = print_perf_comparison( "Norm+Residual+SiLU - method decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py index bdc02c9..18079cf 100644 --- a/tests/perf_tests/test_pointwise_fusion_perf.py +++ b/tests/perf_tests/test_pointwise_fusion_perf.py @@ -66,9 +66,7 @@ def pointwise_baselines(pointwise_device, pointwise_input): """Eager and torch.compile baselines, benchmarked once for the whole module.""" x = pointwise_input eager_model = PointwiseFusionChain().to(pointwise_device).eval() - torch_compiled = torch.compile( - PointwiseFusionChain().to(pointwise_device).eval(), backend="inductor" - ) + torch_compiled = torch.compile(PointwiseFusionChain().to(pointwise_device).eval(), backend="inductor") with torch.no_grad(): eager_result = cuda_benchmark(lambda: eager_model(x)) torch_result = cuda_benchmark(lambda: torch_compiled(x), compilation_warmup=3) @@ -105,7 +103,9 @@ class CompiledPointwise(PointwiseFusionChain): magi_vs_eager, _ = print_perf_comparison( "Pointwise - class decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") @@ -116,9 +116,7 @@ def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointw """Pointwise chain: magi_compile(instance) decoration.""" eager_result, torch_result = pointwise_baselines - magi_compiled = magi_compile( - PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0} - ) + magi_compiled = magi_compile(PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0}) magi_compiled.eval() with torch.no_grad(): @@ -126,7 +124,9 @@ def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointw magi_vs_eager, _ = print_perf_comparison( "Pointwise - instance decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") @@ -141,11 +141,7 @@ def _tc_mode(cfg): cfg.compile_mode = CompileMode.TORCH_COMPILE return cfg - magi_compiled = magi_compile( - PointwiseFusionChain().to(pointwise_device), - dynamic_arg_dims={"x": 0}, - config_patch=_tc_mode, - ) + magi_compiled = magi_compile(PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0}, config_patch=_tc_mode) magi_compiled.eval() with torch.no_grad(): @@ -153,7 +149,9 @@ def _tc_mode(cfg): magi_vs_eager, _ = print_perf_comparison( "Pointwise - instance (TORCH_COMPILE mode)", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") @@ -175,7 +173,9 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor: magi_vs_eager, _ = print_perf_comparison( "Pointwise - function decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") @@ -187,16 +187,16 @@ def test_pointwise_method_decoration(pointwise_device, pointwise_input, pointwis eager_result, torch_result = pointwise_baselines magi_compiled = PointwiseFusionChain().to(pointwise_device).eval() - magi_compiled.forward = magi_compile( - magi_compiled.forward, dynamic_arg_dims={"x": 0} - ) + magi_compiled.forward = magi_compile(magi_compiled.forward, dynamic_arg_dims={"x": 0}) with torch.no_grad(): magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3) magi_vs_eager, _ = print_perf_comparison( "Pointwise - method decoration", - eager_result, magi_result, torch_result, + eager_result, + magi_result, + torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") diff --git a/tests/utils.py b/tests/utils.py index be24321..7580163 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -99,13 +99,7 @@ def cuda_benchmark( fn() torch.cuda.synchronize() - times = do_bench( - fn, - warmup=warmup, - rep=rep, - grad_to_none=grad_to_none, - return_mode="all", - ) + times = do_bench(fn, warmup=warmup, rep=rep, grad_to_none=grad_to_none, return_mode="all") return BenchmarkResult(times_ms=times) From ebb7b4cbadcaf945e27a019cedc78ad5782a6bc9 Mon Sep 17 00:00:00 2001 From: Zhiyao Cen <2523403608@qq.com> Date: Wed, 1 Apr 2026 22:16:39 +0800 Subject: [PATCH 4/5] [test] relax perf speedup thresholds for CI stability Lower MLP, norm-residual, and pointwise speedup gates to reflect observed CI variance while preserving meaningful eager-baseline improvements across entrypoints. Made-with: Cursor --- tests/perf_tests/test_mlp_perf.py | 2 +- tests/perf_tests/test_norm_residual_fusion_perf.py | 2 +- tests/perf_tests/test_pointwise_fusion_perf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py index 68d5bc9..a35397c 100644 --- a/tests/perf_tests/test_mlp_perf.py +++ b/tests/perf_tests/test_mlp_perf.py @@ -32,7 +32,7 @@ HIDDEN_SIZE = 2048 INTERMEDIATE_SIZE = 8192 NUM_TOKENS = 8192 -SPEEDUP_VS_EAGER_THRESHOLD = 1.65 +SPEEDUP_VS_EAGER_THRESHOLD = 1.5 def _build_config(): diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py index b534444..a81c1f1 100644 --- a/tests/perf_tests/test_norm_residual_fusion_perf.py +++ b/tests/perf_tests/test_norm_residual_fusion_perf.py @@ -36,7 +36,7 @@ HIDDEN_SIZE = 4096 NUM_TOKENS = 16384 -SPEEDUP_VS_EAGER_THRESHOLD = 4.05 +SPEEDUP_VS_EAGER_THRESHOLD = 3.7 class NormResidualActivation(nn.Module): diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py index 18079cf..e5bd1ac 100644 --- a/tests/perf_tests/test_pointwise_fusion_perf.py +++ b/tests/perf_tests/test_pointwise_fusion_perf.py @@ -34,7 +34,7 @@ HIDDEN_SIZE = 4096 NUM_TOKENS = 16384 -SPEEDUP_VS_EAGER_THRESHOLD = 3.15 +SPEEDUP_VS_EAGER_THRESHOLD = 2.9 class PointwiseFusionChain(nn.Module): From 6fdd4ddf5223cae341b8f541a59688b9798e3ce1 Mon Sep 17 00:00:00 2001 From: Zhiyao Cen <2523403608@qq.com> Date: Wed, 1 Apr 2026 23:50:25 +0800 Subject: [PATCH 5/5] [test] deduplicate perf speedup assertions Move the repeated perf speedup assertion helper into tests/perf_tests/utils.py and reuse it across MLP, norm-residual fusion, and pointwise perf tests to reduce duplication and keep threshold checks consistent. Made-with: Cursor --- tests/perf_tests/test_mlp_perf.py | 22 +++++----------- .../test_norm_residual_fusion_perf.py | 22 +++++----------- .../perf_tests/test_pointwise_fusion_perf.py | 22 +++++----------- tests/perf_tests/utils.py | 25 +++++++++++++++++++ 4 files changed, 43 insertions(+), 48 deletions(-) create mode 100644 tests/perf_tests/utils.py diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py index a35397c..9563e2d 100644 --- a/tests/perf_tests/test_mlp_perf.py +++ b/tests/perf_tests/test_mlp_perf.py @@ -28,6 +28,7 @@ from magi_compiler.config import CompileMode from tests.model_definition import MLPConfig, RawMLP from tests.perf_tests import cuda_benchmark, print_perf_comparison +from tests.perf_tests.utils import assert_speedup HIDDEN_SIZE = 2048 INTERMEDIATE_SIZE = 8192 @@ -64,17 +65,6 @@ def mlp_baselines(mlp_device, mlp_input): return eager_result, torch_result -# ── Helpers ──────────────────────────────────────────────────────────── - - -def _assert_speedup(magi_vs_eager, eager_result, magi_result, label): - assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, ( - f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. " - f"Got {magi_vs_eager:.2f}x " - f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" - ) - - # ── Tests ────────────────────────────────────────────────────────────── @@ -100,7 +90,7 @@ class CompiledMLP(RawMLP): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") + assert_speedup(magi_vs_eager, eager_result, magi_result, "class", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -122,7 +112,7 @@ def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") + assert_speedup(magi_vs_eager, eager_result, magi_result, "instance", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -148,7 +138,7 @@ def _tc_mode(cfg): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") + assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -173,7 +163,7 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor: torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") + assert_speedup(magi_vs_eager, eager_result, magi_result, "function", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -195,4 +185,4 @@ def test_mlp_method_decoration(mlp_device, mlp_input, mlp_baselines): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) intermediate={INTERMEDIATE_SIZE} dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") + assert_speedup(magi_vs_eager, eager_result, magi_result, "method", SPEEDUP_VS_EAGER_THRESHOLD) diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py index a81c1f1..4e20a68 100644 --- a/tests/perf_tests/test_norm_residual_fusion_perf.py +++ b/tests/perf_tests/test_norm_residual_fusion_perf.py @@ -33,6 +33,7 @@ from magi_compiler.config import CompileMode from tests.model_definition import RMSNorm from tests.perf_tests import cuda_benchmark, print_perf_comparison +from tests.perf_tests.utils import assert_speedup HIDDEN_SIZE = 4096 NUM_TOKENS = 16384 @@ -75,17 +76,6 @@ def nra_baselines(nra_device, nra_inputs): return eager_result, torch_result -# ── Helpers ──────────────────────────────────────────────────────────── - - -def _assert_speedup(magi_vs_eager, eager_result, magi_result, label): - assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, ( - f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. " - f"Got {magi_vs_eager:.2f}x " - f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" - ) - - # ── Tests ────────────────────────────────────────────────────────────── @@ -111,7 +101,7 @@ class CompiledNRA(NormResidualActivation): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") + assert_speedup(magi_vs_eager, eager_result, magi_result, "class", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -133,7 +123,7 @@ def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") + assert_speedup(magi_vs_eager, eager_result, magi_result, "instance", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -161,7 +151,7 @@ def _tc_mode(cfg): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") + assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -186,7 +176,7 @@ def compiled_entry(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor: torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") + assert_speedup(magi_vs_eager, eager_result, magi_result, "function", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -208,4 +198,4 @@ def test_norm_residual_method_decoration(nra_device, nra_inputs, nra_baselines): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE}) dtype=bf16", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") + assert_speedup(magi_vs_eager, eager_result, magi_result, "method", SPEEDUP_VS_EAGER_THRESHOLD) diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py index e5bd1ac..526f3cd 100644 --- a/tests/perf_tests/test_pointwise_fusion_perf.py +++ b/tests/perf_tests/test_pointwise_fusion_perf.py @@ -31,6 +31,7 @@ from magi_compiler import magi_compile from magi_compiler.config import CompileMode from tests.perf_tests import cuda_benchmark, print_perf_comparison +from tests.perf_tests.utils import assert_speedup HIDDEN_SIZE = 4096 NUM_TOKENS = 16384 @@ -73,17 +74,6 @@ def pointwise_baselines(pointwise_device, pointwise_input): return eager_result, torch_result -# ── Helpers ──────────────────────────────────────────────────────────── - - -def _assert_speedup(magi_vs_eager, eager_result, magi_result, label): - assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, ( - f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. " - f"Got {magi_vs_eager:.2f}x " - f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" - ) - - # ── Tests ────────────────────────────────────────────────────────────── @@ -108,7 +98,7 @@ class CompiledPointwise(PointwiseFusionChain): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "class") + assert_speedup(magi_vs_eager, eager_result, magi_result, "class", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -129,7 +119,7 @@ def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointw torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance") + assert_speedup(magi_vs_eager, eager_result, magi_result, "instance", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -154,7 +144,7 @@ def _tc_mode(cfg): torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc") + assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -178,7 +168,7 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor: torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "function") + assert_speedup(magi_vs_eager, eager_result, magi_result, "function", SPEEDUP_VS_EAGER_THRESHOLD) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support") @@ -199,4 +189,4 @@ def test_pointwise_method_decoration(pointwise_device, pointwise_input, pointwis torch_result, extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})", ) - _assert_speedup(magi_vs_eager, eager_result, magi_result, "method") + assert_speedup(magi_vs_eager, eager_result, magi_result, "method", SPEEDUP_VS_EAGER_THRESHOLD) diff --git a/tests/perf_tests/utils.py b/tests/perf_tests/utils.py new file mode 100644 index 0000000..80ba884 --- /dev/null +++ b/tests/perf_tests/utils.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025 SandAI. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tests.perf_tests import BenchmarkResult + + +def assert_speedup( + magi_vs_eager: float, eager_result: BenchmarkResult, magi_result: BenchmarkResult, label: str, threshold: float +) -> None: + assert magi_vs_eager >= threshold, ( + f"[{label}] magi_compile must achieve >= {threshold:.2f}x over eager. " + f"Got {magi_vs_eager:.2f}x " + f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)" + )