From 91443b1a9db76225096169fa900a201f00e6e303 Mon Sep 17 00:00:00 2001
From: Zhiyao Cen <2523403608@qq.com>
Date: Wed, 1 Apr 2026 21:30:43 +0800
Subject: [PATCH 1/5] [test] add dedicated perf test suite with entrypoint
 matrix

Split perf benchmarks into tests/perf_tests with shared benchmarking helpers and add class/instance/function/method coverage plus torch-compile mode checks across MLP, norm-residual fusion, and pointwise chains.

Made-with: Cursor
---
 tests/model_definition.py                     |  32 ++-
 tests/perf_tests/__init__.py                  | 101 +++++++++
 tests/perf_tests/conftest.py                  |  13 ++
 tests/perf_tests/test_mlp_perf.py             | 202 +++++++++++++++++
 .../test_norm_residual_fusion_perf.py         | 207 ++++++++++++++++++
 .../perf_tests/test_pointwise_fusion_perf.py  | 199 +++++++++++++++++
 tests/utils.py                                | 124 +++++++++++
 7 files changed, 869 insertions(+), 9 deletions(-)
 create mode 100644 tests/perf_tests/__init__.py
 create mode 100644 tests/perf_tests/conftest.py
 create mode 100644 tests/perf_tests/test_mlp_perf.py
 create mode 100644 tests/perf_tests/test_norm_residual_fusion_perf.py
 create mode 100644 tests/perf_tests/test_pointwise_fusion_perf.py

diff --git a/tests/model_definition.py b/tests/model_definition.py
index 1fc7c9e..13c75bd 100644
--- a/tests/model_definition.py
+++ b/tests/model_definition.py
@@ -55,9 +55,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x.to(input_dtype)
 
 
-@magi_compile(dynamic_arg_dims={"x": 0})
-class MLP(torch.nn.Module):
-    """MLP module with traditional architecture (up-projection, activation, and down-projection)"""
+class RawMLP(torch.nn.Module):
+    """MLP module with traditional architecture (up-projection, activation, and down-projection).
+
+    This is the uncompiled base class. Use ``MLP`` for the magi_compile-wrapped variant.
+    """
 
     config: MLPConfig
 
@@ -81,20 +83,25 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - x: (num_tokens, hidden_size)
             - output: (num_tokens, hidden_size)
         """
-        # Pre-normalization
         x = self.pre_norm(x).to(torch.bfloat16)
-        # Up-projection
         x = self.up_proj(x).to(torch.float32)
-        # Activation (SiLU)
         x = F.silu(x).to(torch.bfloat16)
-        # Down-projection
         x = self.down_proj(x).to(torch.float32)
         return x
 
 
 @magi_compile(dynamic_arg_dims={"x": 0})
-class RMSNormModule(torch.nn.Module):
-    """Compiled RMSNorm module for testing"""
+class MLP(RawMLP):
+    """Compiled MLP module (magi_compile-wrapped ``RawMLP``)."""
+
+    pass
+
+
+class RawRMSNormModule(torch.nn.Module):
+    """RMSNorm module for testing.
+
+    This is the uncompiled base class. Use ``RMSNormModule`` for the magi_compile-wrapped variant.
+    """
 
     config: RMSNormConfig
 
@@ -119,6 +126,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.norm(x)
 
 
+@magi_compile(dynamic_arg_dims={"x": 0})
+class RMSNormModule(RawRMSNormModule):
+    """Compiled RMSNorm module (magi_compile-wrapped ``RawRMSNormModule``)."""
+
+    pass
+
+
 def create_rms_norm_model(config: RMSNormConfig, device: torch.device) -> RMSNormModule:
     """Create RMSNorm model
 
diff --git a/tests/perf_tests/__init__.py b/tests/perf_tests/__init__.py
new file mode 100644
index 0000000..cbe7710
--- /dev/null
+++ b/tests/perf_tests/__init__.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import statistics
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import torch
+from triton.testing import do_bench
+
+
+@dataclass
+class BenchmarkResult:
+    times_ms: list[float]
+
+    @property
+    def median(self) -> float:
+        return statistics.median(self.times_ms)
+
+    @property
+    def mean(self) -> float:
+        return statistics.mean(self.times_ms)
+
+    @property
+    def min(self) -> float:
+        return min(self.times_ms)
+
+    @property
+    def stdev(self) -> float:
+        return statistics.stdev(self.times_ms) if len(self.times_ms) > 1 else 0.0
+
+    def summary(self, label: str = "") -> str:
+        prefix = f"[{label}] " if label else ""
+        return (
+            f"{prefix}median={self.median:.3f}ms  mean={self.mean:.3f}ms  "
+            f"min={self.min:.3f}ms  stdev={self.stdev:.3f}ms  (n={len(self.times_ms)})"
+        )
+
+
+def cuda_benchmark(
+    fn: Callable[[], object],
+    *,
+    warmup: int = 25,
+    rep: int = 100,
+    grad_to_none: list[torch.Tensor] | None = None,
+    compilation_warmup: int = 0,
+) -> BenchmarkResult:
+    if compilation_warmup > 0:
+        for _ in range(compilation_warmup):
+            fn()
+        torch.cuda.synchronize()
+
+    times = do_bench(
+        fn,
+        warmup=warmup,
+        rep=rep,
+        grad_to_none=grad_to_none,
+        return_mode="all",
+    )
+    return BenchmarkResult(times_ms=times)
+
+
+def print_perf_comparison(
+    title: str,
+    eager: BenchmarkResult,
+    magi: BenchmarkResult,
+    torch_compile: BenchmarkResult | None = None,
+    extra_info: str = "",
+) -> tuple[float, float]:
+    magi_vs_eager = eager.median / magi.median
+    torch_vs_eager = eager.median / torch_compile.median if torch_compile else 0.0
+    magi_vs_torch = torch_compile.median / magi.median if torch_compile else 0.0
+
+    print(f"\n{'=' * 78}")
+    print(title)
+    if extra_info:
+        print(f"  {extra_info}")
+    print(f"{'=' * 78}")
+    print(f"  {eager.summary('eager                       ')}")
+    if torch_compile is not None:
+        print(f"  {torch_compile.summary('torch.compile               ')}")
+    print(f"  {magi.summary('magi_compile                ')}")
+    print("  ---")
+    if torch_compile is not None:
+        print(f"  torch.compile     vs eager:         {torch_vs_eager:.2f}x")
+    print(f"  magi_compile      vs eager:         {magi_vs_eager:.2f}x")
+    if torch_compile is not None:
+        print(f"  magi_compile      vs torch.compile: {magi_vs_torch:.2f}x")
+    print(f"{'=' * 78}")
+    return magi_vs_eager, magi_vs_torch
diff --git a/tests/perf_tests/conftest.py b/tests/perf_tests/conftest.py
new file mode 100644
index 0000000..3dbb800
--- /dev/null
+++ b/tests/perf_tests/conftest.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py
new file mode 100644
index 0000000..b3ff667
--- /dev/null
+++ b/tests/perf_tests/test_mlp_perf.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Performance test: end-to-end MLP block.
+
+Covers all supported compilation paths (class, instance, instance+TC, method).
+
+Measured baseline (H100):
+  torch.compile  ~1.8x vs eager
+  magi_compile   ~1.8x vs eager (all paths)
+"""
+
+import pytest
+import torch
+
+from magi_compiler import magi_compile
+from magi_compiler.config import CompileMode
+from tests.model_definition import MLPConfig, RawMLP
+from tests.perf_tests import cuda_benchmark, print_perf_comparison
+
+HIDDEN_SIZE = 2048
+INTERMEDIATE_SIZE = 8192
+NUM_TOKENS = 8192
+SPEEDUP_VS_EAGER_THRESHOLD = 1.65
+
+
+def _build_config():
+    return MLPConfig(
+        hidden_size=HIDDEN_SIZE,
+        intermediate_size=INTERMEDIATE_SIZE,
+        params_dtype=torch.bfloat16,
+    )
+
+
+# ── Shared baselines (computed once per module) ────────────────────────
+
+
+@pytest.fixture(scope="module")
+def mlp_device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+@pytest.fixture(scope="module")
+def mlp_input(mlp_device):
+    return torch.randn(NUM_TOKENS, HIDDEN_SIZE, device=mlp_device, dtype=torch.bfloat16)
+
+
+@pytest.fixture(scope="module")
+def mlp_baselines(mlp_device, mlp_input):
+    """Eager and torch.compile baselines, benchmarked once for the whole module."""
+    config = _build_config()
+    eager_model = RawMLP(config).to(mlp_device).eval()
+    torch_compiled = torch.compile(
+        RawMLP(config).to(mlp_device).eval(), backend="inductor"
+    )
+    with torch.no_grad():
+        eager_result = cuda_benchmark(lambda: eager_model(mlp_input))
+        torch_result = cuda_benchmark(lambda: torch_compiled(mlp_input), compilation_warmup=3)
+    return eager_result, torch_result
+
+
+# ── Helpers ────────────────────────────────────────────────────────────
+
+
+def _assert_speedup(magi_vs_eager, eager_result, magi_result, label):
+    assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, (
+        f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. "
+        f"Got {magi_vs_eager:.2f}x "
+        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
+    )
+
+
+# ── Tests ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_mlp_class_decoration(mlp_device, mlp_input, mlp_baselines):
+    """MLP block: @magi_compile class decoration."""
+    eager_result, torch_result = mlp_baselines
+    config = _build_config()
+
+    @magi_compile(dynamic_arg_dims={"x": 0})
+    class CompiledMLP(RawMLP):
+        pass
+
+    magi_compiled = CompiledMLP(config).to(mlp_device).eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "MLP - class decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines):
+    """MLP block: magi_compile(instance) decoration."""
+    eager_result, torch_result = mlp_baselines
+    config = _build_config()
+
+    magi_compiled = magi_compile(
+        RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0}
+    )
+    magi_compiled.eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "MLP - instance decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_mlp_instance_torch_compile_mode(mlp_device, mlp_input, mlp_baselines):
+    """MLP block: magi_compile(instance, mode=TORCH_COMPILE)."""
+    eager_result, torch_result = mlp_baselines
+    config = _build_config()
+
+    def _tc_mode(cfg):
+        cfg.compile_mode = CompileMode.TORCH_COMPILE
+        return cfg
+
+    magi_compiled = magi_compile(
+        RawMLP(config).to(mlp_device),
+        dynamic_arg_dims={"x": 0},
+        config_patch=_tc_mode,
+    )
+    magi_compiled.eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "MLP - instance (TORCH_COMPILE mode)",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_mlp_function_decoration(mlp_device, mlp_input, mlp_baselines):
+    """MLP block: @magi_compile function-level entry."""
+    eager_result, torch_result = mlp_baselines
+    config = _build_config()
+
+    model = RawMLP(config).to(mlp_device).eval()
+
+    @magi_compile(dynamic_arg_dims={"x": 0})
+    def compiled_entry(x: torch.Tensor) -> torch.Tensor:
+        return model(x)
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: compiled_entry(mlp_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "MLP - function decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_mlp_method_decoration(mlp_device, mlp_input, mlp_baselines):
+    """MLP block: magi_compile(model.forward) method decoration."""
+    eager_result, torch_result = mlp_baselines
+    config = _build_config()
+
+    magi_compiled = RawMLP(config).to(mlp_device).eval()
+    magi_compiled.forward = magi_compile(
+        magi_compiled.forward, dynamic_arg_dims={"x": 0}
+    )
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "MLP - method decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py
new file mode 100644
index 0000000..3eebc17
--- /dev/null
+++ b/tests/perf_tests/test_norm_residual_fusion_perf.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Performance test: norm + residual + activation fusion.
+
+Covers all supported compilation paths (class, instance, instance+TC, method).
+
+Measured baseline (H100):
+  torch.compile  ~10.0x vs eager
+  magi_compile   ~4.5x vs eager (all paths)
+"""
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from magi_compiler import magi_compile
+from magi_compiler.config import CompileMode
+from tests.model_definition import RMSNorm
+from tests.perf_tests import cuda_benchmark, print_perf_comparison
+
+HIDDEN_SIZE = 4096
+NUM_TOKENS = 16384
+SPEEDUP_VS_EAGER_THRESHOLD = 4.05
+
+
+class NormResidualActivation(nn.Module):
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.norm = RMSNorm(hidden_size)
+
+    def forward(self, x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        return F.silu(self.norm(x) + residual)
+
+
+# ── Shared baselines (computed once per module) ────────────────────────
+
+
+@pytest.fixture(scope="module")
+def nra_device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+@pytest.fixture(scope="module")
+def nra_inputs(nra_device):
+    x = torch.randn(NUM_TOKENS, HIDDEN_SIZE, device=nra_device, dtype=torch.bfloat16)
+    residual = torch.randn_like(x)
+    return x, residual
+
+
+@pytest.fixture(scope="module")
+def nra_baselines(nra_device, nra_inputs):
+    """Eager and torch.compile baselines, benchmarked once for the whole module."""
+    x, residual = nra_inputs
+    eager_model = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval()
+    torch_compiled = torch.compile(
+        NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval(), backend="inductor"
+    )
+    with torch.no_grad():
+        eager_result = cuda_benchmark(lambda: eager_model(x, residual))
+        torch_result = cuda_benchmark(lambda: torch_compiled(x, residual), compilation_warmup=3)
+    return eager_result, torch_result
+
+
+# ── Helpers ────────────────────────────────────────────────────────────
+
+
+def _assert_speedup(magi_vs_eager, eager_result, magi_result, label):
+    assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, (
+        f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. "
+        f"Got {magi_vs_eager:.2f}x "
+        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
+    )
+
+
+# ── Tests ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_norm_residual_class_decoration(nra_device, nra_inputs, nra_baselines):
+    """Norm+residual+SiLU: @magi_compile class decoration."""
+    eager_result, torch_result = nra_baselines
+    x, residual = nra_inputs
+
+    @magi_compile(dynamic_arg_dims={"x": 0, "residual": 0})
+    class CompiledNRA(NormResidualActivation):
+        pass
+
+    magi_compiled = CompiledNRA(HIDDEN_SIZE).to(nra_device).eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Norm+Residual+SiLU - class decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines):
+    """Norm+residual+SiLU: magi_compile(instance) decoration."""
+    eager_result, torch_result = nra_baselines
+    x, residual = nra_inputs
+
+    magi_compiled = magi_compile(
+        NormResidualActivation(HIDDEN_SIZE).to(nra_device),
+        dynamic_arg_dims={"x": 0, "residual": 0},
+    )
+    magi_compiled.eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Norm+Residual+SiLU - instance decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_norm_residual_instance_torch_compile_mode(nra_device, nra_inputs, nra_baselines):
+    """Norm+residual+SiLU: magi_compile(instance, mode=TORCH_COMPILE)."""
+    eager_result, torch_result = nra_baselines
+    x, residual = nra_inputs
+
+    def _tc_mode(cfg):
+        cfg.compile_mode = CompileMode.TORCH_COMPILE
+        return cfg
+
+    magi_compiled = magi_compile(
+        NormResidualActivation(HIDDEN_SIZE).to(nra_device),
+        dynamic_arg_dims={"x": 0, "residual": 0},
+        config_patch=_tc_mode,
+    )
+    magi_compiled.eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Norm+Residual+SiLU - instance (TORCH_COMPILE mode)",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_norm_residual_function_decoration(nra_device, nra_inputs, nra_baselines):
+    """Norm+residual+SiLU: @magi_compile function-level entry."""
+    eager_result, torch_result = nra_baselines
+    x, residual = nra_inputs
+
+    model = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval()
+
+    @magi_compile(dynamic_arg_dims={"x": 0, "residual": 0})
+    def compiled_entry(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+        return model(x, residual)
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: compiled_entry(x, residual), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Norm+Residual+SiLU - function decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_norm_residual_method_decoration(nra_device, nra_inputs, nra_baselines):
+    """Norm+residual+SiLU: magi_compile(model.forward) method decoration."""
+    eager_result, torch_result = nra_baselines
+    x, residual = nra_inputs
+
+    magi_compiled = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval()
+    magi_compiled.forward = magi_compile(
+        magi_compiled.forward, dynamic_arg_dims={"x": 0, "residual": 0}
+    )
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Norm+Residual+SiLU - method decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py
new file mode 100644
index 0000000..35eac0c
--- /dev/null
+++ b/tests/perf_tests/test_pointwise_fusion_perf.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Performance test: pointwise operator chain.
+
+Covers all supported compilation paths (class, instance, instance+TC, method).
+
+Measured baseline (H100):
+  torch.compile  ~5.9x vs eager
+  magi_compile   ~3.5x vs eager (all paths)
+"""
+
+import pytest
+import torch
+import torch.nn as nn
+
+from magi_compiler import magi_compile
+from magi_compiler.config import CompileMode
+from tests.perf_tests import cuda_benchmark, print_perf_comparison
+
+HIDDEN_SIZE = 4096
+NUM_TOKENS = 16384
+SPEEDUP_VS_EAGER_THRESHOLD = 3.15
+
+
+class PointwiseFusionChain(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x * 0.5
+        x = x + 1.0
+        x = torch.relu(x)
+        x = x * x
+        x = x - 0.5
+        x = torch.sigmoid(x)
+        return x
+
+
+# ── Shared baselines (computed once per module) ────────────────────────
+
+
+@pytest.fixture(scope="module")
+def pointwise_device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+@pytest.fixture(scope="module")
+def pointwise_input(pointwise_device):
+    return torch.randn(NUM_TOKENS, HIDDEN_SIZE, device=pointwise_device, dtype=torch.float32)
+
+
+@pytest.fixture(scope="module")
+def pointwise_baselines(pointwise_device, pointwise_input):
+    """Eager and torch.compile baselines, benchmarked once for the whole module."""
+    x = pointwise_input
+    eager_model = PointwiseFusionChain().to(pointwise_device).eval()
+    torch_compiled = torch.compile(
+        PointwiseFusionChain().to(pointwise_device).eval(), backend="inductor"
+    )
+    with torch.no_grad():
+        eager_result = cuda_benchmark(lambda: eager_model(x))
+        torch_result = cuda_benchmark(lambda: torch_compiled(x), compilation_warmup=3)
+    return eager_result, torch_result
+
+
+# ── Helpers ────────────────────────────────────────────────────────────
+
+
+def _assert_speedup(magi_vs_eager, eager_result, magi_result, label):
+    assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, (
+        f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. "
+        f"Got {magi_vs_eager:.2f}x "
+        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
+    )
+
+
+# ── Tests ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_pointwise_class_decoration(pointwise_device, pointwise_input, pointwise_baselines):
+    """Pointwise chain: @magi_compile class decoration."""
+    eager_result, torch_result = pointwise_baselines
+
+    @magi_compile(dynamic_arg_dims={"x": 0})
+    class CompiledPointwise(PointwiseFusionChain):
+        pass
+
+    magi_compiled = CompiledPointwise().to(pointwise_device).eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Pointwise - class decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointwise_baselines):
+    """Pointwise chain: magi_compile(instance) decoration."""
+    eager_result, torch_result = pointwise_baselines
+
+    magi_compiled = magi_compile(
+        PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0}
+    )
+    magi_compiled.eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Pointwise - instance decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_pointwise_instance_torch_compile_mode(pointwise_device, pointwise_input, pointwise_baselines):
+    """Pointwise chain: magi_compile(instance, mode=TORCH_COMPILE)."""
+    eager_result, torch_result = pointwise_baselines
+
+    def _tc_mode(cfg):
+        cfg.compile_mode = CompileMode.TORCH_COMPILE
+        return cfg
+
+    magi_compiled = magi_compile(
+        PointwiseFusionChain().to(pointwise_device),
+        dynamic_arg_dims={"x": 0},
+        config_patch=_tc_mode,
+    )
+    magi_compiled.eval()
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Pointwise - instance (TORCH_COMPILE mode)",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_pointwise_function_decoration(pointwise_device, pointwise_input, pointwise_baselines):
+    """Pointwise chain: @magi_compile function-level entry."""
+    eager_result, torch_result = pointwise_baselines
+
+    model = PointwiseFusionChain().to(pointwise_device).eval()
+
+    @magi_compile(dynamic_arg_dims={"x": 0})
+    def compiled_entry(x: torch.Tensor) -> torch.Tensor:
+        return model(x)
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: compiled_entry(pointwise_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Pointwise - function decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
+def test_pointwise_method_decoration(pointwise_device, pointwise_input, pointwise_baselines):
+    """Pointwise chain: magi_compile(model.forward) method decoration."""
+    eager_result, torch_result = pointwise_baselines
+
+    magi_compiled = PointwiseFusionChain().to(pointwise_device).eval()
+    magi_compiled.forward = magi_compile(
+        magi_compiled.forward, dynamic_arg_dims={"x": 0}
+    )
+
+    with torch.no_grad():
+        magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3)
+
+    magi_vs_eager, _ = print_perf_comparison(
+        "Pointwise - method decoration",
+        eager_result, magi_result, torch_result,
+        extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
+    )
+    _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
diff --git a/tests/utils.py b/tests/utils.py
index a5fa647..be24321 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -13,6 +13,12 @@
 # limitations under the License.
 
 import shutil
+import statistics
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import torch
+from triton.testing import do_bench
 
 from magi_compiler.config import get_compile_config
 
@@ -28,6 +34,124 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         shutil.rmtree(get_compile_config().cache_root_dir, ignore_errors=True)
 
 
+@dataclass
+class BenchmarkResult:
+    """Timing results from a CUDA benchmark run (all times in milliseconds).
+
+    Follows the methodology of ``triton.testing.do_bench``: time-based warmup/rep,
+    L2 cache flush between iterations, per-iteration CUDA event timing.
+    """
+
+    times_ms: list[float]
+
+    @property
+    def median(self) -> float:
+        return statistics.median(self.times_ms)
+
+    @property
+    def mean(self) -> float:
+        return statistics.mean(self.times_ms)
+
+    @property
+    def min(self) -> float:
+        return min(self.times_ms)
+
+    @property
+    def stdev(self) -> float:
+        return statistics.stdev(self.times_ms) if len(self.times_ms) > 1 else 0.0
+
+    def summary(self, label: str = "") -> str:
+        prefix = f"[{label}] " if label else ""
+        return (
+            f"{prefix}median={self.median:.3f}ms  mean={self.mean:.3f}ms  "
+            f"min={self.min:.3f}ms  stdev={self.stdev:.3f}ms  (n={len(self.times_ms)})"
+        )
+
+
+def cuda_benchmark(
+    fn: Callable[[], object],
+    *,
+    warmup: int = 25,
+    rep: int = 100,
+    grad_to_none: list[torch.Tensor] | None = None,
+    compilation_warmup: int = 0,
+) -> BenchmarkResult:
+    """Benchmark a GPU callable using ``triton.testing.do_bench`` methodology.
+
+    Uses time-based warmup/rep (in ms), L2 cache flush between iterations,
+    and per-iteration CUDA event timing -- the same approach torch inductor
+    uses internally for autotuning fused triton kernels.
+
+    Args:
+        fn: Zero-argument callable to benchmark.
+        warmup: Warmup duration in milliseconds (passed to ``do_bench``).
+        rep: Benchmark repetition duration in milliseconds.
+        grad_to_none: Tensors whose ``.grad`` should be cleared between iterations.
+        compilation_warmup: Number of extra invocations **before** ``do_bench``
+            to ensure lazy compilation (e.g. ``magi_compile``) is fully finished.
+            These calls are *not* timed.
+
+    Returns:
+        BenchmarkResult with all per-iteration times in milliseconds.
+    """
+    if compilation_warmup > 0:
+        for _ in range(compilation_warmup):
+            fn()
+        torch.cuda.synchronize()
+
+    times = do_bench(
+        fn,
+        warmup=warmup,
+        rep=rep,
+        grad_to_none=grad_to_none,
+        return_mode="all",
+    )
+    return BenchmarkResult(times_ms=times)
+
+
+def print_perf_comparison(
+    title: str,
+    eager: BenchmarkResult,
+    magi: BenchmarkResult,
+    torch_compile: BenchmarkResult,
+    extra_info: str = "",
+    magi_torch_compile: BenchmarkResult | None = None,
+) -> tuple[float, float]:
+    """Print a comparison table and return speedup ratios.
+
+    When *magi_torch_compile* is provided the table includes the
+    ``magi_compile(compile_mode=TORCH_COMPILE)`` variant as well.
+
+    Returns:
+        (magi_vs_eager_speedup, magi_vs_torch_compile_speedup) based on median.
+    """
+    magi_vs_eager = eager.median / magi.median
+    torch_vs_eager = eager.median / torch_compile.median
+    magi_vs_torch = torch_compile.median / magi.median
+
+    print(f"\n{'=' * 78}")
+    print(title)
+    if extra_info:
+        print(f"  {extra_info}")
+    print(f"{'=' * 78}")
+    print(f"  {eager.summary('eager                       ')}")
+    print(f"  {torch_compile.summary('torch.compile               ')}")
+    if magi_torch_compile is not None:
+        print(f"  {magi_torch_compile.summary('magi (torch_compile mode)   ')}")
+    print(f"  {magi.summary('magi_compile                ')}")
+    print(f"  ---")
+    print(f"  torch.compile     vs eager:         {torch_vs_eager:.2f}x")
+    if magi_torch_compile is not None:
+        mtc_vs_eager = eager.median / magi_torch_compile.median
+        mtc_vs_torch = torch_compile.median / magi_torch_compile.median
+        print(f"  magi(torch mode)  vs eager:         {mtc_vs_eager:.2f}x")
+        print(f"  magi(torch mode)  vs torch.compile: {mtc_vs_torch:.2f}x")
+    print(f"  magi_compile      vs eager:         {magi_vs_eager:.2f}x")
+    print(f"  magi_compile      vs torch.compile: {magi_vs_torch:.2f}x")
+    print(f"{'=' * 78}")
+    return magi_vs_eager, magi_vs_torch
+
+
 def enable_remote_debug():
     import os
 

From 02c87ae33f95db3761c113414aada40f96e80b32 Mon Sep 17 00:00:00 2001
From: Zhiyao Cen <2523403608@qq.com>
Date: Wed, 1 Apr 2026 21:34:53 +0800
Subject: [PATCH 2/5] [chore] add perf-fusion-gap TODO markers in fusion
 benchmarks

Document the known magi vs torch.compile gap in fusion-heavy perf suites so follow-up optimization work has explicit tracking context.

Made-with: Cursor
---
 tests/perf_tests/test_norm_residual_fusion_perf.py | 3 +++
 tests/perf_tests/test_pointwise_fusion_perf.py     | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py
index 3eebc17..7922b10 100644
--- a/tests/perf_tests/test_norm_residual_fusion_perf.py
+++ b/tests/perf_tests/test_norm_residual_fusion_perf.py
@@ -19,6 +19,9 @@
 Measured baseline (H100):
   torch.compile  ~10.0x vs eager
   magi_compile   ~4.5x vs eager (all paths)
+
+TODO(perf-fusion-gap): magi_compile still trails torch.compile in fusion-heavy workloads;
+investigate graph partitioning/fusion opportunities and reduce the gap.
 """
 
 import pytest
diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py
index 35eac0c..bdc02c9 100644
--- a/tests/perf_tests/test_pointwise_fusion_perf.py
+++ b/tests/perf_tests/test_pointwise_fusion_perf.py
@@ -19,6 +19,9 @@
 Measured baseline (H100):
   torch.compile  ~5.9x vs eager
   magi_compile   ~3.5x vs eager (all paths)
+
+TODO(perf-fusion-gap): magi_compile still trails torch.compile in fusion-heavy workloads;
+investigate graph partitioning/fusion opportunities and reduce the gap.
 """
 
 import pytest

From 988b5095a07f986a88d66689f45bab4e2b09ece2 Mon Sep 17 00:00:00 2001
From: Zhiyao Cen <2523403608@qq.com>
Date: Wed, 1 Apr 2026 22:01:32 +0800
Subject: [PATCH 3/5] [chore] apply pre-commit formatting updates

Apply black-driven formatting updates for perf benchmark utilities and perf test files so repository hooks pass consistently in local and CI workflows.

Made-with: Cursor
---
 tests/perf_tests/__init__.py                  |  8 +---
 tests/perf_tests/test_mlp_perf.py             | 44 +++++++++----------
 .../test_norm_residual_fusion_perf.py         | 37 ++++++++--------
 .../perf_tests/test_pointwise_fusion_perf.py  | 38 ++++++++--------
 tests/utils.py                                |  8 +---
 5 files changed, 60 insertions(+), 75 deletions(-)

diff --git a/tests/perf_tests/__init__.py b/tests/perf_tests/__init__.py
index cbe7710..6a79447 100644
--- a/tests/perf_tests/__init__.py
+++ b/tests/perf_tests/__init__.py
@@ -61,13 +61,7 @@ def cuda_benchmark(
             fn()
         torch.cuda.synchronize()
 
-    times = do_bench(
-        fn,
-        warmup=warmup,
-        rep=rep,
-        grad_to_none=grad_to_none,
-        return_mode="all",
-    )
+    times = do_bench(fn, warmup=warmup, rep=rep, grad_to_none=grad_to_none, return_mode="all")
     return BenchmarkResult(times_ms=times)
 
 
diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py
index b3ff667..68d5bc9 100644
--- a/tests/perf_tests/test_mlp_perf.py
+++ b/tests/perf_tests/test_mlp_perf.py
@@ -36,11 +36,7 @@
 
 
 def _build_config():
-    return MLPConfig(
-        hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE,
-        params_dtype=torch.bfloat16,
-    )
+    return MLPConfig(hidden_size=HIDDEN_SIZE, intermediate_size=INTERMEDIATE_SIZE, params_dtype=torch.bfloat16)
 
 
 # ── Shared baselines (computed once per module) ────────────────────────
@@ -61,9 +57,7 @@ def mlp_baselines(mlp_device, mlp_input):
     """Eager and torch.compile baselines, benchmarked once for the whole module."""
     config = _build_config()
     eager_model = RawMLP(config).to(mlp_device).eval()
-    torch_compiled = torch.compile(
-        RawMLP(config).to(mlp_device).eval(), backend="inductor"
-    )
+    torch_compiled = torch.compile(RawMLP(config).to(mlp_device).eval(), backend="inductor")
     with torch.no_grad():
         eager_result = cuda_benchmark(lambda: eager_model(mlp_input))
         torch_result = cuda_benchmark(lambda: torch_compiled(mlp_input), compilation_warmup=3)
@@ -101,7 +95,9 @@ class CompiledMLP(RawMLP):
 
     magi_vs_eager, _ = print_perf_comparison(
         "MLP - class decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
@@ -113,9 +109,7 @@ def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines):
     eager_result, torch_result = mlp_baselines
     config = _build_config()
 
-    magi_compiled = magi_compile(
-        RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0}
-    )
+    magi_compiled = magi_compile(RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0})
     magi_compiled.eval()
 
     with torch.no_grad():
@@ -123,7 +117,9 @@ def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines):
 
     magi_vs_eager, _ = print_perf_comparison(
         "MLP - instance decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
@@ -139,11 +135,7 @@ def _tc_mode(cfg):
         cfg.compile_mode = CompileMode.TORCH_COMPILE
         return cfg
 
-    magi_compiled = magi_compile(
-        RawMLP(config).to(mlp_device),
-        dynamic_arg_dims={"x": 0},
-        config_patch=_tc_mode,
-    )
+    magi_compiled = magi_compile(RawMLP(config).to(mlp_device), dynamic_arg_dims={"x": 0}, config_patch=_tc_mode)
     magi_compiled.eval()
 
     with torch.no_grad():
@@ -151,7 +143,9 @@ def _tc_mode(cfg):
 
     magi_vs_eager, _ = print_perf_comparison(
         "MLP - instance (TORCH_COMPILE mode)",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
@@ -174,7 +168,9 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor:
 
     magi_vs_eager, _ = print_perf_comparison(
         "MLP - function decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
@@ -187,16 +183,16 @@ def test_mlp_method_decoration(mlp_device, mlp_input, mlp_baselines):
     config = _build_config()
 
     magi_compiled = RawMLP(config).to(mlp_device).eval()
-    magi_compiled.forward = magi_compile(
-        magi_compiled.forward, dynamic_arg_dims={"x": 0}
-    )
+    magi_compiled.forward = magi_compile(magi_compiled.forward, dynamic_arg_dims={"x": 0})
 
     with torch.no_grad():
         magi_result = cuda_benchmark(lambda: magi_compiled(mlp_input), compilation_warmup=3)
 
     magi_vs_eager, _ = print_perf_comparison(
         "MLP - method decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py
index 7922b10..b534444 100644
--- a/tests/perf_tests/test_norm_residual_fusion_perf.py
+++ b/tests/perf_tests/test_norm_residual_fusion_perf.py
@@ -68,9 +68,7 @@ def nra_baselines(nra_device, nra_inputs):
     """Eager and torch.compile baselines, benchmarked once for the whole module."""
     x, residual = nra_inputs
     eager_model = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval()
-    torch_compiled = torch.compile(
-        NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval(), backend="inductor"
-    )
+    torch_compiled = torch.compile(NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval(), backend="inductor")
     with torch.no_grad():
         eager_result = cuda_benchmark(lambda: eager_model(x, residual))
         torch_result = cuda_benchmark(lambda: torch_compiled(x, residual), compilation_warmup=3)
@@ -108,7 +106,9 @@ class CompiledNRA(NormResidualActivation):
 
     magi_vs_eager, _ = print_perf_comparison(
         "Norm+Residual+SiLU - class decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
@@ -120,10 +120,7 @@ def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines
     eager_result, torch_result = nra_baselines
     x, residual = nra_inputs
 
-    magi_compiled = magi_compile(
-        NormResidualActivation(HIDDEN_SIZE).to(nra_device),
-        dynamic_arg_dims={"x": 0, "residual": 0},
-    )
+    magi_compiled = magi_compile(NormResidualActivation(HIDDEN_SIZE).to(nra_device), dynamic_arg_dims={"x": 0, "residual": 0})
     magi_compiled.eval()
 
     with torch.no_grad():
@@ -131,7 +128,9 @@ def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines
 
     magi_vs_eager, _ = print_perf_comparison(
         "Norm+Residual+SiLU - instance decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
@@ -148,9 +147,7 @@ def _tc_mode(cfg):
         return cfg
 
     magi_compiled = magi_compile(
-        NormResidualActivation(HIDDEN_SIZE).to(nra_device),
-        dynamic_arg_dims={"x": 0, "residual": 0},
-        config_patch=_tc_mode,
+        NormResidualActivation(HIDDEN_SIZE).to(nra_device), dynamic_arg_dims={"x": 0, "residual": 0}, config_patch=_tc_mode
     )
     magi_compiled.eval()
 
@@ -159,7 +156,9 @@ def _tc_mode(cfg):
 
     magi_vs_eager, _ = print_perf_comparison(
         "Norm+Residual+SiLU - instance (TORCH_COMPILE mode)",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
@@ -182,7 +181,9 @@ def compiled_entry(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
 
     magi_vs_eager, _ = print_perf_comparison(
         "Norm+Residual+SiLU - function decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
@@ -195,16 +196,16 @@ def test_norm_residual_method_decoration(nra_device, nra_inputs, nra_baselines):
     x, residual = nra_inputs
 
     magi_compiled = NormResidualActivation(HIDDEN_SIZE).to(nra_device).eval()
-    magi_compiled.forward = magi_compile(
-        magi_compiled.forward, dynamic_arg_dims={"x": 0, "residual": 0}
-    )
+    magi_compiled.forward = magi_compile(magi_compiled.forward, dynamic_arg_dims={"x": 0, "residual": 0})
 
     with torch.no_grad():
         magi_result = cuda_benchmark(lambda: magi_compiled(x, residual), compilation_warmup=3)
 
     magi_vs_eager, _ = print_perf_comparison(
         "Norm+Residual+SiLU - method decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py
index bdc02c9..18079cf 100644
--- a/tests/perf_tests/test_pointwise_fusion_perf.py
+++ b/tests/perf_tests/test_pointwise_fusion_perf.py
@@ -66,9 +66,7 @@ def pointwise_baselines(pointwise_device, pointwise_input):
     """Eager and torch.compile baselines, benchmarked once for the whole module."""
     x = pointwise_input
     eager_model = PointwiseFusionChain().to(pointwise_device).eval()
-    torch_compiled = torch.compile(
-        PointwiseFusionChain().to(pointwise_device).eval(), backend="inductor"
-    )
+    torch_compiled = torch.compile(PointwiseFusionChain().to(pointwise_device).eval(), backend="inductor")
     with torch.no_grad():
         eager_result = cuda_benchmark(lambda: eager_model(x))
         torch_result = cuda_benchmark(lambda: torch_compiled(x), compilation_warmup=3)
@@ -105,7 +103,9 @@ class CompiledPointwise(PointwiseFusionChain):
 
     magi_vs_eager, _ = print_perf_comparison(
         "Pointwise - class decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
@@ -116,9 +116,7 @@ def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointw
     """Pointwise chain: magi_compile(instance) decoration."""
     eager_result, torch_result = pointwise_baselines
 
-    magi_compiled = magi_compile(
-        PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0}
-    )
+    magi_compiled = magi_compile(PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0})
     magi_compiled.eval()
 
     with torch.no_grad():
@@ -126,7 +124,9 @@ def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointw
 
     magi_vs_eager, _ = print_perf_comparison(
         "Pointwise - instance decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
@@ -141,11 +141,7 @@ def _tc_mode(cfg):
         cfg.compile_mode = CompileMode.TORCH_COMPILE
         return cfg
 
-    magi_compiled = magi_compile(
-        PointwiseFusionChain().to(pointwise_device),
-        dynamic_arg_dims={"x": 0},
-        config_patch=_tc_mode,
-    )
+    magi_compiled = magi_compile(PointwiseFusionChain().to(pointwise_device), dynamic_arg_dims={"x": 0}, config_patch=_tc_mode)
     magi_compiled.eval()
 
     with torch.no_grad():
@@ -153,7 +149,9 @@ def _tc_mode(cfg):
 
     magi_vs_eager, _ = print_perf_comparison(
         "Pointwise - instance (TORCH_COMPILE mode)",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
@@ -175,7 +173,9 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor:
 
     magi_vs_eager, _ = print_perf_comparison(
         "Pointwise - function decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
@@ -187,16 +187,16 @@ def test_pointwise_method_decoration(pointwise_device, pointwise_input, pointwis
     eager_result, torch_result = pointwise_baselines
 
     magi_compiled = PointwiseFusionChain().to(pointwise_device).eval()
-    magi_compiled.forward = magi_compile(
-        magi_compiled.forward, dynamic_arg_dims={"x": 0}
-    )
+    magi_compiled.forward = magi_compile(magi_compiled.forward, dynamic_arg_dims={"x": 0})
 
     with torch.no_grad():
         magi_result = cuda_benchmark(lambda: magi_compiled(pointwise_input), compilation_warmup=3)
 
     magi_vs_eager, _ = print_perf_comparison(
         "Pointwise - method decoration",
-        eager_result, magi_result, torch_result,
+        eager_result,
+        magi_result,
+        torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
     _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
diff --git a/tests/utils.py b/tests/utils.py
index be24321..7580163 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -99,13 +99,7 @@ def cuda_benchmark(
             fn()
         torch.cuda.synchronize()
 
-    times = do_bench(
-        fn,
-        warmup=warmup,
-        rep=rep,
-        grad_to_none=grad_to_none,
-        return_mode="all",
-    )
+    times = do_bench(fn, warmup=warmup, rep=rep, grad_to_none=grad_to_none, return_mode="all")
     return BenchmarkResult(times_ms=times)
 
 

From ebb7b4cbadcaf945e27a019cedc78ad5782a6bc9 Mon Sep 17 00:00:00 2001
From: Zhiyao Cen <2523403608@qq.com>
Date: Wed, 1 Apr 2026 22:16:39 +0800
Subject: [PATCH 4/5] [test] relax perf speedup thresholds for CI stability

Lower MLP, norm-residual, and pointwise speedup gates to reflect observed CI variance while preserving meaningful eager-baseline improvements across entrypoints.

Made-with: Cursor
---
 tests/perf_tests/test_mlp_perf.py                  | 2 +-
 tests/perf_tests/test_norm_residual_fusion_perf.py | 2 +-
 tests/perf_tests/test_pointwise_fusion_perf.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py
index 68d5bc9..a35397c 100644
--- a/tests/perf_tests/test_mlp_perf.py
+++ b/tests/perf_tests/test_mlp_perf.py
@@ -32,7 +32,7 @@
 HIDDEN_SIZE = 2048
 INTERMEDIATE_SIZE = 8192
 NUM_TOKENS = 8192
-SPEEDUP_VS_EAGER_THRESHOLD = 1.65
+SPEEDUP_VS_EAGER_THRESHOLD = 1.5
 
 
 def _build_config():
diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py
index b534444..a81c1f1 100644
--- a/tests/perf_tests/test_norm_residual_fusion_perf.py
+++ b/tests/perf_tests/test_norm_residual_fusion_perf.py
@@ -36,7 +36,7 @@
 
 HIDDEN_SIZE = 4096
 NUM_TOKENS = 16384
-SPEEDUP_VS_EAGER_THRESHOLD = 4.05
+SPEEDUP_VS_EAGER_THRESHOLD = 3.7
 
 
 class NormResidualActivation(nn.Module):
diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py
index 18079cf..e5bd1ac 100644
--- a/tests/perf_tests/test_pointwise_fusion_perf.py
+++ b/tests/perf_tests/test_pointwise_fusion_perf.py
@@ -34,7 +34,7 @@
 
 HIDDEN_SIZE = 4096
 NUM_TOKENS = 16384
-SPEEDUP_VS_EAGER_THRESHOLD = 3.15
+SPEEDUP_VS_EAGER_THRESHOLD = 2.9
 
 
 class PointwiseFusionChain(nn.Module):

From 6fdd4ddf5223cae341b8f541a59688b9798e3ce1 Mon Sep 17 00:00:00 2001
From: Zhiyao Cen <2523403608@qq.com>
Date: Wed, 1 Apr 2026 23:50:25 +0800
Subject: [PATCH 5/5] [test] deduplicate perf speedup assertions

Move the repeated perf speedup assertion helper into tests/perf_tests/utils.py and reuse it across MLP, norm-residual fusion, and pointwise perf tests to reduce duplication and keep threshold checks consistent.

Made-with: Cursor
---
 tests/perf_tests/test_mlp_perf.py             | 22 +++++-----------
 .../test_norm_residual_fusion_perf.py         | 22 +++++-----------
 .../perf_tests/test_pointwise_fusion_perf.py  | 22 +++++-----------
 tests/perf_tests/utils.py                     | 25 +++++++++++++++++++
 4 files changed, 43 insertions(+), 48 deletions(-)
 create mode 100644 tests/perf_tests/utils.py

diff --git a/tests/perf_tests/test_mlp_perf.py b/tests/perf_tests/test_mlp_perf.py
index a35397c..9563e2d 100644
--- a/tests/perf_tests/test_mlp_perf.py
+++ b/tests/perf_tests/test_mlp_perf.py
@@ -28,6 +28,7 @@
 from magi_compiler.config import CompileMode
 from tests.model_definition import MLPConfig, RawMLP
 from tests.perf_tests import cuda_benchmark, print_perf_comparison
+from tests.perf_tests.utils import assert_speedup
 
 HIDDEN_SIZE = 2048
 INTERMEDIATE_SIZE = 8192
@@ -64,17 +65,6 @@ def mlp_baselines(mlp_device, mlp_input):
     return eager_result, torch_result
 
 
-# ── Helpers ────────────────────────────────────────────────────────────
-
-
-def _assert_speedup(magi_vs_eager, eager_result, magi_result, label):
-    assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, (
-        f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. "
-        f"Got {magi_vs_eager:.2f}x "
-        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
-    )
-
-
 # ── Tests ──────────────────────────────────────────────────────────────
 
 
@@ -100,7 +90,7 @@ class CompiledMLP(RawMLP):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "class", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -122,7 +112,7 @@ def test_mlp_instance_decoration(mlp_device, mlp_input, mlp_baselines):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "instance", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -148,7 +138,7 @@ def _tc_mode(cfg):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -173,7 +163,7 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor:
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "function", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -195,4 +185,4 @@ def test_mlp_method_decoration(mlp_device, mlp_input, mlp_baselines):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  intermediate={INTERMEDIATE_SIZE}  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "method", SPEEDUP_VS_EAGER_THRESHOLD)
diff --git a/tests/perf_tests/test_norm_residual_fusion_perf.py b/tests/perf_tests/test_norm_residual_fusion_perf.py
index a81c1f1..4e20a68 100644
--- a/tests/perf_tests/test_norm_residual_fusion_perf.py
+++ b/tests/perf_tests/test_norm_residual_fusion_perf.py
@@ -33,6 +33,7 @@
 from magi_compiler.config import CompileMode
 from tests.model_definition import RMSNorm
 from tests.perf_tests import cuda_benchmark, print_perf_comparison
+from tests.perf_tests.utils import assert_speedup
 
 HIDDEN_SIZE = 4096
 NUM_TOKENS = 16384
@@ -75,17 +76,6 @@ def nra_baselines(nra_device, nra_inputs):
     return eager_result, torch_result
 
 
-# ── Helpers ────────────────────────────────────────────────────────────
-
-
-def _assert_speedup(magi_vs_eager, eager_result, magi_result, label):
-    assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, (
-        f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. "
-        f"Got {magi_vs_eager:.2f}x "
-        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
-    )
-
-
 # ── Tests ──────────────────────────────────────────────────────────────
 
 
@@ -111,7 +101,7 @@ class CompiledNRA(NormResidualActivation):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "class", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -133,7 +123,7 @@ def test_norm_residual_instance_decoration(nra_device, nra_inputs, nra_baselines
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "instance", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -161,7 +151,7 @@ def _tc_mode(cfg):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -186,7 +176,7 @@ def compiled_entry(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "function", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -208,4 +198,4 @@ def test_norm_residual_method_decoration(nra_device, nra_inputs, nra_baselines):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})  dtype=bf16",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "method", SPEEDUP_VS_EAGER_THRESHOLD)
diff --git a/tests/perf_tests/test_pointwise_fusion_perf.py b/tests/perf_tests/test_pointwise_fusion_perf.py
index e5bd1ac..526f3cd 100644
--- a/tests/perf_tests/test_pointwise_fusion_perf.py
+++ b/tests/perf_tests/test_pointwise_fusion_perf.py
@@ -31,6 +31,7 @@
 from magi_compiler import magi_compile
 from magi_compiler.config import CompileMode
 from tests.perf_tests import cuda_benchmark, print_perf_comparison
+from tests.perf_tests.utils import assert_speedup
 
 HIDDEN_SIZE = 4096
 NUM_TOKENS = 16384
@@ -73,17 +74,6 @@ def pointwise_baselines(pointwise_device, pointwise_input):
     return eager_result, torch_result
 
 
-# ── Helpers ────────────────────────────────────────────────────────────
-
-
-def _assert_speedup(magi_vs_eager, eager_result, magi_result, label):
-    assert magi_vs_eager >= SPEEDUP_VS_EAGER_THRESHOLD, (
-        f"[{label}] magi_compile must achieve >= {SPEEDUP_VS_EAGER_THRESHOLD:.2f}x over eager. "
-        f"Got {magi_vs_eager:.2f}x "
-        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
-    )
-
-
 # ── Tests ──────────────────────────────────────────────────────────────
 
 
@@ -108,7 +98,7 @@ class CompiledPointwise(PointwiseFusionChain):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "class")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "class", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -129,7 +119,7 @@ def test_pointwise_instance_decoration(pointwise_device, pointwise_input, pointw
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "instance", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -154,7 +144,7 @@ def _tc_mode(cfg):
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "instance_tc", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -178,7 +168,7 @@ def compiled_entry(x: torch.Tensor) -> torch.Tensor:
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "function")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "function", SPEEDUP_VS_EAGER_THRESHOLD)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA support")
@@ -199,4 +189,4 @@ def test_pointwise_method_decoration(pointwise_device, pointwise_input, pointwis
         torch_result,
         extra_info=f"shape=({NUM_TOKENS}, {HIDDEN_SIZE})",
     )
-    _assert_speedup(magi_vs_eager, eager_result, magi_result, "method")
+    assert_speedup(magi_vs_eager, eager_result, magi_result, "method", SPEEDUP_VS_EAGER_THRESHOLD)
diff --git a/tests/perf_tests/utils.py b/tests/perf_tests/utils.py
new file mode 100644
index 0000000..80ba884
--- /dev/null
+++ b/tests/perf_tests/utils.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2025 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tests.perf_tests import BenchmarkResult
+
+
+def assert_speedup(
+    magi_vs_eager: float, eager_result: BenchmarkResult, magi_result: BenchmarkResult, label: str, threshold: float
+) -> None:
+    assert magi_vs_eager >= threshold, (
+        f"[{label}] magi_compile must achieve >= {threshold:.2f}x over eager. "
+        f"Got {magi_vs_eager:.2f}x "
+        f"(eager={eager_result.median:.3f}ms, magi={magi_result.median:.3f}ms)"
+    )