Make fork default precompilation strategy (#979)

oulgen · web-flow · commit 454932d152c5 · 2025-10-16T23:37:47.000-07:00
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -118,10 +118,10 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
 .. autoattribute:: Settings.autotune_precompile
 
    Select the autotuner precompile mode, which adds parallelism and
-   checks for errors/timeouts. ``"spawn"`` (default) runs kernel
-   warm-up in a fresh process including running to check for errors,
-   ``"fork"`` is faster but does not include the error check run,
-   or None to disables precompile checks altogether. Controlled by
+   checks for errors/timeouts. ``"fork"`` (default) is faster but does
+   not include the error check run, ``"spawn"`` runs kernel warm-up in a
+   fresh process including running to check for errors, or None to
+   disables precompile checks altogether. Controlled by
    ``HELION_AUTOTUNE_PRECOMPILE``.
 
 .. autoattribute:: Settings.autotune_random_seed
@@ -240,7 +240,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_DISALLOW_AUTOTUNING`` | ``check_autotuning_disabled`` | Hard-disable autotuning; kernels must supply explicit configs when this is ``1``. |
 | ``HELION_AUTOTUNE_COMPILE_TIMEOUT`` | ``autotune_compile_timeout`` | Maximum seconds to wait for Triton compilation during autotuning. |
 | ``HELION_AUTOTUNE_LOG_LEVEL`` | ``autotune_log_level`` | Adjust logging verbosity; accepts names like ``INFO`` or numeric levels. |
-| ``HELION_AUTOTUNE_PRECOMPILE`` | ``autotune_precompile`` | Select the autotuner precompile mode (``"spawn"``, ``"fork"``, or disable when empty). |
+| ``HELION_AUTOTUNE_PRECOMPILE`` | ``autotune_precompile`` | Select the autotuner precompile mode (``"fork"`` (default), ``"spawn"``, or disable when empty). |
 | ``HELION_AUTOTUNE_PRECOMPILE_JOBS`` | ``autotune_precompile_jobs`` | Cap the number of concurrent Triton precompile subprocesses. |
 | ``HELION_AUTOTUNE_RANDOM_SEED`` | ``autotune_random_seed`` | Seed used for randomized autotuning searches. |
 | ``HELION_AUTOTUNE_MAX_GENERATIONS`` | ``autotune_max_generations`` | Upper bound on generations for Pattern Search and Differential Evolution. |
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -264,7 +264,7 @@ class _Settings:
         default_factory=functools.partial(
             _env_get_literal,
             "HELION_AUTOTUNE_PRECOMPILE",
-            cast("PrecompileMode", "spawn"),
+            cast("PrecompileMode", "fork"),
             mapping={
                 "spawn": "spawn",
                 "fork": "fork",
@@ -367,7 +367,7 @@ class Settings(_Settings):
             "Use HELION_AUTOTUNE_LOG_LEVEL to override or set 0 to disable output."
         ),
         "autotune_compile_timeout": "Timeout for Triton compilation in seconds used for autotuning. Default is 60 seconds.",
-        "autotune_precompile": "Autotuner precompile mode: 'spawn', 'fork', or falsy/None to disable. Defaults to 'spawn' on non-Windows platforms.",
+        "autotune_precompile": "Autotuner precompile mode: 'fork', 'spawn', or falsy/None to disable. Defaults to 'fork' on non-Windows platforms.",
         "autotune_precompile_jobs": "Maximum concurrent Triton precompile processes, default to cpu count.",
         "autotune_random_seed": "Seed used for autotuner random number generation. Defaults to HELION_AUTOTUNE_RANDOM_SEED or a time-based seed.",
         "autotune_accuracy_check": "If True, validate candidate configs against the baseline kernel output before accepting them during autotuning.",