diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py index 1113dffc9..bb4110ab9 100644 --- a/helion/autotuner/local_cache.py +++ b/helion/autotuner/local_cache.py @@ -6,6 +6,7 @@ import logging import os from pathlib import Path +import platform import textwrap from typing import TYPE_CHECKING import uuid @@ -58,17 +59,34 @@ def _generate_key(self) -> LooseAutotuneCacheKey: for arg in self.args: if isinstance(arg, torch.Tensor): - nms = torch.xpu if torch.xpu.is_available() else torch.cuda - device_properties = nms.get_device_properties(arg.device) - if torch.version.cuda is not None: # pyright: ignore[reportAttributeAccessIssue] - hardware = device_properties.name - runtime_name = str(torch.version.cuda) - elif torch.version.hip is not None: # pyright: ignore[reportAttributeAccessIssue] - hardware = device_properties.gcnArchName - runtime_name = torch.version.hip # pyright: ignore[reportAttributeAccessIssue] - else: + dev = arg.device + # CPU support + if dev.type == "cpu": + hardware = "cpu" + runtime_name = platform.machine().lower() + break + + # XPU (Intel) path + if ( + dev.type == "xpu" + and getattr(torch, "xpu", None) is not None + and torch.xpu.is_available() + ): # pyright: ignore[reportAttributeAccessIssue] + device_properties = torch.xpu.get_device_properties(dev) hardware = device_properties.name runtime_name = device_properties.driver_version # pyright: ignore[reportAttributeAccessIssue] + break + + # CUDA/ROCm path + if dev.type == "cuda" and torch.cuda.is_available(): + device_properties = torch.cuda.get_device_properties(dev) + if torch.version.cuda is not None: # pyright: ignore[reportAttributeAccessIssue] + hardware = device_properties.name + runtime_name = str(torch.version.cuda) + elif torch.version.hip is not None: # pyright: ignore[reportAttributeAccessIssue] + hardware = device_properties.gcnArchName + runtime_name = torch.version.hip # pyright: ignore[reportAttributeAccessIssue] + break assert hardware is not None and runtime_name is not None return LooseAutotuneCacheKey( diff --git a/helion/runtime/__init__.py b/helion/runtime/__init__.py index 5def1da5c..9c1c28884 100644 --- a/helion/runtime/__init__.py +++ b/helion/runtime/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextvars +import os from typing import TYPE_CHECKING import torch @@ -47,7 +48,13 @@ def get_num_sm(device: torch.device) -> int: Returns: Grid size to use for a persistent kernel on the device. """ - assert device.type in ["cuda", "xpu"], "TODO: implement for other devices" + assert device.type in ["cuda", "xpu", "cpu"], "TODO: implement for other devices" + if device.type == "cpu": + try: + num_threads = int(torch.get_num_threads()) + except Exception: + num_threads = 0 + return num_threads if num_threads > 0 else int(os.cpu_count() or 1) if device.type == "cuda": return torch.cuda.get_device_properties(device.index).multi_processor_count # TODO(EikanWang): gpu_subslice_count is an out-of-date term. we change update it to XeCore number.