diff --git a/helion/autotuner/local_cache.py b/helion/autotuner/local_cache.py
index 1113dffc9..bb4110ab9 100644
--- a/helion/autotuner/local_cache.py
+++ b/helion/autotuner/local_cache.py
@@ -6,6 +6,7 @@
 import logging
 import os
 from pathlib import Path
+import platform
 import textwrap
 from typing import TYPE_CHECKING
 import uuid
@@ -58,17 +59,34 @@ def _generate_key(self) -> LooseAutotuneCacheKey:
 
         for arg in self.args:
             if isinstance(arg, torch.Tensor):
-                nms = torch.xpu if torch.xpu.is_available() else torch.cuda
-                device_properties = nms.get_device_properties(arg.device)
-                if torch.version.cuda is not None:  # pyright: ignore[reportAttributeAccessIssue]
-                    hardware = device_properties.name
-                    runtime_name = str(torch.version.cuda)
-                elif torch.version.hip is not None:  # pyright: ignore[reportAttributeAccessIssue]
-                    hardware = device_properties.gcnArchName
-                    runtime_name = torch.version.hip  # pyright: ignore[reportAttributeAccessIssue]
-                else:
+                dev = arg.device
+                # CPU support
+                if dev.type == "cpu":
+                    hardware = "cpu"
+                    runtime_name = platform.machine().lower()
+                    break
+
+                # XPU (Intel) path
+                if (
+                    dev.type == "xpu"
+                    and getattr(torch, "xpu", None) is not None
+                    and torch.xpu.is_available()
+                ):  # pyright: ignore[reportAttributeAccessIssue]
+                    device_properties = torch.xpu.get_device_properties(dev)
                     hardware = device_properties.name
                     runtime_name = device_properties.driver_version  # pyright: ignore[reportAttributeAccessIssue]
+                    break
+
+                # CUDA/ROCm path
+                if dev.type == "cuda" and torch.cuda.is_available():
+                    device_properties = torch.cuda.get_device_properties(dev)
+                    if torch.version.cuda is not None:  # pyright: ignore[reportAttributeAccessIssue]
+                        hardware = device_properties.name
+                        runtime_name = str(torch.version.cuda)
+                    elif torch.version.hip is not None:  # pyright: ignore[reportAttributeAccessIssue]
+                        hardware = device_properties.gcnArchName
+                        runtime_name = torch.version.hip  # pyright: ignore[reportAttributeAccessIssue]
+                    break
 
         assert hardware is not None and runtime_name is not None
         return LooseAutotuneCacheKey(
diff --git a/helion/runtime/__init__.py b/helion/runtime/__init__.py
index 5def1da5c..9c1c28884 100644
--- a/helion/runtime/__init__.py
+++ b/helion/runtime/__init__.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextvars
+import os
 from typing import TYPE_CHECKING
 
 import torch
@@ -47,7 +48,13 @@ def get_num_sm(device: torch.device) -> int:
     Returns:
         Grid size to use for a persistent kernel on the device.
     """
-    assert device.type in ["cuda", "xpu"], "TODO: implement for other devices"
+    assert device.type in ["cuda", "xpu", "cpu"], "TODO: implement for other devices"
+    if device.type == "cpu":
+        try:
+            num_threads = int(torch.get_num_threads())
+        except Exception:
+            num_threads = 0
+        return num_threads if num_threads > 0 else int(os.cpu_count() or 1)
     if device.type == "cuda":
         return torch.cuda.get_device_properties(device.index).multi_processor_count
     # TODO(EikanWang): gpu_subslice_count is an out-of-date term. we change update it to XeCore number.