intel · PatKamin · Sep 18, 2025 · Sep 18, 2025
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from itertools import product
 import os
 import csv
 import io
@@ -24,6 +25,11 @@ class RUNTIMES(Enum):
     UR = "ur"
 
 
+class PROFILERS(Enum):
+    TIMER = "timer"
+    CPU_COUNTER = "cpuCounter"
+
+
 def runtime_to_name(runtime: RUNTIMES) -> str:
     return {
         RUNTIMES.SYCL_PREVIEW: "SYCL Preview",
@@ -171,69 +177,116 @@ def benchmarks(self) -> list[Benchmark]:
 
         # hand-picked value so that total execution time of the benchmark is
         # similar on all architectures
-        long_lernel_exec_time_ioq = [20]
+        long_kernel_exec_time_ioq = [20]
         # For BMG server, a new value 200 is used, but we have to create metadata
         # for both values to keep the dashboard consistent.
         # See SubmitKernel.enabled()
         long_kernel_exec_time_ooo = [20, 200]
 
-        # The Combo Profiler is available only for selected sycl benchmarks
-        profiler_types = ["timer", "cpuCounter"]
-
-        for runtime in list(RUNTIMES):
-            # Add SubmitKernel benchmarks using loops
-            for in_order_queue in [0, 1]:
-                for measure_completion in [0, 1]:
-                    for use_events in [0, 1]:
-                        long_kernel_exec_time = (
-                            long_lernel_exec_time_ioq
-                            if in_order_queue
-                            else long_kernel_exec_time_ooo
-                        )
-                        for kernel_exec_time in [1, *long_kernel_exec_time]:
-                            for profiler_type in profiler_types:
-                                benches.append(
-                                    SubmitKernel(
-                                        self,
-                                        runtime,
-                                        in_order_queue,
-                                        measure_completion,
-                                        use_events,
-                                        kernel_exec_time,
-                                        profiler_type,
-                                    )
-                                )
-
-            # Add SinKernelGraph benchmarks
-            for with_graphs in [0, 1]:
-                for num_kernels in [5, 100]:
+        submit_kernel_params = product(
+            list(RUNTIMES),
+            [0, 1],  # in_order_queue
+            [0, 1],  # measure_completion
+            [0, 1],  # use_events
+        )
+        for (
+            runtime,
+            in_order_queue,
+            measure_completion,
+            use_events,
+        ) in submit_kernel_params:
+            long_kernel_exec_time = (
+                long_kernel_exec_time_ioq
+                if in_order_queue
+                else long_kernel_exec_time_ooo
+            )
+            for kernel_exec_time in [1, *long_kernel_exec_time]:
+                benches.append(
+                    SubmitKernel(
+                        self,
+                        runtime,
+                        in_order_queue,
+                        measure_completion,
+                        use_events,
+                        kernel_exec_time,
+                    )
+                )
+                if runtime == RUNTIMES.SYCL:
+                    # Create CPU count variant
                     benches.append(
-                        GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+                        SubmitKernel(
+                            self,
+                            runtime,
+                            in_order_queue,
+                            measure_completion,
+                            use_events,
+                            kernel_exec_time,
+                            profiler_type=PROFILERS.CPU_COUNTER,
+                        )
                     )
 
+        # Add SinKernelGraph benchmarks
+        sin_kernel_graph_params = product(
+            list(RUNTIMES),
+            [0, 1],  # with_graphs
+            [5, 100],  # num_kernels
+        )
+        for runtime, with_graphs, num_kernels in sin_kernel_graph_params:
+            benches.append(
+                GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+            )
+
             # Add ULLS benchmarks
-            for profiler_type in profiler_types:
-                benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
+        for runtime in list(RUNTIMES):
+            if runtime == RUNTIMES.SYCL:
+                benches.append(
+                    UllsEmptyKernel(
+                        self, runtime, 1000, 256, profiler_type=PROFILERS.CPU_COUNTER
+                    )
+                )
+            benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
-            # Add GraphApiSubmitGraph benchmarks
-            for in_order_queue in [0, 1]:
-                for num_kernels in self.submit_graph_num_kernels:
-                    for measure_completion_time in [0, 1]:
-                        for use_events in [0, 1]:
-                            for profiler_type in profiler_types:
-                                benches.append(
-                                    GraphApiSubmitGraph(
-                                        self,
-                                        runtime,
-                                        in_order_queue,
-                                        num_kernels,
-                                        measure_completion_time,
-                                        profiler_type,
-                                        use_events,
-                                        useHostTasks=0,
-                                    )
-                                )
+        # Add GraphApiSubmitGraph benchmarks
+        submit_graph_params = product(
+            list(RUNTIMES),
+            [0, 1],  # in_order_queue
+            self.submit_graph_num_kernels,
+            [0, 1],  # measure_completion_time
+            [0, 1],  # use_events
+        )
+        for (
+            runtime,
+            in_order_queue,
+            num_kernels,
+            measure_completion_time,
+            use_events,
+        ) in submit_graph_params:
+            benches.append(
+                GraphApiSubmitGraph(
+                    self,
+                    runtime,
+                    in_order_queue,
+                    num_kernels,
+                    measure_completion_time,
+                    use_events,
+                    useHostTasks=0,
+                )
+            )
+            if runtime == RUNTIMES.SYCL:
+                # Create CPU count variant
+                benches.append(
+                    GraphApiSubmitGraph(
+                        self,
+                        runtime,
+                        in_order_queue,
+                        num_kernels,
+                        measure_completion_time,
+                        use_events,
+                        useHostTasks=0,
+                        profiler_type=PROFILERS.CPU_COUNTER,
+                    )
+                )
 
         # Add other benchmarks
         benches += [
@@ -244,7 +297,7 @@ def benchmarks(self) -> list[Benchmark]:
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
         ]
-        for profiler_type in profiler_types:
+        for profiler_type in list(PROFILERS):
             benches.append(
                 QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
             )
@@ -310,7 +363,12 @@ def parse_unit_type(compute_unit):
 
 class ComputeBenchmark(Benchmark):
     def __init__(
-        self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
+        self,
+        bench,
+        name,
+        test,
+        runtime: RUNTIMES = None,
+        profiler_type: PROFILERS = PROFILERS.TIMER,
     ):
         super().__init__(bench.directory, bench)
         self.bench = bench
@@ -478,7 +536,7 @@ def __init__(
         MeasureCompletion=0,
         UseEvents=0,
         KernelExecTime=1,
-        profiler_type="",
+        profiler_type=PROFILERS.TIMER,
     ):
         self.ioq = ioq
         self.MeasureCompletion = MeasureCompletion
@@ -578,7 +636,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseEvents={self.UseEvents}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
@@ -647,7 +705,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--dst={self.destination}",
             f"--size={self.size}",
             "--withCopyOffload=0",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -693,7 +751,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--size={self.size}",
             "--count=100",
             "--withCopyOffload=0",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -731,7 +789,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--sourcePlacement={self.source}",
             f"--destinationPlacement={self.destination}",
             f"--size={self.size}",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -970,9 +1028,9 @@ def __init__(
         inOrderQueue,
         numKernels,
         measureCompletionTime,
-        profiler_type,
         useEvents,
         useHostTasks,
+        profiler_type=PROFILERS.TIMER,
     ):
         self.inOrderQueue = inOrderQueue
         self.numKernels = numKernels
@@ -1037,12 +1095,14 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseHostTasks={self.useHostTasks}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
 
 class UllsEmptyKernel(ComputeBenchmark):
-    def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
+    def __init__(
+        self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type=PROFILERS.TIMER
+    ):
         self.wgc = wgc
         self.wgs = wgs
         # iterations per bin_args: --iterations=10000
@@ -1084,7 +1144,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--wgc={self.wgc}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args