bugfix: add driver support to CUPTI benchmark function, issue #2145 (#2154)

nv-yunzheq · web-flow · commit e59226bd81a8 · 2025-12-01T20:59:19.000-08:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **New Features** * GPU timing now captures driver-level activities alongside runtime and kernel events for more complete timing. * Activity records include richer metadata to improve event correlation and reporting. * CUPTI measurement window adjusted to ensure driver activity is collected during profiling. * **Bug Fixes** * Improved filtering and aggregation so collected activities are correlated and reported more accurately. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
diff --git a/flashinfer/testing/utils.py b/flashinfer/testing/utils.py
@@ -740,7 +740,7 @@ def func_buffer_requested():
         return buffer_size, max_num_records
 
     def func_buffer_completed(
-        launches: list[tuple[float, float, int]],
+        launches: list[tuple[float, float, int, int, int]],
         kernels: list[tuple[str, float, float, int]],
         activities: list,
     ):
@@ -755,9 +755,20 @@ def func_buffer_completed(
                         activity.correlation_id,
                     )
                 )
-            elif activity.kind == cupti.ActivityKind.RUNTIME:
-                # Runtime activity
-                launches.append((activity.start, activity.end, activity.correlation_id))
+            elif activity.kind in (
+                cupti.ActivityKind.RUNTIME,
+                cupti.ActivityKind.DRIVER,
+            ):
+                # Runtime or Driver activity
+                launches.append(
+                    (
+                        activity.start,
+                        activity.end,
+                        activity.correlation_id,
+                        activity.cbid,
+                        activity.kind,
+                    )
+                )
 
     if l2_flush:
         l2_flush_size = int(l2_flush_size_mb) * 1024 * 1024
@@ -815,11 +826,12 @@ def func_buffer_completed(
     torch.cuda.synchronize()
 
     # CUPTI measurement
-    launches: list[tuple[float, float, int]] = []
+    launches: list[tuple[float, float, int, int, int]] = []
     kernels: list[tuple[str, float, float, int]] = []
     iter_timestamps = []
     cupti.activity_enable(cupti.ActivityKind.RUNTIME)
     cupti.activity_enable(cupti.ActivityKind.CONCURRENT_KERNEL)
+    cupti.activity_enable(cupti.ActivityKind.DRIVER)
     cupti.activity_register_callbacks(
         func_buffer_requested, partial(func_buffer_completed, launches, kernels)
     )
@@ -836,6 +848,7 @@ def func_buffer_completed(
     cupti.activity_flush_all(0)
     cupti.activity_disable(cupti.ActivityKind.RUNTIME)
     cupti.activity_disable(cupti.ActivityKind.CONCURRENT_KERNEL)
+    cupti.activity_disable(cupti.ActivityKind.DRIVER)
     cupti.finalize()
 
     # Process activities