From a08ee956e42e47a1c8150e404fd82f10e8b764a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= <lukasz.stolarczuk@intel.com>
Date: Thu, 13 Nov 2025 18:33:38 +0100
Subject: [PATCH] [Bench] Add stats for benchmark executions in GHA summary

---
 devops/actions/run-tests/benchmark/action.yml |   4 +
 devops/scripts/benchmarks/compare.py          |   9 +-
 devops/scripts/benchmarks/main.py             | 136 +++++++++++++++---
 devops/scripts/benchmarks/options.py          |   3 +-
 4 files changed, 132 insertions(+), 20 deletions(-)

diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
index 94dcd44feea05..c83e8c54cc931 100644
--- a/devops/actions/run-tests/benchmark/action.yml
+++ b/devops/actions/run-tests/benchmark/action.yml
@@ -231,6 +231,9 @@ runs:
       WORKDIR="$(realpath ./llvm_test_workdir)"
       if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi
 
+      # Clean up potentially existing, old summary file
+      [ -f "github_summary.md" ] && rm github_summary.md
+
       numactl --cpunodebind "$NUMA_NODE" --membind "$NUMA_NODE" \
       ./devops/scripts/benchmarks/main.py "$WORKDIR" \
         --sycl "$(realpath ./toolchain)" \
@@ -243,6 +246,7 @@ runs:
         --preset "$PRESET" \
         --timestamp-override "$SAVE_TIMESTAMP" \
         --detect-version sycl,compute_runtime \
+        --produce-github-summary \
         ${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }}
       # TODO: add back: "--flamegraph inclusive" once works properly
 
diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py
index 45baff5c75aa0..d8b442127ba79 100644
--- a/devops/scripts/benchmarks/compare.py
+++ b/devops/scripts/benchmarks/compare.py
@@ -357,7 +357,8 @@ def to_hist(
     parser_avg.add_argument(
         "--produce-github-summary",
         action="store_true",
-        help=f"Create a summary file '{options.github_summary_filename}' for Github workflow summaries.",
+        help=f"Produce regression summary for Github workflow, in file '{options.github_summary_filename}'.",
+        default=False,
     )
 
     args = parser.parse_args()
@@ -473,14 +474,16 @@ def print_regression(entry: dict, is_warning: bool = False):
 
             if not args.dry_run:
                 if args.produce_github_summary:
-                    with open(options.github_summary_filename, "w") as f:
+                    with open(options.github_summary_filename, "a") as f:
                         f.write("\n".join(gh_summary))
                 exit(1)  # Exit 1 to trigger Github test failure
 
         log.info("No unexpected regressions found!")
         if args.produce_github_summary:
+            gh_summary.append("")
+            gh_summary.append("### Regressions")
             gh_summary.append("No unexpected regressions found!")
-            with open(options.github_summary_filename, "w") as f:
+            with open(options.github_summary_filename, "a") as f:
                 f.write("\n".join(gh_summary))
 
     else:
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index a2fe028a7d63e..64f7afed7f272 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -32,6 +32,65 @@
 from presets import enabled_suites, presets
 
 
+def generate_github_summary(execution_stats, failures):
+    """Generate GitHub workflow summary with execution statistics"""
+    gh_summary: list[str] = []
+    gh_summary.append("### Benchmarks Execution")
+
+    # Overall statistics
+    total_tests = execution_stats["total_tests"]
+    passed_tests = execution_stats["tests_passed"]
+    failed_tests = execution_stats["tests_failed"]
+    warnings = execution_stats["warnings"]
+    errors = len(failures)
+
+    gh_summary.append("#### Overall Statistics")
+    gh_summary.append(f"- **Total Number of benchmarks:** {total_tests}")
+    gh_summary.append(f"- **Tests Passed:** {passed_tests}")
+    gh_summary.append(f"- **Tests Failed:** {failed_tests}")
+    gh_summary.append(f"- **Errors:** {errors}")
+    gh_summary.append(f"- **Warnings:** {warnings}")
+    gh_summary.append("")
+
+    # Overall status of execution
+    if failed_tests == 0 and errors == 0:
+        gh_summary.append("#### ✅ Status: SUCCESS")
+        gh_summary.append("Benchmarks seem to have executed successfully!")
+    elif failed_tests > 0 or errors > 0:
+        gh_summary.append("#### ❌ Status: FAILURES DETECTED")
+        gh_summary.append("Some benchmarks failed or encountered errors.")
+
+    if warnings > 0:
+        gh_summary.append("#### ⚠️ Status: WARNINGS DETECTED")
+        gh_summary.append("Some benchmarks executed with warnings.")
+
+    gh_summary.append("")
+
+    # Detailed failures info
+    if failures:
+        gh_summary.append("#### Failure Details")
+        gh_summary.append(
+            f"<details><summary>{len(failures)} failed benchmarks:</summary>"
+        )
+        gh_summary.append("")
+
+        for benchmark_name, failure_reason in failures.items():
+            gh_summary.append(f"##### {benchmark_name}")
+            gh_summary.append(f"- **Reason:** {failure_reason}")
+            gh_summary.append("")
+
+        gh_summary.append("</details>")
+        gh_summary.append("")
+
+    # Write the summary to file
+    try:
+        with open(options.github_summary_filename, "w") as f:
+            f.write("\n".join(gh_summary))
+        log.info(f"GitHub summary written to {options.github_summary_filename}")
+    except Exception as e:
+        log.error(f"Failed to write GitHub summary: {e}")
+
+
 def run_iterations(
     benchmark: Benchmark,
     env_vars,
@@ -40,7 +99,12 @@ def run_iterations(
     failures: dict[str, str],
     run_trace: TracingType = TracingType.NONE,
     force_trace: bool = False,
-):
+) -> bool:
+    """
+    Returns True if all iterations completed successfully, False otherwise.
+    Unless options.exit_on_failure is set, then exception is raised.
+    """
+
     for iter in range(iters):
         log.info(f"running {benchmark.name()}, iteration {iter}... ")
         try:
@@ -49,10 +113,10 @@ def run_iterations(
             )
             if bench_results is None:
                 if options.exit_on_failure:
-                    raise RuntimeError(f"Benchmark produced no results!")
+                    raise RuntimeError("Benchmark produced no results!")
                 else:
                     failures[benchmark.name()] = "benchmark produced no results!"
-                    break
+                    return False
 
             for bench_result in bench_results:
                 log.info(
@@ -73,10 +137,15 @@ def run_iterations(
                     f"Benchmark failed: {failure_label} verification failed: {str(e)}"
                 )
             else:
-                failures[failure_label] = f"verification failed: {str(e)}"
-                log.error(f"complete ({failure_label}: verification failed: {str(e)}).")
+                failures[failure_label] = (
+                    f"{failure_label}: verification failed: {str(e)}"
+                )
+                log.error(f"{failure_label}: verification failed: {str(e)}.")
                 continue
 
+    # Iterations completed successfully
+    return True
+
 
 # https://www.statology.org/modified-z-score/
 def modified_z_score(values: list[float]) -> list[float]:
@@ -110,7 +179,7 @@ def remove_outliers(
 
 
 def process_results(
-    results: dict[str, list[Result]], stddev_threshold_override
+    results: dict[str, list[Result]], stddev_threshold_override, execution_stats
 ) -> tuple[bool, list[Result]]:
     processed: list[Result] = []
     # technically, we can detect whether result is below or above threshold per
@@ -142,6 +211,7 @@ def process_results(
             log.warning(
                 f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}"
             )
+            execution_stats["warnings"] += 1
             valid_results = False
 
         rlist.sort(key=lambda res: res.value)
@@ -170,7 +240,7 @@ def collect_metadata(suites):
     return metadata
 
 
-def main(directory, additional_env_vars, compare_names, filter):
+def main(directory, additional_env_vars, compare_names, filter, execution_stats):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
     if options.dry_run:
@@ -218,7 +288,7 @@ def main(directory, additional_env_vars, compare_names, filter):
 
     # TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
     # TODO: do not add benchmarks whose suite setup failed
-    # TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
+    # TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
 
     for s in suites:
         if s.name() not in enabled_suites(options.preset):
@@ -246,9 +316,9 @@ def main(directory, additional_env_vars, compare_names, filter):
             except Exception as e:
                 if options.exit_on_failure:
                     raise e
-                failures[s.name()] = f"Suite setup failure: {e}"
+                failures[s.name()] = f"Suite '{s.name()}' setup failure: {e}"
                 log.error(
-                    f"{type(s).__name__} setup failed. Benchmarks won't be added."
+                    f"Suite {type(s).__name__} setup failed. Benchmarks won't be added."
                 )
                 log.error(f"failed: {e}")
             else:
@@ -265,12 +335,15 @@ def main(directory, additional_env_vars, compare_names, filter):
             if options.exit_on_failure:
                 raise e
             else:
-                failures[benchmark.name()] = f"Benchmark setup failure: {e}"
+                failures[benchmark.name()] = (
+                    f"Benchmark '{benchmark.name()}' setup failure: {e}"
+                )
                 log.error(f"failed: {e}")
 
     results = []
     if benchmarks:
         log.info(f"Running {len(benchmarks)} benchmarks...")
+        execution_stats["total_tests"] = len(benchmarks)
     elif not options.dry_run:
         raise RuntimeError("No benchmarks to run.")
     for benchmark in benchmarks:
@@ -278,6 +351,7 @@ def main(directory, additional_env_vars, compare_names, filter):
             merged_env_vars = {**additional_env_vars}
             intermediate_results: dict[str, list[Result]] = {}
             processed: list[Result] = []
+            iterations_rc = False
 
             # Determine if we should run regular benchmarks
             # Run regular benchmarks if:
@@ -292,7 +366,7 @@ def main(directory, additional_env_vars, compare_names, filter):
 
             if should_run_regular:
                 for _ in range(options.iterations_stddev):
-                    run_iterations(
+                    iterations_rc = run_iterations(
                         benchmark,
                         merged_env_vars,
                         options.iterations,
@@ -301,7 +375,9 @@ def main(directory, additional_env_vars, compare_names, filter):
                         run_trace=TracingType.NONE,
                     )
                     valid, processed = process_results(
-                        intermediate_results, benchmark.stddev_threshold()
+                        intermediate_results,
+                        benchmark.stddev_threshold(),
+                        execution_stats,
                     )
                     if valid:
                         break
@@ -310,7 +386,7 @@ def main(directory, additional_env_vars, compare_names, filter):
             if options.unitrace and (
                 benchmark.traceable(TracingType.UNITRACE) or args.unitrace == "force"
             ):
-                run_iterations(
+                iterations_rc = run_iterations(
                     benchmark,
                     merged_env_vars,
                     1,
@@ -324,7 +400,7 @@ def main(directory, additional_env_vars, compare_names, filter):
                 benchmark.traceable(TracingType.FLAMEGRAPH)
                 or args.flamegraph == "force"
             ):
-                run_iterations(
+                iterations_rc = run_iterations(
                     benchmark,
                     merged_env_vars,
                     1,
@@ -335,11 +411,18 @@ def main(directory, additional_env_vars, compare_names, filter):
                 )
 
             results += processed
+            if iterations_rc:
+                execution_stats["tests_passed"] += 1
+            else:
+                execution_stats["tests_failed"] += 1
         except Exception as e:
+            execution_stats["tests_failed"] += 1
             if options.exit_on_failure:
                 raise e
             else:
-                failures[benchmark.name()] = f"Benchmark run failure: {e}"
+                failures[benchmark.name()] = (
+                    f"Benchmark '{benchmark.name()}' run failure: {e}"
+                )
                 log.error(f"failed: {e}")
 
     this_name = options.current_run_name
@@ -408,6 +491,10 @@ def main(directory, additional_env_vars, compare_names, filter):
         generate_html(history, compare_names, html_path, metadata)
         log.info(f"HTML with benchmark results has been generated")
 
+    # Generate GitHub summary
+    if options.produce_github_summary:
+        generate_github_summary(execution_stats, failures)
+
     if options.exit_on_failure and failures:
         # just in case code missed to raise earlier
         raise RuntimeError(str(failures))
@@ -691,6 +778,12 @@ def validate_and_parse_env_args(env_args):
         help="Set the logging level",
         default="info",
     )
+    parser.add_argument(
+        "--produce-github-summary",
+        action="store_true",
+        help=f"Produce execution stats summary for Github workflow, in file '{options.github_summary_filename}'.",
+        default=False,
+    )
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -724,6 +817,7 @@ def validate_and_parse_env_args(env_args):
     options.flamegraph = args.flamegraph is not None
     options.archive_baseline_days = args.archive_baseline_after
     options.archive_pr_days = args.archive_pr_after
+    options.produce_github_summary = args.produce_github_summary
 
     # Initialize logger with command line arguments
     initialize_logger(args.verbose, args.log_level)
@@ -738,6 +832,14 @@ def validate_and_parse_env_args(env_args):
             parser.error("Specified --output-dir is not a valid path")
         options.output_directory = os.path.abspath(args.output_dir)
 
+    # Initialize GitHub summary tracking
+    execution_stats = {
+        "total_tests": 0,
+        "tests_passed": 0,
+        "tests_failed": 0,
+        "warnings": 0,
+    }
+
     # Options intended for CI:
     options.timestamp_override = args.timestamp_override
     if args.results_dir is not None:
@@ -780,6 +882,7 @@ def validate_and_parse_env_args(env_args):
         options.device_architecture = ""
         log.warning(f"Failed to fetch device architecture: {e}")
         log.warning("Defaulting to generic benchmark parameters.")
+        execution_stats["warnings"] += 1
 
     log.info(f"Selected device architecture: {options.device_architecture}")
 
@@ -788,4 +891,5 @@ def validate_and_parse_env_args(env_args):
         additional_env_vars,
         args.compare,
         benchmark_filter,
+        execution_stats,
     )
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index 1168affaf2575..33c4e556cfd76 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -97,7 +97,8 @@ class Options:
     # CI scripts vs SYCl build source.
     github_repo_override: str = None
     git_commit_override: str = None
-    # Filename used to store Github summary files:
+    # Flag and filename used to store Github summary files:
+    produce_github_summary: bool = False
     github_summary_filename: str = "github_summary.md"
     # Archiving settings
     # Archived runs are stored separately from the main dataset but are still accessible