From a08ee956e42e47a1c8150e404fd82f10e8b764a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Stolarczuk?= Date: Thu, 13 Nov 2025 18:33:38 +0100 Subject: [PATCH] [Bench] Add stats for benchmark executions in GHA summary --- devops/actions/run-tests/benchmark/action.yml | 4 + devops/scripts/benchmarks/compare.py | 9 +- devops/scripts/benchmarks/main.py | 136 +++++++++++++++--- devops/scripts/benchmarks/options.py | 3 +- 4 files changed, 132 insertions(+), 20 deletions(-) diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml index 94dcd44feea05..c83e8c54cc931 100644 --- a/devops/actions/run-tests/benchmark/action.yml +++ b/devops/actions/run-tests/benchmark/action.yml @@ -231,6 +231,9 @@ runs: WORKDIR="$(realpath ./llvm_test_workdir)" if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi + # Clean up potentially existing, old summary file + [ -f "github_summary.md" ] && rm github_summary.md + numactl --cpunodebind "$NUMA_NODE" --membind "$NUMA_NODE" \ ./devops/scripts/benchmarks/main.py "$WORKDIR" \ --sycl "$(realpath ./toolchain)" \ @@ -243,6 +246,7 @@ runs: --preset "$PRESET" \ --timestamp-override "$SAVE_TIMESTAMP" \ --detect-version sycl,compute_runtime \ + --produce-github-summary \ ${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }} # TODO: add back: "--flamegraph inclusive" once works properly diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py index 45baff5c75aa0..d8b442127ba79 100644 --- a/devops/scripts/benchmarks/compare.py +++ b/devops/scripts/benchmarks/compare.py @@ -357,7 +357,8 @@ def to_hist( parser_avg.add_argument( "--produce-github-summary", action="store_true", - help=f"Create a summary file '{options.github_summary_filename}' for Github workflow summaries.", + help=f"Produce regression summary for Github workflow, in file '{options.github_summary_filename}'.", + default=False, ) args = parser.parse_args() @@ -473,14 +474,16 @@ def print_regression(entry: dict, is_warning: bool = False): if not args.dry_run: if args.produce_github_summary: - with open(options.github_summary_filename, "w") as f: + with open(options.github_summary_filename, "a") as f: f.write("\n".join(gh_summary)) exit(1) # Exit 1 to trigger Github test failure log.info("No unexpected regressions found!") if args.produce_github_summary: + gh_summary.append("") + gh_summary.append("### Regressions") gh_summary.append("No unexpected regressions found!") - with open(options.github_summary_filename, "w") as f: + with open(options.github_summary_filename, "a") as f: f.write("\n".join(gh_summary)) else: diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py index a2fe028a7d63e..64f7afed7f272 100755 --- a/devops/scripts/benchmarks/main.py +++ b/devops/scripts/benchmarks/main.py @@ -32,6 +32,65 @@ from presets import enabled_suites, presets +def generate_github_summary(execution_stats, failures): + """Generate GitHub workflow summary with execution statistics""" + gh_summary: list[str] = [] + gh_summary.append("### Benchmarks Execution") + + # Overall statistics + total_tests = execution_stats["total_tests"] + passed_tests = execution_stats["tests_passed"] + failed_tests = execution_stats["tests_failed"] + warnings = execution_stats["warnings"] + errors = len(failures) + + gh_summary.append("#### Overall Statistics") + gh_summary.append(f"- **Total Number of benchmarks:** {total_tests}") + gh_summary.append(f"- **Tests Passed:** {passed_tests}") + gh_summary.append(f"- **Tests Failed:** {failed_tests}") + gh_summary.append(f"- **Errors:** {errors}") + gh_summary.append(f"- **Warnings:** {warnings}") + gh_summary.append("") + + # Overall status of execution + if failed_tests == 0 and errors == 0: + gh_summary.append("#### ✅ Status: SUCCESS") + gh_summary.append("Benchmarks seem to have executed successfully!") + elif failed_tests > 0 or errors > 0: + gh_summary.append("#### ❌ Status: FAILURES DETECTED") + gh_summary.append("Some benchmarks failed or encountered errors.") + + if warnings > 0: + gh_summary.append("#### ⚠️ Status: WARNINGS DETECTED") + gh_summary.append("Some benchmarks executed with warnings.") + + gh_summary.append("") + + # Detailed failures info + if failures: + gh_summary.append("#### Failure Details") + gh_summary.append( + f"
{len(failures)} failed benchmarks:" + ) + gh_summary.append("") + + for benchmark_name, failure_reason in failures.items(): + gh_summary.append(f"##### {benchmark_name}") + gh_summary.append(f"- **Reason:** {failure_reason}") + gh_summary.append("") + + gh_summary.append("
") + gh_summary.append("") + + # Write the summary to file + try: + with open(options.github_summary_filename, "w") as f: + f.write("\n".join(gh_summary)) + log.info(f"GitHub summary written to {options.github_summary_filename}") + except Exception as e: + log.error(f"Failed to write GitHub summary: {e}") + + def run_iterations( benchmark: Benchmark, env_vars, @@ -40,7 +99,12 @@ def run_iterations( failures: dict[str, str], run_trace: TracingType = TracingType.NONE, force_trace: bool = False, -): +) -> bool: + """ + Returns True if all iterations completed successfully, False otherwise. + Unless options.exit_on_failure is set, then exception is raised. + """ + for iter in range(iters): log.info(f"running {benchmark.name()}, iteration {iter}... ") try: @@ -49,10 +113,10 @@ def run_iterations( ) if bench_results is None: if options.exit_on_failure: - raise RuntimeError(f"Benchmark produced no results!") + raise RuntimeError("Benchmark produced no results!") else: failures[benchmark.name()] = "benchmark produced no results!" - break + return False for bench_result in bench_results: log.info( @@ -73,10 +137,15 @@ def run_iterations( f"Benchmark failed: {failure_label} verification failed: {str(e)}" ) else: - failures[failure_label] = f"verification failed: {str(e)}" - log.error(f"complete ({failure_label}: verification failed: {str(e)}).") + failures[failure_label] = ( + f"{failure_label}: verification failed: {str(e)}" + ) + log.error(f"{failure_label}: verification failed: {str(e)}.") continue + # Iterations completed successfully + return True + # https://www.statology.org/modified-z-score/ def modified_z_score(values: list[float]) -> list[float]: @@ -110,7 +179,7 @@ def remove_outliers( def process_results( - results: dict[str, list[Result]], stddev_threshold_override + results: dict[str, list[Result]], stddev_threshold_override, execution_stats ) -> tuple[bool, list[Result]]: processed: list[Result] = [] # technically, we can detect whether result is below or above threshold per @@ -142,6 +211,7 @@ def process_results( log.warning( f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}" ) + execution_stats["warnings"] += 1 valid_results = False rlist.sort(key=lambda res: res.value) @@ -170,7 +240,7 @@ def collect_metadata(suites): return metadata -def main(directory, additional_env_vars, compare_names, filter): +def main(directory, additional_env_vars, compare_names, filter, execution_stats): prepare_workdir(directory, INTERNAL_WORKDIR_VERSION) if options.dry_run: @@ -218,7 +288,7 @@ def main(directory, additional_env_vars, compare_names, filter): # TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup # TODO: do not add benchmarks whose suite setup failed - # TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI + # TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI for s in suites: if s.name() not in enabled_suites(options.preset): @@ -246,9 +316,9 @@ def main(directory, additional_env_vars, compare_names, filter): except Exception as e: if options.exit_on_failure: raise e - failures[s.name()] = f"Suite setup failure: {e}" + failures[s.name()] = f"Suite '{s.name()}' setup failure: {e}" log.error( - f"{type(s).__name__} setup failed. Benchmarks won't be added." + f"Suite {type(s).__name__} setup failed. Benchmarks won't be added." ) log.error(f"failed: {e}") else: @@ -265,12 +335,15 @@ def main(directory, additional_env_vars, compare_names, filter): if options.exit_on_failure: raise e else: - failures[benchmark.name()] = f"Benchmark setup failure: {e}" + failures[benchmark.name()] = ( + f"Benchmark '{benchmark.name()}' setup failure: {e}" + ) log.error(f"failed: {e}") results = [] if benchmarks: log.info(f"Running {len(benchmarks)} benchmarks...") + execution_stats["total_tests"] = len(benchmarks) elif not options.dry_run: raise RuntimeError("No benchmarks to run.") for benchmark in benchmarks: @@ -278,6 +351,7 @@ def main(directory, additional_env_vars, compare_names, filter): merged_env_vars = {**additional_env_vars} intermediate_results: dict[str, list[Result]] = {} processed: list[Result] = [] + iterations_rc = False # Determine if we should run regular benchmarks # Run regular benchmarks if: @@ -292,7 +366,7 @@ def main(directory, additional_env_vars, compare_names, filter): if should_run_regular: for _ in range(options.iterations_stddev): - run_iterations( + iterations_rc = run_iterations( benchmark, merged_env_vars, options.iterations, @@ -301,7 +375,9 @@ def main(directory, additional_env_vars, compare_names, filter): run_trace=TracingType.NONE, ) valid, processed = process_results( - intermediate_results, benchmark.stddev_threshold() + intermediate_results, + benchmark.stddev_threshold(), + execution_stats, ) if valid: break @@ -310,7 +386,7 @@ def main(directory, additional_env_vars, compare_names, filter): if options.unitrace and ( benchmark.traceable(TracingType.UNITRACE) or args.unitrace == "force" ): - run_iterations( + iterations_rc = run_iterations( benchmark, merged_env_vars, 1, @@ -324,7 +400,7 @@ def main(directory, additional_env_vars, compare_names, filter): benchmark.traceable(TracingType.FLAMEGRAPH) or args.flamegraph == "force" ): - run_iterations( + iterations_rc = run_iterations( benchmark, merged_env_vars, 1, @@ -335,11 +411,18 @@ def main(directory, additional_env_vars, compare_names, filter): ) results += processed + if iterations_rc: + execution_stats["tests_passed"] += 1 + else: + execution_stats["tests_failed"] += 1 except Exception as e: + execution_stats["tests_failed"] += 1 if options.exit_on_failure: raise e else: - failures[benchmark.name()] = f"Benchmark run failure: {e}" + failures[benchmark.name()] = ( + f"Benchmark '{benchmark.name()}' run failure: {e}" + ) log.error(f"failed: {e}") this_name = options.current_run_name @@ -408,6 +491,10 @@ def main(directory, additional_env_vars, compare_names, filter): generate_html(history, compare_names, html_path, metadata) log.info(f"HTML with benchmark results has been generated") + # Generate GitHub summary + if options.produce_github_summary: + generate_github_summary(execution_stats, failures) + if options.exit_on_failure and failures: # just in case code missed to raise earlier raise RuntimeError(str(failures)) @@ -691,6 +778,12 @@ def validate_and_parse_env_args(env_args): help="Set the logging level", default="info", ) + parser.add_argument( + "--produce-github-summary", + action="store_true", + help=f"Produce execution stats summary for Github workflow, in file '{options.github_summary_filename}'.", + default=False, + ) args = parser.parse_args() additional_env_vars = validate_and_parse_env_args(args.env) @@ -724,6 +817,7 @@ def validate_and_parse_env_args(env_args): options.flamegraph = args.flamegraph is not None options.archive_baseline_days = args.archive_baseline_after options.archive_pr_days = args.archive_pr_after + options.produce_github_summary = args.produce_github_summary # Initialize logger with command line arguments initialize_logger(args.verbose, args.log_level) @@ -738,6 +832,14 @@ def validate_and_parse_env_args(env_args): parser.error("Specified --output-dir is not a valid path") options.output_directory = os.path.abspath(args.output_dir) + # Initialize GitHub summary tracking + execution_stats = { + "total_tests": 0, + "tests_passed": 0, + "tests_failed": 0, + "warnings": 0, + } + # Options intended for CI: options.timestamp_override = args.timestamp_override if args.results_dir is not None: @@ -780,6 +882,7 @@ def validate_and_parse_env_args(env_args): options.device_architecture = "" log.warning(f"Failed to fetch device architecture: {e}") log.warning("Defaulting to generic benchmark parameters.") + execution_stats["warnings"] += 1 log.info(f"Selected device architecture: {options.device_architecture}") @@ -788,4 +891,5 @@ def validate_and_parse_env_args(env_args): additional_env_vars, args.compare, benchmark_filter, + execution_stats, ) diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index 1168affaf2575..33c4e556cfd76 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -97,7 +97,8 @@ class Options: # CI scripts vs SYCl build source. github_repo_override: str = None git_commit_override: str = None - # Filename used to store Github summary files: + # Flag and filename used to store Github summary files: + produce_github_summary: bool = False github_summary_filename: str = "github_summary.md" # Archiving settings # Archived runs are stored separately from the main dataset but are still accessible