Skip to content

Commit 3e28015

Browse files
[Bench] Add stats for benchmark executions in GHA summary
1 parent 62397e8 commit 3e28015

File tree

4 files changed

+111
-13
lines changed

4 files changed

+111
-13
lines changed

devops/actions/run-tests/benchmark/action.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ runs:
231231
WORKDIR="$(realpath ./llvm_test_workdir)"
232232
if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi
233233

234+
# Clean up potentially existing, old summary file
235+
[ -f "github_summary.md" ] && rm github_summary.md
236+
234237
numactl --cpunodebind "$NUMA_NODE" --membind "$NUMA_NODE" \
235238
./devops/scripts/benchmarks/main.py "$WORKDIR" \
236239
--sycl "$(realpath ./toolchain)" \
@@ -243,6 +246,7 @@ runs:
243246
--preset "$PRESET" \
244247
--timestamp-override "$SAVE_TIMESTAMP" \
245248
--detect-version sycl,compute_runtime \
249+
--produce-github-summary \
246250
${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }}
247251
# TODO: add back: "--flamegraph inclusive" once works properly
248252

devops/scripts/benchmarks/compare.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@ def to_hist(
357357
parser_avg.add_argument(
358358
"--produce-github-summary",
359359
action="store_true",
360-
help=f"Create a summary file '{options.github_summary_filename}' for Github workflow summaries.",
360+
help=f"Produce regression summary for Github workflow, in file '{options.github_summary_filename}'.",
361+
default=False,
361362
)
362363

363364
args = parser.parse_args()
@@ -473,14 +474,16 @@ def print_regression(entry: dict, is_warning: bool = False):
473474

474475
if not args.dry_run:
475476
if args.produce_github_summary:
476-
with open(options.github_summary_filename, "w") as f:
477+
with open(options.github_summary_filename, "a") as f:
477478
f.write("\n".join(gh_summary))
478479
exit(1) # Exit 1 to trigger Github test failure
479480

480481
log.info("No unexpected regressions found!")
481482
if args.produce_github_summary:
482-
gh_summary.append("No unexpected regressions found!")
483-
with open(options.github_summary_filename, "w") as f:
483+
gh_summary.append("")
484+
gh_summary.append("### Regressions")
485+
gh_summary.append("✅ No unexpected regressions found!")
486+
with open(options.github_summary_filename, "a") as f:
484487
f.write("\n".join(gh_summary))
485488

486489
else:

devops/scripts/benchmarks/main.py

Lines changed: 98 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,65 @@
3232
from presets import enabled_suites, presets
3333

3434

35+
def generate_github_summary(execution_stats, failures):
36+
"""Generate GitHub workflow summary with execution statistics"""
37+
gh_summary: list[str] = []
38+
gh_summary.append("### Benchmarks Execution")
39+
40+
# Overall statistics
41+
total_tests = execution_stats["total_tests"]
42+
passed_tests = execution_stats["tests_passed"]
43+
failed_tests = execution_stats["tests_failed"]
44+
warnings = execution_stats["warnings"]
45+
errors = len(failures)
46+
47+
gh_summary.append("#### Overall Statistics")
48+
gh_summary.append(f"- **Total Number of benchmarks:** {total_tests}")
49+
gh_summary.append(f"- **Tests Passed:** {passed_tests}")
50+
gh_summary.append(f"- **Tests Failed:** {failed_tests}")
51+
gh_summary.append(f"- **Errors:** {errors}")
52+
gh_summary.append(f"- **Warnings:** {warnings}")
53+
gh_summary.append("")
54+
55+
# Overall status of execution
56+
if failed_tests == 0 and errors == 0:
57+
gh_summary.append("#### ✅ Status: SUCCESS")
58+
gh_summary.append("Benchmarks seem to have executed successfully!")
59+
elif failed_tests > 0 or errors > 0:
60+
gh_summary.append("#### ❌ Status: FAILURES DETECTED")
61+
gh_summary.append("Some benchmarks failed or encountered errors.")
62+
63+
if warnings > 0:
64+
gh_summary.append("#### ⚠️ Status: WARNINGS DETECTED")
65+
gh_summary.append("Some benchmarks executed with warnings.")
66+
67+
gh_summary.append("")
68+
69+
# Detailed failures info
70+
if failures:
71+
gh_summary.append("#### Failure Details")
72+
gh_summary.append(
73+
f"<details><summary>{len(failures)} failed benchmarks:</summary>"
74+
)
75+
gh_summary.append("")
76+
77+
for benchmark_name, failure_reason in failures.items():
78+
gh_summary.append(f"##### {benchmark_name}")
79+
gh_summary.append(f"- **Reason:** {failure_reason}")
80+
gh_summary.append("")
81+
82+
gh_summary.append("</details>")
83+
gh_summary.append("")
84+
85+
# Write the summary to file
86+
try:
87+
with open(options.github_summary_filename, "w") as f:
88+
f.write("\n".join(gh_summary))
89+
log.info(f"GitHub summary written to {options.github_summary_filename}")
90+
except Exception as e:
91+
log.error(f"Failed to write GitHub summary: {e}")
92+
93+
3594
def run_iterations(
3695
benchmark: Benchmark,
3796
env_vars,
@@ -110,7 +169,7 @@ def remove_outliers(
110169

111170

112171
def process_results(
113-
results: dict[str, list[Result]], stddev_threshold_override
172+
results: dict[str, list[Result]], stddev_threshold_override, execution_stats
114173
) -> tuple[bool, list[Result]]:
115174
processed: list[Result] = []
116175
# technically, we can detect whether result is below or above threshold per
@@ -142,6 +201,7 @@ def process_results(
142201
log.warning(
143202
f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}"
144203
)
204+
execution_stats["warnings"] += 1
145205
valid_results = False
146206

147207
rlist.sort(key=lambda res: res.value)
@@ -170,7 +230,7 @@ def collect_metadata(suites):
170230
return metadata
171231

172232

173-
def main(directory, additional_env_vars, compare_names, filter):
233+
def main(directory, additional_env_vars, compare_names, filter, execution_stats):
174234
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
175235

176236
if options.dry_run:
@@ -218,7 +278,7 @@ def main(directory, additional_env_vars, compare_names, filter):
218278

219279
# TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
220280
# TODO: do not add benchmarks whose suite setup failed
221-
# TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
281+
# TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
222282

223283
for s in suites:
224284
if s.name() not in enabled_suites(options.preset):
@@ -246,9 +306,9 @@ def main(directory, additional_env_vars, compare_names, filter):
246306
except Exception as e:
247307
if options.exit_on_failure:
248308
raise e
249-
failures[s.name()] = f"Suite setup failure: {e}"
309+
failures[s.name()] = f"Suite '{s.name()}' setup failure: {e}"
250310
log.error(
251-
f"{type(s).__name__} setup failed. Benchmarks won't be added."
311+
f"Suite {type(s).__name__} setup failed. Benchmarks won't be added."
252312
)
253313
log.error(f"failed: {e}")
254314
else:
@@ -265,12 +325,15 @@ def main(directory, additional_env_vars, compare_names, filter):
265325
if options.exit_on_failure:
266326
raise e
267327
else:
268-
failures[benchmark.name()] = f"Benchmark setup failure: {e}"
328+
failures[benchmark.name()] = (
329+
f"Benchmark '{benchmark.name()}' setup failure: {e}"
330+
)
269331
log.error(f"failed: {e}")
270332

271333
results = []
272334
if benchmarks:
273335
log.info(f"Running {len(benchmarks)} benchmarks...")
336+
execution_stats["total_tests"] = len(benchmarks)
274337
elif not options.dry_run:
275338
raise RuntimeError("No benchmarks to run.")
276339
for benchmark in benchmarks:
@@ -301,7 +364,9 @@ def main(directory, additional_env_vars, compare_names, filter):
301364
run_trace=TracingType.NONE,
302365
)
303366
valid, processed = process_results(
304-
intermediate_results, benchmark.stddev_threshold()
367+
intermediate_results,
368+
benchmark.stddev_threshold(),
369+
execution_stats,
305370
)
306371
if valid:
307372
break
@@ -335,11 +400,15 @@ def main(directory, additional_env_vars, compare_names, filter):
335400
)
336401

337402
results += processed
403+
execution_stats["tests_passed"] += 1
338404
except Exception as e:
405+
execution_stats["tests_failed"] += 1
339406
if options.exit_on_failure:
340407
raise e
341408
else:
342-
failures[benchmark.name()] = f"Benchmark run failure: {e}"
409+
failures[benchmark.name()] = (
410+
f"Benchmark '{benchmark.name()}' run failure: {e}"
411+
)
343412
log.error(f"failed: {e}")
344413

345414
this_name = options.current_run_name
@@ -408,6 +477,10 @@ def main(directory, additional_env_vars, compare_names, filter):
408477
generate_html(history, compare_names, html_path, metadata)
409478
log.info(f"HTML with benchmark results has been generated")
410479

480+
# Generate GitHub summary
481+
if options.produce_github_summary:
482+
generate_github_summary(execution_stats, failures)
483+
411484
if options.exit_on_failure and failures:
412485
# just in case code missed to raise earlier
413486
raise RuntimeError(str(failures))
@@ -691,6 +764,12 @@ def validate_and_parse_env_args(env_args):
691764
help="Set the logging level",
692765
default="info",
693766
)
767+
parser.add_argument(
768+
"--produce-github-summary",
769+
action="store_true",
770+
help=f"Produce execution stats summary for Github workflow, in file '{options.github_summary_filename}'.",
771+
default=False,
772+
)
694773

695774
args = parser.parse_args()
696775
additional_env_vars = validate_and_parse_env_args(args.env)
@@ -724,6 +803,7 @@ def validate_and_parse_env_args(env_args):
724803
options.flamegraph = args.flamegraph is not None
725804
options.archive_baseline_days = args.archive_baseline_after
726805
options.archive_pr_days = args.archive_pr_after
806+
options.produce_github_summary = args.produce_github_summary
727807

728808
# Initialize logger with command line arguments
729809
initialize_logger(args.verbose, args.log_level)
@@ -738,6 +818,14 @@ def validate_and_parse_env_args(env_args):
738818
parser.error("Specified --output-dir is not a valid path")
739819
options.output_directory = os.path.abspath(args.output_dir)
740820

821+
# Initialize GitHub summary tracking
822+
execution_stats = {
823+
"total_tests": 0,
824+
"tests_passed": 0,
825+
"tests_failed": 0,
826+
"warnings": 0,
827+
}
828+
741829
# Options intended for CI:
742830
options.timestamp_override = args.timestamp_override
743831
if args.results_dir is not None:
@@ -780,6 +868,7 @@ def validate_and_parse_env_args(env_args):
780868
options.device_architecture = ""
781869
log.warning(f"Failed to fetch device architecture: {e}")
782870
log.warning("Defaulting to generic benchmark parameters.")
871+
execution_stats["warnings"] += 1
783872

784873
log.info(f"Selected device architecture: {options.device_architecture}")
785874

@@ -788,4 +877,5 @@ def validate_and_parse_env_args(env_args):
788877
additional_env_vars,
789878
args.compare,
790879
benchmark_filter,
880+
execution_stats,
791881
)

devops/scripts/benchmarks/options.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ class Options:
9797
# CI scripts vs SYCl build source.
9898
github_repo_override: str = None
9999
git_commit_override: str = None
100-
# Filename used to store Github summary files:
100+
# Flag and filename used to store Github summary files:
101+
produce_github_summary: bool = False
101102
github_summary_filename: str = "github_summary.md"
102103
# Archiving settings
103104
# Archived runs are stored separately from the main dataset but are still accessible

0 commit comments

Comments
 (0)