Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions devops/actions/run-tests/benchmark/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,9 @@ runs:
WORKDIR="$(realpath ./llvm_test_workdir)"
if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi

# Clean up potentially existing, old summary file
[ -f "github_summary.md" ] && rm github_summary.md

numactl --cpunodebind "$NUMA_NODE" --membind "$NUMA_NODE" \
./devops/scripts/benchmarks/main.py "$WORKDIR" \
--sycl "$(realpath ./toolchain)" \
Expand All @@ -243,6 +246,7 @@ runs:
--preset "$PRESET" \
--timestamp-override "$SAVE_TIMESTAMP" \
--detect-version sycl,compute_runtime \
--produce-github-summary \
${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }}
# TODO: add back: "--flamegraph inclusive" once works properly

Expand Down
9 changes: 6 additions & 3 deletions devops/scripts/benchmarks/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ def to_hist(
parser_avg.add_argument(
"--produce-github-summary",
action="store_true",
help=f"Create a summary file '{options.github_summary_filename}' for Github workflow summaries.",
help=f"Produce regression summary for Github workflow, in file '{options.github_summary_filename}'.",
default=False,
)

args = parser.parse_args()
Expand Down Expand Up @@ -473,14 +474,16 @@ def print_regression(entry: dict, is_warning: bool = False):

if not args.dry_run:
if args.produce_github_summary:
with open(options.github_summary_filename, "w") as f:
with open(options.github_summary_filename, "a") as f:
f.write("\n".join(gh_summary))
exit(1) # Exit 1 to trigger Github test failure

log.info("No unexpected regressions found!")
if args.produce_github_summary:
gh_summary.append("")
gh_summary.append("### Regressions")
gh_summary.append("No unexpected regressions found!")
with open(options.github_summary_filename, "w") as f:
with open(options.github_summary_filename, "a") as f:
f.write("\n".join(gh_summary))

else:
Expand Down
136 changes: 120 additions & 16 deletions devops/scripts/benchmarks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,65 @@
from presets import enabled_suites, presets


def generate_github_summary(execution_stats, failures):
"""Generate GitHub workflow summary with execution statistics"""
gh_summary: list[str] = []
gh_summary.append("### Benchmarks Execution")

# Overall statistics
total_tests = execution_stats["total_tests"]
passed_tests = execution_stats["tests_passed"]
failed_tests = execution_stats["tests_failed"]
warnings = execution_stats["warnings"]
errors = len(failures)

gh_summary.append("#### Overall Statistics")
gh_summary.append(f"- **Total Number of benchmarks:** {total_tests}")
gh_summary.append(f"- **Tests Passed:** {passed_tests}")
gh_summary.append(f"- **Tests Failed:** {failed_tests}")
gh_summary.append(f"- **Errors:** {errors}")
gh_summary.append(f"- **Warnings:** {warnings}")
gh_summary.append("")

# Overall status of execution
if failed_tests == 0 and errors == 0:
gh_summary.append("#### ✅ Status: SUCCESS")
gh_summary.append("Benchmarks seem to have executed successfully!")
elif failed_tests > 0 or errors > 0:
gh_summary.append("#### ❌ Status: FAILURES DETECTED")
gh_summary.append("Some benchmarks failed or encountered errors.")

if warnings > 0:
gh_summary.append("#### ⚠️ Status: WARNINGS DETECTED")
gh_summary.append("Some benchmarks executed with warnings.")

gh_summary.append("")

# Detailed failures info
if failures:
gh_summary.append("#### Failure Details")
gh_summary.append(
f"<details><summary>{len(failures)} failed benchmarks:</summary>"
)
gh_summary.append("")

for benchmark_name, failure_reason in failures.items():
gh_summary.append(f"##### {benchmark_name}")
gh_summary.append(f"- **Reason:** {failure_reason}")
gh_summary.append("")

gh_summary.append("</details>")
gh_summary.append("")

# Write the summary to file
try:
with open(options.github_summary_filename, "w") as f:
f.write("\n".join(gh_summary))
log.info(f"GitHub summary written to {options.github_summary_filename}")
except Exception as e:
log.error(f"Failed to write GitHub summary: {e}")


def run_iterations(
benchmark: Benchmark,
env_vars,
Expand All @@ -40,7 +99,12 @@ def run_iterations(
failures: dict[str, str],
run_trace: TracingType = TracingType.NONE,
force_trace: bool = False,
):
) -> bool:
"""
Returns True if all iterations completed successfully, False otherwise.
Unless options.exit_on_failure is set, then exception is raised.
"""

for iter in range(iters):
log.info(f"running {benchmark.name()}, iteration {iter}... ")
try:
Expand All @@ -49,10 +113,10 @@ def run_iterations(
)
if bench_results is None:
if options.exit_on_failure:
raise RuntimeError(f"Benchmark produced no results!")
raise RuntimeError("Benchmark produced no results!")
else:
failures[benchmark.name()] = "benchmark produced no results!"
break
return False

for bench_result in bench_results:
log.info(
Expand All @@ -73,10 +137,15 @@ def run_iterations(
f"Benchmark failed: {failure_label} verification failed: {str(e)}"
)
else:
failures[failure_label] = f"verification failed: {str(e)}"
log.error(f"complete ({failure_label}: verification failed: {str(e)}).")
failures[failure_label] = (
f"{failure_label}: verification failed: {str(e)}"
)
log.error(f"{failure_label}: verification failed: {str(e)}.")
continue

# Iterations completed successfully
return True


# https://www.statology.org/modified-z-score/
def modified_z_score(values: list[float]) -> list[float]:
Expand Down Expand Up @@ -110,7 +179,7 @@ def remove_outliers(


def process_results(
results: dict[str, list[Result]], stddev_threshold_override
results: dict[str, list[Result]], stddev_threshold_override, execution_stats
) -> tuple[bool, list[Result]]:
processed: list[Result] = []
# technically, we can detect whether result is below or above threshold per
Expand Down Expand Up @@ -142,6 +211,7 @@ def process_results(
log.warning(
f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}"
)
execution_stats["warnings"] += 1
valid_results = False

rlist.sort(key=lambda res: res.value)
Expand Down Expand Up @@ -170,7 +240,7 @@ def collect_metadata(suites):
return metadata


def main(directory, additional_env_vars, compare_names, filter):
def main(directory, additional_env_vars, compare_names, filter, execution_stats):
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)

if options.dry_run:
Expand Down Expand Up @@ -218,7 +288,7 @@ def main(directory, additional_env_vars, compare_names, filter):

# TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
# TODO: do not add benchmarks whose suite setup failed
# TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
# TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI

for s in suites:
if s.name() not in enabled_suites(options.preset):
Expand Down Expand Up @@ -246,9 +316,9 @@ def main(directory, additional_env_vars, compare_names, filter):
except Exception as e:
if options.exit_on_failure:
raise e
failures[s.name()] = f"Suite setup failure: {e}"
failures[s.name()] = f"Suite '{s.name()}' setup failure: {e}"
log.error(
f"{type(s).__name__} setup failed. Benchmarks won't be added."
f"Suite {type(s).__name__} setup failed. Benchmarks won't be added."
)
log.error(f"failed: {e}")
else:
Expand All @@ -265,19 +335,23 @@ def main(directory, additional_env_vars, compare_names, filter):
if options.exit_on_failure:
raise e
else:
failures[benchmark.name()] = f"Benchmark setup failure: {e}"
failures[benchmark.name()] = (
f"Benchmark '{benchmark.name()}' setup failure: {e}"
)
log.error(f"failed: {e}")

results = []
if benchmarks:
log.info(f"Running {len(benchmarks)} benchmarks...")
execution_stats["total_tests"] = len(benchmarks)
elif not options.dry_run:
raise RuntimeError("No benchmarks to run.")
for benchmark in benchmarks:
try:
merged_env_vars = {**additional_env_vars}
intermediate_results: dict[str, list[Result]] = {}
processed: list[Result] = []
iterations_rc = False

# Determine if we should run regular benchmarks
# Run regular benchmarks if:
Expand All @@ -292,7 +366,7 @@ def main(directory, additional_env_vars, compare_names, filter):

if should_run_regular:
for _ in range(options.iterations_stddev):
run_iterations(
iterations_rc = run_iterations(
benchmark,
merged_env_vars,
options.iterations,
Expand All @@ -301,7 +375,9 @@ def main(directory, additional_env_vars, compare_names, filter):
run_trace=TracingType.NONE,
)
valid, processed = process_results(
intermediate_results, benchmark.stddev_threshold()
intermediate_results,
benchmark.stddev_threshold(),
execution_stats,
)
if valid:
break
Expand All @@ -310,7 +386,7 @@ def main(directory, additional_env_vars, compare_names, filter):
if options.unitrace and (
benchmark.traceable(TracingType.UNITRACE) or args.unitrace == "force"
):
run_iterations(
iterations_rc = run_iterations(
benchmark,
merged_env_vars,
1,
Expand All @@ -324,7 +400,7 @@ def main(directory, additional_env_vars, compare_names, filter):
benchmark.traceable(TracingType.FLAMEGRAPH)
or args.flamegraph == "force"
):
run_iterations(
iterations_rc = run_iterations(
benchmark,
merged_env_vars,
1,
Expand All @@ -335,11 +411,18 @@ def main(directory, additional_env_vars, compare_names, filter):
)

results += processed
if iterations_rc:
execution_stats["tests_passed"] += 1
else:
execution_stats["tests_failed"] += 1
except Exception as e:
execution_stats["tests_failed"] += 1
if options.exit_on_failure:
raise e
else:
failures[benchmark.name()] = f"Benchmark run failure: {e}"
failures[benchmark.name()] = (
f"Benchmark '{benchmark.name()}' run failure: {e}"
)
log.error(f"failed: {e}")

this_name = options.current_run_name
Expand Down Expand Up @@ -408,6 +491,10 @@ def main(directory, additional_env_vars, compare_names, filter):
generate_html(history, compare_names, html_path, metadata)
log.info(f"HTML with benchmark results has been generated")

# Generate GitHub summary
if options.produce_github_summary:
generate_github_summary(execution_stats, failures)

if options.exit_on_failure and failures:
# just in case code missed to raise earlier
raise RuntimeError(str(failures))
Expand Down Expand Up @@ -691,6 +778,12 @@ def validate_and_parse_env_args(env_args):
help="Set the logging level",
default="info",
)
parser.add_argument(
"--produce-github-summary",
action="store_true",
help=f"Produce execution stats summary for Github workflow, in file '{options.github_summary_filename}'.",
default=False,
)

args = parser.parse_args()
additional_env_vars = validate_and_parse_env_args(args.env)
Expand Down Expand Up @@ -724,6 +817,7 @@ def validate_and_parse_env_args(env_args):
options.flamegraph = args.flamegraph is not None
options.archive_baseline_days = args.archive_baseline_after
options.archive_pr_days = args.archive_pr_after
options.produce_github_summary = args.produce_github_summary

# Initialize logger with command line arguments
initialize_logger(args.verbose, args.log_level)
Expand All @@ -738,6 +832,14 @@ def validate_and_parse_env_args(env_args):
parser.error("Specified --output-dir is not a valid path")
options.output_directory = os.path.abspath(args.output_dir)

# Initialize GitHub summary tracking
execution_stats = {
"total_tests": 0,
"tests_passed": 0,
"tests_failed": 0,
"warnings": 0,
}

# Options intended for CI:
options.timestamp_override = args.timestamp_override
if args.results_dir is not None:
Expand Down Expand Up @@ -780,6 +882,7 @@ def validate_and_parse_env_args(env_args):
options.device_architecture = ""
log.warning(f"Failed to fetch device architecture: {e}")
log.warning("Defaulting to generic benchmark parameters.")
execution_stats["warnings"] += 1

log.info(f"Selected device architecture: {options.device_architecture}")

Expand All @@ -788,4 +891,5 @@ def validate_and_parse_env_args(env_args):
additional_env_vars,
args.compare,
benchmark_filter,
execution_stats,
)
3 changes: 2 additions & 1 deletion devops/scripts/benchmarks/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class Options:
# CI scripts vs SYCl build source.
github_repo_override: str = None
git_commit_override: str = None
# Filename used to store Github summary files:
# Flag and filename used to store Github summary files:
produce_github_summary: bool = False
github_summary_filename: str = "github_summary.md"
# Archiving settings
# Archived runs are stored separately from the main dataset but are still accessible
Expand Down