From 1e69a7795d956d7f71e2b2ab3c1eae05351ea4a5 Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 25 Jun 2025 12:49:18 +0100
Subject: [PATCH 1/2] feat: make (fixed-width) evaluation return a non-zero
 exitcode on mismatch

This PR changes the evaluation script to return a non-zero exitcode
whenever the ripgrep checks find a mismatched number (i.e., whenever it
currently prints FAIL to the log). This exitcode should cause the CI to
fail, meaning we no longer have to manually check the logs to confirm
the evaluation actually got the right numbers.

It does mean that now whenever we do expect the numbers to change, we
have to change the expected numbers in the script before CI can pass,
but forcing us to keep these numbers in sync with reality seems more
like a feature than a bug.
---
 bv-evaluation/collect-data-llvm.py             | 8 ++++++++
 bv-evaluation/results/InstCombine/.placeholder | 0
 2 files changed, 8 insertions(+)
 delete mode 100644 bv-evaluation/results/InstCombine/.placeholder

diff --git a/bv-evaluation/collect-data-llvm.py b/bv-evaluation/collect-data-llvm.py
index 836dd99830..32143a38fd 100644
--- a/bv-evaluation/collect-data-llvm.py
+++ b/bv-evaluation/collect-data-llvm.py
@@ -257,7 +257,11 @@
 print("There were "+str(inconsistencies)+" inconsistencies")
 print("Errors raised: "+str(errTot))
 
+shell_command_mismatch = False
+
 # Double Checking Against Grep Output
+# This function sets the global `shell_command_mismatch` to True if the output
+# does not match the expected value.
 def run_shell_command_and_assert_output_eq_int(cwd : str, cmd : str, expected_val : int) -> int:
     response = subprocess.check_output(cmd, shell=True, cwd=cwd, text=True)
     val = int(response)
@@ -305,3 +309,7 @@ def run_shell_command_and_assert_output_eq_int(cwd : str, cmd : str, expected_va
 df.to_csv(raw_data_dir+'llvm-proved-data.csv')
 df_ceg.to_csv(raw_data_dir+'llvm-ceg-data.csv')
 
+# If any of the shell commands were not as expected, set a non-zero exit code
+# to signal this failure (in particular, making CI fail).
+if shell_command_mismatch:
+    exit(1)
\ No newline at end of file
diff --git a/bv-evaluation/results/InstCombine/.placeholder b/bv-evaluation/results/InstCombine/.placeholder
deleted file mode 100644
index e69de29bb2..0000000000

From ff7dd03b3889ecc63635da3aee4a70aa93faea97 Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 25 Jun 2025 15:10:14 +0100
Subject: [PATCH 2/2] fix: actually modify the global exit code var

---
 bv-evaluation/collect-data-llvm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bv-evaluation/collect-data-llvm.py b/bv-evaluation/collect-data-llvm.py
index 32143a38fd..8d89b2b953 100644
--- a/bv-evaluation/collect-data-llvm.py
+++ b/bv-evaluation/collect-data-llvm.py
@@ -263,10 +263,14 @@
 # This function sets the global `shell_command_mismatch` to True if the output
 # does not match the expected value.
 def run_shell_command_and_assert_output_eq_int(cwd : str, cmd : str, expected_val : int) -> int:
+    global shell_command_mismatch
+
     response = subprocess.check_output(cmd, shell=True, cwd=cwd, text=True)
     val = int(response)
     failed =  val != expected_val
     failed_str = "FAIL" if failed else "SUCCESS"
+    if failed:
+        shell_command_mismatch = True
     print(f"ran {cmd}, expected {expected_val}, found {val}, {failed_str}")
 
 run_shell_command_and_assert_output_eq_int("results/InstCombine/", "rg 'Bitwuzla failed' | wc -l", both_failed+bw_only_failed)