diff --git a/bv-evaluation/collect-data-llvm.py b/bv-evaluation/collect-data-llvm.py
index 836dd99830..8d89b2b953 100644
--- a/bv-evaluation/collect-data-llvm.py
+++ b/bv-evaluation/collect-data-llvm.py
@@ -257,12 +257,20 @@
 print("There were "+str(inconsistencies)+" inconsistencies")
 print("Errors raised: "+str(errTot))
 
+shell_command_mismatch = False
+
 # Double Checking Against Grep Output
+# This function sets the global `shell_command_mismatch` to True if the output
+# does not match the expected value.
 def run_shell_command_and_assert_output_eq_int(cwd : str, cmd : str, expected_val : int) -> int:
+    global shell_command_mismatch
+
     response = subprocess.check_output(cmd, shell=True, cwd=cwd, text=True)
     val = int(response)
     failed =  val != expected_val
     failed_str = "FAIL" if failed else "SUCCESS"
+    if failed:
+        shell_command_mismatch = True
     print(f"ran {cmd}, expected {expected_val}, found {val}, {failed_str}")
 
 run_shell_command_and_assert_output_eq_int("results/InstCombine/", "rg 'Bitwuzla failed' | wc -l", both_failed+bw_only_failed)
@@ -305,3 +313,7 @@ def run_shell_command_and_assert_output_eq_int(cwd : str, cmd : str, expected_va
 df.to_csv(raw_data_dir+'llvm-proved-data.csv')
 df_ceg.to_csv(raw_data_dir+'llvm-ceg-data.csv')
 
+# If any of the shell commands were not as expected, set a non-zero exit code
+# to signal this failure (in particular, making CI fail).
+if shell_command_mismatch:
+    exit(1)
\ No newline at end of file
diff --git a/bv-evaluation/results/InstCombine/.placeholder b/bv-evaluation/results/InstCombine/.placeholder
deleted file mode 100644
index e69de29bb2..0000000000