From 1b42ee8ab5bc4991f4426d17da5b5c4868a29e4e Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 16:53:55 +0100
Subject: [PATCH 1/8] feat: evaluation harness to capture messages and
 serialize them as a single JSON object

---
 EvaluationHarness/.gitignore                  |   1 +
 EvaluationHarness/EvaluationHarness.lean      |   3 +
 .../EvaluationHarness/Basic.lean              | 107 ++++++++++++++++++
 .../EvaluationHarness/Tests.lean              |  20 ++++
 EvaluationHarness/Main.lean                   |   4 +
 EvaluationHarness/README.md                   |   1 +
 EvaluationHarness/lakefile.toml               |   6 +
 EvaluationHarness/lean-toolchain              |   1 +
 lake-manifest.json                            |   7 ++
 lakefile.toml                                 |   4 +
 10 files changed, 154 insertions(+)
 create mode 100644 EvaluationHarness/.gitignore
 create mode 100644 EvaluationHarness/EvaluationHarness.lean
 create mode 100644 EvaluationHarness/EvaluationHarness/Basic.lean
 create mode 100644 EvaluationHarness/EvaluationHarness/Tests.lean
 create mode 100644 EvaluationHarness/Main.lean
 create mode 100644 EvaluationHarness/README.md
 create mode 100644 EvaluationHarness/lakefile.toml
 create mode 100644 EvaluationHarness/lean-toolchain

diff --git a/EvaluationHarness/.gitignore b/EvaluationHarness/.gitignore
new file mode 100644
index 0000000000..bfb30ec8c7
--- /dev/null
+++ b/EvaluationHarness/.gitignore
@@ -0,0 +1 @@
+/.lake
diff --git a/EvaluationHarness/EvaluationHarness.lean b/EvaluationHarness/EvaluationHarness.lean
new file mode 100644
index 0000000000..abb0242cd4
--- /dev/null
+++ b/EvaluationHarness/EvaluationHarness.lean
@@ -0,0 +1,3 @@
+-- This module serves as the root of the `EvaluationHarness` library.
+-- Import modules here that should be built as part of the library.
+import EvaluationHarness.Basic
diff --git a/EvaluationHarness/EvaluationHarness/Basic.lean b/EvaluationHarness/EvaluationHarness/Basic.lean
new file mode 100644
index 0000000000..d9aca99892
--- /dev/null
+++ b/EvaluationHarness/EvaluationHarness/Basic.lean
@@ -0,0 +1,107 @@
+import Lean
+
+/-!
+# Evaluation Harness
+
+This file defines the `#evaluation in ...` command wrapper, for easier evaluation.
+-/
+namespace EvaluationHarness
+
+open Lean
+open Elab Command
+
+/--
+`SimpleMessage` contains the subset of fields from `SerialMessage` that we
+wish to include in the logged output.
+-/
+protected structure SimpleMessage where
+  severity: MessageSeverity
+  data: String
+  caption: String
+  pos: Position
+deriving ToJson, FromJson
+
+protected structure Messages where
+  messages : List SerialMessage
+
+open EvaluationHarness (Messages SimpleMessage)
+
+/-! ## Message Collection -/
+
+/--
+Given a monadic action `cmd : CommandElabM Unit`, run the action, collect all
+messages (trace, info, warning and error messages) that the action logged, and
+return those. The collected messages are *not* left in the environment.
+-/
+protected def collectMessages (runCmd : CommandElabM Unit) : CommandElabM Messages := do
+  -- NOTE: the following is largely copied from the `#guard_msgs in` command,
+  --       with some slight adaptations
+
+  let initMsgs ← modifyGet fun st => (st.messages, { st with messages := {} })
+  -- do not forward snapshot as we don't want messages assigned to it to leak outside
+  withReader ({ · with snap? := none }) runCmd
+  -- collect sync and async messages
+  let msgs := (← get).messages ++
+    (← get).snapshotTasks.foldl (· ++ ·.get.getAll.foldl (· ++ ·.diagnostics.msgLog) default) {}
+  -- clear async messages as we don't want them to leak outside, and
+  -- reset messages to initial state
+  modify ({ · with
+    snapshotTasks := #[]
+    messages := initMsgs
+  })
+  let messages ← msgs.toList
+    |>.filter (!·.isSilent)
+    |>.mapM (·.serialize)
+  return { messages }
+open EvaluationHarness (collectMessages)
+
+def Messages.toSimple (self : Messages) : List SimpleMessage :=
+  self.messages.map fun m => { m with }
+
+def Messages.toJson (self : Messages) : Json :=
+  ToJson.toJson self.toSimple
+
+/-! ## EvaluationInfo -/
+
+structure EvaluationInfo where
+  filename: String
+  /-- The line number on which `#evaluation in cmd` occurred. -/
+  line: Nat
+  /-- The collected messages generated by command `cmd`. -/
+  messages: List SimpleMessage
+  -- TODO: `data` field for more structured data reporting
+  /--
+  The name of the elaborated definition/theorem, or `none` if the command
+  was neither definition nor theorem.
+  -/
+  name : Option String
+deriving ToJson, FromJson
+
+open Lean.Parser.Command in
+/--
+If `cmd` is a `def` or `theorem`, return the name of the defininition/theorem.
+-/
+def getDefLikeName? (cmd : TSyntax `command) : Option String :=
+  let raw := cmd.raw
+  let kind := raw.getKind
+  if kind == ``declaration then
+    let name := raw[1]![1]![0]!.getId
+    some <| name.eraseMacroScopes.toString
+  else
+    none
+
+/--
+`#evalation in cmd` captures the messages generated by the command `cmd`, and
+serializes them into a single json object, together with some extra metadata
+such as the name of the current file.
+-/
+elab "#evaluation" " in " cmd:command : command => do
+  let msgs ← collectMessages do
+    elabCommand cmd
+  let info : EvaluationInfo := {
+    filename := ← getFileName
+    line := (← getRefPosition).line
+    messages := msgs.toSimple
+    name := getDefLikeName? cmd
+  }
+  IO.println s!"{toJson info}"
diff --git a/EvaluationHarness/EvaluationHarness/Tests.lean b/EvaluationHarness/EvaluationHarness/Tests.lean
new file mode 100644
index 0000000000..8f6fd895b9
--- /dev/null
+++ b/EvaluationHarness/EvaluationHarness/Tests.lean
@@ -0,0 +1,20 @@
+import EvaluationHarness.Basic
+
+namespace EvaluationHarness.Tests
+
+/-! ## getDefLikeName -/
+
+/-- info: some "foo" -/
+#guard_msgs in #eval do
+  let cmd ← `(command| theorem foo : False := sorry)
+  return getDefLikeName cmd
+
+/-- info: some "foo.bar" -/
+#guard_msgs in #eval do
+  let cmd ← `(command| theorem foo.bar : False := sorry)
+  return getDefLikeName cmd
+
+/-- info: some "baz" -/
+#guard_msgs in #eval do
+  let cmd ← `(command| def baz : False := sorry)
+  return getDefLikeName cmd
diff --git a/EvaluationHarness/Main.lean b/EvaluationHarness/Main.lean
new file mode 100644
index 0000000000..d4cd5f99e3
--- /dev/null
+++ b/EvaluationHarness/Main.lean
@@ -0,0 +1,4 @@
+import EvaluationHarness
+
+def main : IO Unit :=
+  IO.println s!"Hello, {hello}!"
diff --git a/EvaluationHarness/README.md b/EvaluationHarness/README.md
new file mode 100644
index 0000000000..9a614dbe70
--- /dev/null
+++ b/EvaluationHarness/README.md
@@ -0,0 +1 @@
+# evaluation-harness
\ No newline at end of file
diff --git a/EvaluationHarness/lakefile.toml b/EvaluationHarness/lakefile.toml
new file mode 100644
index 0000000000..8e761d908b
--- /dev/null
+++ b/EvaluationHarness/lakefile.toml
@@ -0,0 +1,6 @@
+name = "EvaluationHarness"
+version = "0.1.0"
+defaultTargets = ["EvaluationHarness"]
+
+[[lean_lib]]
+name = "EvaluationHarness"
diff --git a/EvaluationHarness/lean-toolchain b/EvaluationHarness/lean-toolchain
new file mode 100644
index 0000000000..a835e7cb3c
--- /dev/null
+++ b/EvaluationHarness/lean-toolchain
@@ -0,0 +1 @@
+leanprover/lean4-nightly:nightly-2025-08-11
diff --git a/lake-manifest.json b/lake-manifest.json
index dd9f56bbe9..6b03b4978a 100644
--- a/lake-manifest.json
+++ b/lake-manifest.json
@@ -31,6 +31,13 @@
    "inputRev": "nightly-testing-2025-08-11",
    "inherited": false,
    "configFile": "lakefile.lean"},
+  {"type": "path",
+   "scope": "",
+   "name": "EvaluationHarness",
+   "manifestFile": "lake-manifest.json",
+   "inherited": false,
+   "dir": "EvaluationHarness",
+   "configFile": "lakefile.toml"},
   {"type": "path",
    "scope": "",
    "name": "Blase",
diff --git a/lakefile.toml b/lakefile.toml
index 4f2bdf2131..bbd68e2789 100644
--- a/lakefile.toml
+++ b/lakefile.toml
@@ -14,6 +14,10 @@ name = "Blase"
 path = "Blase"
 rev = "main"
 
+[[require]]
+name = "EvaluationHarness"
+path = "EvaluationHarness"
+
 [[require]]
 name = "mathlib"
 git = "https://github.com/leanprover-community/mathlib4-nightly-testing"

From 8668292fb6f98665d6682f7b895c37ca094ded7f Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 12:23:57 +0100
Subject: [PATCH 2/8] move and fix tests

---
 .../EvaluationHarness/{Tests.lean => Tests/Basic.lean}      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename EvaluationHarness/EvaluationHarness/{Tests.lean => Tests/Basic.lean} (82%)

diff --git a/EvaluationHarness/EvaluationHarness/Tests.lean b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
similarity index 82%
rename from EvaluationHarness/EvaluationHarness/Tests.lean
rename to EvaluationHarness/EvaluationHarness/Tests/Basic.lean
index 8f6fd895b9..0efa08d62c 100644
--- a/EvaluationHarness/EvaluationHarness/Tests.lean
+++ b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
@@ -7,14 +7,14 @@ namespace EvaluationHarness.Tests
 /-- info: some "foo" -/
 #guard_msgs in #eval do
   let cmd ← `(command| theorem foo : False := sorry)
-  return getDefLikeName cmd
+  return getDefLikeName? cmd
 
 /-- info: some "foo.bar" -/
 #guard_msgs in #eval do
   let cmd ← `(command| theorem foo.bar : False := sorry)
-  return getDefLikeName cmd
+  return getDefLikeName? cmd
 
 /-- info: some "baz" -/
 #guard_msgs in #eval do
   let cmd ← `(command| def baz : False := sorry)
-  return getDefLikeName cmd
+  return getDefLikeName? cmd

From 4d590947482437d9aef2083133166422c62c5a8c Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 13:33:07 +0100
Subject: [PATCH 3/8] feat: add `strategy` field

---
 .../EvaluationHarness/Basic.lean              | 58 ++++++++++++++++---
 .../EvaluationHarness/Tests/Basic.lean        | 13 +++++
 2 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/EvaluationHarness/EvaluationHarness/Basic.lean b/EvaluationHarness/EvaluationHarness/Basic.lean
index d9aca99892..217ab2ac3b 100644
--- a/EvaluationHarness/EvaluationHarness/Basic.lean
+++ b/EvaluationHarness/EvaluationHarness/Basic.lean
@@ -7,8 +7,7 @@ This file defines the `#evaluation in ...` command wrapper, for easier evaluatio
 -/
 namespace EvaluationHarness
 
-open Lean
-open Elab Command
+open Lean Meta Elab Command
 
 /--
 `SimpleMessage` contains the subset of fields from `SerialMessage` that we
@@ -61,6 +60,31 @@ def Messages.toSimple (self : Messages) : List SimpleMessage :=
 def Messages.toJson (self : Messages) : Json :=
   ToJson.toJson self.toSimple
 
+/-! ## EvaluationConfig -/
+
+structure EvaluationConfig where
+  strategy: Option String := none
+  outputAsLog: Bool := false
+deriving Inhabited
+
+syntax evalConfigElem :=
+  ("strategy" " := " str)
+syntax evalConfig := "(" evalConfigElem,* ")"
+
+def parseEvaluationConfig (cfgElems : TSyntax ``evalConfig) : EvaluationConfig := Id.run <| do
+  let mut cfg := { }
+  let `(evalConfig| ( $cfgElems,* ) ) := cfgElems
+    | return cfg
+  for cfgElem in cfgElems.getElems do
+    cfg := match cfgElem with
+    | `(evalConfigElem| strategy := $s) =>
+        { cfg with strategy := some s.getString }
+    | _ =>
+        panic! "Unknown configuration option: {elem}"
+
+  return cfg
+
+
 /-! ## EvaluationInfo -/
 
 structure EvaluationInfo where
@@ -69,12 +93,19 @@ structure EvaluationInfo where
   line: Nat
   /-- The collected messages generated by command `cmd`. -/
   messages: List SimpleMessage
+  /-- Whether any errors were raised by `cmd`. -/
+  hasErrors: Bool
   -- TODO: `data` field for more structured data reporting
   /--
   The name of the elaborated definition/theorem, or `none` if the command
   was neither definition nor theorem.
   -/
   name : Option String
+  /--
+  An optional user-defined string that can be embedded in the output,
+  see `EvaluationConfig` for details.
+  -/
+  strategy : Option String
 deriving ToJson, FromJson
 
 open Lean.Parser.Command in
@@ -90,18 +121,29 @@ def getDefLikeName? (cmd : TSyntax `command) : Option String :=
   else
     none
 
-/--
-`#evalation in cmd` captures the messages generated by the command `cmd`, and
-serializes them into a single json object, together with some extra metadata
-such as the name of the current file.
--/
-elab "#evaluation" " in " cmd:command : command => do
+def evalEvaluation (cmd : TSyntax `command) (config : EvaluationConfig) : CommandElabM Unit := do
   let msgs ← collectMessages do
     elabCommand cmd
   let info : EvaluationInfo := {
     filename := ← getFileName
     line := (← getRefPosition).line
     messages := msgs.toSimple
+    hasErrors := msgs.messages.any (·.severity == .error)
     name := getDefLikeName? cmd
+    strategy := config.strategy
   }
   IO.println s!"{toJson info}"
+
+/--
+`#evalation in cmd` captures the messages generated by the command `cmd`, and
+serializes them into a single json object, together with some extra metadata
+such as the name of the current file.
+
+See `EvluationConfig` for details on extra configuration options.
+-/
+syntax "#evaluation" (evalConfig)? "in" command : command
+elab_rules : command
+| `(#evaluation in $cmd:command) => evalEvaluation cmd { }
+| `(#evaluation $cfg in $cmd:command) => do
+    let cfg := parseEvaluationConfig cfg
+    evalEvaluation cmd cfg
diff --git a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
index 0efa08d62c..87e2ef9f66 100644
--- a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
+++ b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
@@ -18,3 +18,16 @@ namespace EvaluationHarness.Tests
 #guard_msgs in #eval do
   let cmd ← `(command| def baz : False := sorry)
   return getDefLikeName? cmd
+
+/-! ## `#evaluation`-/
+
+/-
+NOTE: `#guard_msgs` doesn't work, presumably because we output the data
+with `IO.println` instead of `logInfo` or the like.
+-/
+
+#evaluation in
+  def foo : False := sorry
+
+#evaluation (strategy := "barStrat") in
+  def foo : False := sorry

From 3b40bbf4c046316d56acc2d5d8e2852b70e4d601 Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 14:04:26 +0100
Subject: [PATCH 4/8] WIP

---
 .../EvaluationHarness/Tests/Basic.lean        |  6 +-
 .../InstCombine/tests/proofs/gand_proof.lean  | 64 ++++++++++++++++++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
index 87e2ef9f66..e70358e674 100644
--- a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
+++ b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
@@ -30,4 +30,8 @@ with `IO.println` instead of `logInfo` or the like.
   def foo : False := sorry
 
 #evaluation (strategy := "barStrat") in
-  def foo : False := sorry
+  def bar : False := sorry
+
+#evaluation in
+  def shouldError : False := by
+    fail
diff --git a/SSA/Projects/InstCombine/tests/proofs/gand_proof.lean b/SSA/Projects/InstCombine/tests/proofs/gand_proof.lean
index 86d369e89c..9822a0a3f2 100644
--- a/SSA/Projects/InstCombine/tests/proofs/gand_proof.lean
+++ b/SSA/Projects/InstCombine/tests/proofs/gand_proof.lean
@@ -1,5 +1,67 @@
 
-import SSA.Projects.InstCombine.TacticAuto
+
+import SSA.Projects.InstCombine.Tactic.SimpLLVM
+import SSA.Projects.InstCombine.LLVM.Semantics
+
+import SSA.Core.Tactic.ExtractGoals
+
+open BitVec
+open LLVM
+
+set_option linter.unusedTactic false
+set_option linter.unreachableTactic false
+set_option maxHeartbeats 5000000
+set_option maxRecDepth 1000000
+set_option Elab.async false
+
+section gand_proof
+theorem test_with_1_thm (e : IntW 32) :
+  LLVM.and (shl (const? 32 1) e) (const? 32 1) ⊑ zext 32 (icmp IntPred.eq e (const? 32 0)) := by
+    simp_alive_undef
+    simp_alive_ops
+    simp_alive_case_bash
+    simp_alive_split
+    extract_goals
+    all_goals sorry
+
+
+theorem test_with_3_thm (e : IntW 32) :
+  LLVM.and (shl (const? 32 3) e) (const? 32 1) ⊑ zext 32 (icmp IntPred.eq e (const? 32 0)) := by
+    simp_alive_undef
+    simp_alive_ops
+    simp_alive_case_bash
+    simp_alive_split
+    extract_goals
+    all_goals sorry
+
+
+theorem test_with_5_thm (e : IntW 32) :
+  LLVM.and (shl (const? 32 5) e) (const? 32 1) ⊑ zext 32 (icmp IntPred.eq e (const? 32 0)) := by
+    simp_alive_undef
+    simp_alive_ops
+    simp_alive_case_bash
+    simp_alive_split
+    extract_goals
+    all_goals sorry
+
+
+theorem test_with_neg_5_thm (e : IntW 32) :
+  LLVM.and (shl (const? 32 (-5)) e) (const? 32 1) ⊑ zext 32 (icmp IntPred.eq e (const? 32 0)) := by
+    simp_alive_undef
+    simp_alive_ops
+    simp_alive_case_bash
+    simp_alive_split
+    extract_goals
+    all_goals sorry
+
+
+theorem test_with_even_thm (e : IntW 32) : LLVM.and (shl (const? 32 4) e) (const? 32 1) ⊑ const? 32 0 := by
+    simp_alive_undef
+    simp_alive_ops
+    simp_alive_case_bash
+    simp_alive_split
+    extract_goals
+    all_goals sorry
 import SSA.Projects.InstCombine.LLVM.Semantics
 open BitVec
 open LLVM

From 805c3c46ae443dcdf68db752f00df549940e7e10 Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 14:04:37 +0100
Subject: [PATCH 5/8] WIP

---
 .../EvaluationHarness/Tests/ExampleFile.lean  | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 EvaluationHarness/EvaluationHarness/Tests/ExampleFile.lean

diff --git a/EvaluationHarness/EvaluationHarness/Tests/ExampleFile.lean b/EvaluationHarness/EvaluationHarness/Tests/ExampleFile.lean
new file mode 100644
index 0000000000..5fe31a5396
--- /dev/null
+++ b/EvaluationHarness/EvaluationHarness/Tests/ExampleFile.lean
@@ -0,0 +1,19 @@
+import EvaluationHarness.Basic
+import Std.Tactic.BVDecide
+
+open BitVec
+
+#evaluation in
+theorem test_with_1_thm.extracted_1._1 : ∀ (x : BitVec 32),
+    ¬x ≥ ↑32 → 1#32 <<< x &&& 1#32 = zeroExtend 32 (ofBool (x == 0#32)) := by
+  bv_decide
+
+#evaluation in
+theorem test_with_3_thm.extracted_1._1 : ∀ (x : BitVec 32),
+    ¬x ≥ ↑32 → 3#32 <<< x &&& 1#32 = zeroExtend 32 (ofBool (x == 0#32)) := by
+  bv_decide
+
+#evaluation in
+theorem test_with_5_thm.extracted_1._1 : ∀ (x : BitVec 32),
+    ¬x ≥ ↑32 → 5#32 <<< x &&& 1#32 = zeroExtend 32 (ofBool (x == 0#32)) := by
+  fail

From bff1a741ff892fc31e2647a08fd96b275540a52c Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 16:52:42 +0100
Subject: [PATCH 6/8] add private/protected def test cases for getDefLikeName

---
 EvaluationHarness/EvaluationHarness/Tests/Basic.lean | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
index e70358e674..eba1f80198 100644
--- a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
+++ b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
@@ -19,6 +19,16 @@ namespace EvaluationHarness.Tests
   let cmd ← `(command| def baz : False := sorry)
   return getDefLikeName? cmd
 
+/-- info: some "privateBaz" -/
+#guard_msgs in #eval do
+  let cmd ← `(command| private def privateBaz : False := sorry)
+  return getDefLikeName? cmd
+
+/-- info: some "protBaz" -/
+#guard_msgs in #eval do
+  let cmd ← `(command| protected def protBaz : False := sorry)
+  return getDefLikeName? cmd
+
 /-! ## `#evaluation`-/
 
 /-

From db4f14c792389ea2052f65b1e881608bf15762cc Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Wed, 24 Sep 2025 17:53:27 +0100
Subject: [PATCH 7/8] add & implement various options

some options suggested by @bollu, such as omitting the details of messages and including walltime, and some other options to make it easier to run tests
---
 .../EvaluationHarness/Basic.lean              | 79 +++++++++++++++----
 .../EvaluationHarness/Options.lean            | 34 ++++++++
 .../EvaluationHarness/Tests/Basic.lean        | 68 ++++++++++++++--
 3 files changed, 158 insertions(+), 23 deletions(-)
 create mode 100644 EvaluationHarness/EvaluationHarness/Options.lean

diff --git a/EvaluationHarness/EvaluationHarness/Basic.lean b/EvaluationHarness/EvaluationHarness/Basic.lean
index 217ab2ac3b..d65413ec5b 100644
--- a/EvaluationHarness/EvaluationHarness/Basic.lean
+++ b/EvaluationHarness/EvaluationHarness/Basic.lean
@@ -1,5 +1,7 @@
 import Lean
 
+import EvaluationHarness.Options
+
 /-!
 # Evaluation Harness
 
@@ -32,13 +34,13 @@ Given a monadic action `cmd : CommandElabM Unit`, run the action, collect all
 messages (trace, info, warning and error messages) that the action logged, and
 return those. The collected messages are *not* left in the environment.
 -/
-protected def collectMessages (runCmd : CommandElabM Unit) : CommandElabM Messages := do
+protected def collectMessages (runCmd : CommandElabM α) : CommandElabM (α × Messages) := do
   -- NOTE: the following is largely copied from the `#guard_msgs in` command,
   --       with some slight adaptations
 
   let initMsgs ← modifyGet fun st => (st.messages, { st with messages := {} })
   -- do not forward snapshot as we don't want messages assigned to it to leak outside
-  withReader ({ · with snap? := none }) runCmd
+  let ret ← withReader ({ · with snap? := none }) runCmd
   -- collect sync and async messages
   let msgs := (← get).messages ++
     (← get).snapshotTasks.foldl (· ++ ·.get.getAll.foldl (· ++ ·.diagnostics.msgLog) default) {}
@@ -51,20 +53,24 @@ protected def collectMessages (runCmd : CommandElabM Unit) : CommandElabM Messag
   let messages ← msgs.toList
     |>.filter (!·.isSilent)
     |>.mapM (·.serialize)
-  return { messages }
+  return ⟨ret, { messages }⟩
 open EvaluationHarness (collectMessages)
 
-def Messages.toSimple (self : Messages) : List SimpleMessage :=
-  self.messages.map fun m => { m with }
-
-def Messages.toJson (self : Messages) : Json :=
-  ToJson.toJson self.toSimple
+variable {m} [Monad m] [MonadOptions m] in
+def Messages.toSimple (self : Messages) : m (Option <| List SimpleMessage) := do
+  if (← getBoolOption `evaluation.includeMessages true) then
+    let includePosition ← getBoolOption `evaluation.includePosition true
+    some <$> self.messages.mapM fun m =>
+      return { m with
+        pos := if includePosition then m.pos else ⟨0, 0⟩
+      }
+  else
+    return none
 
 /-! ## EvaluationConfig -/
 
 structure EvaluationConfig where
   strategy: Option String := none
-  outputAsLog: Bool := false
 deriving Inhabited
 
 syntax evalConfigElem :=
@@ -91,10 +97,22 @@ structure EvaluationInfo where
   filename: String
   /-- The line number on which `#evaluation in cmd` occurred. -/
   line: Nat
-  /-- The collected messages generated by command `cmd`. -/
-  messages: List SimpleMessage
-  /-- Whether any errors were raised by `cmd`. -/
+  /--
+  The collected messages generated by command `cmd`, or
+  `none` when the `evaluation.includeMessages` option is set to `false`.
+  -/
+  messages: Option (List SimpleMessage)
+  /--
+  Whether any errors were raised by `cmd`.
+
+  NOTE: this is reported regardless of the `includeMessages` option.
+  -/
   hasErrors: Bool
+  /--
+  The time elapsed while elaborating `cmd`, or `none` when the
+  `evaluation.includeWalltime` option is set to `false` (which is the default).
+  -/
+  wallTime : Option Float
   -- TODO: `data` field for more structured data reporting
   /--
   The name of the elaborated definition/theorem, or `none` if the command
@@ -121,18 +139,47 @@ def getDefLikeName? (cmd : TSyntax `command) : Option String :=
   else
     none
 
+/--
+Run `cmd` and return the elapsed wall-time, or run the command but return `none`
+when the `evaluation.includeWalltime` option is set to `false` (which is the
+default).
+-/
+def measureWallTime (cmd : CommandElabM Unit) : CommandElabM (Option Float) := do
+  if (← getBoolOption `evaluation.includeWallTime false) then
+    let t1 ← IO.monoNanosNow
+    cmd
+    let t2 ← IO.monoNanosNow
+    return some (deltaInMs t2 t1)
+  else
+    cmd
+    return none
+where
+  deltaInMs (now past : Nat) : Float :=
+    (Float.ofNat <| now - past) / 1000000.0
+
 def evalEvaluation (cmd : TSyntax `command) (config : EvaluationConfig) : CommandElabM Unit := do
-  let msgs ← collectMessages do
+  let (wallTime, msgs) ← collectMessages <| measureWallTime <| do
     elabCommand cmd
   let info : EvaluationInfo := {
     filename := ← getFileName
-    line := (← getRefPosition).line
-    messages := msgs.toSimple
+    line :=← getRefLine
+    messages := ← msgs.toSimple
     hasErrors := msgs.messages.any (·.severity == .error)
+    wallTime
     name := getDefLikeName? cmd
     strategy := config.strategy
   }
-  IO.println s!"{toJson info}"
+  let info := s!"{toJson info}"
+  if (← getBoolOption `evaluation.outputAsLog) then
+    logInfo info
+  else
+    IO.println info
+where
+  getRefLine {m} [Monad m] [MonadOptions m] [MonadLog m] : m Nat := do
+    if (← getBoolOption `evaluation.includePosition true) then
+      return (← getRefPosition).line
+    else
+      return 0
 
 /--
 `#evalation in cmd` captures the messages generated by the command `cmd`, and
diff --git a/EvaluationHarness/EvaluationHarness/Options.lean b/EvaluationHarness/EvaluationHarness/Options.lean
new file mode 100644
index 0000000000..ef4fb3991b
--- /dev/null
+++ b/EvaluationHarness/EvaluationHarness/Options.lean
@@ -0,0 +1,34 @@
+import Lean
+
+/-!
+# Evaluation Harness Options
+This file defines various options which can be used to influence the behaviour
+of `#evaluation in cmd`.
+-/
+namespace EvaluationHarness
+open Lean
+
+initialize
+  registerOption `evaluation.includeWallTime {
+    defValue := false
+    descr := "Whether to include elapsed wall-time in the output of `#evaluation in cmd`"
+  }
+  registerOption `evaluation.includeMessages {
+    defValue := true
+    descr := "Whether to include a log of all messages in the output of `#evaluation in cmd`.\
+      When set to false, messages are still captured, but then they are discarded \
+      instead of incorporated into the output."
+  }
+
+  registerOption `evaluation.outputAsLog {
+    defValue := false
+    descr := "When set, output is emitted via `logInfo`, rather than `IO.println`"
+  }
+  registerOption `evaluation.includePosition {
+    defValue := true
+    descr := "Whether to include line/column information in the output. \
+      When set to false, all lines and columns are reported as 0. \
+      \n\n\
+      Can be particularly useful to disable in tests, to make the output less
+      brittle to changes in the overal file."
+  }
diff --git a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
index eba1f80198..6bcd781cb6 100644
--- a/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
+++ b/EvaluationHarness/EvaluationHarness/Tests/Basic.lean
@@ -30,18 +30,72 @@ namespace EvaluationHarness.Tests
   return getDefLikeName? cmd
 
 /-! ## `#evaluation`-/
+set_option evaluation.outputAsLog true
+set_option evaluation.includePosition false
 
-/-
-NOTE: `#guard_msgs` doesn't work, presumably because we output the data
-with `IO.println` instead of `logInfo` or the like.
+/--
+info: {"wallTime": null,
+ "strategy": null,
+ "name": "foo",
+ "messages":
+ [{"severity": "warning",
+   "pos": {"line": 0, "column": 0},
+   "data": "declaration uses 'sorry'",
+   "caption": ""}],
+ "line": 0,
+ "hasErrors": false,
+ "filename":
+ "/home/alex/Workspace/PhD/lean-mlir-alt/EvaluationHarness/EvaluationHarness/Tests/Basic.lean"}
 -/
-
-#evaluation in
+#guard_msgs in #evaluation in
   def foo : False := sorry
 
-#evaluation (strategy := "barStrat") in
+/--
+info: {"wallTime": null,
+ "strategy": "barStrat",
+ "name": "bar",
+ "messages":
+ [{"severity": "warning",
+   "pos": {"line": 0, "column": 0},
+   "data": "declaration uses 'sorry'",
+   "caption": ""}],
+ "line": 0,
+ "hasErrors": false,
+ "filename":
+ "/home/alex/Workspace/PhD/lean-mlir-alt/EvaluationHarness/EvaluationHarness/Tests/Basic.lean"}
+-/
+#guard_msgs in #evaluation (strategy := "barStrat") in
   def bar : False := sorry
 
-#evaluation in
+/--
+info: {"wallTime": null,
+ "strategy": null,
+ "name": "shouldError",
+ "messages":
+ [{"severity": "error",
+   "pos": {"line": 0, "column": 0},
+   "data": "Failed: `fail` tactic was invoked\n⊢ False",
+   "caption": ""}],
+ "line": 0,
+ "hasErrors": true,
+ "filename":
+ "/home/alex/Workspace/PhD/lean-mlir-alt/EvaluationHarness/EvaluationHarness/Tests/Basic.lean"}
+-/
+#guard_msgs in #evaluation in
   def shouldError : False := by
     fail
+
+set_option evaluation.includeMessages false in
+/--
+info: {"wallTime": null,
+ "strategy": null,
+ "name": "shouldErrorWithoutMsg",
+ "messages": null,
+ "line": 0,
+ "hasErrors": true,
+ "filename":
+ "/home/alex/Workspace/PhD/lean-mlir-alt/EvaluationHarness/EvaluationHarness/Tests/Basic.lean"}
+-/
+#guard_msgs in #evaluation in
+  def shouldErrorWithoutMsg : False := by
+    fail

From 75dbcf14dd02e6969ad0c0f866ea27c7dc425345 Mon Sep 17 00:00:00 2001
From: Alex Keizer <alex@keizer.dev>
Date: Thu, 25 Sep 2025 13:50:35 +0100
Subject: [PATCH 8/8] ensure lean-toolchain is a symlink

---
 EvaluationHarness/lean-toolchain | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 120000 EvaluationHarness/lean-toolchain

diff --git a/EvaluationHarness/lean-toolchain b/EvaluationHarness/lean-toolchain
deleted file mode 100644
index a835e7cb3c..0000000000
--- a/EvaluationHarness/lean-toolchain
+++ /dev/null
@@ -1 +0,0 @@
-leanprover/lean4-nightly:nightly-2025-08-11
diff --git a/EvaluationHarness/lean-toolchain b/EvaluationHarness/lean-toolchain
new file mode 120000
index 0000000000..6f68e61b37
--- /dev/null
+++ b/EvaluationHarness/lean-toolchain
@@ -0,0 +1 @@
+../lean-toolchain
\ No newline at end of file