From 91d87daef61b1d3bf84cacb10a002f75291b83b9 Mon Sep 17 00:00:00 2001
From: Muhammad Saqlain <2mesaqlain@gmail.com>
Date: Fri, 26 Dec 2025 01:42:09 +0500
Subject: [PATCH 1/2] Test: Assert specific evaluation scores for sample data

---
 tests/test_main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_main.py b/tests/test_main.py
index 4aea9480a..e11db2307 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -524,6 +524,8 @@ def test_evaluate(m, tmpdir):
 
     df = pd.read_csv(csv_file)
     assert results["results"].shape[0] == df.shape[0]
+    assert results["box_precision"] == pytest.approx(0.8, abs=0.01)
+    assert results["box_recall"] == pytest.approx(0.7213, abs=0.01)
 
 
 def test_train_callbacks(m):

From a9b3b5e7b75db673b8b70f29eb12ad478d0d5219 Mon Sep 17 00:00:00 2001
From: Muhammad Saqlain <2mesaqlain@gmail.com>
Date: Thu, 8 Jan 2026 06:09:35 +0500
Subject: [PATCH 2/2] Refactor: Move strict eval checks to benchmark test and
 relax unit test

---
 tests/test_benchmark.py | 20 ++++++++++++++++++++
 tests/test_main.py      | 13 +++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_benchmark.py

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
new file mode 100644
index 000000000..0237670ca
--- /dev/null
+++ b/tests/test_benchmark.py
@@ -0,0 +1,20 @@
+import pytest
+from deepforest import main, get_data
+
+def test_benchmark_release():
+    """
+    Benchmark test to ensure the specific release version of the model
+    produces consistent results.
+    """
+    # Load the model using a SPECIFIC revision (Commit SHA)
+    release_sha = "cc21436bc5d572dde8ff5f93c1e71a32f563cace"
+
+    m = main.deepforest()
+    m.load_model("weecology/deepforest-tree", revision=release_sha)
+
+    csv_file = get_data("OSBS_029.csv")
+    results = m.evaluate(csv_file, iou_threshold=0.4)
+
+    # Strict Assertions (for The "Benchmark")
+    assert results["box_precision"] == pytest.approx(0.8, abs=0.01)
+    assert results["box_recall"] == pytest.approx(0.7213, abs=0.01)
diff --git a/tests/test_main.py b/tests/test_main.py
index e11db2307..4223a53b6 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -511,12 +511,16 @@ def test_predict_tile_from_array(m, path):
 
     assert not prediction.empty
 
-def test_evaluate(m, tmpdir):
+def test_evaluate(m):
     csv_file = get_data("OSBS_029.csv")
     results = m.evaluate(csv_file, iou_threshold=0.4)
 
-    assert np.round(results["box_precision"], 2) > 0.5
-    assert np.round(results["box_recall"], 2) > 0.5
+    # Relaxed assertions (Sanity Check only)
+    # Allows model improvements without breaking tests
+    assert results["box_precision"] > 0.7
+    assert results["box_recall"] > 0.5
+
+    # Structure and Label checks
     assert len(results["results"].predicted_label.dropna().unique()) == 1
     assert results["results"].predicted_label.dropna().unique()[0] == "Tree"
     assert results["predictions"].shape[0] > 0
@@ -524,9 +528,6 @@ def test_evaluate(m, tmpdir):
 
     df = pd.read_csv(csv_file)
     assert results["results"].shape[0] == df.shape[0]
-    assert results["box_precision"] == pytest.approx(0.8, abs=0.01)
-    assert results["box_recall"] == pytest.approx(0.7213, abs=0.01)
-
 
 def test_train_callbacks(m):
     csv_file = get_data("example.csv")