From 7b196b543a11ecc9a7137a8b7f0e093f804e9412 Mon Sep 17 00:00:00 2001
From: Chris Mihiar <28452317+mihiarc@users.noreply.github.com>
Date: Sat, 14 Mar 2026 10:28:09 -0400
Subject: [PATCH 1/2] Fix EVALID year parsing for single-digit state FIPS codes

find_evalid() parsed EVALID as fixed-width SSYYTT using positional
string slicing, which fails for states with single-digit FIPS codes
(AL=1, AR=5). A 5-digit EVALID like 12401 (AL, year=24, type=01)
was misparsed as state=12, year=40, causing clip_most_recent() to
select 2003 periodic inventory instead of 2024 annual inventory.

Replace EVALID string parsing with END_INVYR from POP_EVAL, which is
an unambiguous 4-digit year already available in the joined dataframe.
Remove the now-unused _add_parsed_evalid_columns() function.

Closes #78
---
 src/pyfia/core/fia.py | 78 +++++++------------------------------------
 1 file changed, 12 insertions(+), 66 deletions(-)

diff --git a/src/pyfia/core/fia.py b/src/pyfia/core/fia.py
index 4120153..bf1a3fc 100755
--- a/src/pyfia/core/fia.py
+++ b/src/pyfia/core/fia.py
@@ -10,11 +10,10 @@
 import logging
 import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, overload
+from typing import TYPE_CHECKING
 
 import polars as pl
 
-from ..constants.defaults import EVALIDYearParsing
 from ..validation import sanitize_sql_path
 from .data_reader import FIADataReader
 from .exceptions import (
@@ -31,49 +30,6 @@
 logger = logging.getLogger(__name__)
 
 
-@overload
-def _add_parsed_evalid_columns(
-    df: pl.DataFrame,
-) -> pl.DataFrame: ...
-
-
-@overload
-def _add_parsed_evalid_columns(
-    df: pl.LazyFrame,
-) -> pl.LazyFrame: ...
-
-
-def _add_parsed_evalid_columns(
-    df: pl.DataFrame | pl.LazyFrame,
-) -> pl.DataFrame | pl.LazyFrame:
-    """
-    Add parsed EVALID columns to a DataFrame for sorting.
-
-    EVALID format: SSYYTT (State, Year 2-digit, Type)
-    Year uses Y2K windowing based on EVALIDYearParsing constants.
-
-    See EVALIDYearParsing for details on year interpretation.
-    """
-    year_expr = pl.col("EVALID").cast(pl.Utf8).str.slice(2, 2).cast(pl.Int32)
-    return df.with_columns(
-        [
-            pl.when(year_expr <= EVALIDYearParsing.Y2K_WINDOW_THRESHOLD)
-            .then(EVALIDYearParsing.CENTURY_2000 + year_expr)
-            .otherwise(EVALIDYearParsing.CENTURY_1900 + year_expr)
-            .alias("EVALID_YEAR"),
-            pl.col("EVALID")
-            .cast(pl.Utf8)
-            .str.slice(0, 2)
-            .cast(pl.Int32)
-            .alias("EVALID_STATE"),
-            pl.col("EVALID")
-            .cast(pl.Utf8)
-            .str.slice(4, 2)
-            .cast(pl.Int32)
-            .alias("EVALID_TYPE"),
-        ]
-    )
-
 
 class FIA:
     """
@@ -512,8 +468,10 @@ def find_evalid(
             df = df.filter(pl.col("EVAL_TYP") == eval_type_full)
 
         if most_recent:
-            # Add parsed EVALID columns for robust year sorting
-            df = _add_parsed_evalid_columns(df)
+            # Sort by END_INVYR from POP_EVAL to find the most recent evaluation.
+            # END_INVYR is an unambiguous 4-digit year, unlike EVALID which encodes
+            # the year as 2 digits with variable-width state codes (single-digit
+            # FIPS codes like AL=1 produce 5-digit EVALIDs that break positional parsing).
 
             # Special handling for Texas (STATECD=48)
             # Texas has separate East/West evaluations, but we want the full state
@@ -532,45 +490,33 @@ def find_evalid(
                         .otherwise(0)
                         .alias("IS_FULL_STATE")
                     )
-                    # Sort using parsed year for robust chronological ordering
                     df_texas = (
                         df_texas.sort(
-                            ["EVAL_TYP", "IS_FULL_STATE", "EVALID_YEAR", "EVALID_TYPE"],
-                            descending=[False, True, True, False],
+                            ["EVAL_TYP", "IS_FULL_STATE", "END_INVYR"],
+                            descending=[False, True, True],
                         )
                         .group_by(["STATECD", "EVAL_TYP"])
                         .first()
-                        .drop(
-                            [
-                                "IS_FULL_STATE",
-                                "EVALID_YEAR",
-                                "EVALID_STATE",
-                                "EVALID_TYPE",
-                            ]
-                        )
+                        .drop(["IS_FULL_STATE"])
                     )
                 else:
-                    # Fallback if LOCATION_NM not available - use parsed year
                     df_texas = (
                         df_texas.sort(
-                            ["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"],
-                            descending=[False, False, True, False],
+                            ["STATECD", "EVAL_TYP", "END_INVYR"],
+                            descending=[False, False, True],
                         )
                         .group_by(["STATECD", "EVAL_TYP"])
                         .first()
-                        .drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"])
                     )
 
-            # For other states, use robust year sorting
             if not df_other.is_empty():
                 df_other = (
                     df_other.sort(
-                        ["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"],
-                        descending=[False, False, True, False],
+                        ["STATECD", "EVAL_TYP", "END_INVYR"],
+                        descending=[False, False, True],
                     )
                     .group_by(["STATECD", "EVAL_TYP"])
                     .first()
-                    .drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"])
                 )
 
             # Combine Texas and other states

From 5ba0221438b6586fc5e8a9bbddbb101c384c25b7 Mon Sep 17 00:00:00 2001
From: Chris Mihiar <28452317+mihiarc@users.noreply.github.com>
Date: Sat, 14 Mar 2026 10:31:59 -0400
Subject: [PATCH 2/2] Add EVALID tiebreaker to sort and unit tests for 5-digit
 EVALIDs

Add EVALID as secondary sort key (descending) for deterministic
tiebreaking when multiple evaluations share the same END_INVYR.

Add unit tests covering single-digit FIPS codes (Alabama, Arkansas),
standard 2-digit codes (Georgia), multi-state selection, and
tiebreaking behavior.
---
 src/pyfia/core/fia.py          |  12 +--
 tests/unit/test_find_evalid.py | 160 +++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 6 deletions(-)
 create mode 100644 tests/unit/test_find_evalid.py

diff --git a/src/pyfia/core/fia.py b/src/pyfia/core/fia.py
index bf1a3fc..c96a1fa 100755
--- a/src/pyfia/core/fia.py
+++ b/src/pyfia/core/fia.py
@@ -492,8 +492,8 @@ def find_evalid(
                     )
                     df_texas = (
                         df_texas.sort(
-                            ["EVAL_TYP", "IS_FULL_STATE", "END_INVYR"],
-                            descending=[False, True, True],
+                            ["EVAL_TYP", "IS_FULL_STATE", "END_INVYR", "EVALID"],
+                            descending=[False, True, True, True],
                         )
                         .group_by(["STATECD", "EVAL_TYP"])
                         .first()
@@ -502,8 +502,8 @@ def find_evalid(
                 else:
                     df_texas = (
                         df_texas.sort(
-                            ["STATECD", "EVAL_TYP", "END_INVYR"],
-                            descending=[False, False, True],
+                            ["STATECD", "EVAL_TYP", "END_INVYR", "EVALID"],
+                            descending=[False, False, True, True],
                         )
                         .group_by(["STATECD", "EVAL_TYP"])
                         .first()
@@ -512,8 +512,8 @@ def find_evalid(
             if not df_other.is_empty():
                 df_other = (
                     df_other.sort(
-                        ["STATECD", "EVAL_TYP", "END_INVYR"],
-                        descending=[False, False, True],
+                        ["STATECD", "EVAL_TYP", "END_INVYR", "EVALID"],
+                        descending=[False, False, True, True],
                     )
                     .group_by(["STATECD", "EVAL_TYP"])
                     .first()
diff --git a/tests/unit/test_find_evalid.py b/tests/unit/test_find_evalid.py
new file mode 100644
index 0000000..acec686
--- /dev/null
+++ b/tests/unit/test_find_evalid.py
@@ -0,0 +1,160 @@
+"""Unit tests for find_evalid() EVALID selection logic.
+
+Verifies that find_evalid(most_recent=True) correctly selects the most
+recent evaluation using END_INVYR, especially for states with single-digit
+FIPS codes that produce 5-digit EVALIDs.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import polars as pl
+import pytest
+
+
+@pytest.fixture
+def mock_fia():
+    """Create a mock FIA instance with the real find_evalid method."""
+    from pyfia.core.fia import FIA
+
+    with patch.object(FIA, "__init__", lambda self: None):
+        db = FIA()
+        db.tables = {}
+        db.evalid = None
+        db.state_filter = None
+        db._reader = MagicMock()
+        return db
+
+
+def _setup_eval_tables(mock_fia, pop_eval_rows, pop_eval_typ_rows):
+    """Helper to set up POP_EVAL and POP_EVAL_TYP tables on mock FIA."""
+    mock_fia.tables["POP_EVAL"] = pl.DataFrame(pop_eval_rows).lazy()
+    mock_fia.tables["POP_EVAL_TYP"] = pl.DataFrame(pop_eval_typ_rows).lazy()
+
+
+class TestSingleDigitFIPSCode:
+    """find_evalid must handle 5-digit EVALIDs from single-digit FIPS codes."""
+
+    def test_alabama_selects_most_recent_by_end_invyr(self, mock_fia):
+        """Alabama (FIPS=1) produces 5-digit EVALIDs. Most recent should win."""
+        _setup_eval_tables(
+            mock_fia,
+            pop_eval_rows={
+                "CN": ["eval_old", "eval_new"],
+                "EVALID": [10301, 12401],
+                "STATECD": [1, 1],
+                "END_INVYR": [2003, 2024],
+                "LOCATION_NM": ["Alabama", "Alabama"],
+            },
+            pop_eval_typ_rows={
+                "CN": ["typ_old", "typ_new"],
+                "EVAL_CN": ["eval_old", "eval_new"],
+                "EVAL_TYP": ["EXPVOL", "EXPVOL"],
+            },
+        )
+
+        result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")
+
+        assert result == [12401], (
+            f"Expected EVALID 12401 (END_INVYR=2024), got {result}. "
+            "5-digit EVALID parsing may be broken."
+        )
+
+    def test_arkansas_selects_most_recent(self, mock_fia):
+        """Arkansas (FIPS=5) also produces 5-digit EVALIDs."""
+        _setup_eval_tables(
+            mock_fia,
+            pop_eval_rows={
+                "CN": ["eval_old", "eval_new"],
+                "EVALID": [50501, 52201],
+                "STATECD": [5, 5],
+                "END_INVYR": [2005, 2022],
+                "LOCATION_NM": ["Arkansas", "Arkansas"],
+            },
+            pop_eval_typ_rows={
+                "CN": ["typ_old", "typ_new"],
+                "EVAL_CN": ["eval_old", "eval_new"],
+                "EVAL_TYP": ["EXPVOL", "EXPVOL"],
+            },
+        )
+
+        result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")
+
+        assert result == [52201]
+
+
+class TestTwoDigitFIPSCode:
+    """Standard 2-digit FIPS codes should continue to work."""
+
+    def test_georgia_selects_most_recent(self, mock_fia):
+        """Georgia (FIPS=13) produces standard 6-digit EVALIDs."""
+        _setup_eval_tables(
+            mock_fia,
+            pop_eval_rows={
+                "CN": ["eval_old", "eval_new"],
+                "EVALID": [131901, 132301],
+                "STATECD": [13, 13],
+                "END_INVYR": [2019, 2023],
+                "LOCATION_NM": ["Georgia", "Georgia"],
+            },
+            pop_eval_typ_rows={
+                "CN": ["typ_old", "typ_new"],
+                "EVAL_CN": ["eval_old", "eval_new"],
+                "EVAL_TYP": ["EXPVOL", "EXPVOL"],
+            },
+        )
+
+        result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")
+
+        assert result == [132301]
+
+
+class TestMultiStateSelection:
+    """find_evalid should pick the most recent per state in multi-state DBs."""
+
+    def test_picks_most_recent_per_state(self, mock_fia):
+        """Each state should get its own most recent evaluation."""
+        _setup_eval_tables(
+            mock_fia,
+            pop_eval_rows={
+                "CN": ["al_old", "al_new", "ga_old", "ga_new"],
+                "EVALID": [10301, 12401, 131901, 132301],
+                "STATECD": [1, 1, 13, 13],
+                "END_INVYR": [2003, 2024, 2019, 2023],
+                "LOCATION_NM": ["Alabama", "Alabama", "Georgia", "Georgia"],
+            },
+            pop_eval_typ_rows={
+                "CN": ["t1", "t2", "t3", "t4"],
+                "EVAL_CN": ["al_old", "al_new", "ga_old", "ga_new"],
+                "EVAL_TYP": ["EXPVOL", "EXPVOL", "EXPVOL", "EXPVOL"],
+            },
+        )
+
+        result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")
+
+        assert sorted(result) == [12401, 132301]
+
+
+class TestTiebreaking:
+    """When END_INVYR ties, EVALID descending should break the tie."""
+
+    def test_same_end_invyr_picks_higher_evalid(self, mock_fia):
+        """Higher EVALID wins when END_INVYR is the same."""
+        _setup_eval_tables(
+            mock_fia,
+            pop_eval_rows={
+                "CN": ["eval_a", "eval_b"],
+                "EVALID": [132300, 132301],
+                "STATECD": [13, 13],
+                "END_INVYR": [2023, 2023],
+                "LOCATION_NM": ["Georgia", "Georgia"],
+            },
+            pop_eval_typ_rows={
+                "CN": ["typ_a", "typ_b"],
+                "EVAL_CN": ["eval_a", "eval_b"],
+                "EVAL_TYP": ["EXPVOL", "EXPVOL"],
+            },
+        )
+
+        result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")
+
+        assert result == [132301]