From 7b196b543a11ecc9a7137a8b7f0e093f804e9412 Mon Sep 17 00:00:00 2001 From: Chris Mihiar <28452317+mihiarc@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:28:09 -0400 Subject: [PATCH 1/2] Fix EVALID year parsing for single-digit state FIPS codes find_evalid() parsed EVALID as fixed-width SSYYTT using positional string slicing, which fails for states with single-digit FIPS codes (AL=1, AR=5). A 5-digit EVALID like 12401 (AL, year=24, type=01) was misparsed as state=12, year=40, causing clip_most_recent() to select 2003 periodic inventory instead of 2024 annual inventory. Replace EVALID string parsing with END_INVYR from POP_EVAL, which is an unambiguous 4-digit year already available in the joined dataframe. Remove the now-unused _add_parsed_evalid_columns() function. Closes #78 --- src/pyfia/core/fia.py | 78 +++++++------------------------------------ 1 file changed, 12 insertions(+), 66 deletions(-) diff --git a/src/pyfia/core/fia.py b/src/pyfia/core/fia.py index 4120153..bf1a3fc 100755 --- a/src/pyfia/core/fia.py +++ b/src/pyfia/core/fia.py @@ -10,11 +10,10 @@ import logging import warnings from pathlib import Path -from typing import TYPE_CHECKING, overload +from typing import TYPE_CHECKING import polars as pl -from ..constants.defaults import EVALIDYearParsing from ..validation import sanitize_sql_path from .data_reader import FIADataReader from .exceptions import ( @@ -31,49 +30,6 @@ logger = logging.getLogger(__name__) -@overload -def _add_parsed_evalid_columns( - df: pl.DataFrame, -) -> pl.DataFrame: ... - - -@overload -def _add_parsed_evalid_columns( - df: pl.LazyFrame, -) -> pl.LazyFrame: ... - - -def _add_parsed_evalid_columns( - df: pl.DataFrame | pl.LazyFrame, -) -> pl.DataFrame | pl.LazyFrame: - """ - Add parsed EVALID columns to a DataFrame for sorting. - - EVALID format: SSYYTT (State, Year 2-digit, Type) - Year uses Y2K windowing based on EVALIDYearParsing constants. - - See EVALIDYearParsing for details on year interpretation. - """ - year_expr = pl.col("EVALID").cast(pl.Utf8).str.slice(2, 2).cast(pl.Int32) - return df.with_columns( - [ - pl.when(year_expr <= EVALIDYearParsing.Y2K_WINDOW_THRESHOLD) - .then(EVALIDYearParsing.CENTURY_2000 + year_expr) - .otherwise(EVALIDYearParsing.CENTURY_1900 + year_expr) - .alias("EVALID_YEAR"), - pl.col("EVALID") - .cast(pl.Utf8) - .str.slice(0, 2) - .cast(pl.Int32) - .alias("EVALID_STATE"), - pl.col("EVALID") - .cast(pl.Utf8) - .str.slice(4, 2) - .cast(pl.Int32) - .alias("EVALID_TYPE"), - ] - ) - class FIA: """ @@ -512,8 +468,10 @@ def find_evalid( df = df.filter(pl.col("EVAL_TYP") == eval_type_full) if most_recent: - # Add parsed EVALID columns for robust year sorting - df = _add_parsed_evalid_columns(df) + # Sort by END_INVYR from POP_EVAL to find the most recent evaluation. + # END_INVYR is an unambiguous 4-digit year, unlike EVALID which encodes + # the year as 2 digits with variable-width state codes (single-digit + # FIPS codes like AL=1 produce 5-digit EVALIDs that break positional parsing). # Special handling for Texas (STATECD=48) # Texas has separate East/West evaluations, but we want the full state @@ -532,45 +490,33 @@ def find_evalid( .otherwise(0) .alias("IS_FULL_STATE") ) - # Sort using parsed year for robust chronological ordering df_texas = ( df_texas.sort( - ["EVAL_TYP", "IS_FULL_STATE", "EVALID_YEAR", "EVALID_TYPE"], - descending=[False, True, True, False], + ["EVAL_TYP", "IS_FULL_STATE", "END_INVYR"], + descending=[False, True, True], ) .group_by(["STATECD", "EVAL_TYP"]) .first() - .drop( - [ - "IS_FULL_STATE", - "EVALID_YEAR", - "EVALID_STATE", - "EVALID_TYPE", - ] - ) + .drop(["IS_FULL_STATE"]) ) else: - # Fallback if LOCATION_NM not available - use parsed year df_texas = ( df_texas.sort( - ["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"], - descending=[False, False, True, False], + ["STATECD", "EVAL_TYP", "END_INVYR"], + descending=[False, False, True], ) .group_by(["STATECD", "EVAL_TYP"]) .first() - .drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"]) ) - # For other states, use robust year sorting if not df_other.is_empty(): df_other = ( df_other.sort( - ["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"], - descending=[False, False, True, False], + ["STATECD", "EVAL_TYP", "END_INVYR"], + descending=[False, False, True], ) .group_by(["STATECD", "EVAL_TYP"]) .first() - .drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"]) ) # Combine Texas and other states From 5ba0221438b6586fc5e8a9bbddbb101c384c25b7 Mon Sep 17 00:00:00 2001 From: Chris Mihiar <28452317+mihiarc@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:31:59 -0400 Subject: [PATCH 2/2] Add EVALID tiebreaker to sort and unit tests for 5-digit EVALIDs Add EVALID as secondary sort key (descending) for deterministic tiebreaking when multiple evaluations share the same END_INVYR. Add unit tests covering single-digit FIPS codes (Alabama, Arkansas), standard 2-digit codes (Georgia), multi-state selection, and tiebreaking behavior. --- src/pyfia/core/fia.py | 12 +-- tests/unit/test_find_evalid.py | 160 +++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 tests/unit/test_find_evalid.py diff --git a/src/pyfia/core/fia.py b/src/pyfia/core/fia.py index bf1a3fc..c96a1fa 100755 --- a/src/pyfia/core/fia.py +++ b/src/pyfia/core/fia.py @@ -492,8 +492,8 @@ def find_evalid( ) df_texas = ( df_texas.sort( - ["EVAL_TYP", "IS_FULL_STATE", "END_INVYR"], - descending=[False, True, True], + ["EVAL_TYP", "IS_FULL_STATE", "END_INVYR", "EVALID"], + descending=[False, True, True, True], ) .group_by(["STATECD", "EVAL_TYP"]) .first() @@ -502,8 +502,8 @@ def find_evalid( else: df_texas = ( df_texas.sort( - ["STATECD", "EVAL_TYP", "END_INVYR"], - descending=[False, False, True], + ["STATECD", "EVAL_TYP", "END_INVYR", "EVALID"], + descending=[False, False, True, True], ) .group_by(["STATECD", "EVAL_TYP"]) .first() @@ -512,8 +512,8 @@ def find_evalid( if not df_other.is_empty(): df_other = ( df_other.sort( - ["STATECD", "EVAL_TYP", "END_INVYR"], - descending=[False, False, True], + ["STATECD", "EVAL_TYP", "END_INVYR", "EVALID"], + descending=[False, False, True, True], ) .group_by(["STATECD", "EVAL_TYP"]) .first() diff --git a/tests/unit/test_find_evalid.py b/tests/unit/test_find_evalid.py new file mode 100644 index 0000000..acec686 --- /dev/null +++ b/tests/unit/test_find_evalid.py @@ -0,0 +1,160 @@ +"""Unit tests for find_evalid() EVALID selection logic. + +Verifies that find_evalid(most_recent=True) correctly selects the most +recent evaluation using END_INVYR, especially for states with single-digit +FIPS codes that produce 5-digit EVALIDs. +""" + +from unittest.mock import MagicMock, patch + +import polars as pl +import pytest + + +@pytest.fixture +def mock_fia(): + """Create a mock FIA instance with the real find_evalid method.""" + from pyfia.core.fia import FIA + + with patch.object(FIA, "__init__", lambda self: None): + db = FIA() + db.tables = {} + db.evalid = None + db.state_filter = None + db._reader = MagicMock() + return db + + +def _setup_eval_tables(mock_fia, pop_eval_rows, pop_eval_typ_rows): + """Helper to set up POP_EVAL and POP_EVAL_TYP tables on mock FIA.""" + mock_fia.tables["POP_EVAL"] = pl.DataFrame(pop_eval_rows).lazy() + mock_fia.tables["POP_EVAL_TYP"] = pl.DataFrame(pop_eval_typ_rows).lazy() + + +class TestSingleDigitFIPSCode: + """find_evalid must handle 5-digit EVALIDs from single-digit FIPS codes.""" + + def test_alabama_selects_most_recent_by_end_invyr(self, mock_fia): + """Alabama (FIPS=1) produces 5-digit EVALIDs. Most recent should win.""" + _setup_eval_tables( + mock_fia, + pop_eval_rows={ + "CN": ["eval_old", "eval_new"], + "EVALID": [10301, 12401], + "STATECD": [1, 1], + "END_INVYR": [2003, 2024], + "LOCATION_NM": ["Alabama", "Alabama"], + }, + pop_eval_typ_rows={ + "CN": ["typ_old", "typ_new"], + "EVAL_CN": ["eval_old", "eval_new"], + "EVAL_TYP": ["EXPVOL", "EXPVOL"], + }, + ) + + result = mock_fia.find_evalid(most_recent=True, eval_type="VOL") + + assert result == [12401], ( + f"Expected EVALID 12401 (END_INVYR=2024), got {result}. " + "5-digit EVALID parsing may be broken." + ) + + def test_arkansas_selects_most_recent(self, mock_fia): + """Arkansas (FIPS=5) also produces 5-digit EVALIDs.""" + _setup_eval_tables( + mock_fia, + pop_eval_rows={ + "CN": ["eval_old", "eval_new"], + "EVALID": [50501, 52201], + "STATECD": [5, 5], + "END_INVYR": [2005, 2022], + "LOCATION_NM": ["Arkansas", "Arkansas"], + }, + pop_eval_typ_rows={ + "CN": ["typ_old", "typ_new"], + "EVAL_CN": ["eval_old", "eval_new"], + "EVAL_TYP": ["EXPVOL", "EXPVOL"], + }, + ) + + result = mock_fia.find_evalid(most_recent=True, eval_type="VOL") + + assert result == [52201] + + +class TestTwoDigitFIPSCode: + """Standard 2-digit FIPS codes should continue to work.""" + + def test_georgia_selects_most_recent(self, mock_fia): + """Georgia (FIPS=13) produces standard 6-digit EVALIDs.""" + _setup_eval_tables( + mock_fia, + pop_eval_rows={ + "CN": ["eval_old", "eval_new"], + "EVALID": [131901, 132301], + "STATECD": [13, 13], + "END_INVYR": [2019, 2023], + "LOCATION_NM": ["Georgia", "Georgia"], + }, + pop_eval_typ_rows={ + "CN": ["typ_old", "typ_new"], + "EVAL_CN": ["eval_old", "eval_new"], + "EVAL_TYP": ["EXPVOL", "EXPVOL"], + }, + ) + + result = mock_fia.find_evalid(most_recent=True, eval_type="VOL") + + assert result == [132301] + + +class TestMultiStateSelection: + """find_evalid should pick the most recent per state in multi-state DBs.""" + + def test_picks_most_recent_per_state(self, mock_fia): + """Each state should get its own most recent evaluation.""" + _setup_eval_tables( + mock_fia, + pop_eval_rows={ + "CN": ["al_old", "al_new", "ga_old", "ga_new"], + "EVALID": [10301, 12401, 131901, 132301], + "STATECD": [1, 1, 13, 13], + "END_INVYR": [2003, 2024, 2019, 2023], + "LOCATION_NM": ["Alabama", "Alabama", "Georgia", "Georgia"], + }, + pop_eval_typ_rows={ + "CN": ["t1", "t2", "t3", "t4"], + "EVAL_CN": ["al_old", "al_new", "ga_old", "ga_new"], + "EVAL_TYP": ["EXPVOL", "EXPVOL", "EXPVOL", "EXPVOL"], + }, + ) + + result = mock_fia.find_evalid(most_recent=True, eval_type="VOL") + + assert sorted(result) == [12401, 132301] + + +class TestTiebreaking: + """When END_INVYR ties, EVALID descending should break the tie.""" + + def test_same_end_invyr_picks_higher_evalid(self, mock_fia): + """Higher EVALID wins when END_INVYR is the same.""" + _setup_eval_tables( + mock_fia, + pop_eval_rows={ + "CN": ["eval_a", "eval_b"], + "EVALID": [132300, 132301], + "STATECD": [13, 13], + "END_INVYR": [2023, 2023], + "LOCATION_NM": ["Georgia", "Georgia"], + }, + pop_eval_typ_rows={ + "CN": ["typ_a", "typ_b"], + "EVAL_CN": ["eval_a", "eval_b"], + "EVAL_TYP": ["EXPVOL", "EXPVOL"], + }, + ) + + result = mock_fia.find_evalid(most_recent=True, eval_type="VOL") + + assert result == [132301]