Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 12 additions & 66 deletions src/pyfia/core/fia.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@
import logging
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, overload
from typing import TYPE_CHECKING

import polars as pl

from ..constants.defaults import EVALIDYearParsing
from ..validation import sanitize_sql_path
from .data_reader import FIADataReader
from .exceptions import (
Expand All @@ -31,49 +30,6 @@
logger = logging.getLogger(__name__)


@overload
def _add_parsed_evalid_columns(
df: pl.DataFrame,
) -> pl.DataFrame: ...


@overload
def _add_parsed_evalid_columns(
df: pl.LazyFrame,
) -> pl.LazyFrame: ...


def _add_parsed_evalid_columns(
df: pl.DataFrame | pl.LazyFrame,
) -> pl.DataFrame | pl.LazyFrame:
"""
Add parsed EVALID columns to a DataFrame for sorting.

EVALID format: SSYYTT (State, Year 2-digit, Type)
Year uses Y2K windowing based on EVALIDYearParsing constants.

See EVALIDYearParsing for details on year interpretation.
"""
year_expr = pl.col("EVALID").cast(pl.Utf8).str.slice(2, 2).cast(pl.Int32)
return df.with_columns(
[
pl.when(year_expr <= EVALIDYearParsing.Y2K_WINDOW_THRESHOLD)
.then(EVALIDYearParsing.CENTURY_2000 + year_expr)
.otherwise(EVALIDYearParsing.CENTURY_1900 + year_expr)
.alias("EVALID_YEAR"),
pl.col("EVALID")
.cast(pl.Utf8)
.str.slice(0, 2)
.cast(pl.Int32)
.alias("EVALID_STATE"),
pl.col("EVALID")
.cast(pl.Utf8)
.str.slice(4, 2)
.cast(pl.Int32)
.alias("EVALID_TYPE"),
]
)


class FIA:
"""
Expand Down Expand Up @@ -512,8 +468,10 @@ def find_evalid(
df = df.filter(pl.col("EVAL_TYP") == eval_type_full)

if most_recent:
# Add parsed EVALID columns for robust year sorting
df = _add_parsed_evalid_columns(df)
# Sort by END_INVYR from POP_EVAL to find the most recent evaluation.
# END_INVYR is an unambiguous 4-digit year, unlike EVALID which encodes
# the year as 2 digits with variable-width state codes (single-digit
# FIPS codes like AL=1 produce 5-digit EVALIDs that break positional parsing).

# Special handling for Texas (STATECD=48)
# Texas has separate East/West evaluations, but we want the full state
Expand All @@ -532,45 +490,33 @@ def find_evalid(
.otherwise(0)
.alias("IS_FULL_STATE")
)
# Sort using parsed year for robust chronological ordering
df_texas = (
df_texas.sort(
["EVAL_TYP", "IS_FULL_STATE", "EVALID_YEAR", "EVALID_TYPE"],
descending=[False, True, True, False],
["EVAL_TYP", "IS_FULL_STATE", "END_INVYR", "EVALID"],
descending=[False, True, True, True],
)
.group_by(["STATECD", "EVAL_TYP"])
.first()
.drop(
[
"IS_FULL_STATE",
"EVALID_YEAR",
"EVALID_STATE",
"EVALID_TYPE",
]
)
.drop(["IS_FULL_STATE"])
)
else:
# Fallback if LOCATION_NM not available - use parsed year
df_texas = (
df_texas.sort(
["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"],
descending=[False, False, True, False],
["STATECD", "EVAL_TYP", "END_INVYR", "EVALID"],
descending=[False, False, True, True],
)
.group_by(["STATECD", "EVAL_TYP"])
.first()
.drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"])
)

# For other states, use robust year sorting
if not df_other.is_empty():
df_other = (
df_other.sort(
["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"],
descending=[False, False, True, False],
["STATECD", "EVAL_TYP", "END_INVYR", "EVALID"],
descending=[False, False, True, True],
)
.group_by(["STATECD", "EVAL_TYP"])
.first()
.drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"])
)

# Combine Texas and other states
Expand Down
160 changes: 160 additions & 0 deletions tests/unit/test_find_evalid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Unit tests for find_evalid() EVALID selection logic.

Verifies that find_evalid(most_recent=True) correctly selects the most
recent evaluation using END_INVYR, especially for states with single-digit
FIPS codes that produce 5-digit EVALIDs.
"""

from unittest.mock import MagicMock, patch

import polars as pl
import pytest


@pytest.fixture
def mock_fia():
"""Create a mock FIA instance with the real find_evalid method."""
from pyfia.core.fia import FIA

with patch.object(FIA, "__init__", lambda self: None):
db = FIA()
db.tables = {}
db.evalid = None
db.state_filter = None
db._reader = MagicMock()
return db


def _setup_eval_tables(mock_fia, pop_eval_rows, pop_eval_typ_rows):
"""Helper to set up POP_EVAL and POP_EVAL_TYP tables on mock FIA."""
mock_fia.tables["POP_EVAL"] = pl.DataFrame(pop_eval_rows).lazy()
mock_fia.tables["POP_EVAL_TYP"] = pl.DataFrame(pop_eval_typ_rows).lazy()


class TestSingleDigitFIPSCode:
"""find_evalid must handle 5-digit EVALIDs from single-digit FIPS codes."""

def test_alabama_selects_most_recent_by_end_invyr(self, mock_fia):
"""Alabama (FIPS=1) produces 5-digit EVALIDs. Most recent should win."""
_setup_eval_tables(
mock_fia,
pop_eval_rows={
"CN": ["eval_old", "eval_new"],
"EVALID": [10301, 12401],
"STATECD": [1, 1],
"END_INVYR": [2003, 2024],
"LOCATION_NM": ["Alabama", "Alabama"],
},
pop_eval_typ_rows={
"CN": ["typ_old", "typ_new"],
"EVAL_CN": ["eval_old", "eval_new"],
"EVAL_TYP": ["EXPVOL", "EXPVOL"],
},
)

result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")

assert result == [12401], (
f"Expected EVALID 12401 (END_INVYR=2024), got {result}. "
"5-digit EVALID parsing may be broken."
)

def test_arkansas_selects_most_recent(self, mock_fia):
"""Arkansas (FIPS=5) also produces 5-digit EVALIDs."""
_setup_eval_tables(
mock_fia,
pop_eval_rows={
"CN": ["eval_old", "eval_new"],
"EVALID": [50501, 52201],
"STATECD": [5, 5],
"END_INVYR": [2005, 2022],
"LOCATION_NM": ["Arkansas", "Arkansas"],
},
pop_eval_typ_rows={
"CN": ["typ_old", "typ_new"],
"EVAL_CN": ["eval_old", "eval_new"],
"EVAL_TYP": ["EXPVOL", "EXPVOL"],
},
)

result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")

assert result == [52201]


class TestTwoDigitFIPSCode:
"""Standard 2-digit FIPS codes should continue to work."""

def test_georgia_selects_most_recent(self, mock_fia):
"""Georgia (FIPS=13) produces standard 6-digit EVALIDs."""
_setup_eval_tables(
mock_fia,
pop_eval_rows={
"CN": ["eval_old", "eval_new"],
"EVALID": [131901, 132301],
"STATECD": [13, 13],
"END_INVYR": [2019, 2023],
"LOCATION_NM": ["Georgia", "Georgia"],
},
pop_eval_typ_rows={
"CN": ["typ_old", "typ_new"],
"EVAL_CN": ["eval_old", "eval_new"],
"EVAL_TYP": ["EXPVOL", "EXPVOL"],
},
)

result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")

assert result == [132301]


class TestMultiStateSelection:
"""find_evalid should pick the most recent per state in multi-state DBs."""

def test_picks_most_recent_per_state(self, mock_fia):
"""Each state should get its own most recent evaluation."""
_setup_eval_tables(
mock_fia,
pop_eval_rows={
"CN": ["al_old", "al_new", "ga_old", "ga_new"],
"EVALID": [10301, 12401, 131901, 132301],
"STATECD": [1, 1, 13, 13],
"END_INVYR": [2003, 2024, 2019, 2023],
"LOCATION_NM": ["Alabama", "Alabama", "Georgia", "Georgia"],
},
pop_eval_typ_rows={
"CN": ["t1", "t2", "t3", "t4"],
"EVAL_CN": ["al_old", "al_new", "ga_old", "ga_new"],
"EVAL_TYP": ["EXPVOL", "EXPVOL", "EXPVOL", "EXPVOL"],
},
)

result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")

assert sorted(result) == [12401, 132301]


class TestTiebreaking:
"""When END_INVYR ties, EVALID descending should break the tie."""

def test_same_end_invyr_picks_higher_evalid(self, mock_fia):
"""Higher EVALID wins when END_INVYR is the same."""
_setup_eval_tables(
mock_fia,
pop_eval_rows={
"CN": ["eval_a", "eval_b"],
"EVALID": [132300, 132301],
"STATECD": [13, 13],
"END_INVYR": [2023, 2023],
"LOCATION_NM": ["Georgia", "Georgia"],
},
pop_eval_typ_rows={
"CN": ["typ_a", "typ_b"],
"EVAL_CN": ["eval_a", "eval_b"],
"EVAL_TYP": ["EXPVOL", "EXPVOL"],
},
)

result = mock_fia.find_evalid(most_recent=True, eval_type="VOL")

assert result == [132301]
Loading