Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ on:
branches: [main]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install ruff
run: pip install ruff>=0.9.0
- name: Check formatting
run: ruff format --check .

test:
runs-on: ${{ matrix.os }}
strategy:
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

format:
ruff format .

changelog:
python .github/bump_version.py
towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
1 change: 1 addition & 0 deletions changelog.d/add-ruff.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added ruff formatter with CI check and Makefile target.
25 changes: 19 additions & 6 deletions policyengine_taxsim/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
from policyengine_taxsim.comparison.statistics import ComparisonStatistics
from policyengine_taxsim.core.yaml_generator import generate_pe_tests_yaml
from policyengine_taxsim.core.input_mapper import form_household_situation
from policyengine_taxsim.core.utils import get_state_code, convert_taxsim32_dependents
from policyengine_taxsim.core.utils import (
get_state_code,
convert_taxsim32_dependents,
)


def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame):
Expand Down Expand Up @@ -66,7 +69,7 @@ def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame):

# Generate YAML file
# Use taxsimid from row if available, otherwise use index + 1
taxsim_id = int(row['taxsimid']) if 'taxsimid' in row else idx + 1
taxsim_id = int(row["taxsimid"]) if "taxsimid" in row else idx + 1
yaml_filename = f"taxsim_record_{taxsim_id}_{year}.yaml"
generate_pe_tests_yaml(household, outputs, yaml_filename, logs=True)

Expand Down Expand Up @@ -151,7 +154,10 @@ def cli(ctx, logs, disable_salt, sample):
"--disable-salt", is_flag=True, default=False, help="Set SALT Deduction to 0"
)
@click.option(
"--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)"
"--assume-w2-wages",
is_flag=True,
default=False,
help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)",
)
@click.option("--sample", type=int, help="Sample N records from input")
def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample):
Expand All @@ -171,7 +177,9 @@ def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample
df = df.sample(n=sample, random_state=42)

# Use StitchedRunner: routes to PE (2021+) or TAXSIM (pre-2021)
runner = StitchedRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages)
runner = StitchedRunner(
df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages
)
results_df = runner.run(show_progress=True)

# Use the runner's input_df which has taxsimid (auto-assigned if needed)
Expand Down Expand Up @@ -241,7 +249,10 @@ def taxsim(input_file, output, sample, taxsim_path):
)
@click.option("--logs", is_flag=True, help="Generate PolicyEngine YAML logs")
@click.option(
"--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)"
"--assume-w2-wages",
is_flag=True,
default=False,
help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)",
)
def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_wages):
"""Compare PolicyEngine and TAXSIM results"""
Expand Down Expand Up @@ -274,7 +285,9 @@ def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_

# Run PolicyEngine
click.echo("Running PolicyEngine...")
pe_runner = PolicyEngineRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages)
pe_runner = PolicyEngineRunner(
df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages
)
pe_results = pe_runner.run()

# Use the runner's input_df which has taxsimid (auto-assigned if needed)
Expand Down
32 changes: 21 additions & 11 deletions policyengine_taxsim/core/input_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,11 @@ def form_household_situation(year, state, taxsim_vars):
for i in range(1, depx + 1):
dep_name = f"your {get_ordinal(i)} dependent"
people[dep_name] = {
"age": {str(year): int(taxsim_vars.get(f"age{i}", 10)) if taxsim_vars.get(f"age{i}") is not None else 10},
"age": {
str(year): int(taxsim_vars.get(f"age{i}", 10))
if taxsim_vars.get(f"age{i}") is not None
else 10
},
"employment_income": {str(year): 0},
"is_tax_unit_dependent": {str(year): True},
"is_tax_unit_spouse": {str(year): False},
Expand All @@ -195,38 +199,44 @@ def form_household_situation(year, state, taxsim_vars):
household_situation = add_additional_units(
state.lower(), year, household_situation, taxsim_vars
)

# Explicitly set SSI to 0 for all people to prevent PolicyEngine from imputing SSI benefits
# TAXSIM does not model SSI, so we need to ensure it's not automatically calculated
for person_name in household_situation["people"]:
household_situation["people"][person_name]["ssi"] = {str(year): 0}

# Explicitly set person-level benefit programs to 0 to prevent PolicyEngine from imputing these benefits
# TAXSIM does not model these programs, so we need to ensure they're not automatically calculated
for person_name in household_situation["people"]:
household_situation["people"][person_name]["head_start"] = {str(year): 0}
household_situation["people"][person_name]["early_head_start"] = {str(year): 0}
household_situation["people"][person_name]["commodity_supplemental_food_program"] = {str(year): 0}

household_situation["people"][person_name][
"commodity_supplemental_food_program"
] = {str(year): 0}

# Explicitly set SNAP to 0 for all SPM units to prevent PolicyEngine from imputing SNAP benefits
# TAXSIM does not model SNAP, so we need to ensure it's not automatically calculated
for spm_unit_name in household_situation["spm_units"]:
household_situation["spm_units"][spm_unit_name]["snap"] = {str(year): 0}

# Explicitly set TANF to 0 for all SPM units to prevent PolicyEngine from imputing TANF benefits
# TAXSIM does not model TANF, so we need to ensure it's not automatically calculated
for spm_unit_name in household_situation["spm_units"]:
household_situation["spm_units"][spm_unit_name]["tanf"] = {str(year): 0}

# Explicitly set free_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing free school meal benefits
# TAXSIM does not model free school meals, so we need to ensure it's not automatically calculated
for spm_unit_name in household_situation["spm_units"]:
household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {str(year): 0}

household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {
str(year): 0
}

# Explicitly set reduced_price_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing reduced price school meal benefits
# TAXSIM does not model reduced price school meals, so we need to ensure it's not automatically calculated
for spm_unit_name in household_situation["spm_units"]:
household_situation["spm_units"][spm_unit_name]["reduced_price_school_meals"] = {str(year): 0}
household_situation["spm_units"][spm_unit_name][
"reduced_price_school_meals"
] = {str(year): 0}

return household_situation

Expand Down Expand Up @@ -308,7 +318,7 @@ def generate_household(taxsim_vars):

# Convert TAXSIM32 dependent format if present
taxsim_vars = convert_taxsim32_dependents(taxsim_vars)

taxsim_vars = set_taxsim_defaults(taxsim_vars, int(year))

state = get_state_code(taxsim_vars["state"])
Expand Down
77 changes: 42 additions & 35 deletions policyengine_taxsim/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,112 +97,119 @@ def to_roundedup_number(value):

def convert_taxsim32_dependents(taxsim_vars):
"""
Convert TAXSIM32 dependent count format (dep13, dep17, dep18)
Convert TAXSIM32 dependent count format (dep13, dep17, dep18)
to individual age format (age1, age2, etc.).

TAXSIM32 format uses cumulative counts:
- dep13: Number of dependents under 13
- dep17: Number of dependents under 17 (includes those under 13)
- dep18: Number of dependents under 18 (includes those under 17 and 13)

This function infers individual ages based on these counts.
If depx exceeds dep18, additional dependents are assigned age 21.

Args:
taxsim_vars (dict): Dictionary containing TAXSIM input variables

Returns:
dict: Updated dictionary with age1, age2, etc. fields added
"""
# Check if we have the TAXSIM32 format fields present
# Just check for presence, not values, since all three could be 0 with depx > 0 (all dependents 18+)
has_taxsim32_fields = 'dep13' in taxsim_vars or 'dep17' in taxsim_vars or 'dep18' in taxsim_vars

has_taxsim32_fields = (
"dep13" in taxsim_vars or "dep17" in taxsim_vars or "dep18" in taxsim_vars
)

# Check if we already have individual age fields explicitly set (including 0 for newborns)
# We consider age fields as explicitly set if they exist in the input
has_individual_age_fields = any(
f'age{i}' in taxsim_vars and taxsim_vars[f'age{i}'] is not None
f"age{i}" in taxsim_vars and taxsim_vars[f"age{i}"] is not None
for i in range(1, 12)
)

# Get depx value
depx = int(taxsim_vars.get('depx', 0) or 0)
depx = int(taxsim_vars.get("depx", 0) or 0)

# Only convert if:
# 1. We have TAXSIM32 fields (dep13/17/18) with meaningful values
# 2. AND we don't already have individual age fields set
# This ensures we only convert when TAXSIM32 format is actually being used
if has_taxsim32_fields and not has_individual_age_fields:
dep13 = int(taxsim_vars.get('dep13', 0) or 0)
dep17 = int(taxsim_vars.get('dep17', 0) or 0)
dep18 = int(taxsim_vars.get('dep18', 0) or 0)
depx = int(taxsim_vars.get('depx', 0) or 0)
dep13 = int(taxsim_vars.get("dep13", 0) or 0)
dep17 = int(taxsim_vars.get("dep17", 0) or 0)
dep18 = int(taxsim_vars.get("dep18", 0) or 0)
depx = int(taxsim_vars.get("depx", 0) or 0)

# Calculate the number of dependents in each age bracket
# Note: These are cumulative, so we need to subtract to get individual counts
num_under_13 = dep13
num_13_to_16 = dep17 - dep13 # Those under 17 but not under 13
num_17 = dep18 - dep17 # Those under 18 but not under 17 (i.e., exactly 17)

# Calculate number of dependents 18 or older
num_18_or_older = 0
if depx > dep18:
num_18_or_older = depx - dep18 # These will be assigned age 21

# Set depx to the total number of dependents if not already set
if 'depx' not in taxsim_vars or taxsim_vars['depx'] is None:
taxsim_vars['depx'] = max(depx, dep18)
if "depx" not in taxsim_vars or taxsim_vars["depx"] is None:
taxsim_vars["depx"] = max(depx, dep18)
else:
# Ensure depx is at least as large as dep18
taxsim_vars['depx'] = max(int(taxsim_vars['depx']), dep18)
taxsim_vars["depx"] = max(int(taxsim_vars["depx"]), dep18)

# Generate individual ages based on the counts
# We'll use typical ages for each bracket
dep_counter = 1

# Add dependents under 13 (use age 10 as default)
for _ in range(num_under_13):
if dep_counter <= 11: # TAXSIM supports up to 11 dependents
taxsim_vars[f'age{dep_counter}'] = 10
taxsim_vars[f"age{dep_counter}"] = 10
dep_counter += 1

# Add dependents aged 13-16 (use age 15 as default)
for _ in range(num_13_to_16):
if dep_counter <= 11:
taxsim_vars[f'age{dep_counter}'] = 15
taxsim_vars[f"age{dep_counter}"] = 15
dep_counter += 1

# Add dependents aged 17 (use age 17)
for _ in range(num_17):
if dep_counter <= 11:
taxsim_vars[f'age{dep_counter}'] = 17
taxsim_vars[f"age{dep_counter}"] = 17
dep_counter += 1

# Add dependents aged 18 or older (use age 21 as default for adult dependents)
for _ in range(num_18_or_older):
if dep_counter <= 11:
taxsim_vars[f'age{dep_counter}'] = 21
taxsim_vars[f"age{dep_counter}"] = 21
dep_counter += 1

# Handle NaN and age 0 inputs - default them to age 10
# Check all age fields up to age11
for i in range(1, 12):
age_field = f'age{i}'
age_field = f"age{i}"
if age_field in taxsim_vars:
age_value = taxsim_vars[age_field]
# Check for NaN (using numpy's isnan if it's a numpy type, or math.isnan for regular floats)
is_nan = False
try:
import math

if isinstance(age_value, (float, np.floating)):
is_nan = math.isnan(age_value) if not isinstance(age_value, np.ndarray) else np.isnan(age_value)
is_nan = (
math.isnan(age_value)
if not isinstance(age_value, np.ndarray)
else np.isnan(age_value)
)
except (TypeError, ValueError):
pass

# If age is NaN or 0, default to 10
if is_nan or age_value == 0 or age_value == 0.0:
taxsim_vars[age_field] = 10

return taxsim_vars


Expand Down
8 changes: 6 additions & 2 deletions policyengine_taxsim/core/yaml_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,12 @@ def generate_yaml(
"ssi": person_data.get("ssi", {}).get(year_str, 0),
"wic": person_data.get("wic", {}).get(year_str, 0),
"head_start": person_data.get("head_start", {}).get(year_str, 0),
"early_head_start": person_data.get("early_head_start", {}).get(year_str, 0),
"commodity_supplemental_food_program": person_data.get("commodity_supplemental_food_program", {}).get(year_str, 0),
"early_head_start": person_data.get("early_head_start", {}).get(
year_str, 0
),
"commodity_supplemental_food_program": person_data.get(
"commodity_supplemental_food_program", {}
).get(year_str, 0),
}

# Add optional fields only if they have non-zero values
Expand Down
4 changes: 1 addition & 3 deletions policyengine_taxsim/runners/base_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ def _validate_input(self):
"""Validate that input data has required structure"""
# year is required by TAXSIM (1960-2023) and PolicyEngine
if "year" not in self.input_df.columns:
raise ValueError(
"Input data must contain a 'year' column"
)
raise ValueError("Input data must contain a 'year' column")

# Auto-assign taxsimid if not present
if "taxsimid" not in self.input_df.columns:
Expand Down
Loading