From 064582b23d2892d32498510a84a5ee44f5f79ca0 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 7 Mar 2026 08:58:46 -0500 Subject: [PATCH] Add ruff formatter with CI lint job and Makefile target Formats all Python files with ruff, adds ruff>=0.9.0 to dev dependencies, adds a `ruff format --check .` CI step, and a `make format` target. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 9 ++ Makefile | 3 + changelog.d/add-ruff.added.md | 1 + policyengine_taxsim/cli.py | 25 +++- policyengine_taxsim/core/input_mapper.py | 32 +++-- policyengine_taxsim/core/utils.py | 77 ++++++----- policyengine_taxsim/core/yaml_generator.py | 8 +- policyengine_taxsim/runners/base_runner.py | 4 +- .../runners/policyengine_runner.py | 67 +++++---- .../runners/stitched_runner.py | 7 +- policyengine_taxsim/runners/taxsim_runner.py | 26 +++- pyproject.toml | 1 + tests/test_assume_w2_wages.py | 4 +- tests/test_cli_entry_point.py | 4 +- tests/test_e2e.py | 78 ++++++----- tests/test_mappers.py | 5 +- tests/test_multi_state.py | 45 +++++- tests/test_performance.py | 130 +++++++++--------- tests/test_stitched_runner.py | 17 +-- 19 files changed, 323 insertions(+), 220 deletions(-) create mode 100644 changelog.d/add-ruff.added.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aa878dd3..90fd655e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,15 @@ on: branches: [main] jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install ruff + run: pip install ruff>=0.9.0 + - name: Check formatting + run: ruff format --check . + test: runs-on: ${{ matrix.os }} strategy: diff --git a/Makefile b/Makefile index 9fccb96e..acd9ef06 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,7 @@ +format: + ruff format . + changelog: python .github/bump_version.py towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))") diff --git a/changelog.d/add-ruff.added.md b/changelog.d/add-ruff.added.md new file mode 100644 index 00000000..993e55e3 --- /dev/null +++ b/changelog.d/add-ruff.added.md @@ -0,0 +1 @@ +Added ruff formatter with CI check and Makefile target. \ No newline at end of file diff --git a/policyengine_taxsim/cli.py b/policyengine_taxsim/cli.py index cdb17d7a..ada17a40 100644 --- a/policyengine_taxsim/cli.py +++ b/policyengine_taxsim/cli.py @@ -23,7 +23,10 @@ from policyengine_taxsim.comparison.statistics import ComparisonStatistics from policyengine_taxsim.core.yaml_generator import generate_pe_tests_yaml from policyengine_taxsim.core.input_mapper import form_household_situation - from policyengine_taxsim.core.utils import get_state_code, convert_taxsim32_dependents + from policyengine_taxsim.core.utils import ( + get_state_code, + convert_taxsim32_dependents, + ) def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame): @@ -66,7 +69,7 @@ def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame): # Generate YAML file # Use taxsimid from row if available, otherwise use index + 1 - taxsim_id = int(row['taxsimid']) if 'taxsimid' in row else idx + 1 + taxsim_id = int(row["taxsimid"]) if "taxsimid" in row else idx + 1 yaml_filename = f"taxsim_record_{taxsim_id}_{year}.yaml" generate_pe_tests_yaml(household, outputs, yaml_filename, logs=True) @@ -151,7 +154,10 @@ def cli(ctx, logs, disable_salt, sample): "--disable-salt", is_flag=True, default=False, help="Set SALT Deduction to 0" ) @click.option( - "--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)" + "--assume-w2-wages", + is_flag=True, + default=False, + help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)", ) @click.option("--sample", type=int, help="Sample N records from input") def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample): @@ -171,7 +177,9 @@ def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample df = df.sample(n=sample, random_state=42) # Use StitchedRunner: routes to PE (2021+) or TAXSIM (pre-2021) - runner = StitchedRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages) + runner = StitchedRunner( + df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages + ) results_df = runner.run(show_progress=True) # Use the runner's input_df which has taxsimid (auto-assigned if needed) @@ -241,7 +249,10 @@ def taxsim(input_file, output, sample, taxsim_path): ) @click.option("--logs", is_flag=True, help="Generate PolicyEngine YAML logs") @click.option( - "--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)" + "--assume-w2-wages", + is_flag=True, + default=False, + help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)", ) def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_wages): """Compare PolicyEngine and TAXSIM results""" @@ -274,7 +285,9 @@ def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_ # Run PolicyEngine click.echo("Running PolicyEngine...") - pe_runner = PolicyEngineRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages) + pe_runner = PolicyEngineRunner( + df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages + ) pe_results = pe_runner.run() # Use the runner's input_df which has taxsimid (auto-assigned if needed) diff --git a/policyengine_taxsim/core/input_mapper.py b/policyengine_taxsim/core/input_mapper.py index 1c9efeff..1d36227d 100644 --- a/policyengine_taxsim/core/input_mapper.py +++ b/policyengine_taxsim/core/input_mapper.py @@ -185,7 +185,11 @@ def form_household_situation(year, state, taxsim_vars): for i in range(1, depx + 1): dep_name = f"your {get_ordinal(i)} dependent" people[dep_name] = { - "age": {str(year): int(taxsim_vars.get(f"age{i}", 10)) if taxsim_vars.get(f"age{i}") is not None else 10}, + "age": { + str(year): int(taxsim_vars.get(f"age{i}", 10)) + if taxsim_vars.get(f"age{i}") is not None + else 10 + }, "employment_income": {str(year): 0}, "is_tax_unit_dependent": {str(year): True}, "is_tax_unit_spouse": {str(year): False}, @@ -195,38 +199,44 @@ def form_household_situation(year, state, taxsim_vars): household_situation = add_additional_units( state.lower(), year, household_situation, taxsim_vars ) - + # Explicitly set SSI to 0 for all people to prevent PolicyEngine from imputing SSI benefits # TAXSIM does not model SSI, so we need to ensure it's not automatically calculated for person_name in household_situation["people"]: household_situation["people"][person_name]["ssi"] = {str(year): 0} - + # Explicitly set person-level benefit programs to 0 to prevent PolicyEngine from imputing these benefits # TAXSIM does not model these programs, so we need to ensure they're not automatically calculated for person_name in household_situation["people"]: household_situation["people"][person_name]["head_start"] = {str(year): 0} household_situation["people"][person_name]["early_head_start"] = {str(year): 0} - household_situation["people"][person_name]["commodity_supplemental_food_program"] = {str(year): 0} - + household_situation["people"][person_name][ + "commodity_supplemental_food_program" + ] = {str(year): 0} + # Explicitly set SNAP to 0 for all SPM units to prevent PolicyEngine from imputing SNAP benefits # TAXSIM does not model SNAP, so we need to ensure it's not automatically calculated for spm_unit_name in household_situation["spm_units"]: household_situation["spm_units"][spm_unit_name]["snap"] = {str(year): 0} - + # Explicitly set TANF to 0 for all SPM units to prevent PolicyEngine from imputing TANF benefits # TAXSIM does not model TANF, so we need to ensure it's not automatically calculated for spm_unit_name in household_situation["spm_units"]: household_situation["spm_units"][spm_unit_name]["tanf"] = {str(year): 0} - + # Explicitly set free_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing free school meal benefits # TAXSIM does not model free school meals, so we need to ensure it's not automatically calculated for spm_unit_name in household_situation["spm_units"]: - household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {str(year): 0} - + household_situation["spm_units"][spm_unit_name]["free_school_meals"] = { + str(year): 0 + } + # Explicitly set reduced_price_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing reduced price school meal benefits # TAXSIM does not model reduced price school meals, so we need to ensure it's not automatically calculated for spm_unit_name in household_situation["spm_units"]: - household_situation["spm_units"][spm_unit_name]["reduced_price_school_meals"] = {str(year): 0} + household_situation["spm_units"][spm_unit_name][ + "reduced_price_school_meals" + ] = {str(year): 0} return household_situation @@ -308,7 +318,7 @@ def generate_household(taxsim_vars): # Convert TAXSIM32 dependent format if present taxsim_vars = convert_taxsim32_dependents(taxsim_vars) - + taxsim_vars = set_taxsim_defaults(taxsim_vars, int(year)) state = get_state_code(taxsim_vars["state"]) diff --git a/policyengine_taxsim/core/utils.py b/policyengine_taxsim/core/utils.py index b616520f..0a29b69d 100644 --- a/policyengine_taxsim/core/utils.py +++ b/policyengine_taxsim/core/utils.py @@ -97,112 +97,119 @@ def to_roundedup_number(value): def convert_taxsim32_dependents(taxsim_vars): """ - Convert TAXSIM32 dependent count format (dep13, dep17, dep18) + Convert TAXSIM32 dependent count format (dep13, dep17, dep18) to individual age format (age1, age2, etc.). - + TAXSIM32 format uses cumulative counts: - dep13: Number of dependents under 13 - dep17: Number of dependents under 17 (includes those under 13) - dep18: Number of dependents under 18 (includes those under 17 and 13) - + This function infers individual ages based on these counts. If depx exceeds dep18, additional dependents are assigned age 21. - + Args: taxsim_vars (dict): Dictionary containing TAXSIM input variables - + Returns: dict: Updated dictionary with age1, age2, etc. fields added """ # Check if we have the TAXSIM32 format fields present # Just check for presence, not values, since all three could be 0 with depx > 0 (all dependents 18+) - has_taxsim32_fields = 'dep13' in taxsim_vars or 'dep17' in taxsim_vars or 'dep18' in taxsim_vars - + has_taxsim32_fields = ( + "dep13" in taxsim_vars or "dep17" in taxsim_vars or "dep18" in taxsim_vars + ) + # Check if we already have individual age fields explicitly set (including 0 for newborns) # We consider age fields as explicitly set if they exist in the input has_individual_age_fields = any( - f'age{i}' in taxsim_vars and taxsim_vars[f'age{i}'] is not None + f"age{i}" in taxsim_vars and taxsim_vars[f"age{i}"] is not None for i in range(1, 12) ) - + # Get depx value - depx = int(taxsim_vars.get('depx', 0) or 0) - + depx = int(taxsim_vars.get("depx", 0) or 0) + # Only convert if: # 1. We have TAXSIM32 fields (dep13/17/18) with meaningful values # 2. AND we don't already have individual age fields set # This ensures we only convert when TAXSIM32 format is actually being used if has_taxsim32_fields and not has_individual_age_fields: - dep13 = int(taxsim_vars.get('dep13', 0) or 0) - dep17 = int(taxsim_vars.get('dep17', 0) or 0) - dep18 = int(taxsim_vars.get('dep18', 0) or 0) - depx = int(taxsim_vars.get('depx', 0) or 0) - + dep13 = int(taxsim_vars.get("dep13", 0) or 0) + dep17 = int(taxsim_vars.get("dep17", 0) or 0) + dep18 = int(taxsim_vars.get("dep18", 0) or 0) + depx = int(taxsim_vars.get("depx", 0) or 0) + # Calculate the number of dependents in each age bracket # Note: These are cumulative, so we need to subtract to get individual counts num_under_13 = dep13 num_13_to_16 = dep17 - dep13 # Those under 17 but not under 13 num_17 = dep18 - dep17 # Those under 18 but not under 17 (i.e., exactly 17) - + # Calculate number of dependents 18 or older num_18_or_older = 0 if depx > dep18: num_18_or_older = depx - dep18 # These will be assigned age 21 - + # Set depx to the total number of dependents if not already set - if 'depx' not in taxsim_vars or taxsim_vars['depx'] is None: - taxsim_vars['depx'] = max(depx, dep18) + if "depx" not in taxsim_vars or taxsim_vars["depx"] is None: + taxsim_vars["depx"] = max(depx, dep18) else: # Ensure depx is at least as large as dep18 - taxsim_vars['depx'] = max(int(taxsim_vars['depx']), dep18) - + taxsim_vars["depx"] = max(int(taxsim_vars["depx"]), dep18) + # Generate individual ages based on the counts # We'll use typical ages for each bracket dep_counter = 1 - + # Add dependents under 13 (use age 10 as default) for _ in range(num_under_13): if dep_counter <= 11: # TAXSIM supports up to 11 dependents - taxsim_vars[f'age{dep_counter}'] = 10 + taxsim_vars[f"age{dep_counter}"] = 10 dep_counter += 1 - + # Add dependents aged 13-16 (use age 15 as default) for _ in range(num_13_to_16): if dep_counter <= 11: - taxsim_vars[f'age{dep_counter}'] = 15 + taxsim_vars[f"age{dep_counter}"] = 15 dep_counter += 1 - + # Add dependents aged 17 (use age 17) for _ in range(num_17): if dep_counter <= 11: - taxsim_vars[f'age{dep_counter}'] = 17 + taxsim_vars[f"age{dep_counter}"] = 17 dep_counter += 1 - + # Add dependents aged 18 or older (use age 21 as default for adult dependents) for _ in range(num_18_or_older): if dep_counter <= 11: - taxsim_vars[f'age{dep_counter}'] = 21 + taxsim_vars[f"age{dep_counter}"] = 21 dep_counter += 1 - + # Handle NaN and age 0 inputs - default them to age 10 # Check all age fields up to age11 for i in range(1, 12): - age_field = f'age{i}' + age_field = f"age{i}" if age_field in taxsim_vars: age_value = taxsim_vars[age_field] # Check for NaN (using numpy's isnan if it's a numpy type, or math.isnan for regular floats) is_nan = False try: import math + if isinstance(age_value, (float, np.floating)): - is_nan = math.isnan(age_value) if not isinstance(age_value, np.ndarray) else np.isnan(age_value) + is_nan = ( + math.isnan(age_value) + if not isinstance(age_value, np.ndarray) + else np.isnan(age_value) + ) except (TypeError, ValueError): pass - + # If age is NaN or 0, default to 10 if is_nan or age_value == 0 or age_value == 0.0: taxsim_vars[age_field] = 10 - + return taxsim_vars diff --git a/policyengine_taxsim/core/yaml_generator.py b/policyengine_taxsim/core/yaml_generator.py index a3d67965..ff02ab3b 100644 --- a/policyengine_taxsim/core/yaml_generator.py +++ b/policyengine_taxsim/core/yaml_generator.py @@ -164,8 +164,12 @@ def generate_yaml( "ssi": person_data.get("ssi", {}).get(year_str, 0), "wic": person_data.get("wic", {}).get(year_str, 0), "head_start": person_data.get("head_start", {}).get(year_str, 0), - "early_head_start": person_data.get("early_head_start", {}).get(year_str, 0), - "commodity_supplemental_food_program": person_data.get("commodity_supplemental_food_program", {}).get(year_str, 0), + "early_head_start": person_data.get("early_head_start", {}).get( + year_str, 0 + ), + "commodity_supplemental_food_program": person_data.get( + "commodity_supplemental_food_program", {} + ).get(year_str, 0), } # Add optional fields only if they have non-zero values diff --git a/policyengine_taxsim/runners/base_runner.py b/policyengine_taxsim/runners/base_runner.py index 120d2e14..bd36e963 100644 --- a/policyengine_taxsim/runners/base_runner.py +++ b/policyengine_taxsim/runners/base_runner.py @@ -32,9 +32,7 @@ def _validate_input(self): """Validate that input data has required structure""" # year is required by TAXSIM (1960-2023) and PolicyEngine if "year" not in self.input_df.columns: - raise ValueError( - "Input data must contain a 'year' column" - ) + raise ValueError("Input data must contain a 'year' column") # Auto-assign taxsimid if not present if "taxsimid" not in self.input_df.columns: diff --git a/policyengine_taxsim/runners/policyengine_runner.py b/policyengine_taxsim/runners/policyengine_runner.py index dcee759b..0c257f92 100644 --- a/policyengine_taxsim/runners/policyengine_runner.py +++ b/policyengine_taxsim/runners/policyengine_runner.py @@ -427,7 +427,9 @@ def _process_person_data_for_year(self, year_data: pd.DataFrame, year: int) -> d # For each dependent, figure out which dep number they are dep_position = position_in_hh[dep_mask_indices] dep_has_spouse = has_spouse_expanded[dep_mask_indices] - dep_num = dep_position - np.where(dep_has_spouse, 2, 1) + 1 # 1-indexed + dep_num = ( + dep_position - np.where(dep_has_spouse, 2, 1) + 1 + ) # 1-indexed dep_hh_idx = household_ids[dep_mask_indices] for dep_slot in range(1, 12): @@ -448,7 +450,10 @@ def _process_person_data_for_year(self, year_data: pd.DataFrame, year: int) -> d default_val = mapping.get("default", 0.0) # Primary values - if isinstance(primary_source, str) and primary_source in year_data.columns: + if ( + isinstance(primary_source, str) + and primary_source in year_data.columns + ): prim_vals = year_data[primary_source].fillna(0).values.astype(float) values[is_primary] = np.repeat(prim_vals, people_per_hh)[is_primary] elif callable(primary_source): @@ -462,7 +467,10 @@ def _process_person_data_for_year(self, year_data: pd.DataFrame, year: int) -> d values[is_primary] = primary_source # Spouse values - if isinstance(spouse_source, str) and spouse_source in year_data.columns: + if ( + isinstance(spouse_source, str) + and spouse_source in year_data.columns + ): sp_vals = year_data[spouse_source].fillna(0).values.astype(float) values[is_spouse] = np.repeat(sp_vals, people_per_hh)[is_spouse] elif callable(spouse_source): @@ -578,8 +586,12 @@ def _apply_defaults_vectorized(self, df: pd.DataFrame) -> pd.DataFrame: age_cols[col] = np.zeros(len(df), dtype=int) mask = (dep_counter == dep_slot) & (count_series > 0) age_cols[col] = np.where(mask, age_val, age_cols[col]) - dep_counter = np.where(count_series > 0, dep_counter + 1, dep_counter) - count_series = np.where(count_series > 0, count_series - 1, count_series) + dep_counter = np.where( + count_series > 0, dep_counter + 1, dep_counter + ) + count_series = np.where( + count_series > 0, count_series - 1, count_series + ) for col, vals in age_cols.items(): if col not in df.columns: @@ -731,10 +743,18 @@ def generate(self) -> None: # SPM unit data data["spm_unit_id"][year_int] = year_spm_unit_ids data["spm_unit_weight"][year_int] = np.ones(n_year_records) - data["snap"][year_int] = np.zeros(n_year_records) # Set SNAP to 0 to match TAXSIM (which doesn't model SNAP) - data["tanf"][year_int] = np.zeros(n_year_records) # Set TANF to 0 to match TAXSIM (which doesn't model TANF) - data["free_school_meals"][year_int] = np.zeros(n_year_records) # Set free school meals to 0 to match TAXSIM (which doesn't model free school meals) - data["reduced_price_school_meals"][year_int] = np.zeros(n_year_records) # Set reduced price school meals to 0 to match TAXSIM (which doesn't model reduced price school meals) + data["snap"][year_int] = np.zeros( + n_year_records + ) # Set SNAP to 0 to match TAXSIM (which doesn't model SNAP) + data["tanf"][year_int] = np.zeros( + n_year_records + ) # Set TANF to 0 to match TAXSIM (which doesn't model TANF) + data["free_school_meals"][year_int] = np.zeros( + n_year_records + ) # Set free school meals to 0 to match TAXSIM (which doesn't model free school meals) + data["reduced_price_school_meals"][year_int] = np.zeros( + n_year_records + ) # Set reduced price school meals to 0 to match TAXSIM (which doesn't model reduced price school meals) # Marital unit data data["marital_unit_id"][year_int] = year_marital_unit_ids @@ -761,7 +781,11 @@ class PolicyEngineRunner(BaseTaxRunner): """ def __init__( - self, input_df: pd.DataFrame, logs: bool = False, disable_salt: bool = False, assume_w2_wages: bool = False + self, + input_df: pd.DataFrame, + logs: bool = False, + disable_salt: bool = False, + assume_w2_wages: bool = False, ): super().__init__(input_df) self.logs = logs @@ -917,14 +941,10 @@ def _calc_tax_unit(self, sim, var_name, period): values = np.array(sim.calculate(var_name, period=period)) entity_key = var_obj.entity.key if entity_key == "person": - return np.array( - sim.map_result(values, "person", "tax_unit", how="sum") - ) + return np.array(sim.map_result(values, "person", "tax_unit", how="sum")) elif entity_key != "tax_unit": # For household/spm_unit etc., project to person then sum to tax_unit - return np.array( - sim.map_result(values, entity_key, "tax_unit") - ) + return np.array(sim.map_result(values, entity_key, "tax_unit")) return values def _extract_vectorized_results( @@ -1024,8 +1044,7 @@ def _extract_vectorized_results( ) unified_vars_list = ( all( - sim.tax_benefit_system.variables.get(v) - is not None + sim.tax_benefit_system.variables.get(v) is not None for v in variables_list ) if variables_list @@ -1063,7 +1082,9 @@ def _extract_vectorized_results( ): continue try: - arr = self._calc_tax_unit(sim, resolved, year_str) + arr = self._calc_tax_unit( + sim, resolved, year_str + ) var_sum += arr except Exception as e: if "does not exist" in str(e): @@ -1081,7 +1102,9 @@ def _extract_vectorized_results( ): continue try: - arr = self._calc_tax_unit(sim, resolved, year_str) + arr = self._calc_tax_unit( + sim, resolved, year_str + ) result_array[state_mask] = arr[state_mask] except Exception as e: if "does not exist" in str(e): @@ -1126,9 +1149,7 @@ def _extract_vectorized_results( else: fiitax_arr = self._calc_tax_unit( sim, "income_tax", year_str - ) + self._calc_tax_unit( - sim, "additional_medicare_tax", year_str - ) + ) + self._calc_tax_unit(sim, "additional_medicare_tax", year_str) columns["fiitax"] = np.round(fiitax_arr, 2) # Apply idtl filtering: mask out columns not requested by each row's idtl diff --git a/policyengine_taxsim/runners/stitched_runner.py b/policyengine_taxsim/runners/stitched_runner.py index 52d7dd5f..16194090 100644 --- a/policyengine_taxsim/runners/stitched_runner.py +++ b/policyengine_taxsim/runners/stitched_runner.py @@ -36,8 +36,7 @@ def run(self, show_progress: bool = True) -> pd.DataFrame: # Warn if PE-only kwargs are set but some rows go to TAXSIM if taxsim_mask.any(): active_pe_kwargs = { - k for k, v in self._pe_kwargs.items() - if k in self._PE_ONLY_KWARGS and v + k for k, v in self._pe_kwargs.items() if k in self._PE_ONLY_KWARGS and v } if active_pe_kwargs: logger.warning( @@ -51,9 +50,7 @@ def run(self, show_progress: bool = True) -> pd.DataFrame: frames = [] if pe_mask.any(): - pe_runner = PolicyEngineRunner( - self.input_df[pe_mask], **self._pe_kwargs - ) + pe_runner = PolicyEngineRunner(self.input_df[pe_mask], **self._pe_kwargs) frames.append(pe_runner.run(show_progress=show_progress)) if taxsim_mask.any(): diff --git a/policyengine_taxsim/runners/taxsim_runner.py b/policyengine_taxsim/runners/taxsim_runner.py index 4eab53b7..35cc0a1b 100644 --- a/policyengine_taxsim/runners/taxsim_runner.py +++ b/policyengine_taxsim/runners/taxsim_runner.py @@ -37,7 +37,7 @@ class TaxsimRunner(BaseTaxRunner): "age10", "age11", ] - + # TAXSIM32 format columns for dependent counts by age bracket TAXSIM32_COLUMNS = [ "dep13", # Number of dependents under 13 @@ -71,7 +71,9 @@ class TaxsimRunner(BaseTaxRunner): "idtl", # Output control ] - ALL_COLUMNS = REQUIRED_COLUMNS + DEPENDENT_AGE_COLUMNS + TAXSIM32_COLUMNS + INCOME_COLUMNS + ALL_COLUMNS = ( + REQUIRED_COLUMNS + DEPENDENT_AGE_COLUMNS + TAXSIM32_COLUMNS + INCOME_COLUMNS + ) def __init__(self, input_df: pd.DataFrame, taxsim_path: str = None): super().__init__(input_df) @@ -98,15 +100,27 @@ def _detect_taxsim_executable(self) -> Path: # 1. Relative path (for running from repo during development) Path("resources") / "taxsimtest" / exe_name, # 2. Shared data location (for pip-installed packages) - Path(sys.prefix) / "share" / "policyengine_taxsim" / "taxsimtest" / exe_name, + Path(sys.prefix) + / "share" + / "policyengine_taxsim" + / "taxsimtest" + / exe_name, # 3. User site-packages shared data - Path(sys.base_prefix) / "share" / "policyengine_taxsim" / "taxsimtest" / exe_name, + Path(sys.base_prefix) + / "share" + / "policyengine_taxsim" + / "taxsimtest" + / exe_name, ] # Also check virtualenv locations if hasattr(sys, "real_prefix"): # virtualenv search_paths.append( - Path(sys.real_prefix) / "share" / "policyengine_taxsim" / "taxsimtest" / exe_name + Path(sys.real_prefix) + / "share" + / "policyengine_taxsim" + / "taxsimtest" + / exe_name ) for taxsim_path in search_paths: @@ -154,7 +168,7 @@ def _format_input_for_taxsim(self, df: pd.DataFrame) -> pd.DataFrame: # Add only age columns for actual dependents (up to 11 max) for i in range(min(depx, 11)): - age_col = f"age{i+1}" + age_col = f"age{i + 1}" dynamic_columns.append(age_col) # Add income columns (but exclude TAXSIM32 columns since TAXSIM-35 uses individual ages) diff --git a/pyproject.toml b/pyproject.toml index 1f4f1cb8..e46b71e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ include = [ dependencies = [ "pytest", "pytest-cov", + "ruff>=0.9.0", ] [tool.hatch.envs.default.scripts] diff --git a/tests/test_assume_w2_wages.py b/tests/test_assume_w2_wages.py index c27fbb7c..fc6d7750 100644 --- a/tests/test_assume_w2_wages.py +++ b/tests/test_assume_w2_wages.py @@ -125,9 +125,7 @@ def test_scorp_above_threshold_qbid_changes(self): ) # At least one record should show a meaningful difference - fiitax_diff = ( - result_default["fiitax"].values - result_w2["fiitax"].values - ) + fiitax_diff = result_default["fiitax"].values - result_w2["fiitax"].values assert fiitax_diff.max() > 100, ( f"Expected meaningful QBID difference for high-income S-Corp filers, " f"but max fiitax reduction was only ${fiitax_diff.max():.2f}" diff --git a/tests/test_cli_entry_point.py b/tests/test_cli_entry_point.py index 7c5898b2..51169f0d 100644 --- a/tests/test_cli_entry_point.py +++ b/tests/test_cli_entry_point.py @@ -12,9 +12,7 @@ def test_cli_entry_point_resolves(): """The console script entry point must import and be callable.""" dist = distribution("policyengine-taxsim") - console_scripts = [ - ep for ep in dist.entry_points if ep.group == "console_scripts" - ] + console_scripts = [ep for ep in dist.entry_points if ep.group == "console_scripts"] assert len(console_scripts) == 1, "Expected exactly one console script" ep = console_scripts[0] assert ep.name == "policyengine-taxsim" diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 9faf7fb2..3c752e97 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -9,7 +9,6 @@ class E2ETest(unittest.TestCase): - def setUp(self) -> None: import importlib.resources as pkg_resources import policyengine_taxsim @@ -236,7 +235,7 @@ def test_match_single_household_output(self): # Compare standard_output_cols = ["year", "fiitax", "siitax"] full_output_cols = standard_output_cols + [ - "tfica" "v10", # state_agi + "tficav10", # state_agi "v13", "v18", "v19", @@ -350,17 +349,20 @@ def test_match_joint_household_output(self): # "fiitax", "siitax", ] - full_output_cols = standard_output_cols + [ - # "tfica" - # "v10", # state_agi - # "v13", - # "v18", - # "v19", - # "v26", - # "v28", - # "v34", - # "v45", - ] + full_output_cols = ( + standard_output_cols + + [ + # "tfica" + # "v10", # state_agi + # "v13", + # "v18", + # "v19", + # "v26", + # "v28", + # "v34", + # "v45", + ] + ) # Determine which columns to check based on idtl value columns_to_check = ( @@ -461,17 +463,20 @@ def test_match_household_with_dependent_output(self): # "fiitax", "siitax", ] - full_output_cols = standard_output_cols + [ - # "tfica" - # "v10", # state_agi - # "v13", - # "v18", - # "v19", - # "v26", - # "v28", - # "v34", - # "v45", - ] + full_output_cols = ( + standard_output_cols + + [ + # "tfica" + # "v10", # state_agi + # "v13", + # "v18", + # "v19", + # "v26", + # "v28", + # "v34", + # "v45", + ] + ) # Determine which columns to check based on idtl value columns_to_check = ( @@ -581,17 +586,20 @@ def test_match_household_with_dependent_single_parent_output(self): # "fiitax", "siitax", ] - full_output_cols = standard_output_cols + [ - # "tfica" - # "v10", # state_agi - # "v13", - # "v18", - # "v19", - # "v26", - # "v28", - # "v34", - # "v45", - ] + full_output_cols = ( + standard_output_cols + + [ + # "tfica" + # "v10", # state_agi + # "v13", + # "v18", + # "v19", + # "v26", + # "v28", + # "v34", + # "v45", + ] + ) # Determine which columns to check based on idtl value columns_to_check = ( diff --git a/tests/test_mappers.py b/tests/test_mappers.py index 1d23c93d..a24792a1 100644 --- a/tests/test_mappers.py +++ b/tests/test_mappers.py @@ -191,7 +191,6 @@ def test_export_single_household(sample_taxsim_input): "you": { "age": {"2021": 35}, "employment_income": {"2021": 50000}, - "is_tax_unit_head": {"2021": True}, } }, @@ -412,7 +411,7 @@ def test_household_with_dependent(sample_taxsim_input_for_household_with_depende "your first dependent", "your second dependent", ], - "pa_use_tax": {"2023": 0}, + "pa_use_tax": {"2023": 0}, } }, } @@ -492,7 +491,7 @@ def test_household_with_dependent_single_parent( "tax_units": { "your tax unit": { "members": ["you", "your first dependent", "your second dependent"], - "pa_use_tax": {"2023": 0}, + "pa_use_tax": {"2023": 0}, } }, } diff --git a/tests/test_multi_state.py b/tests/test_multi_state.py index 66ab20b9..ef60fdf6 100644 --- a/tests/test_multi_state.py +++ b/tests/test_multi_state.py @@ -81,11 +81,46 @@ def _mixed_contiguous_and_noncontiguous(): """Mix of contiguous (KY, CA, NY) and non-contiguous (AK, HI) states.""" return pd.DataFrame( [ - {"taxsimid": 1, "year": 2023, "state": 18, "mstat": 1, "pwages": 50000, "idtl": 0}, # KY - {"taxsimid": 2, "year": 2023, "state": 5, "mstat": 1, "pwages": 50000, "idtl": 0}, # CA - {"taxsimid": 3, "year": 2023, "state": 33, "mstat": 1, "pwages": 50000, "idtl": 0}, # NY - {"taxsimid": 4, "year": 2023, "state": 2, "mstat": 1, "pwages": 50000, "idtl": 0}, # AK - {"taxsimid": 5, "year": 2023, "state": 12, "mstat": 1, "pwages": 50000, "idtl": 0}, # HI + { + "taxsimid": 1, + "year": 2023, + "state": 18, + "mstat": 1, + "pwages": 50000, + "idtl": 0, + }, # KY + { + "taxsimid": 2, + "year": 2023, + "state": 5, + "mstat": 1, + "pwages": 50000, + "idtl": 0, + }, # CA + { + "taxsimid": 3, + "year": 2023, + "state": 33, + "mstat": 1, + "pwages": 50000, + "idtl": 0, + }, # NY + { + "taxsimid": 4, + "year": 2023, + "state": 2, + "mstat": 1, + "pwages": 50000, + "idtl": 0, + }, # AK + { + "taxsimid": 5, + "year": 2023, + "state": 12, + "mstat": 1, + "pwages": 50000, + "idtl": 0, + }, # HI ] ) diff --git a/tests/test_performance.py b/tests/test_performance.py index fd0b01be..c3838d20 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -103,9 +103,7 @@ def test_generate_does_not_scale_linearly(self): times = {} for n in [100, 500]: records = _make_synthetic_records(n, seed=42) - runner = PolicyEngineRunner( - records.copy(), logs=False, disable_salt=True - ) + runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True) runner.input_df["year"] = runner.input_df["year"].apply( lambda x: int(float(x)) ) @@ -133,9 +131,7 @@ def test_extract_builds_dataframe_without_row_loop(self): this would be slow at higher record counts. """ records = _make_synthetic_records(200, seed=55) - runner = PolicyEngineRunner( - records.copy(), logs=False, disable_salt=True - ) + runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True) orig_extract = runner._extract_vectorized_results.__func__ extract_time = {} @@ -146,9 +142,7 @@ def timed_extract(self_runner, sim, input_df): extract_time["t"] = time.time() - t0 return result - runner._extract_vectorized_results = types.MethodType( - timed_extract, runner - ) + runner._extract_vectorized_results = types.MethodType(timed_extract, runner) result = runner.run(show_progress=False) assert len(result) == 200 @@ -172,9 +166,7 @@ def test_benchmark_500_records(self): elapsed = time.time() - start assert len(result) == 500 - assert elapsed < 60, ( - f"500 records took {elapsed:.1f}s, expected < 60s" - ) + assert elapsed < 60, f"500 records took {elapsed:.1f}s, expected < 60s" print(f"\nBenchmark: 500 records in {elapsed:.1f}s") def test_benchmark_cps_like(self): @@ -190,29 +182,31 @@ def test_benchmark_cps_like(self): mstat = rng.choice([1, 2], size=n, p=[0.55, 0.45]) depx = rng.choice([0, 1, 2, 3, 4], size=n, p=[0.35, 0.25, 0.2, 0.15, 0.05]) - records = pd.DataFrame({ - "taxsimid": np.arange(1, n + 1), - "year": 2023, - "state": rng.choice(all_states, size=n), - "mstat": mstat, - "depx": depx, - "page": rng.randint(20, 75, size=n), - "sage": np.where(mstat == 2, rng.randint(20, 75, size=n), 0), - "pwages": rng.lognormal(10.5, 1.0, size=n).round(2), - "swages": np.where( - mstat == 2, rng.lognormal(10.0, 1.2, size=n).round(2), 0 - ), - "dividends": np.where( - rng.random(n) < 0.15, rng.lognormal(8, 2, size=n).round(2), 0 - ), - "intrec": np.where( - rng.random(n) < 0.25, rng.lognormal(7, 1.5, size=n).round(2), 0 - ), - "pensions": np.where( - rng.random(n) < 0.20, rng.lognormal(9.5, 1.0, size=n).round(2), 0 - ), - "idtl": 2, - }) + records = pd.DataFrame( + { + "taxsimid": np.arange(1, n + 1), + "year": 2023, + "state": rng.choice(all_states, size=n), + "mstat": mstat, + "depx": depx, + "page": rng.randint(20, 75, size=n), + "sage": np.where(mstat == 2, rng.randint(20, 75, size=n), 0), + "pwages": rng.lognormal(10.5, 1.0, size=n).round(2), + "swages": np.where( + mstat == 2, rng.lognormal(10.0, 1.2, size=n).round(2), 0 + ), + "dividends": np.where( + rng.random(n) < 0.15, rng.lognormal(8, 2, size=n).round(2), 0 + ), + "intrec": np.where( + rng.random(n) < 0.25, rng.lognormal(7, 1.5, size=n).round(2), 0 + ), + "pensions": np.where( + rng.random(n) < 0.20, rng.lognormal(9.5, 1.0, size=n).round(2), 0 + ), + "idtl": 2, + } + ) # Add dependent ages for i in range(1, 5): @@ -229,11 +223,11 @@ def test_benchmark_cps_like(self): elapsed = time.time() - start assert len(result) == n - print(f"\nBenchmark (CPS-like): {n} records, {records['state'].nunique()} states, idtl=2") - print(f" Total: {elapsed:.1f}s") - assert elapsed < 120, ( - f"CPS-like benchmark took {elapsed:.1f}s, expected < 120s" + print( + f"\nBenchmark (CPS-like): {n} records, {records['state'].nunique()} states, idtl=2" ) + print(f" Total: {elapsed:.1f}s") + assert elapsed < 120, f"CPS-like benchmark took {elapsed:.1f}s, expected < 120s" class TestStateVariableEfficiency: @@ -249,23 +243,23 @@ def test_extract_does_not_iterate_states(self): rng = np.random.RandomState(88) all_states = list(range(1, 52)) n = 50 - records = pd.DataFrame({ - "taxsimid": np.arange(1, n + 1), - "year": 2023, - "state": rng.choice(all_states, size=n), - "mstat": 1, - "depx": 0, - "page": 40, - "sage": 0, - "pwages": rng.uniform(30000, 100000, size=n).round(2), - "swages": 0.0, - "idtl": 2, # full output to trigger all state vars - }) - - runner = PolicyEngineRunner( - records.copy(), logs=False, disable_salt=True + records = pd.DataFrame( + { + "taxsimid": np.arange(1, n + 1), + "year": 2023, + "state": rng.choice(all_states, size=n), + "mstat": 1, + "depx": 0, + "page": 40, + "sage": 0, + "pwages": rng.uniform(30000, 100000, size=n).round(2), + "swages": 0.0, + "idtl": 2, # full output to trigger all state vars + } ) + runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True) + # Count _calc_tax_unit calls orig_calc_tu = runner._calc_tax_unit.__func__ calc_count = {"n": 0} @@ -291,21 +285,21 @@ def test_state_variable_values_match(self): Verify that unified state variable results match expected values. state_income_tax (already unified) should match siitax column. """ - records = pd.DataFrame({ - "taxsimid": [1, 2], - "year": 2023, - "state": [5, 33], # CA, NY - "mstat": 1, - "depx": 0, - "page": 40, - "sage": 0, - "pwages": [80000.0, 60000.0], - "swages": 0.0, - "idtl": 2, - }) - runner = PolicyEngineRunner( - records.copy(), logs=False, disable_salt=True + records = pd.DataFrame( + { + "taxsimid": [1, 2], + "year": 2023, + "state": [5, 33], # CA, NY + "mstat": 1, + "depx": 0, + "page": 40, + "sage": 0, + "pwages": [80000.0, 60000.0], + "swages": 0.0, + "idtl": 2, + } ) + runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True) result = runner.run(show_progress=False) # siitax should be nonzero for CA and NY diff --git a/tests/test_stitched_runner.py b/tests/test_stitched_runner.py index be6e6fe8..7ce2bdfe 100644 --- a/tests/test_stitched_runner.py +++ b/tests/test_stitched_runner.py @@ -68,12 +68,8 @@ def test_all_taxsim_years(self, MockPE, MockTaxsim): def test_mixed_years(self, MockPE, MockTaxsim): """Mixed years split correctly between engines.""" df = _make_input([(1, 2019), (2, 2022), (3, 1990), (4, 2025)]) - MockPE.return_value.run.return_value = _make_result( - [(2, 2022), (4, 2025)] - ) - MockTaxsim.return_value.run.return_value = _make_result( - [(1, 2019), (3, 1990)] - ) + MockPE.return_value.run.return_value = _make_result([(2, 2022), (4, 2025)]) + MockTaxsim.return_value.run.return_value = _make_result([(1, 2019), (3, 1990)]) runner = StitchedRunner(df) runner.run(show_progress=False) @@ -135,12 +131,8 @@ class TestOutputOrdering: @patch("policyengine_taxsim.runners.stitched_runner.PolicyEngineRunner") def test_order_preserved(self, MockPE, MockTaxsim): df = _make_input([(3, 2019), (1, 2023), (4, 1990), (2, 2025)]) - MockPE.return_value.run.return_value = _make_result( - [(1, 2023), (2, 2025)] - ) - MockTaxsim.return_value.run.return_value = _make_result( - [(3, 2019), (4, 1990)] - ) + MockPE.return_value.run.return_value = _make_result([(1, 2023), (2, 2025)]) + MockTaxsim.return_value.run.return_value = _make_result([(3, 2019), (4, 1990)]) runner = StitchedRunner(df) result = runner.run(show_progress=False) @@ -256,6 +248,7 @@ def test_pe_kwargs_warning_for_taxsim_rows(self, MockPE, MockTaxsim, caplog): MockTaxsim.return_value.run.return_value = _make_result([(1, 2020)]) import logging + with caplog.at_level(logging.WARNING): runner = StitchedRunner(df, logs=True, disable_salt=True) runner.run(show_progress=False)