From 064582b23d2892d32498510a84a5ee44f5f79ca0 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 7 Mar 2026 08:58:46 -0500
Subject: [PATCH] Add ruff formatter with CI lint job and Makefile target

Formats all Python files with ruff, adds ruff>=0.9.0 to dev dependencies,
adds a `ruff format --check .` CI step, and a `make format` target.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml                      |   9 ++
 Makefile                                      |   3 +
 changelog.d/add-ruff.added.md                 |   1 +
 policyengine_taxsim/cli.py                    |  25 +++-
 policyengine_taxsim/core/input_mapper.py      |  32 +++--
 policyengine_taxsim/core/utils.py             |  77 ++++++-----
 policyengine_taxsim/core/yaml_generator.py    |   8 +-
 policyengine_taxsim/runners/base_runner.py    |   4 +-
 .../runners/policyengine_runner.py            |  67 +++++----
 .../runners/stitched_runner.py                |   7 +-
 policyengine_taxsim/runners/taxsim_runner.py  |  26 +++-
 pyproject.toml                                |   1 +
 tests/test_assume_w2_wages.py                 |   4 +-
 tests/test_cli_entry_point.py                 |   4 +-
 tests/test_e2e.py                             |  78 ++++++-----
 tests/test_mappers.py                         |   5 +-
 tests/test_multi_state.py                     |  45 +++++-
 tests/test_performance.py                     | 130 +++++++++---------
 tests/test_stitched_runner.py                 |  17 +--
 19 files changed, 323 insertions(+), 220 deletions(-)
 create mode 100644 changelog.d/add-ruff.added.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index aa878dd3..90fd655e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,15 @@ on:
     branches: [main]
 
 jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install ruff
+        run: pip install ruff>=0.9.0
+      - name: Check formatting
+        run: ruff format --check .
+
   test:
     runs-on: ${{ matrix.os }}
     strategy:
diff --git a/Makefile b/Makefile
index 9fccb96e..acd9ef06 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,7 @@
 
+format:
+	ruff format .
+
 changelog:
 	python .github/bump_version.py
 	towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
diff --git a/changelog.d/add-ruff.added.md b/changelog.d/add-ruff.added.md
new file mode 100644
index 00000000..993e55e3
--- /dev/null
+++ b/changelog.d/add-ruff.added.md
@@ -0,0 +1 @@
+Added ruff formatter with CI check and Makefile target.
\ No newline at end of file
diff --git a/policyengine_taxsim/cli.py b/policyengine_taxsim/cli.py
index cdb17d7a..ada17a40 100644
--- a/policyengine_taxsim/cli.py
+++ b/policyengine_taxsim/cli.py
@@ -23,7 +23,10 @@
     from policyengine_taxsim.comparison.statistics import ComparisonStatistics
     from policyengine_taxsim.core.yaml_generator import generate_pe_tests_yaml
     from policyengine_taxsim.core.input_mapper import form_household_situation
-    from policyengine_taxsim.core.utils import get_state_code, convert_taxsim32_dependents
+    from policyengine_taxsim.core.utils import (
+        get_state_code,
+        convert_taxsim32_dependents,
+    )
 
 
 def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame):
@@ -66,7 +69,7 @@ def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame):
 
             # Generate YAML file
             # Use taxsimid from row if available, otherwise use index + 1
-            taxsim_id = int(row['taxsimid']) if 'taxsimid' in row else idx + 1
+            taxsim_id = int(row["taxsimid"]) if "taxsimid" in row else idx + 1
             yaml_filename = f"taxsim_record_{taxsim_id}_{year}.yaml"
             generate_pe_tests_yaml(household, outputs, yaml_filename, logs=True)
 
@@ -151,7 +154,10 @@ def cli(ctx, logs, disable_salt, sample):
     "--disable-salt", is_flag=True, default=False, help="Set SALT Deduction to 0"
 )
 @click.option(
-    "--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)"
+    "--assume-w2-wages",
+    is_flag=True,
+    default=False,
+    help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)",
 )
 @click.option("--sample", type=int, help="Sample N records from input")
 def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample):
@@ -171,7 +177,9 @@ def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample
             df = df.sample(n=sample, random_state=42)
 
         # Use StitchedRunner: routes to PE (2021+) or TAXSIM (pre-2021)
-        runner = StitchedRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages)
+        runner = StitchedRunner(
+            df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages
+        )
         results_df = runner.run(show_progress=True)
 
         # Use the runner's input_df which has taxsimid (auto-assigned if needed)
@@ -241,7 +249,10 @@ def taxsim(input_file, output, sample, taxsim_path):
 )
 @click.option("--logs", is_flag=True, help="Generate PolicyEngine YAML logs")
 @click.option(
-    "--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)"
+    "--assume-w2-wages",
+    is_flag=True,
+    default=False,
+    help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)",
 )
 def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_wages):
     """Compare PolicyEngine and TAXSIM results"""
@@ -274,7 +285,9 @@ def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_
 
         # Run PolicyEngine
         click.echo("Running PolicyEngine...")
-        pe_runner = PolicyEngineRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages)
+        pe_runner = PolicyEngineRunner(
+            df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages
+        )
         pe_results = pe_runner.run()
 
         # Use the runner's input_df which has taxsimid (auto-assigned if needed)
diff --git a/policyengine_taxsim/core/input_mapper.py b/policyengine_taxsim/core/input_mapper.py
index 1c9efeff..1d36227d 100644
--- a/policyengine_taxsim/core/input_mapper.py
+++ b/policyengine_taxsim/core/input_mapper.py
@@ -185,7 +185,11 @@ def form_household_situation(year, state, taxsim_vars):
     for i in range(1, depx + 1):
         dep_name = f"your {get_ordinal(i)} dependent"
         people[dep_name] = {
-            "age": {str(year): int(taxsim_vars.get(f"age{i}", 10)) if taxsim_vars.get(f"age{i}") is not None else 10},
+            "age": {
+                str(year): int(taxsim_vars.get(f"age{i}", 10))
+                if taxsim_vars.get(f"age{i}") is not None
+                else 10
+            },
             "employment_income": {str(year): 0},
             "is_tax_unit_dependent": {str(year): True},
             "is_tax_unit_spouse": {str(year): False},
@@ -195,38 +199,44 @@ def form_household_situation(year, state, taxsim_vars):
     household_situation = add_additional_units(
         state.lower(), year, household_situation, taxsim_vars
     )
-    
+
     # Explicitly set SSI to 0 for all people to prevent PolicyEngine from imputing SSI benefits
     # TAXSIM does not model SSI, so we need to ensure it's not automatically calculated
     for person_name in household_situation["people"]:
         household_situation["people"][person_name]["ssi"] = {str(year): 0}
-    
+
     # Explicitly set person-level benefit programs to 0 to prevent PolicyEngine from imputing these benefits
     # TAXSIM does not model these programs, so we need to ensure they're not automatically calculated
     for person_name in household_situation["people"]:
         household_situation["people"][person_name]["head_start"] = {str(year): 0}
         household_situation["people"][person_name]["early_head_start"] = {str(year): 0}
-        household_situation["people"][person_name]["commodity_supplemental_food_program"] = {str(year): 0}
-    
+        household_situation["people"][person_name][
+            "commodity_supplemental_food_program"
+        ] = {str(year): 0}
+
     # Explicitly set SNAP to 0 for all SPM units to prevent PolicyEngine from imputing SNAP benefits
     # TAXSIM does not model SNAP, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
         household_situation["spm_units"][spm_unit_name]["snap"] = {str(year): 0}
-    
+
     # Explicitly set TANF to 0 for all SPM units to prevent PolicyEngine from imputing TANF benefits
     # TAXSIM does not model TANF, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
         household_situation["spm_units"][spm_unit_name]["tanf"] = {str(year): 0}
-    
+
     # Explicitly set free_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing free school meal benefits
     # TAXSIM does not model free school meals, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
-        household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {str(year): 0}
-    
+        household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {
+            str(year): 0
+        }
+
     # Explicitly set reduced_price_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing reduced price school meal benefits
     # TAXSIM does not model reduced price school meals, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
-        household_situation["spm_units"][spm_unit_name]["reduced_price_school_meals"] = {str(year): 0}
+        household_situation["spm_units"][spm_unit_name][
+            "reduced_price_school_meals"
+        ] = {str(year): 0}
 
     return household_situation
 
@@ -308,7 +318,7 @@ def generate_household(taxsim_vars):
 
     # Convert TAXSIM32 dependent format if present
     taxsim_vars = convert_taxsim32_dependents(taxsim_vars)
-    
+
     taxsim_vars = set_taxsim_defaults(taxsim_vars, int(year))
 
     state = get_state_code(taxsim_vars["state"])
diff --git a/policyengine_taxsim/core/utils.py b/policyengine_taxsim/core/utils.py
index b616520f..0a29b69d 100644
--- a/policyengine_taxsim/core/utils.py
+++ b/policyengine_taxsim/core/utils.py
@@ -97,112 +97,119 @@ def to_roundedup_number(value):
 
 def convert_taxsim32_dependents(taxsim_vars):
     """
-    Convert TAXSIM32 dependent count format (dep13, dep17, dep18) 
+    Convert TAXSIM32 dependent count format (dep13, dep17, dep18)
     to individual age format (age1, age2, etc.).
-    
+
     TAXSIM32 format uses cumulative counts:
     - dep13: Number of dependents under 13
     - dep17: Number of dependents under 17 (includes those under 13)
     - dep18: Number of dependents under 18 (includes those under 17 and 13)
-    
+
     This function infers individual ages based on these counts.
     If depx exceeds dep18, additional dependents are assigned age 21.
-    
+
     Args:
         taxsim_vars (dict): Dictionary containing TAXSIM input variables
-        
+
     Returns:
         dict: Updated dictionary with age1, age2, etc. fields added
     """
     # Check if we have the TAXSIM32 format fields present
     # Just check for presence, not values, since all three could be 0 with depx > 0 (all dependents 18+)
-    has_taxsim32_fields = 'dep13' in taxsim_vars or 'dep17' in taxsim_vars or 'dep18' in taxsim_vars
-    
+    has_taxsim32_fields = (
+        "dep13" in taxsim_vars or "dep17" in taxsim_vars or "dep18" in taxsim_vars
+    )
+
     # Check if we already have individual age fields explicitly set (including 0 for newborns)
     # We consider age fields as explicitly set if they exist in the input
     has_individual_age_fields = any(
-        f'age{i}' in taxsim_vars and taxsim_vars[f'age{i}'] is not None
+        f"age{i}" in taxsim_vars and taxsim_vars[f"age{i}"] is not None
         for i in range(1, 12)
     )
-    
+
     # Get depx value
-    depx = int(taxsim_vars.get('depx', 0) or 0)
-    
+    depx = int(taxsim_vars.get("depx", 0) or 0)
+
     # Only convert if:
     # 1. We have TAXSIM32 fields (dep13/17/18) with meaningful values
     # 2. AND we don't already have individual age fields set
     # This ensures we only convert when TAXSIM32 format is actually being used
     if has_taxsim32_fields and not has_individual_age_fields:
-        dep13 = int(taxsim_vars.get('dep13', 0) or 0)
-        dep17 = int(taxsim_vars.get('dep17', 0) or 0)
-        dep18 = int(taxsim_vars.get('dep18', 0) or 0)
-        depx = int(taxsim_vars.get('depx', 0) or 0)
-        
+        dep13 = int(taxsim_vars.get("dep13", 0) or 0)
+        dep17 = int(taxsim_vars.get("dep17", 0) or 0)
+        dep18 = int(taxsim_vars.get("dep18", 0) or 0)
+        depx = int(taxsim_vars.get("depx", 0) or 0)
+
         # Calculate the number of dependents in each age bracket
         # Note: These are cumulative, so we need to subtract to get individual counts
         num_under_13 = dep13
         num_13_to_16 = dep17 - dep13  # Those under 17 but not under 13
         num_17 = dep18 - dep17  # Those under 18 but not under 17 (i.e., exactly 17)
-        
+
         # Calculate number of dependents 18 or older
         num_18_or_older = 0
         if depx > dep18:
             num_18_or_older = depx - dep18  # These will be assigned age 21
-        
+
         # Set depx to the total number of dependents if not already set
-        if 'depx' not in taxsim_vars or taxsim_vars['depx'] is None:
-            taxsim_vars['depx'] = max(depx, dep18)
+        if "depx" not in taxsim_vars or taxsim_vars["depx"] is None:
+            taxsim_vars["depx"] = max(depx, dep18)
         else:
             # Ensure depx is at least as large as dep18
-            taxsim_vars['depx'] = max(int(taxsim_vars['depx']), dep18)
-        
+            taxsim_vars["depx"] = max(int(taxsim_vars["depx"]), dep18)
+
         # Generate individual ages based on the counts
         # We'll use typical ages for each bracket
         dep_counter = 1
-        
+
         # Add dependents under 13 (use age 10 as default)
         for _ in range(num_under_13):
             if dep_counter <= 11:  # TAXSIM supports up to 11 dependents
-                taxsim_vars[f'age{dep_counter}'] = 10
+                taxsim_vars[f"age{dep_counter}"] = 10
                 dep_counter += 1
-        
+
         # Add dependents aged 13-16 (use age 15 as default)
         for _ in range(num_13_to_16):
             if dep_counter <= 11:
-                taxsim_vars[f'age{dep_counter}'] = 15
+                taxsim_vars[f"age{dep_counter}"] = 15
                 dep_counter += 1
-        
+
         # Add dependents aged 17 (use age 17)
         for _ in range(num_17):
             if dep_counter <= 11:
-                taxsim_vars[f'age{dep_counter}'] = 17
+                taxsim_vars[f"age{dep_counter}"] = 17
                 dep_counter += 1
-        
+
         # Add dependents aged 18 or older (use age 21 as default for adult dependents)
         for _ in range(num_18_or_older):
             if dep_counter <= 11:
-                taxsim_vars[f'age{dep_counter}'] = 21
+                taxsim_vars[f"age{dep_counter}"] = 21
                 dep_counter += 1
-    
+
     # Handle NaN and age 0 inputs - default them to age 10
     # Check all age fields up to age11
     for i in range(1, 12):
-        age_field = f'age{i}'
+        age_field = f"age{i}"
         if age_field in taxsim_vars:
             age_value = taxsim_vars[age_field]
             # Check for NaN (using numpy's isnan if it's a numpy type, or math.isnan for regular floats)
             is_nan = False
             try:
                 import math
+
                 if isinstance(age_value, (float, np.floating)):
-                    is_nan = math.isnan(age_value) if not isinstance(age_value, np.ndarray) else np.isnan(age_value)
+                    is_nan = (
+                        math.isnan(age_value)
+                        if not isinstance(age_value, np.ndarray)
+                        else np.isnan(age_value)
+                    )
             except (TypeError, ValueError):
                 pass
-            
+
             # If age is NaN or 0, default to 10
             if is_nan or age_value == 0 or age_value == 0.0:
                 taxsim_vars[age_field] = 10
-    
+
     return taxsim_vars
 
 
diff --git a/policyengine_taxsim/core/yaml_generator.py b/policyengine_taxsim/core/yaml_generator.py
index a3d67965..ff02ab3b 100644
--- a/policyengine_taxsim/core/yaml_generator.py
+++ b/policyengine_taxsim/core/yaml_generator.py
@@ -164,8 +164,12 @@ def generate_yaml(
                 "ssi": person_data.get("ssi", {}).get(year_str, 0),
                 "wic": person_data.get("wic", {}).get(year_str, 0),
                 "head_start": person_data.get("head_start", {}).get(year_str, 0),
-                "early_head_start": person_data.get("early_head_start", {}).get(year_str, 0),
-                "commodity_supplemental_food_program": person_data.get("commodity_supplemental_food_program", {}).get(year_str, 0),
+                "early_head_start": person_data.get("early_head_start", {}).get(
+                    year_str, 0
+                ),
+                "commodity_supplemental_food_program": person_data.get(
+                    "commodity_supplemental_food_program", {}
+                ).get(year_str, 0),
             }
 
             # Add optional fields only if they have non-zero values
diff --git a/policyengine_taxsim/runners/base_runner.py b/policyengine_taxsim/runners/base_runner.py
index 120d2e14..bd36e963 100644
--- a/policyengine_taxsim/runners/base_runner.py
+++ b/policyengine_taxsim/runners/base_runner.py
@@ -32,9 +32,7 @@ def _validate_input(self):
         """Validate that input data has required structure"""
         # year is required by TAXSIM (1960-2023) and PolicyEngine
         if "year" not in self.input_df.columns:
-            raise ValueError(
-                "Input data must contain a 'year' column"
-            )
+            raise ValueError("Input data must contain a 'year' column")
 
         # Auto-assign taxsimid if not present
         if "taxsimid" not in self.input_df.columns:
diff --git a/policyengine_taxsim/runners/policyengine_runner.py b/policyengine_taxsim/runners/policyengine_runner.py
index dcee759b..0c257f92 100644
--- a/policyengine_taxsim/runners/policyengine_runner.py
+++ b/policyengine_taxsim/runners/policyengine_runner.py
@@ -427,7 +427,9 @@ def _process_person_data_for_year(self, year_data: pd.DataFrame, year: int) -> d
                     # For each dependent, figure out which dep number they are
                     dep_position = position_in_hh[dep_mask_indices]
                     dep_has_spouse = has_spouse_expanded[dep_mask_indices]
-                    dep_num = dep_position - np.where(dep_has_spouse, 2, 1) + 1  # 1-indexed
+                    dep_num = (
+                        dep_position - np.where(dep_has_spouse, 2, 1) + 1
+                    )  # 1-indexed
 
                     dep_hh_idx = household_ids[dep_mask_indices]
                     for dep_slot in range(1, 12):
@@ -448,7 +450,10 @@ def _process_person_data_for_year(self, year_data: pd.DataFrame, year: int) -> d
                 default_val = mapping.get("default", 0.0)
 
                 # Primary values
-                if isinstance(primary_source, str) and primary_source in year_data.columns:
+                if (
+                    isinstance(primary_source, str)
+                    and primary_source in year_data.columns
+                ):
                     prim_vals = year_data[primary_source].fillna(0).values.astype(float)
                     values[is_primary] = np.repeat(prim_vals, people_per_hh)[is_primary]
                 elif callable(primary_source):
@@ -462,7 +467,10 @@ def _process_person_data_for_year(self, year_data: pd.DataFrame, year: int) -> d
                     values[is_primary] = primary_source
 
                 # Spouse values
-                if isinstance(spouse_source, str) and spouse_source in year_data.columns:
+                if (
+                    isinstance(spouse_source, str)
+                    and spouse_source in year_data.columns
+                ):
                     sp_vals = year_data[spouse_source].fillna(0).values.astype(float)
                     values[is_spouse] = np.repeat(sp_vals, people_per_hh)[is_spouse]
                 elif callable(spouse_source):
@@ -578,8 +586,12 @@ def _apply_defaults_vectorized(self, df: pd.DataFrame) -> pd.DataFrame:
                             age_cols[col] = np.zeros(len(df), dtype=int)
                         mask = (dep_counter == dep_slot) & (count_series > 0)
                         age_cols[col] = np.where(mask, age_val, age_cols[col])
-                    dep_counter = np.where(count_series > 0, dep_counter + 1, dep_counter)
-                    count_series = np.where(count_series > 0, count_series - 1, count_series)
+                    dep_counter = np.where(
+                        count_series > 0, dep_counter + 1, dep_counter
+                    )
+                    count_series = np.where(
+                        count_series > 0, count_series - 1, count_series
+                    )
 
             for col, vals in age_cols.items():
                 if col not in df.columns:
@@ -731,10 +743,18 @@ def generate(self) -> None:
             # SPM unit data
             data["spm_unit_id"][year_int] = year_spm_unit_ids
             data["spm_unit_weight"][year_int] = np.ones(n_year_records)
-            data["snap"][year_int] = np.zeros(n_year_records)  # Set SNAP to 0 to match TAXSIM (which doesn't model SNAP)
-            data["tanf"][year_int] = np.zeros(n_year_records)  # Set TANF to 0 to match TAXSIM (which doesn't model TANF)
-            data["free_school_meals"][year_int] = np.zeros(n_year_records)  # Set free school meals to 0 to match TAXSIM (which doesn't model free school meals)
-            data["reduced_price_school_meals"][year_int] = np.zeros(n_year_records)  # Set reduced price school meals to 0 to match TAXSIM (which doesn't model reduced price school meals)
+            data["snap"][year_int] = np.zeros(
+                n_year_records
+            )  # Set SNAP to 0 to match TAXSIM (which doesn't model SNAP)
+            data["tanf"][year_int] = np.zeros(
+                n_year_records
+            )  # Set TANF to 0 to match TAXSIM (which doesn't model TANF)
+            data["free_school_meals"][year_int] = np.zeros(
+                n_year_records
+            )  # Set free school meals to 0 to match TAXSIM (which doesn't model free school meals)
+            data["reduced_price_school_meals"][year_int] = np.zeros(
+                n_year_records
+            )  # Set reduced price school meals to 0 to match TAXSIM (which doesn't model reduced price school meals)
 
             # Marital unit data
             data["marital_unit_id"][year_int] = year_marital_unit_ids
@@ -761,7 +781,11 @@ class PolicyEngineRunner(BaseTaxRunner):
     """
 
     def __init__(
-        self, input_df: pd.DataFrame, logs: bool = False, disable_salt: bool = False, assume_w2_wages: bool = False
+        self,
+        input_df: pd.DataFrame,
+        logs: bool = False,
+        disable_salt: bool = False,
+        assume_w2_wages: bool = False,
     ):
         super().__init__(input_df)
         self.logs = logs
@@ -917,14 +941,10 @@ def _calc_tax_unit(self, sim, var_name, period):
         values = np.array(sim.calculate(var_name, period=period))
         entity_key = var_obj.entity.key
         if entity_key == "person":
-            return np.array(
-                sim.map_result(values, "person", "tax_unit", how="sum")
-            )
+            return np.array(sim.map_result(values, "person", "tax_unit", how="sum"))
         elif entity_key != "tax_unit":
             # For household/spm_unit etc., project to person then sum to tax_unit
-            return np.array(
-                sim.map_result(values, entity_key, "tax_unit")
-            )
+            return np.array(sim.map_result(values, entity_key, "tax_unit"))
         return values
 
     def _extract_vectorized_results(
@@ -1024,8 +1044,7 @@ def _extract_vectorized_results(
                         )
                         unified_vars_list = (
                             all(
-                                sim.tax_benefit_system.variables.get(v)
-                                is not None
+                                sim.tax_benefit_system.variables.get(v) is not None
                                 for v in variables_list
                             )
                             if variables_list
@@ -1063,7 +1082,9 @@ def _extract_vectorized_results(
                                         ):
                                             continue
                                         try:
-                                            arr = self._calc_tax_unit(sim, resolved, year_str)
+                                            arr = self._calc_tax_unit(
+                                                sim, resolved, year_str
+                                            )
                                             var_sum += arr
                                         except Exception as e:
                                             if "does not exist" in str(e):
@@ -1081,7 +1102,9 @@ def _extract_vectorized_results(
                                     ):
                                         continue
                                     try:
-                                        arr = self._calc_tax_unit(sim, resolved, year_str)
+                                        arr = self._calc_tax_unit(
+                                            sim, resolved, year_str
+                                        )
                                         result_array[state_mask] = arr[state_mask]
                                     except Exception as e:
                                         if "does not exist" in str(e):
@@ -1126,9 +1149,7 @@ def _extract_vectorized_results(
             else:
                 fiitax_arr = self._calc_tax_unit(
                     sim, "income_tax", year_str
-                ) + self._calc_tax_unit(
-                    sim, "additional_medicare_tax", year_str
-                )
+                ) + self._calc_tax_unit(sim, "additional_medicare_tax", year_str)
                 columns["fiitax"] = np.round(fiitax_arr, 2)
 
             # Apply idtl filtering: mask out columns not requested by each row's idtl
diff --git a/policyengine_taxsim/runners/stitched_runner.py b/policyengine_taxsim/runners/stitched_runner.py
index 52d7dd5f..16194090 100644
--- a/policyengine_taxsim/runners/stitched_runner.py
+++ b/policyengine_taxsim/runners/stitched_runner.py
@@ -36,8 +36,7 @@ def run(self, show_progress: bool = True) -> pd.DataFrame:
         # Warn if PE-only kwargs are set but some rows go to TAXSIM
         if taxsim_mask.any():
             active_pe_kwargs = {
-                k for k, v in self._pe_kwargs.items()
-                if k in self._PE_ONLY_KWARGS and v
+                k for k, v in self._pe_kwargs.items() if k in self._PE_ONLY_KWARGS and v
             }
             if active_pe_kwargs:
                 logger.warning(
@@ -51,9 +50,7 @@ def run(self, show_progress: bool = True) -> pd.DataFrame:
         frames = []
 
         if pe_mask.any():
-            pe_runner = PolicyEngineRunner(
-                self.input_df[pe_mask], **self._pe_kwargs
-            )
+            pe_runner = PolicyEngineRunner(self.input_df[pe_mask], **self._pe_kwargs)
             frames.append(pe_runner.run(show_progress=show_progress))
 
         if taxsim_mask.any():
diff --git a/policyengine_taxsim/runners/taxsim_runner.py b/policyengine_taxsim/runners/taxsim_runner.py
index 4eab53b7..35cc0a1b 100644
--- a/policyengine_taxsim/runners/taxsim_runner.py
+++ b/policyengine_taxsim/runners/taxsim_runner.py
@@ -37,7 +37,7 @@ class TaxsimRunner(BaseTaxRunner):
         "age10",
         "age11",
     ]
-    
+
     # TAXSIM32 format columns for dependent counts by age bracket
     TAXSIM32_COLUMNS = [
         "dep13",  # Number of dependents under 13
@@ -71,7 +71,9 @@ class TaxsimRunner(BaseTaxRunner):
         "idtl",  # Output control
     ]
 
-    ALL_COLUMNS = REQUIRED_COLUMNS + DEPENDENT_AGE_COLUMNS + TAXSIM32_COLUMNS + INCOME_COLUMNS
+    ALL_COLUMNS = (
+        REQUIRED_COLUMNS + DEPENDENT_AGE_COLUMNS + TAXSIM32_COLUMNS + INCOME_COLUMNS
+    )
 
     def __init__(self, input_df: pd.DataFrame, taxsim_path: str = None):
         super().__init__(input_df)
@@ -98,15 +100,27 @@ def _detect_taxsim_executable(self) -> Path:
             # 1. Relative path (for running from repo during development)
             Path("resources") / "taxsimtest" / exe_name,
             # 2. Shared data location (for pip-installed packages)
-            Path(sys.prefix) / "share" / "policyengine_taxsim" / "taxsimtest" / exe_name,
+            Path(sys.prefix)
+            / "share"
+            / "policyengine_taxsim"
+            / "taxsimtest"
+            / exe_name,
             # 3. User site-packages shared data
-            Path(sys.base_prefix) / "share" / "policyengine_taxsim" / "taxsimtest" / exe_name,
+            Path(sys.base_prefix)
+            / "share"
+            / "policyengine_taxsim"
+            / "taxsimtest"
+            / exe_name,
         ]
 
         # Also check virtualenv locations
         if hasattr(sys, "real_prefix"):  # virtualenv
             search_paths.append(
-                Path(sys.real_prefix) / "share" / "policyengine_taxsim" / "taxsimtest" / exe_name
+                Path(sys.real_prefix)
+                / "share"
+                / "policyengine_taxsim"
+                / "taxsimtest"
+                / exe_name
             )
 
         for taxsim_path in search_paths:
@@ -154,7 +168,7 @@ def _format_input_for_taxsim(self, df: pd.DataFrame) -> pd.DataFrame:
 
             # Add only age columns for actual dependents (up to 11 max)
             for i in range(min(depx, 11)):
-                age_col = f"age{i+1}"
+                age_col = f"age{i + 1}"
                 dynamic_columns.append(age_col)
 
             # Add income columns (but exclude TAXSIM32 columns since TAXSIM-35 uses individual ages)
diff --git a/pyproject.toml b/pyproject.toml
index 1f4f1cb8..e46b71e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,7 @@ include = [
 dependencies = [
     "pytest",
     "pytest-cov",
+    "ruff>=0.9.0",
 ]
 
 [tool.hatch.envs.default.scripts]
diff --git a/tests/test_assume_w2_wages.py b/tests/test_assume_w2_wages.py
index c27fbb7c..fc6d7750 100644
--- a/tests/test_assume_w2_wages.py
+++ b/tests/test_assume_w2_wages.py
@@ -125,9 +125,7 @@ def test_scorp_above_threshold_qbid_changes(self):
             )
 
         # At least one record should show a meaningful difference
-        fiitax_diff = (
-            result_default["fiitax"].values - result_w2["fiitax"].values
-        )
+        fiitax_diff = result_default["fiitax"].values - result_w2["fiitax"].values
         assert fiitax_diff.max() > 100, (
             f"Expected meaningful QBID difference for high-income S-Corp filers, "
             f"but max fiitax reduction was only ${fiitax_diff.max():.2f}"
diff --git a/tests/test_cli_entry_point.py b/tests/test_cli_entry_point.py
index 7c5898b2..51169f0d 100644
--- a/tests/test_cli_entry_point.py
+++ b/tests/test_cli_entry_point.py
@@ -12,9 +12,7 @@
 def test_cli_entry_point_resolves():
     """The console script entry point must import and be callable."""
     dist = distribution("policyengine-taxsim")
-    console_scripts = [
-        ep for ep in dist.entry_points if ep.group == "console_scripts"
-    ]
+    console_scripts = [ep for ep in dist.entry_points if ep.group == "console_scripts"]
     assert len(console_scripts) == 1, "Expected exactly one console script"
     ep = console_scripts[0]
     assert ep.name == "policyengine-taxsim"
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 9faf7fb2..3c752e97 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -9,7 +9,6 @@
 
 
 class E2ETest(unittest.TestCase):
-
     def setUp(self) -> None:
         import importlib.resources as pkg_resources
         import policyengine_taxsim
@@ -236,7 +235,7 @@ def test_match_single_household_output(self):
         # Compare
         standard_output_cols = ["year", "fiitax", "siitax"]
         full_output_cols = standard_output_cols + [
-            "tfica" "v10",  # state_agi
+            "tficav10",  # state_agi
             "v13",
             "v18",
             "v19",
@@ -350,17 +349,20 @@ def test_match_joint_household_output(self):
             # "fiitax",
             "siitax",
         ]
-        full_output_cols = standard_output_cols + [
-            # "tfica"
-            # "v10",  # state_agi
-            # "v13",
-            # "v18",
-            # "v19",
-            # "v26",
-            # "v28",
-            # "v34",
-            # "v45",
-        ]
+        full_output_cols = (
+            standard_output_cols
+            + [
+                # "tfica"
+                # "v10",  # state_agi
+                # "v13",
+                # "v18",
+                # "v19",
+                # "v26",
+                # "v28",
+                # "v34",
+                # "v45",
+            ]
+        )
 
         # Determine which columns to check based on idtl value
         columns_to_check = (
@@ -461,17 +463,20 @@ def test_match_household_with_dependent_output(self):
             # "fiitax",
             "siitax",
         ]
-        full_output_cols = standard_output_cols + [
-            # "tfica"
-            # "v10",  # state_agi
-            # "v13",
-            # "v18",
-            # "v19",
-            # "v26",
-            # "v28",
-            # "v34",
-            # "v45",
-        ]
+        full_output_cols = (
+            standard_output_cols
+            + [
+                # "tfica"
+                # "v10",  # state_agi
+                # "v13",
+                # "v18",
+                # "v19",
+                # "v26",
+                # "v28",
+                # "v34",
+                # "v45",
+            ]
+        )
 
         # Determine which columns to check based on idtl value
         columns_to_check = (
@@ -581,17 +586,20 @@ def test_match_household_with_dependent_single_parent_output(self):
             # "fiitax",
             "siitax",
         ]
-        full_output_cols = standard_output_cols + [
-            # "tfica"
-            # "v10",  # state_agi
-            # "v13",
-            # "v18",
-            # "v19",
-            # "v26",
-            # "v28",
-            # "v34",
-            # "v45",
-        ]
+        full_output_cols = (
+            standard_output_cols
+            + [
+                # "tfica"
+                # "v10",  # state_agi
+                # "v13",
+                # "v18",
+                # "v19",
+                # "v26",
+                # "v28",
+                # "v34",
+                # "v45",
+            ]
+        )
 
         # Determine which columns to check based on idtl value
         columns_to_check = (
diff --git a/tests/test_mappers.py b/tests/test_mappers.py
index 1d23c93d..a24792a1 100644
--- a/tests/test_mappers.py
+++ b/tests/test_mappers.py
@@ -191,7 +191,6 @@ def test_export_single_household(sample_taxsim_input):
             "you": {
                 "age": {"2021": 35},
                 "employment_income": {"2021": 50000},
-                
                 "is_tax_unit_head": {"2021": True},
             }
         },
@@ -412,7 +411,7 @@ def test_household_with_dependent(sample_taxsim_input_for_household_with_depende
                     "your first dependent",
                     "your second dependent",
                 ],
-                                    "pa_use_tax": {"2023": 0},
+                "pa_use_tax": {"2023": 0},
             }
         },
     }
@@ -492,7 +491,7 @@ def test_household_with_dependent_single_parent(
         "tax_units": {
             "your tax unit": {
                 "members": ["you", "your first dependent", "your second dependent"],
-                                    "pa_use_tax": {"2023": 0},
+                "pa_use_tax": {"2023": 0},
             }
         },
     }
diff --git a/tests/test_multi_state.py b/tests/test_multi_state.py
index 66ab20b9..ef60fdf6 100644
--- a/tests/test_multi_state.py
+++ b/tests/test_multi_state.py
@@ -81,11 +81,46 @@ def _mixed_contiguous_and_noncontiguous():
     """Mix of contiguous (KY, CA, NY) and non-contiguous (AK, HI) states."""
     return pd.DataFrame(
         [
-            {"taxsimid": 1, "year": 2023, "state": 18, "mstat": 1, "pwages": 50000, "idtl": 0},  # KY
-            {"taxsimid": 2, "year": 2023, "state": 5, "mstat": 1, "pwages": 50000, "idtl": 0},   # CA
-            {"taxsimid": 3, "year": 2023, "state": 33, "mstat": 1, "pwages": 50000, "idtl": 0},  # NY
-            {"taxsimid": 4, "year": 2023, "state": 2, "mstat": 1, "pwages": 50000, "idtl": 0},   # AK
-            {"taxsimid": 5, "year": 2023, "state": 12, "mstat": 1, "pwages": 50000, "idtl": 0},  # HI
+            {
+                "taxsimid": 1,
+                "year": 2023,
+                "state": 18,
+                "mstat": 1,
+                "pwages": 50000,
+                "idtl": 0,
+            },  # KY
+            {
+                "taxsimid": 2,
+                "year": 2023,
+                "state": 5,
+                "mstat": 1,
+                "pwages": 50000,
+                "idtl": 0,
+            },  # CA
+            {
+                "taxsimid": 3,
+                "year": 2023,
+                "state": 33,
+                "mstat": 1,
+                "pwages": 50000,
+                "idtl": 0,
+            },  # NY
+            {
+                "taxsimid": 4,
+                "year": 2023,
+                "state": 2,
+                "mstat": 1,
+                "pwages": 50000,
+                "idtl": 0,
+            },  # AK
+            {
+                "taxsimid": 5,
+                "year": 2023,
+                "state": 12,
+                "mstat": 1,
+                "pwages": 50000,
+                "idtl": 0,
+            },  # HI
         ]
     )
 
diff --git a/tests/test_performance.py b/tests/test_performance.py
index fd0b01be..c3838d20 100644
--- a/tests/test_performance.py
+++ b/tests/test_performance.py
@@ -103,9 +103,7 @@ def test_generate_does_not_scale_linearly(self):
         times = {}
         for n in [100, 500]:
             records = _make_synthetic_records(n, seed=42)
-            runner = PolicyEngineRunner(
-                records.copy(), logs=False, disable_salt=True
-            )
+            runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True)
             runner.input_df["year"] = runner.input_df["year"].apply(
                 lambda x: int(float(x))
             )
@@ -133,9 +131,7 @@ def test_extract_builds_dataframe_without_row_loop(self):
         this would be slow at higher record counts.
         """
         records = _make_synthetic_records(200, seed=55)
-        runner = PolicyEngineRunner(
-            records.copy(), logs=False, disable_salt=True
-        )
+        runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True)
 
         orig_extract = runner._extract_vectorized_results.__func__
         extract_time = {}
@@ -146,9 +142,7 @@ def timed_extract(self_runner, sim, input_df):
             extract_time["t"] = time.time() - t0
             return result
 
-        runner._extract_vectorized_results = types.MethodType(
-            timed_extract, runner
-        )
+        runner._extract_vectorized_results = types.MethodType(timed_extract, runner)
         result = runner.run(show_progress=False)
 
         assert len(result) == 200
@@ -172,9 +166,7 @@ def test_benchmark_500_records(self):
         elapsed = time.time() - start
 
         assert len(result) == 500
-        assert elapsed < 60, (
-            f"500 records took {elapsed:.1f}s, expected < 60s"
-        )
+        assert elapsed < 60, f"500 records took {elapsed:.1f}s, expected < 60s"
         print(f"\nBenchmark: 500 records in {elapsed:.1f}s")
 
     def test_benchmark_cps_like(self):
@@ -190,29 +182,31 @@ def test_benchmark_cps_like(self):
         mstat = rng.choice([1, 2], size=n, p=[0.55, 0.45])
         depx = rng.choice([0, 1, 2, 3, 4], size=n, p=[0.35, 0.25, 0.2, 0.15, 0.05])
 
-        records = pd.DataFrame({
-            "taxsimid": np.arange(1, n + 1),
-            "year": 2023,
-            "state": rng.choice(all_states, size=n),
-            "mstat": mstat,
-            "depx": depx,
-            "page": rng.randint(20, 75, size=n),
-            "sage": np.where(mstat == 2, rng.randint(20, 75, size=n), 0),
-            "pwages": rng.lognormal(10.5, 1.0, size=n).round(2),
-            "swages": np.where(
-                mstat == 2, rng.lognormal(10.0, 1.2, size=n).round(2), 0
-            ),
-            "dividends": np.where(
-                rng.random(n) < 0.15, rng.lognormal(8, 2, size=n).round(2), 0
-            ),
-            "intrec": np.where(
-                rng.random(n) < 0.25, rng.lognormal(7, 1.5, size=n).round(2), 0
-            ),
-            "pensions": np.where(
-                rng.random(n) < 0.20, rng.lognormal(9.5, 1.0, size=n).round(2), 0
-            ),
-            "idtl": 2,
-        })
+        records = pd.DataFrame(
+            {
+                "taxsimid": np.arange(1, n + 1),
+                "year": 2023,
+                "state": rng.choice(all_states, size=n),
+                "mstat": mstat,
+                "depx": depx,
+                "page": rng.randint(20, 75, size=n),
+                "sage": np.where(mstat == 2, rng.randint(20, 75, size=n), 0),
+                "pwages": rng.lognormal(10.5, 1.0, size=n).round(2),
+                "swages": np.where(
+                    mstat == 2, rng.lognormal(10.0, 1.2, size=n).round(2), 0
+                ),
+                "dividends": np.where(
+                    rng.random(n) < 0.15, rng.lognormal(8, 2, size=n).round(2), 0
+                ),
+                "intrec": np.where(
+                    rng.random(n) < 0.25, rng.lognormal(7, 1.5, size=n).round(2), 0
+                ),
+                "pensions": np.where(
+                    rng.random(n) < 0.20, rng.lognormal(9.5, 1.0, size=n).round(2), 0
+                ),
+                "idtl": 2,
+            }
+        )
 
         # Add dependent ages
         for i in range(1, 5):
@@ -229,11 +223,11 @@ def test_benchmark_cps_like(self):
         elapsed = time.time() - start
 
         assert len(result) == n
-        print(f"\nBenchmark (CPS-like): {n} records, {records['state'].nunique()} states, idtl=2")
-        print(f"  Total: {elapsed:.1f}s")
-        assert elapsed < 120, (
-            f"CPS-like benchmark took {elapsed:.1f}s, expected < 120s"
+        print(
+            f"\nBenchmark (CPS-like): {n} records, {records['state'].nunique()} states, idtl=2"
         )
+        print(f"  Total: {elapsed:.1f}s")
+        assert elapsed < 120, f"CPS-like benchmark took {elapsed:.1f}s, expected < 120s"
 
 
 class TestStateVariableEfficiency:
@@ -249,23 +243,23 @@ def test_extract_does_not_iterate_states(self):
         rng = np.random.RandomState(88)
         all_states = list(range(1, 52))
         n = 50
-        records = pd.DataFrame({
-            "taxsimid": np.arange(1, n + 1),
-            "year": 2023,
-            "state": rng.choice(all_states, size=n),
-            "mstat": 1,
-            "depx": 0,
-            "page": 40,
-            "sage": 0,
-            "pwages": rng.uniform(30000, 100000, size=n).round(2),
-            "swages": 0.0,
-            "idtl": 2,  # full output to trigger all state vars
-        })
-
-        runner = PolicyEngineRunner(
-            records.copy(), logs=False, disable_salt=True
+        records = pd.DataFrame(
+            {
+                "taxsimid": np.arange(1, n + 1),
+                "year": 2023,
+                "state": rng.choice(all_states, size=n),
+                "mstat": 1,
+                "depx": 0,
+                "page": 40,
+                "sage": 0,
+                "pwages": rng.uniform(30000, 100000, size=n).round(2),
+                "swages": 0.0,
+                "idtl": 2,  # full output to trigger all state vars
+            }
         )
 
+        runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True)
+
         # Count _calc_tax_unit calls
         orig_calc_tu = runner._calc_tax_unit.__func__
         calc_count = {"n": 0}
@@ -291,21 +285,21 @@ def test_state_variable_values_match(self):
         Verify that unified state variable results match expected values.
         state_income_tax (already unified) should match siitax column.
         """
-        records = pd.DataFrame({
-            "taxsimid": [1, 2],
-            "year": 2023,
-            "state": [5, 33],  # CA, NY
-            "mstat": 1,
-            "depx": 0,
-            "page": 40,
-            "sage": 0,
-            "pwages": [80000.0, 60000.0],
-            "swages": 0.0,
-            "idtl": 2,
-        })
-        runner = PolicyEngineRunner(
-            records.copy(), logs=False, disable_salt=True
+        records = pd.DataFrame(
+            {
+                "taxsimid": [1, 2],
+                "year": 2023,
+                "state": [5, 33],  # CA, NY
+                "mstat": 1,
+                "depx": 0,
+                "page": 40,
+                "sage": 0,
+                "pwages": [80000.0, 60000.0],
+                "swages": 0.0,
+                "idtl": 2,
+            }
         )
+        runner = PolicyEngineRunner(records.copy(), logs=False, disable_salt=True)
         result = runner.run(show_progress=False)
 
         # siitax should be nonzero for CA and NY
diff --git a/tests/test_stitched_runner.py b/tests/test_stitched_runner.py
index be6e6fe8..7ce2bdfe 100644
--- a/tests/test_stitched_runner.py
+++ b/tests/test_stitched_runner.py
@@ -68,12 +68,8 @@ def test_all_taxsim_years(self, MockPE, MockTaxsim):
     def test_mixed_years(self, MockPE, MockTaxsim):
         """Mixed years split correctly between engines."""
         df = _make_input([(1, 2019), (2, 2022), (3, 1990), (4, 2025)])
-        MockPE.return_value.run.return_value = _make_result(
-            [(2, 2022), (4, 2025)]
-        )
-        MockTaxsim.return_value.run.return_value = _make_result(
-            [(1, 2019), (3, 1990)]
-        )
+        MockPE.return_value.run.return_value = _make_result([(2, 2022), (4, 2025)])
+        MockTaxsim.return_value.run.return_value = _make_result([(1, 2019), (3, 1990)])
 
         runner = StitchedRunner(df)
         runner.run(show_progress=False)
@@ -135,12 +131,8 @@ class TestOutputOrdering:
     @patch("policyengine_taxsim.runners.stitched_runner.PolicyEngineRunner")
     def test_order_preserved(self, MockPE, MockTaxsim):
         df = _make_input([(3, 2019), (1, 2023), (4, 1990), (2, 2025)])
-        MockPE.return_value.run.return_value = _make_result(
-            [(1, 2023), (2, 2025)]
-        )
-        MockTaxsim.return_value.run.return_value = _make_result(
-            [(3, 2019), (4, 1990)]
-        )
+        MockPE.return_value.run.return_value = _make_result([(1, 2023), (2, 2025)])
+        MockTaxsim.return_value.run.return_value = _make_result([(3, 2019), (4, 1990)])
 
         runner = StitchedRunner(df)
         result = runner.run(show_progress=False)
@@ -256,6 +248,7 @@ def test_pe_kwargs_warning_for_taxsim_rows(self, MockPE, MockTaxsim, caplog):
         MockTaxsim.return_value.run.return_value = _make_result([(1, 2020)])
 
         import logging
+
         with caplog.at_level(logging.WARNING):
             runner = StitchedRunner(df, logs=True, disable_salt=True)
             runner.run(show_progress=False)