PolicyEngine · MaxGhenis · Mar 7, 2026 · Mar 7, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,6 +7,15 @@ on:
     branches: [main]
 
 jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install ruff
+        run: pip install ruff>=0.9.0
+      - name: Check formatting
+        run: ruff format --check .
+
   test:
     runs-on: ${{ matrix.os }}
     strategy:

diff --git a/Makefile b/Makefile
@@ -1,4 +1,7 @@
 
+format:
+	ruff format .
+
 changelog:
 	python .github/bump_version.py
 	towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
diff --git a/changelog.d/add-ruff.added.md b/changelog.d/add-ruff.added.md
@@ -0,0 +1 @@
+Added ruff formatter with CI check and Makefile target.
diff --git a/policyengine_taxsim/cli.py b/policyengine_taxsim/cli.py
@@ -23,7 +23,10 @@
     from policyengine_taxsim.comparison.statistics import ComparisonStatistics
     from policyengine_taxsim.core.yaml_generator import generate_pe_tests_yaml
     from policyengine_taxsim.core.input_mapper import form_household_situation
-    from policyengine_taxsim.core.utils import get_state_code, convert_taxsim32_dependents
+    from policyengine_taxsim.core.utils import (
+        get_state_code,
+        convert_taxsim32_dependents,
+    )
 
 
 def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame):
@@ -66,7 +69,7 @@ def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame):
 
             # Generate YAML file
             # Use taxsimid from row if available, otherwise use index + 1
-            taxsim_id = int(row['taxsimid']) if 'taxsimid' in row else idx + 1
+            taxsim_id = int(row["taxsimid"]) if "taxsimid" in row else idx + 1
             yaml_filename = f"taxsim_record_{taxsim_id}_{year}.yaml"
             generate_pe_tests_yaml(household, outputs, yaml_filename, logs=True)
 
@@ -151,7 +154,10 @@ def cli(ctx, logs, disable_salt, sample):
     "--disable-salt", is_flag=True, default=False, help="Set SALT Deduction to 0"
 )
 @click.option(
-    "--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)"
+    "--assume-w2-wages",
+    is_flag=True,
+    default=False,
+    help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)",
 )
 @click.option("--sample", type=int, help="Sample N records from input")
 def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample):
@@ -171,7 +177,9 @@ def policyengine(input_file, output, logs, disable_salt, assume_w2_wages, sample
             df = df.sample(n=sample, random_state=42)
 
         # Use StitchedRunner: routes to PE (2021+) or TAXSIM (pre-2021)
-        runner = StitchedRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages)
+        runner = StitchedRunner(
+            df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages
+        )
         results_df = runner.run(show_progress=True)
 
         # Use the runner's input_df which has taxsimid (auto-assigned if needed)
@@ -241,7 +249,10 @@ def taxsim(input_file, output, sample, taxsim_path):
 )
 @click.option("--logs", is_flag=True, help="Generate PolicyEngine YAML logs")
 @click.option(
-    "--assume-w2-wages", is_flag=True, default=False, help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)"
+    "--assume-w2-wages",
+    is_flag=True,
+    default=False,
+    help="Assume large W-2 wages for QBID (aligns with TAXSIM S-Corp handling)",
 )
 def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_wages):
     """Compare PolicyEngine and TAXSIM results"""
@@ -274,7 +285,9 @@ def compare(input_file, sample, output_dir, year, disable_salt, logs, assume_w2_
 
         # Run PolicyEngine
         click.echo("Running PolicyEngine...")
-        pe_runner = PolicyEngineRunner(df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages)
+        pe_runner = PolicyEngineRunner(
+            df, logs=logs, disable_salt=disable_salt, assume_w2_wages=assume_w2_wages
+        )
         pe_results = pe_runner.run()
 
         # Use the runner's input_df which has taxsimid (auto-assigned if needed)

diff --git a/policyengine_taxsim/core/input_mapper.py b/policyengine_taxsim/core/input_mapper.py
@@ -185,7 +185,11 @@ def form_household_situation(year, state, taxsim_vars):
     for i in range(1, depx + 1):
         dep_name = f"your {get_ordinal(i)} dependent"
         people[dep_name] = {
-            "age": {str(year): int(taxsim_vars.get(f"age{i}", 10)) if taxsim_vars.get(f"age{i}") is not None else 10},
+            "age": {
+                str(year): int(taxsim_vars.get(f"age{i}", 10))
+                if taxsim_vars.get(f"age{i}") is not None
+                else 10
+            },
             "employment_income": {str(year): 0},
             "is_tax_unit_dependent": {str(year): True},
             "is_tax_unit_spouse": {str(year): False},
@@ -195,38 +199,44 @@ def form_household_situation(year, state, taxsim_vars):
     household_situation = add_additional_units(
         state.lower(), year, household_situation, taxsim_vars
     )
-    
+
     # Explicitly set SSI to 0 for all people to prevent PolicyEngine from imputing SSI benefits
     # TAXSIM does not model SSI, so we need to ensure it's not automatically calculated
     for person_name in household_situation["people"]:
         household_situation["people"][person_name]["ssi"] = {str(year): 0}
-    
+
     # Explicitly set person-level benefit programs to 0 to prevent PolicyEngine from imputing these benefits
     # TAXSIM does not model these programs, so we need to ensure they're not automatically calculated
     for person_name in household_situation["people"]:
         household_situation["people"][person_name]["head_start"] = {str(year): 0}
         household_situation["people"][person_name]["early_head_start"] = {str(year): 0}
-        household_situation["people"][person_name]["commodity_supplemental_food_program"] = {str(year): 0}
-
+        household_situation["people"][person_name][
+            "commodity_supplemental_food_program"
+        ] = {str(year): 0}
+
     # Explicitly set SNAP to 0 for all SPM units to prevent PolicyEngine from imputing SNAP benefits
     # TAXSIM does not model SNAP, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
         household_situation["spm_units"][spm_unit_name]["snap"] = {str(year): 0}
-    
+
     # Explicitly set TANF to 0 for all SPM units to prevent PolicyEngine from imputing TANF benefits
     # TAXSIM does not model TANF, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
         household_situation["spm_units"][spm_unit_name]["tanf"] = {str(year): 0}
-    
+
     # Explicitly set free_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing free school meal benefits
     # TAXSIM does not model free school meals, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
-        household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {str(year): 0}
-
+        household_situation["spm_units"][spm_unit_name]["free_school_meals"] = {
+            str(year): 0
+        }
+
     # Explicitly set reduced_price_school_meals to 0 for all SPM units to prevent PolicyEngine from imputing reduced price school meal benefits
     # TAXSIM does not model reduced price school meals, so we need to ensure it's not automatically calculated
     for spm_unit_name in household_situation["spm_units"]:
-        household_situation["spm_units"][spm_unit_name]["reduced_price_school_meals"] = {str(year): 0}
+        household_situation["spm_units"][spm_unit_name][
+            "reduced_price_school_meals"
+        ] = {str(year): 0}
 
     return household_situation
 
@@ -308,7 +318,7 @@ def generate_household(taxsim_vars):
 
     # Convert TAXSIM32 dependent format if present
     taxsim_vars = convert_taxsim32_dependents(taxsim_vars)
-    
+
     taxsim_vars = set_taxsim_defaults(taxsim_vars, int(year))
 
     state = get_state_code(taxsim_vars["state"])

diff --git a/policyengine_taxsim/core/utils.py b/policyengine_taxsim/core/utils.py
@@ -97,112 +97,119 @@ def to_roundedup_number(value):
 
 def convert_taxsim32_dependents(taxsim_vars):
     """
-    Convert TAXSIM32 dependent count format (dep13, dep17, dep18) 
+    Convert TAXSIM32 dependent count format (dep13, dep17, dep18)
     to individual age format (age1, age2, etc.).
-    
+
     TAXSIM32 format uses cumulative counts:
     - dep13: Number of dependents under 13
     - dep17: Number of dependents under 17 (includes those under 13)
     - dep18: Number of dependents under 18 (includes those under 17 and 13)
-    
+
     This function infers individual ages based on these counts.
     If depx exceeds dep18, additional dependents are assigned age 21.
-    
+
     Args:
         taxsim_vars (dict): Dictionary containing TAXSIM input variables
-        
+
     Returns:
         dict: Updated dictionary with age1, age2, etc. fields added
     """
     # Check if we have the TAXSIM32 format fields present
     # Just check for presence, not values, since all three could be 0 with depx > 0 (all dependents 18+)
-    has_taxsim32_fields = 'dep13' in taxsim_vars or 'dep17' in taxsim_vars or 'dep18' in taxsim_vars
-
+    has_taxsim32_fields = (
+        "dep13" in taxsim_vars or "dep17" in taxsim_vars or "dep18" in taxsim_vars
+    )
+
     # Check if we already have individual age fields explicitly set (including 0 for newborns)
     # We consider age fields as explicitly set if they exist in the input
     has_individual_age_fields = any(
-        f'age{i}' in taxsim_vars and taxsim_vars[f'age{i}'] is not None
+        f"age{i}" in taxsim_vars and taxsim_vars[f"age{i}"] is not None
         for i in range(1, 12)
     )
-    
+
     # Get depx value
-    depx = int(taxsim_vars.get('depx', 0) or 0)
-    
+    depx = int(taxsim_vars.get("depx", 0) or 0)
+
     # Only convert if:
     # 1. We have TAXSIM32 fields (dep13/17/18) with meaningful values
     # 2. AND we don't already have individual age fields set
     # This ensures we only convert when TAXSIM32 format is actually being used
     if has_taxsim32_fields and not has_individual_age_fields:
-        dep13 = int(taxsim_vars.get('dep13', 0) or 0)
-        dep17 = int(taxsim_vars.get('dep17', 0) or 0)
-        dep18 = int(taxsim_vars.get('dep18', 0) or 0)
-        depx = int(taxsim_vars.get('depx', 0) or 0)
-        
+        dep13 = int(taxsim_vars.get("dep13", 0) or 0)
+        dep17 = int(taxsim_vars.get("dep17", 0) or 0)
+        dep18 = int(taxsim_vars.get("dep18", 0) or 0)
+        depx = int(taxsim_vars.get("depx", 0) or 0)
+
         # Calculate the number of dependents in each age bracket
         # Note: These are cumulative, so we need to subtract to get individual counts
         num_under_13 = dep13
         num_13_to_16 = dep17 - dep13  # Those under 17 but not under 13
         num_17 = dep18 - dep17  # Those under 18 but not under 17 (i.e., exactly 17)
-        
+
         # Calculate number of dependents 18 or older
         num_18_or_older = 0
         if depx > dep18:
             num_18_or_older = depx - dep18  # These will be assigned age 21
-        
+
         # Set depx to the total number of dependents if not already set
-        if 'depx' not in taxsim_vars or taxsim_vars['depx'] is None:
-            taxsim_vars['depx'] = max(depx, dep18)
+        if "depx" not in taxsim_vars or taxsim_vars["depx"] is None:
+            taxsim_vars["depx"] = max(depx, dep18)
         else:
             # Ensure depx is at least as large as dep18
-            taxsim_vars['depx'] = max(int(taxsim_vars['depx']), dep18)
-        
+            taxsim_vars["depx"] = max(int(taxsim_vars["depx"]), dep18)
+
         # Generate individual ages based on the counts
         # We'll use typical ages for each bracket
         dep_counter = 1
-        
+
         # Add dependents under 13 (use age 10 as default)
         for _ in range(num_under_13):
             if dep_counter <= 11:  # TAXSIM supports up to 11 dependents
-                taxsim_vars[f'age{dep_counter}'] = 10
+                taxsim_vars[f"age{dep_counter}"] = 10
                 dep_counter += 1
-        
+
         # Add dependents aged 13-16 (use age 15 as default)
         for _ in range(num_13_to_16):
             if dep_counter <= 11:
-                taxsim_vars[f'age{dep_counter}'] = 15
+                taxsim_vars[f"age{dep_counter}"] = 15
                 dep_counter += 1
-        
+
         # Add dependents aged 17 (use age 17)
         for _ in range(num_17):
             if dep_counter <= 11:
-                taxsim_vars[f'age{dep_counter}'] = 17
+                taxsim_vars[f"age{dep_counter}"] = 17
                 dep_counter += 1
-        
+
         # Add dependents aged 18 or older (use age 21 as default for adult dependents)
         for _ in range(num_18_or_older):
             if dep_counter <= 11:
-                taxsim_vars[f'age{dep_counter}'] = 21
+                taxsim_vars[f"age{dep_counter}"] = 21
                 dep_counter += 1
-    
+
     # Handle NaN and age 0 inputs - default them to age 10
     # Check all age fields up to age11
     for i in range(1, 12):
-        age_field = f'age{i}'
+        age_field = f"age{i}"
         if age_field in taxsim_vars:
             age_value = taxsim_vars[age_field]
             # Check for NaN (using numpy's isnan if it's a numpy type, or math.isnan for regular floats)
             is_nan = False
             try:
                 import math
+
                 if isinstance(age_value, (float, np.floating)):
-                    is_nan = math.isnan(age_value) if not isinstance(age_value, np.ndarray) else np.isnan(age_value)
+                    is_nan = (
+                        math.isnan(age_value)
+                        if not isinstance(age_value, np.ndarray)
+                        else np.isnan(age_value)
+                    )
             except (TypeError, ValueError):
                 pass
-            
+
             # If age is NaN or 0, default to 10
             if is_nan or age_value == 0 or age_value == 0.0:
                 taxsim_vars[age_field] = 10
-    
+
     return taxsim_vars
 
 

diff --git a/policyengine_taxsim/core/yaml_generator.py b/policyengine_taxsim/core/yaml_generator.py
@@ -164,8 +164,12 @@ def generate_yaml(
                 "ssi": person_data.get("ssi", {}).get(year_str, 0),
                 "wic": person_data.get("wic", {}).get(year_str, 0),
                 "head_start": person_data.get("head_start", {}).get(year_str, 0),
-                "early_head_start": person_data.get("early_head_start", {}).get(year_str, 0),
-                "commodity_supplemental_food_program": person_data.get("commodity_supplemental_food_program", {}).get(year_str, 0),
+                "early_head_start": person_data.get("early_head_start", {}).get(
+                    year_str, 0
+                ),
+                "commodity_supplemental_food_program": person_data.get(
+                    "commodity_supplemental_food_program", {}
+                ).get(year_str, 0),
             }
 
             # Add optional fields only if they have non-zero values

diff --git a/policyengine_taxsim/runners/base_runner.py b/policyengine_taxsim/runners/base_runner.py
@@ -32,9 +32,7 @@ def _validate_input(self):
         """Validate that input data has required structure"""
         # year is required by TAXSIM (1960-2023) and PolicyEngine
         if "year" not in self.input_df.columns:
-            raise ValueError(
-                "Input data must contain a 'year' column"
-            )
+            raise ValueError("Input data must contain a 'year' column")
 
         # Auto-assign taxsimid if not present
         if "taxsimid" not in self.input_df.columns:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Added ruff formatter with CI check and Makefile target.