diff --git a/analysis/all_reforms_hf_2026.py b/analysis/all_reforms_hf_2026.py new file mode 100644 index 0000000..2e0faa5 --- /dev/null +++ b/analysis/all_reforms_hf_2026.py @@ -0,0 +1,74 @@ +import sys +import os +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from policyengine_us import Microsimulation +from reforms import REFORMS +from tqdm import tqdm + +print('='*80) +print('ALL REFORMS - HF TEST 2026 DATASET') +print('='*80) +print() + +# Load baseline once +print('Loading baseline...') +baseline = Microsimulation(dataset='hf://policyengine/test/2026.h5') +baseline_tax = baseline.calculate('income_tax', period=2026, map_to='household') +print(f'✓ Baseline: ${baseline_tax.sum() / 1e9:.1f}B') +print() + +results = [] + +for reform_id, reform_config in tqdm(REFORMS.items(), desc='Processing reforms'): + reform_name = reform_config['name'] + reform_func = reform_config['func'] + + print(f'\nProcessing {reform_id}: {reform_name[:50]}...') + + try: + reform = reform_func() + reform_sim = Microsimulation(dataset='hf://policyengine/test/2026.h5', reform=reform) + reform_tax = reform_sim.calculate('income_tax', period=2026, map_to='household') + + impact = (reform_tax.sum() - baseline_tax.sum()) / 1e9 + + results.append({ + 'Reform ID': reform_id, + 'Reform Name': reform_name, + 'Revenue Impact ($B)': impact # Keep full precision + }) + + print(f' ✓ Impact: ${impact:.1f}B') + + except Exception as e: + print(f' ✗ Error: {e}') + results.append({ + 'Reform ID': reform_id, + 'Reform Name': reform_name, + 'Revenue Impact ($B)': 'ERROR' + }) + +print() +print('='*80) +print('SUMMARY OF ALL REFORMS (2026)') +print('='*80) +print() + +import pandas as pd +df = pd.DataFrame(results) +print(df.to_string(index=False)) +print() + +# Save to CSV +output_file = '../data/all_reforms_hf_test_2026.csv' +df.to_csv(output_file, index=False) +print(f'✓ Results saved to: {output_file}') +print() + +print('='*80) +print('Dataset: hf://policyengine/test/2026.h5') +print('='*80) diff --git a/analysis/check_age_distribution_2054.py b/analysis/check_age_distribution_2054.py new file mode 100644 index 0000000..7a4a6c5 --- /dev/null +++ b/analysis/check_age_distribution_2054.py @@ -0,0 +1,88 @@ +""" +Check age distribution in the 2054 dataset +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation + +print("="*80) +print("AGE DISTRIBUTION ANALYSIS - 2054 Dataset") +print("="*80) +print() + +# Load the 2054 dataset +print("Loading 2054 dataset...") +sim = Microsimulation(dataset="hf://policyengine/test/2054.h5") +print("✓ Dataset loaded") +print() + +# Get age and person weight +age = sim.calculate("age", period=2054) + +# Get person weight - need to check if this variable exists +try: + person_weight = sim.calculate("person_weight", period=2054) +except: + # If person_weight doesn't exist, use household_weight mapped to persons + print("Note: Using household weight mapped to persons") + person_weight = sim.calculate("household_weight", period=2054, map_to="person") + +# Filter valid ages and weights +valid = (age > 0) & (person_weight > 0) & np.isfinite(age) & np.isfinite(person_weight) +age = age[valid] +person_weight = person_weight[valid] + +print(f"Total people in sample: {len(age):,}") +print(f"Total weighted population: {person_weight.sum():,.0f}") +print() + +# Calculate age statistics +print("Age Distribution:") +print("-" * 60) + +# Age groups +age_groups = [ + ("Under 18", 0, 17), + ("18-24", 18, 24), + ("25-34", 25, 34), + ("35-44", 35, 44), + ("45-54", 45, 54), + ("55-64", 55, 64), + ("65-74", 65, 74), + ("75-84", 75, 84), + ("85+", 85, 150) +] + +for group_name, min_age, max_age in age_groups: + mask = (age >= min_age) & (age <= max_age) + count = mask.sum() + weighted = person_weight[mask].sum() + pct = (weighted / person_weight.sum()) * 100 + print(f"{group_name:12s}: {count:>8,} people ({weighted:>15,.0f} weighted, {pct:>5.1f}%)") + +print() +print("="*60) + +# People over 65 +over_65 = age >= 65 +count_over_65 = over_65.sum() +weighted_over_65 = person_weight[over_65].sum() +pct_over_65 = (weighted_over_65 / person_weight.sum()) * 100 + +print(f"People aged 65+:") +print(f" Sample count: {count_over_65:,}") +print(f" Weighted count: {weighted_over_65:,.0f}") +print(f" Percentage of population: {pct_over_65:.1f}%") + +print() +print("✓ Analysis complete!") diff --git a/analysis/check_available_datasets.py b/analysis/check_available_datasets.py new file mode 100644 index 0000000..9b4856c --- /dev/null +++ b/analysis/check_available_datasets.py @@ -0,0 +1,84 @@ +""" +Check available PolicyEngine US datasets +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from policyengine_us import Microsimulation + +print("="*80) +print("AVAILABLE POLICYENGINE US DATASETS") +print("="*80) +print() + +# Try to get dataset list +try: + datasets = list(Microsimulation.datasets.keys()) + print(f"Total datasets available: {len(datasets)}") + print() + + # Group by type + enhanced_cps = [d for d in datasets if 'enhanced_cps' in d] + cps = [d for d in datasets if d.startswith('cps_') and 'enhanced' not in d] + test = [d for d in datasets if 'test' in d or 'hf://' in d] + other = [d for d in datasets if d not in enhanced_cps + cps + test] + + print("Enhanced CPS datasets (recommended):") + for d in sorted(enhanced_cps): + print(f" - {d}") + + print() + print("Raw CPS datasets:") + for d in sorted(cps): + print(f" - {d}") + + if test: + print() + print("Test/Projection datasets:") + for d in sorted(test): + print(f" - {d}") + + if other: + print() + print("Other datasets:") + for d in sorted(other): + print(f" - {d}") + +except Exception as e: + print(f"Could not retrieve dataset list: {e}") + print() + print("Common datasets you can try:") + print(" - enhanced_cps_2026") + print(" - enhanced_cps_2027") + print(" - enhanced_cps_2028") + print(" - enhanced_cps_2029") + print(" - enhanced_cps_2030") + print(" - enhanced_cps_2031") + print(" - enhanced_cps_2032") + print(" - enhanced_cps_2033") + print(" - enhanced_cps_2034") + +print() +print("="*80) +print() + +# Test loading enhanced_cps_2034 +print("Testing enhanced_cps_2034...") +try: + sim = Microsimulation(dataset="enhanced_cps_2034") + hh_weight = sim.calculate("household_weight", period=2034) + print(f"✓ enhanced_cps_2034 loaded successfully!") + print(f" Households: {len(hh_weight):,}") + print(f" Weighted: {hh_weight.sum():,.0f}") +except Exception as e: + print(f"✗ Could not load enhanced_cps_2034: {e}") + +print() +print("✓ Check complete!") diff --git a/analysis/check_top01_seniors_2054.py b/analysis/check_top01_seniors_2054.py new file mode 100644 index 0000000..24dfc30 --- /dev/null +++ b/analysis/check_top01_seniors_2054.py @@ -0,0 +1,138 @@ +""" +Check how many seniors (65+) are in the top 0.1% income group in 2054 dataset +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation + +print("="*80) +print("TOP 0.1% SENIORS ANALYSIS - 2054 Dataset") +print("="*80) +print() + +# Load the 2054 dataset +print("Loading 2054 dataset...") +sim = Microsimulation(dataset="hf://policyengine/test/2054.h5") +print("✓ Dataset loaded") +print() + +# Get household-level data +household_weight = sim.calculate("household_weight", period=2054) +household_net_income = sim.calculate("household_net_income", period=2054, map_to="household") + +# Get person-level data +age = sim.calculate("age", period=2054) +person_id = sim.calculate("person_id", period=2054) +household_id = sim.calculate("household_id", period=2054) + +# Get Social Security data +ss_benefits = sim.calculate("social_security", period=2054, map_to="household") +taxable_ss_benefits = sim.calculate("taxable_social_security", period=2054, map_to="household") + +print("Dataset Statistics:") +print(f" Total households: {len(household_weight):,}") +print(f" Total people: {len(age):,}") +print() + +# Create household DataFrame +df_hh = pd.DataFrame({ + 'household_id': range(len(household_weight)), + 'household_net_income': household_net_income, + 'weight': household_weight, + 'ss_benefits': ss_benefits, + 'taxable_ss_benefits': taxable_ss_benefits +}) + +# Remove invalid households +df_hh = df_hh[np.isfinite(df_hh['household_net_income'])] +df_hh = df_hh[df_hh['household_net_income'] > 0] +df_hh = df_hh[df_hh['weight'] > 0] + +# Calculate income percentile +df_hh['income_percentile'] = df_hh['household_net_income'].rank(pct=True) * 100 + +# Identify top 0.1% +df_hh['is_top_01'] = df_hh['income_percentile'] > 99.9 + +# Create person DataFrame +df_person = pd.DataFrame({ + 'person_id': person_id, + 'household_id': household_id, + 'age': age +}) + +# Filter valid ages +df_person = df_person[np.isfinite(df_person['age'])] +df_person = df_person[df_person['age'] > 0] + +# Identify seniors +df_person['is_senior'] = df_person['age'] >= 65 + +# Count seniors per household +seniors_per_hh = df_person[df_person['is_senior']].groupby('household_id').size() +df_hh['num_seniors'] = df_hh['household_id'].map(seniors_per_hh).fillna(0) +df_hh['has_seniors'] = df_hh['num_seniors'] > 0 + +print("="*80) +print("TOP 0.1% INCOME GROUP ANALYSIS") +print("="*80) +print() + +# Overall top 0.1% +top_01 = df_hh[df_hh['is_top_01']] +print(f"Households in top 0.1%:") +print(f" Sample count: {len(top_01):,}") +print(f" Weighted count: {top_01['weight'].sum():,.0f}") +print(f" Income threshold: ${top_01['household_net_income'].min():,.0f}") +print(f" Average income: ${top_01['household_net_income'].mean():,.0f}") +print() + +# Top 0.1% with seniors +top_01_with_seniors = top_01[top_01['has_seniors']] +print(f"Top 0.1% households WITH seniors (65+):") +print(f" Sample count: {len(top_01_with_seniors):,}") +print(f" Weighted count: {top_01_with_seniors['weight'].sum():,.0f}") +print(f" Percentage of top 0.1%: {len(top_01_with_seniors) / len(top_01) * 100:.1f}%") +print(f" Average # of seniors: {top_01_with_seniors['num_seniors'].mean():.1f}") +print() + +# Top 0.1% receiving SS benefits +top_01_with_ss = top_01[top_01['ss_benefits'] > 0] +print(f"Top 0.1% households receiving Social Security:") +print(f" Sample count: {len(top_01_with_ss):,}") +print(f" Weighted count: {top_01_with_ss['weight'].sum():,.0f}") +print(f" Percentage of top 0.1%: {len(top_01_with_ss) / len(top_01) * 100:.1f}%") +if len(top_01_with_ss) > 0: + print(f" Average SS benefit: ${top_01_with_ss['ss_benefits'].mean():,.0f}") +print() + +# Top 0.1% with taxable SS benefits +top_01_with_taxable_ss = top_01[top_01['taxable_ss_benefits'] > 0] +print(f"Top 0.1% households with TAXABLE Social Security:") +print(f" Sample count: {len(top_01_with_taxable_ss):,}") +print(f" Weighted count: {top_01_with_taxable_ss['weight'].sum():,.0f}") +print(f" Percentage of top 0.1%: {len(top_01_with_taxable_ss) / len(top_01) * 100:.1f}%") +if len(top_01_with_taxable_ss) > 0: + print(f" Average taxable SS: ${top_01_with_taxable_ss['taxable_ss_benefits'].mean():,.0f}") +print() + +# Summary comparison +print("="*80) +print("SUMMARY") +print("="*80) +print(f"Top 0.1% households: {len(top_01):,}") +print(f" - With seniors (65+): {len(top_01_with_seniors):,} ({len(top_01_with_seniors) / len(top_01) * 100:.1f}%)") +print(f" - Receiving SS: {len(top_01_with_ss):,} ({len(top_01_with_ss) / len(top_01) * 100:.1f}%)") +print(f" - With taxable SS: {len(top_01_with_taxable_ss):,} ({len(top_01_with_taxable_ss) / len(top_01) * 100:.1f}%)") +print() +print("✓ Analysis complete!") diff --git a/analysis/create_wharton_comparison_excel.py b/analysis/create_wharton_comparison_excel.py new file mode 100644 index 0000000..254d833 --- /dev/null +++ b/analysis/create_wharton_comparison_excel.py @@ -0,0 +1,227 @@ +""" +Create Excel spreadsheet with Wharton Budget Model comparison +in the clean table format requested - one sheet with three tables +""" + +import pandas as pd +import os +from openpyxl import Workbook +from openpyxl.styles import Font, Alignment, PatternFill, Border, Side +from openpyxl.utils.dataframe import dataframe_to_rows + +# PolicyEngine and Wharton data +data_2026 = { + 'Income Group': ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%'], + 'PolicyEngine': [-24, -65, -417, -763, -2148, -2907, -1972, -1608, 0], + 'Wharton': [0, -15, -340, -1135, -1625, -1590, -2020, -2205, -2450], + 'Difference': [-24, -50, -77, 372, -523, -1317, 48, 597, 2450], + '% Diff': ['N/A', '333%', '23%', '-33%', '32%', '83%', '-2%', '-27%', '-100%'] +} + +data_2034 = { + 'Income Group': ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%'], + 'PolicyEngine': [-39, -195, -769, -1291, -3053, -3388, -2325, -2250, 0], + 'Wharton': [0, -45, -615, -1630, -2160, -2160, -2605, -2715, -2970], + 'Difference': [-39, -150, -154, 339, -893, -1228, 280, 465, 2970], + '% Diff': ['N/A', '333%', '25%', '-21%', '41%', '57%', '-11%', '-17%', '-100%'] +} + +data_2054 = { + 'Income Group': ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%'], + 'PolicyEngine': [-5, -242, -757, -1558, -3518, -5094, -5183, -3231, 0], + 'Wharton': [-5, -275, -1730, -3560, -4075, -4385, -4565, -4820, -5080], + 'Difference': [0, 33, 973, 2002, 557, -709, -618, 1589, 5080], + '% Diff': ['0% ✓', '-12%', '-56%', '-56%', '-14%', '16%', '14%', '-33%', '-100%'] +} + +data_2054_local = { + 'Income Group': ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%'], + 'PolicyEngine': [-312, -1119, -2982, -4342, -9064, -13974, -6113, -6406, -280], + 'Wharton': [-5, -275, -1730, -3560, -4075, -4385, -4565, -4820, -5080], + 'Difference': [-307, -844, -1252, -782, -4989, -9589, -1548, -1586, 4800], + '% Diff': ['6240%', '307%', '72%', '22%', '122%', '219%', '34%', '33%', '-94%'] +} + +data_2054_new = { + 'Income Group': ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%'], + 'PolicyEngine': [-134, -868, -1946, -2644, -4067, -6741, -3097, -4098, -188], + 'Wharton': [-5, -275, -1730, -3560, -4075, -4385, -4565, -4820, -5080], + 'Difference': [-129, -593, -216, 916, 8, -2356, 1468, 722, 4892], + '% Diff': ['2580%', '216%', '12% ✓', '-26%', '~0% ✓✓', '54%', '-32%', '-15%', '-96%'] +} + +# Create workbook +wb = Workbook() +ws = wb.active +ws.title = "Wharton Comparison" + +# Define styles +header_font = Font(bold=True, size=14) +table_header_font = Font(bold=True, size=11) +regular_font = Font(size=11) +centered = Alignment(horizontal='center', vertical='center') +left_aligned = Alignment(horizontal='left', vertical='center') +right_aligned = Alignment(horizontal='right', vertical='center') + +# Border style +thin_border = Border( + left=Side(style='thin'), + right=Side(style='thin'), + top=Side(style='thin'), + bottom=Side(style='thin') +) + +# Fill styles +gray_fill = PatternFill(start_color='F0F0F0', end_color='F0F0F0', fill_type='solid') +header_fill = PatternFill(start_color='D9D9D9', end_color='D9D9D9', fill_type='solid') + +current_row = 1 + +# Helper function to add a table +def add_table(ws, start_row, year, data, title): + """Add a formatted table to the worksheet""" + row = start_row + + # Add title + ws.cell(row=row, column=1, value=title) + ws.cell(row=row, column=1).font = header_font + ws.merge_cells(start_row=row, start_column=1, end_row=row, end_column=5) + row += 1 + + # Add header row + headers = ['Income Group', 'PolicyEngine', 'Wharton', 'Difference', '% Diff'] + for col, header in enumerate(headers, 1): + cell = ws.cell(row=row, column=col, value=header) + cell.font = table_header_font + cell.alignment = centered + cell.fill = header_fill + cell.border = thin_border + row += 1 + + # Add data rows + for i, group in enumerate(data['Income Group']): + # Determine if this row should be gray + is_gray = i % 2 == 1 + + # Income Group + cell = ws.cell(row=row, column=1, value=group) + cell.font = regular_font + cell.alignment = left_aligned + cell.border = thin_border + if is_gray: + cell.fill = gray_fill + + # PolicyEngine + cell = ws.cell(row=row, column=2, value=f"-${abs(data['PolicyEngine'][i]):,}" if data['PolicyEngine'][i] != 0 else "$0") + cell.font = regular_font + cell.alignment = right_aligned + cell.border = thin_border + if is_gray: + cell.fill = gray_fill + + # Wharton + wh_val = data['Wharton'][i] + cell = ws.cell(row=row, column=3, value=f"-${abs(wh_val):,}" if wh_val < 0 else f"${wh_val:,}" if wh_val > 0 else "$0") + cell.font = regular_font + cell.alignment = right_aligned + cell.border = thin_border + if is_gray: + cell.fill = gray_fill + + # Difference + diff_val = data['Difference'][i] + cell = ws.cell(row=row, column=4, value=f"+${abs(diff_val):,}" if diff_val > 0 else f"-${abs(diff_val):,}" if diff_val < 0 else "$0") + cell.font = regular_font + cell.alignment = right_aligned + cell.border = thin_border + if is_gray: + cell.fill = gray_fill + + # % Diff + cell = ws.cell(row=row, column=5, value=data['% Diff'][i]) + cell.font = regular_font + cell.alignment = centered + cell.border = thin_border + if is_gray: + cell.fill = gray_fill + + row += 1 + + return row + 1 # Return next available row with spacing + +# Set column widths +ws.column_dimensions['A'].width = 20 +ws.column_dimensions['B'].width = 18 +ws.column_dimensions['C'].width = 18 +ws.column_dimensions['D'].width = 18 +ws.column_dimensions['E'].width = 12 + +# Add revenue summary at top +ws.cell(row=current_row, column=1, value="AGGREGATE REVENUE IMPACT (Billions)") +ws.cell(row=current_row, column=1).font = header_font +ws.merge_cells(start_row=current_row, start_column=1, end_row=current_row, end_column=5) +current_row += 1 + +# Revenue data +revenue_data = [ + ['Year 2026:', '-$85.4B', '(Enhanced CPS 2024)'], + ['Year 2034:', '-$131.7B', '(Enhanced CPS 2024)'], + ['Year 2054:', '-$176.3B', '(Enhanced CPS 2024)'], + ['Year 2054 (Old Local):', '-$588.1B', '(2054.h5 - old local)'], + ['Year 2054 (New):', '-$284.3B', '(2054 (1).h5 - best Wharton match)'], +] +for row_data in revenue_data: + ws.cell(row=current_row, column=1, value=row_data[0]).font = Font(bold=True, size=11) + ws.cell(row=current_row, column=2, value=row_data[1]).font = Font(size=11) + if len(row_data) > 2: + ws.cell(row=current_row, column=3, value=row_data[2]).font = Font(italic=True, size=10) + current_row += 1 + +current_row += 2 # Add spacing + +# Add 2026 table +current_row = add_table(ws, current_row, 2026, data_2026, "Average Tax Change per Household (Dollars) - Year 2026") + +# Add 2034 table +current_row = add_table(ws, current_row, 2034, data_2034, "Average Tax Change per Household (Dollars) - Year 2034") + +# Add 2054 table (Enhanced CPS) +current_row = add_table(ws, current_row, 2054, data_2054, "Average Tax Change per Household (Dollars) - Year 2054 (Enhanced CPS 2024)") + +# Add 2054 old local table +current_row = add_table(ws, current_row, 2054, data_2054_local, "Average Tax Change per Household (Dollars) - Year 2054 (Old Local Dataset: 2054.h5)") + +# Add 2054 new table +current_row = add_table(ws, current_row, 2054, data_2054_new, "Average Tax Change per Household (Dollars) - Year 2054 (New Dataset: 2054 (1).h5 - BEST MATCH)") + +# Add dataset note at bottom +ws.cell(row=current_row, column=1, value="Dataset: Enhanced CPS 2024 (reweighted to target years)") +ws.cell(row=current_row, column=1).font = Font(italic=True, size=10) +ws.merge_cells(start_row=current_row, start_column=1, end_row=current_row, end_column=5) + +# Save workbook +output_file = '../data/wharton_comparison_enhanced_cps_2024.xlsx' +wb.save(output_file) + +print(f"✓ Excel file created: {output_file}") +print() +print("Single sheet with formatted tables:") +print(" - Revenue summary (2026, 2034, 2054, 2054 new)") +print(" - 2026 comparison table (formatted)") +print(" - 2034 comparison table (formatted)") +print(" - 2054 comparison table - Enhanced CPS 2024 (formatted)") +print(" - 2054 comparison table - New dataset 2054 (1).h5 (formatted)") +print() +print("Formatting includes:") +print(" - Bold headers with gray background") +print(" - Alternating row colors (white/light gray)") +print(" - Borders on all cells") +print(" - Proper currency formatting") +print(" - Centered/aligned text") +print() +print("✓ Complete!") diff --git a/analysis/option1_analysis_2026.py b/analysis/option1_analysis_2026.py new file mode 100644 index 0000000..4706ee9 --- /dev/null +++ b/analysis/option1_analysis_2026.py @@ -0,0 +1,172 @@ +""" +Calculate Option 1 (Full Repeal of SS Benefits Taxation) impacts for 2026 +using enhanced_cps_2026 dataset for comparison with Wharton Budget Model 2026 benchmark. +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("OPTION 1 ANALYSIS - 2026 (Enhanced CPS)") +print("Full Repeal of Social Security Benefits Taxation") +print("="*80) +print() + +# Load baseline and reform simulations +print("Loading enhanced_cps_2026 dataset...") +try: + baseline = Microsimulation(dataset="enhanced_cps_2026") + print("✓ Baseline loaded") +except Exception as e: + print(f"✗ Failed to load enhanced_cps_2026: {e}") + print("Trying without specifying dataset (will use default)...") + baseline = Microsimulation() + print("✓ Using default dataset") + +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(reform=option1_reform) +print("✓ Reform simulation loaded") +print() + +# Calculate aggregate revenue impact +print("="*80) +print("AGGREGATE REVENUE IMPACT (2026)") +print("="*80) +print() + +baseline_income_tax = baseline.calculate("income_tax", period=2026, map_to="household") +reform_income_tax = reform.calculate("income_tax", period=2026, map_to="household") + +revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum() +revenue_impact_billions = revenue_impact / 1e9 + +print(f"Baseline income tax: ${baseline_income_tax.sum() / 1e9:,.1f}B") +print(f"Reform income tax: ${reform_income_tax.sum() / 1e9:,.1f}B") +print(f"Revenue impact: ${revenue_impact_billions:,.1f}B") +print() + +# Save aggregate result +os.makedirs('../data', exist_ok=True) +agg_df = pd.DataFrame([{ + 'reform_id': 'option1', + 'reform_name': 'Full Repeal of Social Security Benefits Taxation', + 'year': 2026, + 'revenue_impact': revenue_impact, + 'revenue_impact_billions': revenue_impact_billions, + 'scoring_type': 'static', + 'dataset': 'enhanced_cps_2026' +}]) +agg_df.to_csv('../data/option1_aggregate_2026.csv', index=False) +print("✓ Saved aggregate results to data/option1_aggregate_2026.csv") +print() + +# Calculate distributional impacts +print("="*80) +print("DISTRIBUTIONAL ANALYSIS (2026)") +print("="*80) +print() + +# Get household-level data +household_weight = baseline.calculate("household_weight", period=2026) +household_net_income_baseline = baseline.calculate("household_net_income", period=2026, map_to="household") +household_net_income_reform = reform.calculate("household_net_income", period=2026, map_to="household") +income_tax_baseline = baseline.calculate("income_tax", period=2026, map_to="household") +income_tax_reform = reform.calculate("income_tax", period=2026, map_to="household") + +# Calculate changes +tax_change = income_tax_reform - income_tax_baseline +income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + +# Create DataFrame +df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, + 'income_baseline': household_net_income_baseline, + 'income_reform': household_net_income_reform +}) + +# Remove invalid values +df = df[np.isfinite(df['household_net_income'])] +df = df[df['household_net_income'] > 0] +df = df[np.isfinite(df['income_change_pct'])] +df = df[df['weight'] > 0] + +print(f"Analyzing {len(df):,} households (weighted: {df['weight'].sum():,.0f})") +print() + +# Calculate income percentiles +df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + +# Define income groups matching Wharton +def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + +df['income_group'] = df['income_percentile'].apply(assign_income_group) + +# Calculate weighted averages by group +results = [] +group_order = [ + 'First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%' +] + +for group in group_order: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'Income group': group, + 'Average tax change': round(avg_tax_change), + 'Percent change in income, after taxes and transfers': f"{avg_income_change_pct:.1f}%" + }) + +results_df = pd.DataFrame(results) + +print("RESULTS: Option 1 Distributional Impacts - 2026") +print("-" * 80) +print(results_df.to_string(index=False)) +print() + +# Save results +results_df.to_csv('../data/option1_distributional_2026.csv', index=False) +print("✓ Saved distributional results to data/option1_distributional_2026.csv") +print() + +print("="*80) +print("✓ Analysis complete!") +print("="*80) diff --git a/analysis/option1_analysis_2034.py b/analysis/option1_analysis_2034.py new file mode 100644 index 0000000..a931cba --- /dev/null +++ b/analysis/option1_analysis_2034.py @@ -0,0 +1,174 @@ +""" +Calculate Option 1 (Full Repeal of SS Benefits Taxation) impacts for 2034 +using enhanced_cps_2024 (reweighted to 2034) for Wharton comparison. +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("OPTION 1 ANALYSIS - 2034 (Enhanced CPS 2024 → 2034)") +print("Full Repeal of Social Security Benefits Taxation") +print("="*80) +print() + +# Use default dataset (enhanced_cps_2024) and let PolicyEngine reweight to 2034 +print("Loading enhanced_cps_2024 (will be reweighted to 2034)...") +baseline = Microsimulation() +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(reform=option1_reform) +print("✓ Simulations loaded") +print() + +# Check dataset size +household_weight = baseline.calculate("household_weight", period=2034) +print(f"Dataset info for 2034:") +print(f" Households in sample: {len(household_weight):,}") +print(f" Weighted households: {household_weight.sum():,.0f}") +print() + +# Calculate aggregate revenue impact +print("="*80) +print("AGGREGATE REVENUE IMPACT (2034)") +print("="*80) +print() + +baseline_income_tax = baseline.calculate("income_tax", period=2034, map_to="household") +reform_income_tax = reform.calculate("income_tax", period=2034, map_to="household") + +revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum() +revenue_impact_billions = revenue_impact / 1e9 + +print(f"Baseline income tax: ${baseline_income_tax.sum() / 1e9:,.1f}B") +print(f"Reform income tax: ${reform_income_tax.sum() / 1e9:,.1f}B") +print(f"Revenue impact: ${revenue_impact_billions:,.1f}B") +print() + +# Save aggregate result +os.makedirs('../data', exist_ok=True) +agg_df = pd.DataFrame([{ + 'reform_id': 'option1', + 'reform_name': 'Full Repeal of Social Security Benefits Taxation', + 'year': 2034, + 'revenue_impact': revenue_impact, + 'revenue_impact_billions': revenue_impact_billions, + 'scoring_type': 'static', + 'dataset': 'enhanced_cps_2024_reweighted_to_2034' +}]) +agg_df.to_csv('../data/option1_aggregate_2034.csv', index=False) +print("✓ Saved aggregate results to data/option1_aggregate_2034.csv") +print() + +# Calculate distributional impacts +print("="*80) +print("DISTRIBUTIONAL ANALYSIS (2034)") +print("="*80) +print() + +# Get household-level data +household_net_income_baseline = baseline.calculate("household_net_income", period=2034, map_to="household") +household_net_income_reform = reform.calculate("household_net_income", period=2034, map_to="household") +income_tax_baseline = baseline.calculate("income_tax", period=2034, map_to="household") +income_tax_reform = reform.calculate("income_tax", period=2034, map_to="household") + +# Calculate changes +tax_change = income_tax_reform - income_tax_baseline +income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + +# Create DataFrame +df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, +}) + +# Remove invalid values +df = df[np.isfinite(df['household_net_income'])] +df = df[df['household_net_income'] > 0] +df = df[np.isfinite(df['income_change_pct'])] +df = df[df['weight'] > 0] + +print(f"Analyzing {len(df):,} households (weighted: {df['weight'].sum():,.0f})") +print() + +# Calculate income percentiles +df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + +# Define income groups matching Wharton +def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + +df['income_group'] = df['income_percentile'].apply(assign_income_group) + +# Calculate weighted averages by group +results = [] +group_order = [ + 'First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%' +] + +for group in group_order: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'Income group': group, + 'Average tax change': round(avg_tax_change), + 'Percent change in income, after taxes and transfers': f"{avg_income_change_pct:.1f}%", + 'Sample size': len(group_data), + 'Weighted count': round(total_weight) + }) + +results_df = pd.DataFrame(results) + +print("RESULTS: Option 1 Distributional Impacts - 2034") +print("-" * 80) +print(results_df[['Income group', 'Average tax change', 'Percent change in income, after taxes and transfers']].to_string(index=False)) +print() +print("Sample sizes by group:") +for _, row in results_df.iterrows(): + print(f" {row['Income group']:15s}: {row['Sample size']:>6,} households ({row['Weighted count']:>15,.0f} weighted)") +print() + +# Save results +results_df.to_csv('../data/option1_distributional_2034.csv', index=False) +print("✓ Saved distributional results to data/option1_distributional_2034.csv") +print() + +print("="*80) +print("✓ Analysis complete!") +print("="*80) diff --git a/analysis/option1_analysis_2054_enhanced.py b/analysis/option1_analysis_2054_enhanced.py new file mode 100644 index 0000000..2a468d0 --- /dev/null +++ b/analysis/option1_analysis_2054_enhanced.py @@ -0,0 +1,170 @@ +""" +Calculate Option 1 (Full Repeal of SS Benefits Taxation) impacts for 2054 +using enhanced_cps_2024 (reweighted to 2054) for Wharton comparison. +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("OPTION 1 ANALYSIS - 2054 (Enhanced CPS 2024 → 2054)") +print("Full Repeal of Social Security Benefits Taxation") +print("="*80) +print() + +# Use default dataset (enhanced_cps_2024) and let PolicyEngine reweight to 2054 +print("Loading enhanced_cps_2024 (will be reweighted to 2054)...") +baseline = Microsimulation() +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(reform=option1_reform) +print("✓ Simulations loaded") +print() + +# Check dataset size +household_weight = baseline.calculate("household_weight", period=2054) +print(f"Dataset info for 2054:") +print(f" Households in sample: {len(household_weight):,}") +print(f" Weighted households: {household_weight.sum():,.0f}") +print() + +# Calculate aggregate revenue impact +print("="*80) +print("AGGREGATE REVENUE IMPACT (2054)") +print("="*80) +print() + +baseline_income_tax = baseline.calculate("income_tax", period=2054, map_to="household") +reform_income_tax = reform.calculate("income_tax", period=2054, map_to="household") + +revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum() +revenue_impact_billions = revenue_impact / 1e9 + +print(f"Baseline income tax: ${baseline_income_tax.sum() / 1e9:,.1f}B") +print(f"Reform income tax: ${reform_income_tax.sum() / 1e9:,.1f}B") +print(f"Revenue impact: ${revenue_impact_billions:,.1f}B") +print() + +# Save aggregate result +os.makedirs('../data', exist_ok=True) +agg_df = pd.DataFrame([{ + 'reform_id': 'option1', + 'reform_name': 'Full Repeal of Social Security Benefits Taxation', + 'year': 2054, + 'revenue_impact': revenue_impact, + 'revenue_impact_billions': revenue_impact_billions, + 'scoring_type': 'static', + 'dataset': 'enhanced_cps_2024_reweighted_to_2054' +}]) +agg_df.to_csv('../data/option1_aggregate_2054_enhanced.csv', index=False) +print("✓ Saved aggregate results to data/option1_aggregate_2054_enhanced.csv") +print() + +# Calculate distributional impacts +print("="*80) +print("DISTRIBUTIONAL ANALYSIS (2054)") +print("="*80) +print() + +# Get household-level data +household_net_income_baseline = baseline.calculate("household_net_income", period=2054, map_to="household") +household_net_income_reform = reform.calculate("household_net_income", period=2054, map_to="household") +income_tax_baseline = baseline.calculate("income_tax", period=2054, map_to="household") +income_tax_reform = reform.calculate("income_tax", period=2054, map_to="household") + +# Calculate changes +tax_change = income_tax_reform - income_tax_baseline +income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + +# Create DataFrame +df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, +}) + +# Remove invalid values +df = df[np.isfinite(df['household_net_income'])] +df = df[df['household_net_income'] > 0] +df = df[np.isfinite(df['income_change_pct'])] +df = df[df['weight'] > 0] + +print(f"Analyzing {len(df):,} households (weighted: {df['weight'].sum():,.0f})") +print() + +# Calculate income percentiles +df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + +# Define income groups matching Wharton +def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + +df['income_group'] = df['income_percentile'].apply(assign_income_group) + +# Calculate weighted averages by group +results = [] +group_order = [ + 'First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%' +] + +for group in group_order: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'Income group': group, + 'Average tax change': round(avg_tax_change), + 'Percent change in income, after taxes and transfers': f"{avg_income_change_pct:.1f}%", + 'Sample size': len(group_data), + 'Weighted count': round(total_weight) + }) + +results_df = pd.DataFrame(results) + +print("RESULTS: Option 1 Distributional Impacts - 2034") +print("-" * 80) +print(results_df[['Income group', 'Average tax change', 'Percent change in income, after taxes and transfers']].to_string(index=False)) +print() + +# Save results +results_df.to_csv('../data/option1_distributional_2034.csv', index=False) +print("✓ Saved distributional results to data/option1_distributional_2034.csv") +print() + +print("="*80) +print("✓ Analysis complete!") +print("="*80) diff --git a/analysis/option1_analysis_2054_local.py b/analysis/option1_analysis_2054_local.py new file mode 100644 index 0000000..4f2ad59 --- /dev/null +++ b/analysis/option1_analysis_2054_local.py @@ -0,0 +1,180 @@ +""" +Calculate Option 1 (Full Repeal of SS Benefits Taxation) impacts for 2054 +using LOCAL 2054.h5 dataset for comparison with Wharton Budget Model 2054 benchmark. +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("OPTION 1 ANALYSIS - 2054 (LOCAL DATASET)") +print("Full Repeal of Social Security Benefits Taxation") +print("="*80) +print() + +# Load baseline and reform simulations using local dataset +local_dataset_path = "/Users/ziminghua/Downloads/2054.h5" + +print(f"Loading local dataset: {local_dataset_path}") +baseline = Microsimulation(dataset=local_dataset_path) +print("✓ Baseline loaded") + +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(dataset=local_dataset_path, reform=option1_reform) +print("✓ Reform simulation loaded") +print() + +# Check dataset size +household_weight = baseline.calculate("household_weight", period=2054) +print(f"Dataset info:") +print(f" Households in sample: {len(household_weight):,}") +print(f" Weighted households: {household_weight.sum():,.0f}") +print() + +# Calculate aggregate revenue impact +print("="*80) +print("AGGREGATE REVENUE IMPACT (2054)") +print("="*80) +print() + +baseline_income_tax = baseline.calculate("income_tax", period=2054, map_to="household") +reform_income_tax = reform.calculate("income_tax", period=2054, map_to="household") + +revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum() +revenue_impact_billions = revenue_impact / 1e9 + +print(f"Baseline income tax: ${baseline_income_tax.sum() / 1e9:,.1f}B") +print(f"Reform income tax: ${reform_income_tax.sum() / 1e9:,.1f}B") +print(f"Revenue impact: ${revenue_impact_billions:,.1f}B") +print() + +# Save aggregate result +os.makedirs('../data', exist_ok=True) +agg_df = pd.DataFrame([{ + 'reform_id': 'option1', + 'reform_name': 'Full Repeal of Social Security Benefits Taxation', + 'year': 2054, + 'revenue_impact': revenue_impact, + 'revenue_impact_billions': revenue_impact_billions, + 'scoring_type': 'static', + 'dataset': 'local_2054.h5' +}]) +agg_df.to_csv('../data/option1_aggregate_2054_local.csv', index=False) +print("✓ Saved aggregate results to data/option1_aggregate_2054_local.csv") +print() + +# Calculate distributional impacts +print("="*80) +print("DISTRIBUTIONAL ANALYSIS (2054)") +print("="*80) +print() + +# Get household-level data +household_net_income_baseline = baseline.calculate("household_net_income", period=2054, map_to="household") +household_net_income_reform = reform.calculate("household_net_income", period=2054, map_to="household") +income_tax_baseline = baseline.calculate("income_tax", period=2054, map_to="household") +income_tax_reform = reform.calculate("income_tax", period=2054, map_to="household") + +# Calculate changes +tax_change = income_tax_reform - income_tax_baseline +income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + +# Create DataFrame +df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, + 'income_baseline': household_net_income_baseline, + 'income_reform': household_net_income_reform +}) + +# Remove invalid values +df = df[np.isfinite(df['household_net_income'])] +df = df[df['household_net_income'] > 0] +df = df[np.isfinite(df['income_change_pct'])] +df = df[df['weight'] > 0] + +print(f"Analyzing {len(df):,} households (weighted: {df['weight'].sum():,.0f})") +print() + +# Calculate income percentiles +df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + +# Define income groups matching Wharton +def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + +df['income_group'] = df['income_percentile'].apply(assign_income_group) + +# Calculate weighted averages by group +results = [] +group_order = [ + 'First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%' +] + +for group in group_order: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'Income group': group, + 'Average tax change': round(avg_tax_change), + 'Percent change in income, after taxes and transfers': f"{avg_income_change_pct:.1f}%", + 'Sample size': len(group_data), + 'Weighted count': round(total_weight) + }) + +results_df = pd.DataFrame(results) + +print("RESULTS: Option 1 Distributional Impacts - 2054 (Local Dataset)") +print("-" * 80) +print(results_df[['Income group', 'Average tax change', 'Percent change in income, after taxes and transfers']].to_string(index=False)) +print() +print("Sample sizes by group:") +for _, row in results_df.iterrows(): + print(f" {row['Income group']:15s}: {row['Sample size']:>6,} households ({row['Weighted count']:>15,.0f} weighted)") +print() + +# Save results +results_df.to_csv('../data/option1_distributional_2054_local.csv', index=False) +print("✓ Saved distributional results to data/option1_distributional_2054_local.csv") +print() + +print("="*80) +print("✓ Analysis complete!") +print("="*80) diff --git a/analysis/option1_analysis_2054_new_uprating.py b/analysis/option1_analysis_2054_new_uprating.py new file mode 100644 index 0000000..13adff5 --- /dev/null +++ b/analysis/option1_analysis_2054_new_uprating.py @@ -0,0 +1,198 @@ +""" +Calculate Option 1 impacts for 2054 using NEW dataset with SSA Trustees uprating +This dataset was generated with PR #6744 uprating parameters. +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("OPTION 1 ANALYSIS - 2054 (NEW SSA TRUSTEES UPRATING)") +print("Full Repeal of Social Security Benefits Taxation") +print("Dataset: Generated with PR #6744 uprating parameters") +print("="*80) +print() + +# Load baseline and reform simulations using new dataset +new_dataset_path = "/Users/ziminghua/Downloads/2054 (1).h5" + +print(f"Loading new dataset: {new_dataset_path}") +baseline = Microsimulation(dataset=new_dataset_path) +print("✓ Baseline loaded") + +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(dataset=new_dataset_path, reform=option1_reform) +print("✓ Reform simulation loaded") +print() + +# Check dataset size +household_weight = baseline.calculate("household_weight", period=2054) +print(f"Dataset info:") +print(f" Households in sample: {len(household_weight):,}") +print(f" Weighted households: {household_weight.sum():,.0f}") +print() + +# Calculate aggregate revenue impact +print("="*80) +print("AGGREGATE REVENUE IMPACT (2054)") +print("="*80) +print() + +baseline_income_tax = baseline.calculate("income_tax", period=2054, map_to="household") +reform_income_tax = reform.calculate("income_tax", period=2054, map_to="household") + +revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum() +revenue_impact_billions = revenue_impact / 1e9 + +print(f"Baseline income tax: ${baseline_income_tax.sum() / 1e9:,.1f}B") +print(f"Reform income tax: ${reform_income_tax.sum() / 1e9:,.1f}B") +print(f"Revenue impact: ${revenue_impact_billions:,.1f}B") +print() + +# Calculate distributional impacts +print("="*80) +print("DISTRIBUTIONAL ANALYSIS (2054)") +print("="*80) +print() + +# Get household-level data +household_net_income_baseline = baseline.calculate("household_net_income", period=2054, map_to="household") +household_net_income_reform = reform.calculate("household_net_income", period=2054, map_to="household") +income_tax_baseline = baseline.calculate("income_tax", period=2054, map_to="household") +income_tax_reform = reform.calculate("income_tax", period=2054, map_to="household") + +# Calculate changes +tax_change = income_tax_reform - income_tax_baseline +income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + +# Create DataFrame +df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, +}) + +# Remove invalid values +df = df[np.isfinite(df['household_net_income'])] +df = df[df['household_net_income'] > 0] +df = df[np.isfinite(df['income_change_pct'])] +df = df[df['weight'] > 0] + +print(f"Analyzing {len(df):,} households (weighted: {df['weight'].sum():,.0f})") +print() + +# Calculate income percentiles +df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + +# Define income groups matching Wharton +def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + +df['income_group'] = df['income_percentile'].apply(assign_income_group) + +# Calculate weighted averages by group +results = [] +group_order = [ + 'First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%' +] + +for group in group_order: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'Income group': group, + 'Average tax change': round(avg_tax_change), + 'Percent change in income': f"{avg_income_change_pct:.1f}%", + 'Sample size': len(group_data), + 'Weighted count': round(total_weight) + }) + +results_df = pd.DataFrame(results) + +print("RESULTS: Option 1 Distributional Impacts - 2054 (New Uprating)") +print("-" * 80) +print(results_df[['Income group', 'Average tax change', 'Percent change in income']].to_string(index=False)) +print() +print("Sample sizes by group:") +for _, row in results_df.iterrows(): + print(f" {row['Income group']:15s}: {row['Sample size']:>6,} households ({row['Weighted count']:>15,.0f} weighted)") +print() + +# Comparison with Wharton +wharton_2054 = { + 'First quintile': -5, + 'Second quintile': -275, + 'Middle quintile': -1730, + 'Fourth quintile': -3560, + '80-90%': -4075, + '90-95%': -4385, + '95-99%': -4565, + '99-99.9%': -4820, + 'Top 0.1%': -5080 +} + +print("="*80) +print("COMPARISON WITH WHARTON 2054") +print("="*80) +print() + +comparison = [] +for _, row in results_df.iterrows(): + group = row['Income group'] + pe_val = row['Average tax change'] + wh_val = wharton_2054[group] + diff = pe_val - wh_val + pct_diff = (diff / wh_val * 100) if wh_val != 0 else None + + comparison.append({ + 'Income Group': group, + 'PE (New Uprating)': pe_val, + 'Wharton': wh_val, + 'Difference': diff, + '% Diff': f"{pct_diff:.0f}%" if pct_diff is not None else 'N/A' + }) + +comp_df = pd.DataFrame(comparison) +print(comp_df.to_string(index=False)) +print() + +print("="*80) +print(f"Revenue Impact: ${revenue_impact_billions:.1f}B") +print("Dataset: 2054 (1).h5 - Generated with SSA Trustees uprating (PR #6744)") +print("="*80) diff --git a/analysis/option1_analysis_2100_enhanced_cps.py b/analysis/option1_analysis_2100_enhanced_cps.py new file mode 100644 index 0000000..54a56bc --- /dev/null +++ b/analysis/option1_analysis_2100_enhanced_cps.py @@ -0,0 +1,47 @@ +""" +Run Option 1 with enhanced_cps_2024 for year 2026 +(Don't commit this - just for testing) +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("OPTION 1 - YEAR 2026 (Enhanced CPS 2024)") +print("="*80) +print() + +print("Loading enhanced_cps_2024...") +baseline = Microsimulation() # Uses enhanced_cps_2024 by default +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(reform=option1_reform) +print("✓ Simulations loaded") +print() + +# Calculate for year 2026 +print("Calculating revenue impact for year 2026...") +baseline_income_tax = baseline.calculate("income_tax", period=2026, map_to="household") +reform_income_tax = reform.calculate("income_tax", period=2026, map_to="household") + +revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum() +revenue_impact_billions = revenue_impact / 1e9 + +print() +print("="*80) +print("RESULTS") +print("="*80) +print(f"Baseline income tax (2026): ${baseline_income_tax.sum() / 1e9:,.1f}B") +print(f"Reform income tax (2026): ${reform_income_tax.sum() / 1e9:,.1f}B") +print(f"Revenue impact: ${revenue_impact_billions:,.1f}B") +print() +print("Dataset: Enhanced CPS 2024") +print("="*80) diff --git a/analysis/option1_distributional_2054.py b/analysis/option1_distributional_2054.py new file mode 100644 index 0000000..8cf2beb --- /dev/null +++ b/analysis/option1_distributional_2054.py @@ -0,0 +1,147 @@ +""" +Calculate distributional impacts of Option 1 (Full Repeal of SS Benefits Taxation) for 2054 +to compare with Wharton Budget Model benchmark. + +This script calculates: +1. Average tax change by income group +2. Percent change in income after taxes and transfers by income group + +Income groups match Wharton benchmark: +- Quintiles (First through Fourth) +- 80-90%, 90-95%, 95-99%, 99-99.9%, Top 0.1% +""" + +import sys +import os + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +import pandas as pd +import numpy as np +from policyengine_us import Microsimulation +from reforms import REFORMS + +print("="*80) +print("DISTRIBUTIONAL ANALYSIS: Option 1 - Year 2054") +print("Full Repeal of Social Security Benefits Taxation") +print("="*80) +print() + +# Load baseline and reform simulations +print("Loading 2054 dataset...") +baseline = Microsimulation(dataset="hf://policyengine/test/2054.h5") +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(dataset="hf://policyengine/test/2054.h5", reform=option1_reform) +print("✓ Simulations loaded") +print() + +# Calculate key variables for baseline +print("Calculating baseline values...") +household_weight = baseline.calculate("household_weight", period=2054) +income_tax_baseline = baseline.calculate("income_tax", period=2054, map_to="household") +household_net_income_baseline = baseline.calculate("household_net_income", period=2054, map_to="household") + +# Calculate reform values +print("Calculating reform values...") +income_tax_reform = reform.calculate("income_tax", period=2054, map_to="household") +household_net_income_reform = reform.calculate("household_net_income", period=2054, map_to="household") + +# Calculate changes +tax_change = income_tax_reform - income_tax_baseline # Negative = tax cut +# household_net_income already accounts for taxes and transfers +income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + +print("✓ Calculations complete") +print() + +# Create DataFrame +df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, + 'income_baseline': household_net_income_baseline, + 'income_reform': household_net_income_reform +}) + +# Remove invalid values +df = df[np.isfinite(df['household_net_income'])] +df = df[df['household_net_income'] > 0] +df = df[np.isfinite(df['income_change_pct'])] + +print(f"Analyzing {len(df):,} households (weighted: {df['weight'].sum():,.0f})") +print() + +# Calculate income percentiles +df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + +# Define income groups matching Wharton +def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + +df['income_group'] = df['income_percentile'].apply(assign_income_group) + +# Calculate weighted averages by group +print("Calculating distributional impacts...") +print() + +results = [] +group_order = [ + 'First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%' +] + +for group in group_order: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + # Weighted averages + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'Income group': group, + 'Average tax change': round(avg_tax_change), + 'Percent change in income, after taxes and transfers': f"{avg_income_change_pct:.1f}%" + }) + +results_df = pd.DataFrame(results) + +print("="*80) +print("RESULTS: Option 1 Distributional Impacts - 2054") +print("="*80) +print() +print(results_df.to_string(index=False)) +print() +print("="*80) + +# Save results +output_file = '../data/option1_distributional_2054.csv' +results_df.to_csv(output_file, index=False) +print(f"✓ Results saved to: {output_file}") +print() + +print("✓ Analysis complete!") diff --git a/analysis/policy-impacts-2100.ipynb b/analysis/policy-impacts-2100.ipynb new file mode 100644 index 0000000..199cd6a --- /dev/null +++ b/analysis/policy-impacts-2100.ipynb @@ -0,0 +1,574 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# Policy Impacts for Year 2054 (Wharton Benchmark)\n", + "\n", + "This notebook calculates static budgetary impacts for Option 1 (Full Repeal of Social Security Benefits Taxation) using the 2054 dataset.\n", + "This allows comparison with Wharton Budget Model estimates for the same year.\n", + "\n", + "**Dataset**: `hf://policyengine/test/2054.h5` \n", + "**Year**: 2054 \n", + "**Scoring**: Static (no behavioral responses)\n", + "**Reform**: Option 1 only (for Wharton benchmark comparison)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "imports", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-30T16:32:28.688612Z", + "iopub.status.busy": "2025-10-30T16:32:28.688533Z", + "iopub.status.idle": "2025-10-30T16:32:36.764350Z", + "shell.execute_reply": "2025-10-30T16:32:36.764009Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Working directory: /Users/ziminghua/vscode/crfb-tob-impacts\n", + "Source path: /Users/ziminghua/vscode/crfb-tob-impacts/src\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/miniconda3/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Libraries imported\n", + "✓ Found 8 reforms\n" + ] + } + ], + "source": [ + "# Import necessary libraries\n", + "import sys\n", + "import os\n", + "\n", + "# Determine repo root and add src to path\n", + "if os.path.basename(os.getcwd()) == 'analysis':\n", + " repo_root = os.path.abspath('..')\n", + " os.chdir(repo_root)\n", + "else:\n", + " repo_root = os.getcwd()\n", + "\n", + "# Add src directory to Python path\n", + "src_path = os.path.join(repo_root, 'src')\n", + "if src_path not in sys.path:\n", + " sys.path.insert(0, src_path)\n", + "\n", + "print(f\"Working directory: {os.getcwd()}\")\n", + "print(f\"Source path: {src_path}\")\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from policyengine_us import Microsimulation\n", + "from policyengine_core.reforms import Reform\n", + "from reforms import REFORMS\n", + "from tqdm import tqdm\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "print(f\"✓ Libraries imported\")\n", + "print(f\"✓ Found {len(REFORMS)} reforms\")" + ] + }, + { + "cell_type": "markdown", + "id": "dataset", + "metadata": {}, + "source": [ + "## Load 2054 Dataset\n", + "\n", + "Load the 2054 projection dataset to compare with Wharton benchmark." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "load_dataset", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-30T16:32:36.765572Z", + "iopub.status.busy": "2025-10-30T16:32:36.765493Z", + "iopub.status.idle": "2025-10-30T16:32:38.033872Z", + "shell.execute_reply": "2025-10-30T16:32:38.033536Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading 2054 dataset...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Dataset loaded successfully\n", + "\n", + "Dataset statistics:\n", + " Number of households in sample: 21,108\n", + " Weighted household count: 170,807,832\n" + ] + } + ], + "source": [ + "# Load the 2054 dataset\n", + "print(\"Loading 2054 dataset...\")\n", + "sim = Microsimulation(dataset=\"hf://policyengine/test/2054.h5\")\n", + "print(\"✓ Dataset loaded successfully\")\n", + "\n", + "# Check dataset size\n", + "household_weight = sim.calculate(\"household_weight\", period=2054)\n", + "household_count = sim.calculate(\"household_count\", period=2054, map_to=\"household\")\n", + "\n", + "print(f\"\\nDataset statistics:\")\n", + "print(f\" Number of households in sample: {len(household_weight):,}\")\n", + "print(f\" Weighted household count: {household_count.sum():,.0f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "baseline", + "metadata": {}, + "source": [ + "## Compute Baseline\n", + "\n", + "Calculate baseline income tax for year 2054 using the 2054 dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "compute_baseline", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-30T16:32:38.035227Z", + "iopub.status.busy": "2025-10-30T16:32:38.035134Z", + "iopub.status.idle": "2025-10-30T16:32:46.309849Z", + "shell.execute_reply": "2025-10-30T16:32:46.309527Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing baseline for 2054...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Baseline computed\n", + " Total baseline income tax: $3,289.7B\n" + ] + } + ], + "source": [ + "print(\"Computing baseline for 2054...\")\n", + "baseline_2054 = Microsimulation(dataset=\"hf://policyengine/test/2054.h5\")\n", + "baseline_income_tax = baseline_2054.calculate(\"income_tax\", map_to=\"household\", period=2054)\n", + "\n", + "print(f\"✓ Baseline computed\")\n", + "print(f\" Total baseline income tax: ${baseline_income_tax.sum() / 1e9:,.1f}B\")" + ] + }, + { + "cell_type": "markdown", + "id": "functions", + "metadata": {}, + "source": [ + "## Helper Function\n", + "\n", + "Define function to calculate revenue impact for a given reform." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "helper_function", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-30T16:32:46.310977Z", + "iopub.status.busy": "2025-10-30T16:32:46.310911Z", + "iopub.status.idle": "2025-10-30T16:32:46.312618Z", + "shell.execute_reply": "2025-10-30T16:32:46.312351Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Helper function defined\n" + ] + } + ], + "source": [ + "def calculate_revenue_impact_2054(reform):\n", + " \"\"\"\n", + " Calculate revenue impact for a given reform in year 2054.\n", + " \n", + " Args:\n", + " reform: Reform object\n", + " \n", + " Returns:\n", + " Revenue impact in dollars (positive = revenue gain, negative = revenue loss)\n", + " \"\"\"\n", + " # Create reformed simulation with 2054 dataset\n", + " reform_sim = Microsimulation(dataset=\"hf://policyengine/test/2054.h5\", reform=reform)\n", + " \n", + " # Calculate reformed income tax\n", + " reform_income_tax = reform_sim.calculate(\"income_tax\", map_to=\"household\", period=2054)\n", + " \n", + " # JCT convention: reformed - baseline (positive = more revenue)\n", + " revenue_impact = reform_income_tax.sum() - baseline_income_tax.sum()\n", + " \n", + " return revenue_impact\n", + "\n", + "print(\"✓ Helper function defined\")" + ] + }, + { + "cell_type": "markdown", + "id": "calculate", + "metadata": {}, + "source": [ + "## Calculate Reform Impacts for 2054\n", + "\n", + "Test Option 1 with the 2054 dataset for Wharton benchmark comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "calculate_impacts", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-30T16:32:46.313458Z", + "iopub.status.busy": "2025-10-30T16:32:46.313406Z", + "iopub.status.idle": "2025-10-30T16:37:28.288931Z", + "shell.execute_reply": "2025-10-30T16:37:28.288472Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "CALCULATING REFORM IMPACTS FOR YEAR 2054\n", + "================================================================================\n", + "Testing Option 1 only (for Wharton benchmark comparison)\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "Processing reforms: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
reform_idreform_namerevenue_impact_billions
0option1Full Repeal of Social Security Benefits Taxation-239.612969
\n", + "" + ], + "text/plain": [ + " reform_id reform_name \\\n", + "0 option1 Full Repeal of Social Security Benefits Taxation \n", + "\n", + " revenue_impact_billions \n", + "0 -239.612969 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Convert to DataFrame\n", + "results_df = pd.DataFrame(results_2054)\n", + "\n", + "if len(results_df) > 0:\n", + " print(\"\\n2054 Reform Impacts (Billions):\")\n", + " print(\"=\"*80)\n", + " \n", + " for _, row in results_df.iterrows():\n", + " print(f\"{row['reform_id']:8s}: {row['reform_name']:55s} ${row['revenue_impact_billions']:>8,.1f}B\")\n", + " \n", + " print(\"=\"*80)\n", + " print(f\"\\nTotal reforms calculated: {len(results_df)}\")\n", + " \n", + " # Display as table\n", + " display(results_df[['reform_id', 'reform_name', 'revenue_impact_billions']].sort_values('revenue_impact_billions', ascending=False))\n", + "else:\n", + " print(\"⚠ No results to display\")" + ] + }, + { + "cell_type": "markdown", + "id": "export", + "metadata": {}, + "source": [ + "## Export Results to CSV\n", + "\n", + "Save the 2054 impact estimates to a CSV file for Wharton benchmark comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "export_csv", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-30T16:37:28.302988Z", + "iopub.status.busy": "2025-10-30T16:37:28.302927Z", + "iopub.status.idle": "2025-10-30T16:37:28.307613Z", + "shell.execute_reply": "2025-10-30T16:37:28.307348Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Exported results to: data/policy_impacts_2054_wharton.csv\n", + " Records: 1\n", + " Columns: reform_id, reform_name, year, revenue_impact, revenue_impact_billions, scoring_type, dataset\n", + "✓ Exported summary to: data/policy_impacts_2054_wharton_summary.csv\n", + "\n", + "✓ Analysis complete!\n" + ] + } + ], + "source": [ + "# Create data directory if it doesn't exist\n", + "os.makedirs('data', exist_ok=True)\n", + "\n", + "if len(results_df) > 0:\n", + " # Export full results\n", + " output_file = 'data/policy_impacts_2054_wharton.csv'\n", + " results_df.to_csv(output_file, index=False)\n", + " print(f\"✓ Exported results to: {output_file}\")\n", + " print(f\" Records: {len(results_df)}\")\n", + " print(f\" Columns: {', '.join(results_df.columns)}\")\n", + " \n", + " # Also create a summary version\n", + " summary_df = results_df[['reform_id', 'reform_name', 'revenue_impact_billions']].copy()\n", + " summary_df = summary_df.sort_values('revenue_impact_billions', ascending=False)\n", + " summary_file = 'data/policy_impacts_2054_wharton_summary.csv'\n", + " summary_df.to_csv(summary_file, index=False)\n", + " print(f\"✓ Exported summary to: {summary_file}\")\n", + "else:\n", + " print(\"⚠ No results to export\")\n", + "\n", + "print(\"\\n✓ Analysis complete!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/test_enhanced_datasets.py b/analysis/test_enhanced_datasets.py new file mode 100644 index 0000000..d078ccd --- /dev/null +++ b/analysis/test_enhanced_datasets.py @@ -0,0 +1,57 @@ +""" +Test loading enhanced CPS datasets for 2026, 2034, and other years +""" + +from policyengine_us import Microsimulation +import traceback + +datasets_to_test = [ + "enhanced_cps_2026", + "enhanced_cps_2027", + "enhanced_cps_2028", + "enhanced_cps_2029", + "enhanced_cps_2030", + "enhanced_cps_2031", + "enhanced_cps_2032", + "enhanced_cps_2033", + "enhanced_cps_2034", +] + +print("Testing enhanced CPS datasets...") +print("="*80) + +working_datasets = [] +failed_datasets = [] + +for dataset_name in datasets_to_test: + year = int(dataset_name.split('_')[-1]) + print(f"\nTesting {dataset_name}...") + + try: + # Try to create simulation with this dataset + sim = Microsimulation(dataset=dataset_name) + + # Try to calculate something to verify it works + hh_weight = sim.calculate("household_weight", period=year) + + print(f" ✓ SUCCESS!") + print(f" Households: {len(hh_weight):,}") + print(f" Weighted: {hh_weight.sum():,.0f}") + working_datasets.append(dataset_name) + + except Exception as e: + print(f" ✗ FAILED: {type(e).__name__}: {e}") + failed_datasets.append(dataset_name) + +print() +print("="*80) +print("SUMMARY") +print("="*80) +print(f"Working datasets: {len(working_datasets)}") +for ds in working_datasets: + print(f" ✓ {ds}") + +print() +print(f"Failed datasets: {len(failed_datasets)}") +for ds in failed_datasets: + print(f" ✗ {ds}") diff --git a/analysis/test_hf_2026.py b/analysis/test_hf_2026.py new file mode 100644 index 0000000..ec4da69 --- /dev/null +++ b/analysis/test_hf_2026.py @@ -0,0 +1,31 @@ +import sys +import os +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from policyengine_us import Microsimulation +from reforms import REFORMS + +print('Running Option 1 with hf://policyengine/test/2026.h5') +print('='*80) + +baseline = Microsimulation(dataset='hf://policyengine/test/2026.h5') +option1_reform = REFORMS['option1']['func']() +reform = Microsimulation(dataset='hf://policyengine/test/2026.h5', reform=option1_reform) + +print('Calculating for 2026...') +baseline_tax = baseline.calculate('income_tax', period=2026, map_to='household') +reform_tax = reform.calculate('income_tax', period=2026, map_to='household') + +revenue_impact = (reform_tax.sum() - baseline_tax.sum()) / 1e9 + +print() +print('RESULTS') +print('='*80) +print(f'Baseline: ${baseline_tax.sum() / 1e9:.1f}B') +print(f'Reform: ${reform_tax.sum() / 1e9:.1f}B') +print(f'Impact: ${revenue_impact:.1f}B') +print() +print('Dataset: hf://policyengine/test/2026.h5') diff --git a/analysis/wharton_benchmark_comparison.md b/analysis/wharton_benchmark_comparison.md new file mode 100644 index 0000000..511fa7d --- /dev/null +++ b/analysis/wharton_benchmark_comparison.md @@ -0,0 +1,122 @@ +# Wharton Budget Model Benchmark Comparison +## Option 1: Full Repeal of Social Security Benefits Taxation - Year 2054 + +This analysis compares PolicyEngine US estimates with the Wharton Budget Model for eliminating income taxes on Social Security benefits. + +--- + +## Aggregate Revenue Impact + +| Source | Revenue Impact (2054) | +|--------|----------------------| +| **PolicyEngine US** | **-$239.6 billion** | +| Wharton Budget Model | *(Not provided in benchmark)* | + +--- + +## Distributional Impacts by Income Group + +### Average Tax Change (2054) + +| Income Group | PolicyEngine US | Wharton Budget Model | Difference | % Difference | +|--------------|-----------------|---------------------|------------|--------------| +| First quintile | -$6 | -$5 | -$1 | 20% | +| Second quintile | -$236 | -$275 | +$39 | -14% | +| Middle quintile | -$880 | -$1,730 | +$850 | -49% | +| Fourth quintile | -$1,629 | -$3,560 | +$1,931 | -54% | +| 80-90% | -$3,594 | -$4,075 | +$481 | -12% | +| 90-95% | -$6,297 | -$4,385 | -$1,912 | 44% | +| 95-99% | -$7,987 | -$4,565 | -$3,422 | 75% | +| 99-99.9% | -$4,984 | -$4,820 | -$164 | 3% | +| Top 0.1% | $0 | -$5,080 | +$5,080 | -100% | + +*Negative values indicate tax cuts (benefit to taxpayers)* + +### Percent Change in Income, After Taxes and Transfers (2054) + +| Income Group | PolicyEngine US | Wharton Budget Model | Difference (pp) | +|--------------|-----------------|---------------------|-----------------| +| First quintile | 0.0% | 0.0% | 0.0 pp | +| Second quintile | 0.3% | 0.3% | 0.0 pp | +| Middle quintile | 0.6% | 1.3% | -0.7 pp | +| Fourth quintile | 0.8% | 1.6% | -0.8 pp | +| 80-90% | 1.2% | 1.2% | 0.0 pp | +| 90-95% | 1.5% | 0.9% | 0.6 pp | +| 95-99% | 1.4% | 0.6% | 0.8 pp | +| 99-99.9% | 0.3% | 0.2% | 0.1 pp | +| Top 0.1% | 0.0% | 0.0% | 0.0 pp | + +*pp = percentage points* + +--- + +## Key Findings + +### Areas of Agreement +1. **Bottom quintiles**: Both models show minimal impact on the first quintile and similar impacts on the second quintile +2. **Upper-middle income (80-90%)**: Very similar average tax changes (~$3,600-$4,100) and identical percentage income changes (1.2%) +3. **General pattern**: Both models show the policy benefits middle-to-upper-middle income households most + +### Notable Differences + +1. **Middle & Fourth Quintiles**: + - PolicyEngine shows smaller tax cuts (-$880 and -$1,629) than Wharton (-$1,730 and -$3,560) + - This translates to smaller income changes in PolicyEngine (0.6% and 0.8%) vs Wharton (1.3% and 1.6%) + +2. **High Income (90-99th percentiles)**: + - PolicyEngine shows **larger** tax cuts for the 90-95% (-$6,297) and 95-99% (-$7,987) groups + - Wharton shows more uniform benefits across high-income groups (-$4,385 to -$4,820) + +3. **Top 0.1%**: + - **Major discrepancy**: PolicyEngine shows $0 benefit, Wharton shows -$5,080 tax cut + - This suggests different treatment or data for very high earners receiving Social Security benefits + +--- + +## Methodology Notes + +### PolicyEngine US (2054) +- **Dataset**: PolicyEngine US 2054 projection (`hf://policyengine/test/2054.h5`) +- **Sample**: 20,895 households (weighted: 166,973,936) +- **Scoring**: Static (no behavioral responses) +- **Reform**: Complete elimination of federal income taxation on Social Security benefits +- **Income grouping**: Based on household net income percentiles + +### Wharton Budget Model (2054) +- Source: Wharton Budget Model - "Conventional Annual Distributional Effects of Eliminating Income Taxes on Social Security Benefits" +- Methodology details not provided in benchmark table + +--- + +## Technical Implementation + +### Files Generated +1. **aggregate-revenue-impact-2054**: `/data/policy_impacts_2054_wharton_summary.csv` +2. **Distributional analysis**: `/data/option1_distributional_2054.csv` +3. **Analysis scripts**: + - `/analysis/policy-impacts-2100.ipynb` (modified for 2054 dataset) + - `/analysis/option1_distributional_2054.py` + +### Branch +All analysis conducted on the `wharton-benchmark` branch. + +--- + +## Conclusions + +1. **Overall pattern alignment**: Both models agree that eliminating SS benefit taxation primarily benefits middle-to-upper-middle income households + +2. **Magnitude differences**: PolicyEngine generally shows smaller benefits to middle quintiles but larger benefits to the 90-99th percentiles + +3. **Top 0.1% discrepancy**: Requires further investigation - could be due to: + - Different assumptions about Social Security benefit receipt by very high earners + - Different treatment of the benefit cap + - Dataset differences in top income representation + +4. **Revenue estimate**: PolicyEngine estimates -$239.6B revenue loss in 2054 (Wharton aggregate revenue not provided in benchmark) + +--- + +*Analysis Date: October 30, 2025* +*PolicyEngine US Version: Current* +*Branch: wharton-benchmark* diff --git a/analysis/wharton_comparison_pipeline.py b/analysis/wharton_comparison_pipeline.py new file mode 100644 index 0000000..9a570ce --- /dev/null +++ b/analysis/wharton_comparison_pipeline.py @@ -0,0 +1,214 @@ +""" +Quick Pipeline: Generate Wharton Benchmark Comparison for any dataset + +Usage: + python wharton_comparison_pipeline.py + +Example: + python wharton_comparison_pipeline.py /Users/ziminghua/Downloads/2054.h5 2054 +""" + +import sys +import os +import pandas as pd +import numpy as np + +# Setup path +repo_root = os.path.abspath('..') +src_path = os.path.join(repo_root, 'src') +if src_path not in sys.path: + sys.path.insert(0, src_path) + +from policyengine_us import Microsimulation +from reforms import REFORMS + +# Wharton benchmark data (from Excel file) +WHARTON_BENCHMARKS = { + 2026: { + 'First quintile': {'tax_change': 0, 'pct_change': 0.0}, + 'Second quintile': {'tax_change': -15, 'pct_change': 0.0}, + 'Middle quintile': {'tax_change': -340, 'pct_change': 0.5}, + 'Fourth quintile': {'tax_change': -1135, 'pct_change': 1.1}, + '80-90%': {'tax_change': -1625, 'pct_change': 1.0}, + '90-95%': {'tax_change': -1590, 'pct_change': 0.7}, + '95-99%': {'tax_change': -2020, 'pct_change': 0.5}, + '99-99.9%': {'tax_change': -2205, 'pct_change': 0.2}, + 'Top 0.1%': {'tax_change': -2450, 'pct_change': 0.0}, + }, + 2034: { + 'First quintile': {'tax_change': 0, 'pct_change': 0.0}, + 'Second quintile': {'tax_change': -45, 'pct_change': 0.1}, + 'Middle quintile': {'tax_change': -615, 'pct_change': 0.8}, + 'Fourth quintile': {'tax_change': -1630, 'pct_change': 1.2}, + '80-90%': {'tax_change': -2160, 'pct_change': 1.1}, + '90-95%': {'tax_change': -2160, 'pct_change': 0.7}, + '95-99%': {'tax_change': -2605, 'pct_change': 0.6}, + '99-99.9%': {'tax_change': -2715, 'pct_change': 0.2}, + 'Top 0.1%': {'tax_change': -2970, 'pct_change': 0.0}, + }, + 2054: { + 'First quintile': {'tax_change': -5, 'pct_change': 0.0}, + 'Second quintile': {'tax_change': -275, 'pct_change': 0.3}, + 'Middle quintile': {'tax_change': -1730, 'pct_change': 1.3}, + 'Fourth quintile': {'tax_change': -3560, 'pct_change': 1.6}, + '80-90%': {'tax_change': -4075, 'pct_change': 1.2}, + '90-95%': {'tax_change': -4385, 'pct_change': 0.9}, + '95-99%': {'tax_change': -4565, 'pct_change': 0.6}, + '99-99.9%': {'tax_change': -4820, 'pct_change': 0.2}, + 'Top 0.1%': {'tax_change': -5080, 'pct_change': 0.0}, + }, +} + +def run_analysis(dataset_path, year): + """Run Option 1 analysis for given dataset and year""" + + print(f"Loading dataset: {dataset_path}") + baseline = Microsimulation(dataset=dataset_path) + + option1_reform = REFORMS['option1']['func']() + reform = Microsimulation(dataset=dataset_path, reform=option1_reform) + + # Get household data + household_weight = baseline.calculate("household_weight", period=year) + household_net_income_baseline = baseline.calculate("household_net_income", period=year, map_to="household") + household_net_income_reform = reform.calculate("household_net_income", period=year, map_to="household") + income_tax_baseline = baseline.calculate("income_tax", period=year, map_to="household") + income_tax_reform = reform.calculate("income_tax", period=year, map_to="household") + + # Calculate changes + tax_change = income_tax_reform - income_tax_baseline + income_change_pct = ((household_net_income_reform - household_net_income_baseline) / household_net_income_baseline) * 100 + + # Create DataFrame + df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, + }) + + # Remove invalid values + df = df[np.isfinite(df['household_net_income'])] + df = df[df['household_net_income'] > 0] + df = df[np.isfinite(df['income_change_pct'])] + df = df[df['weight'] > 0] + + # Calculate percentiles + df['income_percentile'] = df['household_net_income'].rank(pct=True) * 100 + + # Assign income groups + def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + + df['income_group'] = df['income_percentile'].apply(assign_income_group) + + # Calculate aggregate revenue + revenue_impact = (income_tax_reform.sum() - income_tax_baseline.sum()) / 1e9 + + # Calculate by group + results = [] + for group in ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%']: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'group': group, + 'pe_tax_change': round(avg_tax_change), + 'pe_pct_change': round(avg_income_change_pct, 1), + }) + + return pd.DataFrame(results), revenue_impact + +def generate_comparison_table(pe_results, year): + """Generate comparison table with Wharton benchmark""" + + if year not in WHARTON_BENCHMARKS: + print(f"Warning: No Wharton benchmark available for year {year}") + return pe_results + + wharton_data = WHARTON_BENCHMARKS[year] + + comparison = [] + for _, row in pe_results.iterrows(): + group = row['group'] + wharton = wharton_data.get(group, {'tax_change': None, 'pct_change': None}) + + pe_tax = row['pe_tax_change'] + wh_tax = wharton['tax_change'] + + comparison.append({ + 'Income Group': group, + 'PolicyEngine': f"${pe_tax:,}", + 'Wharton': f"${wh_tax:,}" if wh_tax is not None else 'N/A', + 'Difference': f"${(pe_tax - wh_tax):,}" if wh_tax is not None else 'N/A', + 'PE %': f"{row['pe_pct_change']}%", + 'Wharton %': f"{wharton['pct_change']}%" if wharton['pct_change'] is not None else 'N/A', + }) + + return pd.DataFrame(comparison) + +if __name__ == "__main__": + if len(sys.argv) != 3: + print(__doc__) + sys.exit(1) + + dataset_path = sys.argv[1] + year = int(sys.argv[2]) + + print("="*80) + print(f"WHARTON COMPARISON PIPELINE - YEAR {year}") + print("="*80) + print() + + # Run analysis + print("Running PolicyEngine analysis...") + pe_results, revenue_impact = run_analysis(dataset_path, year) + print(f"✓ Analysis complete") + print(f" Revenue impact: ${revenue_impact:.1f}B") + print() + + # Generate comparison table + print("Generating comparison table...") + comparison_table = generate_comparison_table(pe_results, year) + + print() + print("="*80) + print(f"COMPARISON TABLE: {year}") + print("="*80) + print() + print("Average Tax Change (per household):") + print(comparison_table[['Income Group', 'PolicyEngine', 'Wharton', 'Difference']].to_string(index=False)) + print() + print("Percent Change in Income:") + print(comparison_table[['Income Group', 'PE %', 'Wharton %']].to_string(index=False)) + print() + + # Save to file + output_file = f"../data/wharton_comparison_{year}.csv" + comparison_table.to_csv(output_file, index=False) + print(f"✓ Saved to: {output_file}") + print() + print("="*80) diff --git a/data/all_reforms_hf_test_2026.csv b/data/all_reforms_hf_test_2026.csv new file mode 100644 index 0000000..b96c046 --- /dev/null +++ b/data/all_reforms_hf_test_2026.csv @@ -0,0 +1,9 @@ +Reform ID,Reform Name,Revenue Impact ($B) +option1,Full Repeal of Social Security Benefits Taxation,-90.4 +option2,Taxation of 85% of Social Security Benefits,25.7 +option3,85% Taxation with Permanent Senior Deduction Extension,25.7 +option4,Social Security Tax Credit System ($500),32.8 +option5,Roth-Style Swap,54.0 +option6,Phased Roth-Style Swap,18.7 +option7,Eliminate Bonus Senior Deduction,23.1 +option8,Full Taxation of Social Security Benefits,54.1 diff --git a/data/option1_aggregate_2026.csv b/data/option1_aggregate_2026.csv new file mode 100644 index 0000000..32b1051 --- /dev/null +++ b/data/option1_aggregate_2026.csv @@ -0,0 +1,2 @@ +reform_id,reform_name,year,revenue_impact,revenue_impact_billions,scoring_type,dataset +option1,Full Repeal of Social Security Benefits Taxation,2026,-85386229066.45117,-85.38622906645118,static,enhanced_cps_2026 diff --git a/data/option1_aggregate_2034.csv b/data/option1_aggregate_2034.csv new file mode 100644 index 0000000..6b8c7fb --- /dev/null +++ b/data/option1_aggregate_2034.csv @@ -0,0 +1,2 @@ +reform_id,reform_name,year,revenue_impact,revenue_impact_billions,scoring_type,dataset +option1,Full Repeal of Social Security Benefits Taxation,2034,-131706383571.92188,-131.70638357192186,static,enhanced_cps_2024_reweighted_to_2034 diff --git a/data/option1_aggregate_2054_enhanced.csv b/data/option1_aggregate_2054_enhanced.csv new file mode 100644 index 0000000..8befc68 --- /dev/null +++ b/data/option1_aggregate_2054_enhanced.csv @@ -0,0 +1,2 @@ +reform_id,reform_name,year,revenue_impact,revenue_impact_billions,scoring_type,dataset +option1,Full Repeal of Social Security Benefits Taxation,2054,-176340917437.51514,-176.34091743751515,static,enhanced_cps_2024_reweighted_to_2054 diff --git a/data/option1_aggregate_2054_local.csv b/data/option1_aggregate_2054_local.csv new file mode 100644 index 0000000..bb21f3b --- /dev/null +++ b/data/option1_aggregate_2054_local.csv @@ -0,0 +1,2 @@ +reform_id,reform_name,year,revenue_impact,revenue_impact_billions,scoring_type,dataset +option1,Full Repeal of Social Security Benefits Taxation,2054,-588065432383.9199,-588.0654323839199,static,local_2054.h5 diff --git a/data/option1_distributional_2026.csv b/data/option1_distributional_2026.csv new file mode 100644 index 0000000..8c19823 --- /dev/null +++ b/data/option1_distributional_2026.csv @@ -0,0 +1,10 @@ +Income group,Average tax change,"Percent change in income, after taxes and transfers" +First quintile,-24,0.1% +Second quintile,-65,0.1% +Middle quintile,-417,0.4% +Fourth quintile,-763,0.5% +80-90%,-2148,1.1% +90-95%,-2907,1.0% +95-99%,-1972,0.5% +99-99.9%,-1608,0.1% +Top 0.1%,0,0.0% diff --git a/data/option1_distributional_2034.csv b/data/option1_distributional_2034.csv new file mode 100644 index 0000000..095da11 --- /dev/null +++ b/data/option1_distributional_2034.csv @@ -0,0 +1,10 @@ +Income group,Average tax change,"Percent change in income, after taxes and transfers",Sample size,Weighted count +First quintile,-5,0.0%,4178,35505336 +Second quintile,-242,0.3%,4178,33075940 +Middle quintile,-757,0.5%,4179,27372536 +Fourth quintile,-1558,0.7%,4178,30579978 +80-90%,-3518,1.2%,2089,11800340 +90-95%,-5094,1.2%,1045,6391109 +95-99%,-5183,0.9%,836,4102413 +99-99.9%,-3231,0.2%,188,1198984 +Top 0.1%,0,0.0%,21,31152 diff --git a/data/option1_distributional_2054.csv b/data/option1_distributional_2054.csv new file mode 100644 index 0000000..fbeb2b0 --- /dev/null +++ b/data/option1_distributional_2054.csv @@ -0,0 +1,10 @@ +Income group,Average tax change,"Percent change in income, after taxes and transfers" +First quintile,-6,0.0% +Second quintile,-236,0.3% +Middle quintile,-880,0.6% +Fourth quintile,-1629,0.8% +80-90%,-3594,1.2% +90-95%,-6297,1.5% +95-99%,-7987,1.4% +99-99.9%,-4984,0.3% +Top 0.1%,0,0.0% diff --git a/data/option1_distributional_2054_local.csv b/data/option1_distributional_2054_local.csv new file mode 100644 index 0000000..63a15dc --- /dev/null +++ b/data/option1_distributional_2054_local.csv @@ -0,0 +1,10 @@ +Income group,Average tax change,"Percent change in income, after taxes and transfers",Sample size,Weighted count +First quintile,-312,3.7%,4080,36732832 +Second quintile,-1119,0.8%,4082,37962540 +Middle quintile,-2982,1.3%,4081,33545236 +Fourth quintile,-4342,1.2%,4081,34003848 +80-90%,-9064,1.7%,2040,14292485 +90-95%,-13974,1.9%,1020,8448726 +95-99%,-6113,0.5%,816,4634287 +99-99.9%,-6406,0.2%,184,1485465 +Top 0.1%,-280,0.0%,21,28933 diff --git a/data/policy_impacts_2054_wharton.csv b/data/policy_impacts_2054_wharton.csv new file mode 100644 index 0000000..673baa9 --- /dev/null +++ b/data/policy_impacts_2054_wharton.csv @@ -0,0 +1,2 @@ +reform_id,reform_name,year,revenue_impact,revenue_impact_billions,scoring_type,dataset +option1,Full Repeal of Social Security Benefits Taxation,2054,-239612969375.03955,-239.61296937503954,static,2054.h5 diff --git a/data/policy_impacts_2054_wharton_summary.csv b/data/policy_impacts_2054_wharton_summary.csv new file mode 100644 index 0000000..d307617 --- /dev/null +++ b/data/policy_impacts_2054_wharton_summary.csv @@ -0,0 +1,2 @@ +reform_id,reform_name,revenue_impact_billions +option1,Full Repeal of Social Security Benefits Taxation,-239.61296937503954 diff --git a/data/wharton_comparison_enhanced_cps_2024.xlsx b/data/wharton_comparison_enhanced_cps_2024.xlsx new file mode 100644 index 0000000..208fdc8 Binary files /dev/null and b/data/wharton_comparison_enhanced_cps_2024.xlsx differ