Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions analysis/all_reforms_hf_2026.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import sys
import os
repo_root = os.path.abspath('..')
src_path = os.path.join(repo_root, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)

from policyengine_us import Microsimulation
from reforms import REFORMS
from tqdm import tqdm

print('='*80)
print('ALL REFORMS - HF TEST 2026 DATASET')
print('='*80)
print()

# Load baseline once
print('Loading baseline...')
baseline = Microsimulation(dataset='hf://policyengine/test/2026.h5')
baseline_tax = baseline.calculate('income_tax', period=2026, map_to='household')
print(f'βœ“ Baseline: ${baseline_tax.sum() / 1e9:.1f}B')
print()

results = []

for reform_id, reform_config in tqdm(REFORMS.items(), desc='Processing reforms'):
reform_name = reform_config['name']
reform_func = reform_config['func']

print(f'\nProcessing {reform_id}: {reform_name[:50]}...')

try:
reform = reform_func()
reform_sim = Microsimulation(dataset='hf://policyengine/test/2026.h5', reform=reform)
reform_tax = reform_sim.calculate('income_tax', period=2026, map_to='household')

impact = (reform_tax.sum() - baseline_tax.sum()) / 1e9

results.append({
'Reform ID': reform_id,
'Reform Name': reform_name,
'Revenue Impact ($B)': impact # Keep full precision
})

print(f' βœ“ Impact: ${impact:.1f}B')

except Exception as e:
print(f' βœ— Error: {e}')
results.append({
'Reform ID': reform_id,
'Reform Name': reform_name,
'Revenue Impact ($B)': 'ERROR'
})

print()
print('='*80)
print('SUMMARY OF ALL REFORMS (2026)')
print('='*80)
print()

import pandas as pd
df = pd.DataFrame(results)
print(df.to_string(index=False))
print()

# Save to CSV
output_file = '../data/all_reforms_hf_test_2026.csv'
df.to_csv(output_file, index=False)
print(f'βœ“ Results saved to: {output_file}')
print()

print('='*80)
print('Dataset: hf://policyengine/test/2026.h5')
print('='*80)
88 changes: 88 additions & 0 deletions analysis/check_age_distribution_2054.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
Check age distribution in the 2054 dataset
"""

import sys
import os

# Setup path
repo_root = os.path.abspath('..')
src_path = os.path.join(repo_root, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)

import pandas as pd
import numpy as np
from policyengine_us import Microsimulation

print("="*80)
print("AGE DISTRIBUTION ANALYSIS - 2054 Dataset")
print("="*80)
print()

# Load the 2054 dataset
print("Loading 2054 dataset...")
sim = Microsimulation(dataset="hf://policyengine/test/2054.h5")
print("βœ“ Dataset loaded")
print()

# Get age and person weight
age = sim.calculate("age", period=2054)

# Get person weight - need to check if this variable exists
try:
person_weight = sim.calculate("person_weight", period=2054)
except:
# If person_weight doesn't exist, use household_weight mapped to persons
print("Note: Using household weight mapped to persons")
person_weight = sim.calculate("household_weight", period=2054, map_to="person")

# Filter valid ages and weights
valid = (age > 0) & (person_weight > 0) & np.isfinite(age) & np.isfinite(person_weight)
age = age[valid]
person_weight = person_weight[valid]

print(f"Total people in sample: {len(age):,}")
print(f"Total weighted population: {person_weight.sum():,.0f}")
print()

# Calculate age statistics
print("Age Distribution:")
print("-" * 60)

# Age groups
age_groups = [
("Under 18", 0, 17),
("18-24", 18, 24),
("25-34", 25, 34),
("35-44", 35, 44),
("45-54", 45, 54),
("55-64", 55, 64),
("65-74", 65, 74),
("75-84", 75, 84),
("85+", 85, 150)
]

for group_name, min_age, max_age in age_groups:
mask = (age >= min_age) & (age <= max_age)
count = mask.sum()
weighted = person_weight[mask].sum()
pct = (weighted / person_weight.sum()) * 100
print(f"{group_name:12s}: {count:>8,} people ({weighted:>15,.0f} weighted, {pct:>5.1f}%)")

print()
print("="*60)

# People over 65
over_65 = age >= 65
count_over_65 = over_65.sum()
weighted_over_65 = person_weight[over_65].sum()
pct_over_65 = (weighted_over_65 / person_weight.sum()) * 100

print(f"People aged 65+:")
print(f" Sample count: {count_over_65:,}")
print(f" Weighted count: {weighted_over_65:,.0f}")
print(f" Percentage of population: {pct_over_65:.1f}%")

print()
print("βœ“ Analysis complete!")
84 changes: 84 additions & 0 deletions analysis/check_available_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Check available PolicyEngine US datasets
"""

import sys
import os

# Setup path
repo_root = os.path.abspath('..')
src_path = os.path.join(repo_root, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)

from policyengine_us import Microsimulation

print("="*80)
print("AVAILABLE POLICYENGINE US DATASETS")
print("="*80)
print()

# Try to get dataset list
try:
datasets = list(Microsimulation.datasets.keys())
print(f"Total datasets available: {len(datasets)}")
print()

# Group by type
enhanced_cps = [d for d in datasets if 'enhanced_cps' in d]
cps = [d for d in datasets if d.startswith('cps_') and 'enhanced' not in d]
test = [d for d in datasets if 'test' in d or 'hf://' in d]
other = [d for d in datasets if d not in enhanced_cps + cps + test]

print("Enhanced CPS datasets (recommended):")
for d in sorted(enhanced_cps):
print(f" - {d}")

print()
print("Raw CPS datasets:")
for d in sorted(cps):
print(f" - {d}")

if test:
print()
print("Test/Projection datasets:")
for d in sorted(test):
print(f" - {d}")

if other:
print()
print("Other datasets:")
for d in sorted(other):
print(f" - {d}")

except Exception as e:
print(f"Could not retrieve dataset list: {e}")
print()
print("Common datasets you can try:")
print(" - enhanced_cps_2026")
print(" - enhanced_cps_2027")
print(" - enhanced_cps_2028")
print(" - enhanced_cps_2029")
print(" - enhanced_cps_2030")
print(" - enhanced_cps_2031")
print(" - enhanced_cps_2032")
print(" - enhanced_cps_2033")
print(" - enhanced_cps_2034")

print()
print("="*80)
print()

# Test loading enhanced_cps_2034
print("Testing enhanced_cps_2034...")
try:
sim = Microsimulation(dataset="enhanced_cps_2034")
hh_weight = sim.calculate("household_weight", period=2034)
print(f"βœ“ enhanced_cps_2034 loaded successfully!")
print(f" Households: {len(hh_weight):,}")
print(f" Weighted: {hh_weight.sum():,.0f}")
except Exception as e:
print(f"βœ— Could not load enhanced_cps_2034: {e}")

print()
print("βœ“ Check complete!")
138 changes: 138 additions & 0 deletions analysis/check_top01_seniors_2054.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
Check how many seniors (65+) are in the top 0.1% income group in 2054 dataset
"""

import sys
import os

# Setup path
repo_root = os.path.abspath('..')
src_path = os.path.join(repo_root, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)

import pandas as pd
import numpy as np
from policyengine_us import Microsimulation

print("="*80)
print("TOP 0.1% SENIORS ANALYSIS - 2054 Dataset")
print("="*80)
print()

# Load the 2054 dataset
print("Loading 2054 dataset...")
sim = Microsimulation(dataset="hf://policyengine/test/2054.h5")
print("βœ“ Dataset loaded")
print()

# Get household-level data
household_weight = sim.calculate("household_weight", period=2054)
household_net_income = sim.calculate("household_net_income", period=2054, map_to="household")

# Get person-level data
age = sim.calculate("age", period=2054)
person_id = sim.calculate("person_id", period=2054)
household_id = sim.calculate("household_id", period=2054)

# Get Social Security data
ss_benefits = sim.calculate("social_security", period=2054, map_to="household")
taxable_ss_benefits = sim.calculate("taxable_social_security", period=2054, map_to="household")

print("Dataset Statistics:")
print(f" Total households: {len(household_weight):,}")
print(f" Total people: {len(age):,}")
print()

# Create household DataFrame
df_hh = pd.DataFrame({
'household_id': range(len(household_weight)),
'household_net_income': household_net_income,
'weight': household_weight,
'ss_benefits': ss_benefits,
'taxable_ss_benefits': taxable_ss_benefits
})

# Remove invalid households
df_hh = df_hh[np.isfinite(df_hh['household_net_income'])]
df_hh = df_hh[df_hh['household_net_income'] > 0]
df_hh = df_hh[df_hh['weight'] > 0]

# Calculate income percentile
df_hh['income_percentile'] = df_hh['household_net_income'].rank(pct=True) * 100

# Identify top 0.1%
df_hh['is_top_01'] = df_hh['income_percentile'] > 99.9

# Create person DataFrame
df_person = pd.DataFrame({
'person_id': person_id,
'household_id': household_id,
'age': age
})

# Filter valid ages
df_person = df_person[np.isfinite(df_person['age'])]
df_person = df_person[df_person['age'] > 0]

# Identify seniors
df_person['is_senior'] = df_person['age'] >= 65

# Count seniors per household
seniors_per_hh = df_person[df_person['is_senior']].groupby('household_id').size()
df_hh['num_seniors'] = df_hh['household_id'].map(seniors_per_hh).fillna(0)
df_hh['has_seniors'] = df_hh['num_seniors'] > 0

print("="*80)
print("TOP 0.1% INCOME GROUP ANALYSIS")
print("="*80)
print()

# Overall top 0.1%
top_01 = df_hh[df_hh['is_top_01']]
print(f"Households in top 0.1%:")
print(f" Sample count: {len(top_01):,}")
print(f" Weighted count: {top_01['weight'].sum():,.0f}")
print(f" Income threshold: ${top_01['household_net_income'].min():,.0f}")
print(f" Average income: ${top_01['household_net_income'].mean():,.0f}")
print()

# Top 0.1% with seniors
top_01_with_seniors = top_01[top_01['has_seniors']]
print(f"Top 0.1% households WITH seniors (65+):")
print(f" Sample count: {len(top_01_with_seniors):,}")
print(f" Weighted count: {top_01_with_seniors['weight'].sum():,.0f}")
print(f" Percentage of top 0.1%: {len(top_01_with_seniors) / len(top_01) * 100:.1f}%")
print(f" Average # of seniors: {top_01_with_seniors['num_seniors'].mean():.1f}")
print()

# Top 0.1% receiving SS benefits
top_01_with_ss = top_01[top_01['ss_benefits'] > 0]
print(f"Top 0.1% households receiving Social Security:")
print(f" Sample count: {len(top_01_with_ss):,}")
print(f" Weighted count: {top_01_with_ss['weight'].sum():,.0f}")
print(f" Percentage of top 0.1%: {len(top_01_with_ss) / len(top_01) * 100:.1f}%")
if len(top_01_with_ss) > 0:
print(f" Average SS benefit: ${top_01_with_ss['ss_benefits'].mean():,.0f}")
print()

# Top 0.1% with taxable SS benefits
top_01_with_taxable_ss = top_01[top_01['taxable_ss_benefits'] > 0]
print(f"Top 0.1% households with TAXABLE Social Security:")
print(f" Sample count: {len(top_01_with_taxable_ss):,}")
print(f" Weighted count: {top_01_with_taxable_ss['weight'].sum():,.0f}")
print(f" Percentage of top 0.1%: {len(top_01_with_taxable_ss) / len(top_01) * 100:.1f}%")
if len(top_01_with_taxable_ss) > 0:
print(f" Average taxable SS: ${top_01_with_taxable_ss['taxable_ss_benefits'].mean():,.0f}")
print()

# Summary comparison
print("="*80)
print("SUMMARY")
print("="*80)
print(f"Top 0.1% households: {len(top_01):,}")
print(f" - With seniors (65+): {len(top_01_with_seniors):,} ({len(top_01_with_seniors) / len(top_01) * 100:.1f}%)")
print(f" - Receiving SS: {len(top_01_with_ss):,} ({len(top_01_with_ss) / len(top_01) * 100:.1f}%)")
print(f" - With taxable SS: {len(top_01_with_taxable_ss):,} ({len(top_01_with_taxable_ss) / len(top_01) * 100:.1f}%)")
print()
print("βœ“ Analysis complete!")
Loading