PolicyEngine · hua7450 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/analysis/all_reforms_hf_2026.py b/analysis/all_reforms_hf_2026.py
@@ -0,0 +1,74 @@
+import sys
+import os
+repo_root = os.path.abspath('..')
+src_path = os.path.join(repo_root, 'src')
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+
+from policyengine_us import Microsimulation
+from reforms import REFORMS
+from tqdm import tqdm
+
+print('='*80)
+print('ALL REFORMS - HF TEST 2026 DATASET')
+print('='*80)
+print()
+
+# Load baseline once
+print('Loading baseline...')
+baseline = Microsimulation(dataset='hf://policyengine/test/2026.h5')
+baseline_tax = baseline.calculate('income_tax', period=2026, map_to='household')
+print(f'✓ Baseline: ${baseline_tax.sum() / 1e9:.1f}B')
+print()
+
+results = []
+
+for reform_id, reform_config in tqdm(REFORMS.items(), desc='Processing reforms'):
+    reform_name = reform_config['name']
+    reform_func = reform_config['func']
+
+    print(f'\nProcessing {reform_id}: {reform_name[:50]}...')
+
+    try:
+        reform = reform_func()
+        reform_sim = Microsimulation(dataset='hf://policyengine/test/2026.h5', reform=reform)
+        reform_tax = reform_sim.calculate('income_tax', period=2026, map_to='household')
+
+        impact = (reform_tax.sum() - baseline_tax.sum()) / 1e9
+
+        results.append({
+            'Reform ID': reform_id,
+            'Reform Name': reform_name,
+            'Revenue Impact ($B)': impact  # Keep full precision
+        })
+
+        print(f'  ✓ Impact: ${impact:.1f}B')
+
+    except Exception as e:
+        print(f'  ✗ Error: {e}')
+        results.append({
+            'Reform ID': reform_id,
+            'Reform Name': reform_name,
+            'Revenue Impact ($B)': 'ERROR'
+        })
+
+print()
+print('='*80)
+print('SUMMARY OF ALL REFORMS (2026)')
+print('='*80)
+print()
+
+import pandas as pd
+df = pd.DataFrame(results)
+print(df.to_string(index=False))
+print()
+
+# Save to CSV
+output_file = '../data/all_reforms_hf_test_2026.csv'
+df.to_csv(output_file, index=False)
+print(f'✓ Results saved to: {output_file}')
+print()
+
+print('='*80)
+print('Dataset: hf://policyengine/test/2026.h5')
+print('='*80)
diff --git a/analysis/check_age_distribution_2054.py b/analysis/check_age_distribution_2054.py
@@ -0,0 +1,88 @@
+"""
+Check age distribution in the 2054 dataset
+"""
+
+import sys
+import os
+
+# Setup path
+repo_root = os.path.abspath('..')
+src_path = os.path.join(repo_root, 'src')
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+
+import pandas as pd
+import numpy as np
+from policyengine_us import Microsimulation
+
+print("="*80)
+print("AGE DISTRIBUTION ANALYSIS - 2054 Dataset")
+print("="*80)
+print()
+
+# Load the 2054 dataset
+print("Loading 2054 dataset...")
+sim = Microsimulation(dataset="hf://policyengine/test/2054.h5")
+print("✓ Dataset loaded")
+print()
+
+# Get age and person weight
+age = sim.calculate("age", period=2054)
+
+# Get person weight - need to check if this variable exists
+try:
+    person_weight = sim.calculate("person_weight", period=2054)
+except:
+    # If person_weight doesn't exist, use household_weight mapped to persons
+    print("Note: Using household weight mapped to persons")
+    person_weight = sim.calculate("household_weight", period=2054, map_to="person")
+
+# Filter valid ages and weights
+valid = (age > 0) & (person_weight > 0) & np.isfinite(age) & np.isfinite(person_weight)
+age = age[valid]
+person_weight = person_weight[valid]
+
+print(f"Total people in sample: {len(age):,}")
+print(f"Total weighted population: {person_weight.sum():,.0f}")
+print()
+
+# Calculate age statistics
+print("Age Distribution:")
+print("-" * 60)
+
+# Age groups
+age_groups = [
+    ("Under 18", 0, 17),
+    ("18-24", 18, 24),
+    ("25-34", 25, 34),
+    ("35-44", 35, 44),
+    ("45-54", 45, 54),
+    ("55-64", 55, 64),
+    ("65-74", 65, 74),
+    ("75-84", 75, 84),
+    ("85+", 85, 150)
+]
+
+for group_name, min_age, max_age in age_groups:
+    mask = (age >= min_age) & (age <= max_age)
+    count = mask.sum()
+    weighted = person_weight[mask].sum()
+    pct = (weighted / person_weight.sum()) * 100
+    print(f"{group_name:12s}: {count:>8,} people ({weighted:>15,.0f} weighted, {pct:>5.1f}%)")
+
+print()
+print("="*60)
+
+# People over 65
+over_65 = age >= 65
+count_over_65 = over_65.sum()
+weighted_over_65 = person_weight[over_65].sum()
+pct_over_65 = (weighted_over_65 / person_weight.sum()) * 100
+
+print(f"People aged 65+:")
+print(f"  Sample count: {count_over_65:,}")
+print(f"  Weighted count: {weighted_over_65:,.0f}")
+print(f"  Percentage of population: {pct_over_65:.1f}%")
+
+print()
+print("✓ Analysis complete!")
diff --git a/analysis/check_available_datasets.py b/analysis/check_available_datasets.py
@@ -0,0 +1,84 @@
+"""
+Check available PolicyEngine US datasets
+"""
+
+import sys
+import os
+
+# Setup path
+repo_root = os.path.abspath('..')
+src_path = os.path.join(repo_root, 'src')
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+
+from policyengine_us import Microsimulation
+
+print("="*80)
+print("AVAILABLE POLICYENGINE US DATASETS")
+print("="*80)
+print()
+
+# Try to get dataset list
+try:
+    datasets = list(Microsimulation.datasets.keys())
+    print(f"Total datasets available: {len(datasets)}")
+    print()
+
+    # Group by type
+    enhanced_cps = [d for d in datasets if 'enhanced_cps' in d]
+    cps = [d for d in datasets if d.startswith('cps_') and 'enhanced' not in d]
+    test = [d for d in datasets if 'test' in d or 'hf://' in d]
+    other = [d for d in datasets if d not in enhanced_cps + cps + test]
+
+    print("Enhanced CPS datasets (recommended):")
+    for d in sorted(enhanced_cps):
+        print(f"  - {d}")
+
+    print()
+    print("Raw CPS datasets:")
+    for d in sorted(cps):
+        print(f"  - {d}")
+
+    if test:
+        print()
+        print("Test/Projection datasets:")
+        for d in sorted(test):
+            print(f"  - {d}")
+
+    if other:
+        print()
+        print("Other datasets:")
+        for d in sorted(other):
+            print(f"  - {d}")
+
+except Exception as e:
+    print(f"Could not retrieve dataset list: {e}")
+    print()
+    print("Common datasets you can try:")
+    print("  - enhanced_cps_2026")
+    print("  - enhanced_cps_2027")
+    print("  - enhanced_cps_2028")
+    print("  - enhanced_cps_2029")
+    print("  - enhanced_cps_2030")
+    print("  - enhanced_cps_2031")
+    print("  - enhanced_cps_2032")
+    print("  - enhanced_cps_2033")
+    print("  - enhanced_cps_2034")
+
+print()
+print("="*80)
+print()
+
+# Test loading enhanced_cps_2034
+print("Testing enhanced_cps_2034...")
+try:
+    sim = Microsimulation(dataset="enhanced_cps_2034")
+    hh_weight = sim.calculate("household_weight", period=2034)
+    print(f"✓ enhanced_cps_2034 loaded successfully!")
+    print(f"  Households: {len(hh_weight):,}")
+    print(f"  Weighted: {hh_weight.sum():,.0f}")
+except Exception as e:
+    print(f"✗ Could not load enhanced_cps_2034: {e}")
+
+print()
+print("✓ Check complete!")
diff --git a/analysis/check_top01_seniors_2054.py b/analysis/check_top01_seniors_2054.py
@@ -0,0 +1,138 @@
+"""
+Check how many seniors (65+) are in the top 0.1% income group in 2054 dataset
+"""
+
+import sys
+import os
+
+# Setup path
+repo_root = os.path.abspath('..')
+src_path = os.path.join(repo_root, 'src')
+if src_path not in sys.path:
+    sys.path.insert(0, src_path)
+
+import pandas as pd
+import numpy as np
+from policyengine_us import Microsimulation
+
+print("="*80)
+print("TOP 0.1% SENIORS ANALYSIS - 2054 Dataset")
+print("="*80)
+print()
+
+# Load the 2054 dataset
+print("Loading 2054 dataset...")
+sim = Microsimulation(dataset="hf://policyengine/test/2054.h5")
+print("✓ Dataset loaded")
+print()
+
+# Get household-level data
+household_weight = sim.calculate("household_weight", period=2054)
+household_net_income = sim.calculate("household_net_income", period=2054, map_to="household")
+
+# Get person-level data
+age = sim.calculate("age", period=2054)
+person_id = sim.calculate("person_id", period=2054)
+household_id = sim.calculate("household_id", period=2054)
+
+# Get Social Security data
+ss_benefits = sim.calculate("social_security", period=2054, map_to="household")
+taxable_ss_benefits = sim.calculate("taxable_social_security", period=2054, map_to="household")
+
+print("Dataset Statistics:")
+print(f"  Total households: {len(household_weight):,}")
+print(f"  Total people: {len(age):,}")
+print()
+
+# Create household DataFrame
+df_hh = pd.DataFrame({
+    'household_id': range(len(household_weight)),
+    'household_net_income': household_net_income,
+    'weight': household_weight,
+    'ss_benefits': ss_benefits,
+    'taxable_ss_benefits': taxable_ss_benefits
+})
+
+# Remove invalid households
+df_hh = df_hh[np.isfinite(df_hh['household_net_income'])]
+df_hh = df_hh[df_hh['household_net_income'] > 0]
+df_hh = df_hh[df_hh['weight'] > 0]
+
+# Calculate income percentile
+df_hh['income_percentile'] = df_hh['household_net_income'].rank(pct=True) * 100
+
+# Identify top 0.1%
+df_hh['is_top_01'] = df_hh['income_percentile'] > 99.9
+
+# Create person DataFrame
+df_person = pd.DataFrame({
+    'person_id': person_id,
+    'household_id': household_id,
+    'age': age
+})
+
+# Filter valid ages
+df_person = df_person[np.isfinite(df_person['age'])]
+df_person = df_person[df_person['age'] > 0]
+
+# Identify seniors
+df_person['is_senior'] = df_person['age'] >= 65
+
+# Count seniors per household
+seniors_per_hh = df_person[df_person['is_senior']].groupby('household_id').size()
+df_hh['num_seniors'] = df_hh['household_id'].map(seniors_per_hh).fillna(0)
+df_hh['has_seniors'] = df_hh['num_seniors'] > 0
+
+print("="*80)
+print("TOP 0.1% INCOME GROUP ANALYSIS")
+print("="*80)
+print()
+
+# Overall top 0.1%
+top_01 = df_hh[df_hh['is_top_01']]
+print(f"Households in top 0.1%:")
+print(f"  Sample count: {len(top_01):,}")
+print(f"  Weighted count: {top_01['weight'].sum():,.0f}")
+print(f"  Income threshold: ${top_01['household_net_income'].min():,.0f}")
+print(f"  Average income: ${top_01['household_net_income'].mean():,.0f}")
+print()
+
+# Top 0.1% with seniors
+top_01_with_seniors = top_01[top_01['has_seniors']]
+print(f"Top 0.1% households WITH seniors (65+):")
+print(f"  Sample count: {len(top_01_with_seniors):,}")
+print(f"  Weighted count: {top_01_with_seniors['weight'].sum():,.0f}")
+print(f"  Percentage of top 0.1%: {len(top_01_with_seniors) / len(top_01) * 100:.1f}%")
+print(f"  Average # of seniors: {top_01_with_seniors['num_seniors'].mean():.1f}")
+print()
+
+# Top 0.1% receiving SS benefits
+top_01_with_ss = top_01[top_01['ss_benefits'] > 0]
+print(f"Top 0.1% households receiving Social Security:")
+print(f"  Sample count: {len(top_01_with_ss):,}")
+print(f"  Weighted count: {top_01_with_ss['weight'].sum():,.0f}")
+print(f"  Percentage of top 0.1%: {len(top_01_with_ss) / len(top_01) * 100:.1f}%")
+if len(top_01_with_ss) > 0:
+    print(f"  Average SS benefit: ${top_01_with_ss['ss_benefits'].mean():,.0f}")
+print()
+
+# Top 0.1% with taxable SS benefits
+top_01_with_taxable_ss = top_01[top_01['taxable_ss_benefits'] > 0]
+print(f"Top 0.1% households with TAXABLE Social Security:")
+print(f"  Sample count: {len(top_01_with_taxable_ss):,}")
+print(f"  Weighted count: {top_01_with_taxable_ss['weight'].sum():,.0f}")
+print(f"  Percentage of top 0.1%: {len(top_01_with_taxable_ss) / len(top_01) * 100:.1f}%")
+if len(top_01_with_taxable_ss) > 0:
+    print(f"  Average taxable SS: ${top_01_with_taxable_ss['taxable_ss_benefits'].mean():,.0f}")
+print()
+
+# Summary comparison
+print("="*80)
+print("SUMMARY")
+print("="*80)
+print(f"Top 0.1% households: {len(top_01):,}")
+print(f"  - With seniors (65+): {len(top_01_with_seniors):,} ({len(top_01_with_seniors) / len(top_01) * 100:.1f}%)")
+print(f"  - Receiving SS: {len(top_01_with_ss):,} ({len(top_01_with_ss) / len(top_01) * 100:.1f}%)")
+print(f"  - With taxable SS: {len(top_01_with_taxable_ss):,} ({len(top_01_with_taxable_ss) / len(top_01) * 100:.1f}%)")
+print()
+print("✓ Analysis complete!")