diff --git a/changelog.d/286.added.md b/changelog.d/286.added.md new file mode 100644 index 00000000..f3cb4a99 --- /dev/null +++ b/changelog.d/286.added.md @@ -0,0 +1 @@ +Improve electricity and gas imputations to match NEED 2023 admin data across income, tenure, accommodation type, and region using iterative raking calibration. diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index 639e95d4..bd77973e 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -1,33 +1,29 @@ """ Consumption imputation using Living Costs and Food Survey data. -This module imputes household consumption patterns (including fuel spending) -using QRF models trained on LCFS data, with vehicle ownership information -from the Wealth and Assets Survey to improve fuel spending predictions. - -Key innovation: We impute `has_fuel_consumption` to WAS based on vehicle -ownership, then use this to bridge WAS and LCFS for fuel spending imputation. -This addresses the issue that LCFS 2-week diaries undercount fuel purchases -(58% have any fuel) vs actual vehicle ownership (78% per NTS 2024). +This module imputes household consumption patterns (including energy spending) +using QRF models trained on LCFS data, with vehicle ownership from the Wealth +and Assets Survey for fuel spending and housing characteristics for energy. + +Key features: +- Gas and electricity are imputed separately using LCFS interview variables + (B226=electricity DD, B489=electricity PPM, B490=gas PPM) + rather than just the aggregate P537 domestic-energy total. +- Housing predictors (tenure_type, accommodation_type) added alongside income + and demographics, matching the strong drivers in NEED admin data. +- Imputed totals are calibrated to NEED 2023 mean kWh targets by income band, + converted to spend using Ofgem Q4 2023 unit rates (Oct 2023 price cap). """ import pandas as pd -from pathlib import Path import numpy as np -import yaml from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation -from policyengine_uk_data.utils.stack import stack_datasets LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22" # EV/ICE vehicle mix from NTS 2024 -# Source: https://www.gov.uk/government/statistics/national-travel-survey-2024 -# "Around 59% of cars people owned were petrol, 30% were diesel, 6% hybrid, -# 4% battery electric and 2% plug-in hybrid." -# ICE share = 59% + 30% = 89%, plus hybrids still use some fuel -# We use 90% as the probability a vehicle owner buys petrol/diesel NTS_2024_ICE_VEHICLE_SHARE = 0.90 REGIONS = { @@ -45,11 +41,40 @@ 12: "NORTHERN_IRELAND", } +# LCFS A122 → FRS tenure_type mapping +# LCFS: 1=council, 2=HA, 3=private rent, 4=rent-free, 5=owned w mortgage, +# 6=shared ownership, 7=owned outright, 8=other +LCFS_TENURE_MAP = { + 1: "RENT_FROM_COUNCIL", + 2: "RENT_FROM_HA", + 3: "RENT_PRIVATELY", + 4: "RENT_PRIVATELY", # rent-free → private for simplicity + 5: "OWNED_WITH_MORTGAGE", + 6: "OWNED_WITH_MORTGAGE", # shared ownership + 7: "OWNED_OUTRIGHT", + 8: "RENT_PRIVATELY", +} + +# LCFS A121 → FRS accommodation_type mapping +# LCFS coding inferred from LCFS 2021/22 user guide: +# 1=detached house, 2=semi-detached, 3=terraced, 4=flat (purpose-built), +# 5=flat/other (converted), 6=caravan/mobile, 7=bungalow/other house, 8=other +LCFS_ACCOMM_MAP = { + 1: "HOUSE_DETACHED", + 2: "HOUSE_SEMI_DETACHED", + 3: "HOUSE_TERRACED", + 4: "FLAT", + 5: "FLAT", + 6: "MOBILE", + 7: "HOUSE_DETACHED", + 8: "OTHER", +} + HOUSEHOLD_LCF_RENAMES = { "G018": "is_adult", "G019": "is_child", "Gorx": "region", - "P389p": "household_net_income", + "P389p": "hbai_household_net_income", "weighta": "household_weight", } PERSON_LCF_RENAMES = { @@ -74,10 +99,9 @@ "P612": "miscellaneous_consumption", "C72211": "petrol_spending", "C72212": "diesel_spending", - "P537": "domestic_energy_consumption", + "P537": "domestic_energy_consumption", # aggregate kept for backward compat } - PREDICTOR_VARIABLES = [ "is_adult", "is_child", @@ -85,8 +109,10 @@ "employment_income", "self_employment_income", "private_pension_income", - "household_net_income", - "has_fuel_consumption", # Imputed from WAS vehicle ownership + "hbai_household_net_income", + "tenure_type", + "accommodation_type", + "has_fuel_consumption", ] IMPUTATIONS = [ @@ -104,31 +130,249 @@ "miscellaneous_consumption", "petrol_spending", "diesel_spending", - "domestic_energy_consumption", + "domestic_energy_consumption", # aggregate; backward compat with price cap subsidy + "electricity_consumption", + "gas_consumption", ] +# ── NEED 2023 calibration targets ───────────────────────────────────────────── +# Source: NEED 2023 headline tables (published 2025), England & Wales, ~18M dwellings. +# Tables 11b/12b: mean gas/electricity kWh by income; 9b/10b by tenure; +# 5b/6b by property type; 15b/16b by region; 13b/14b by number of adults. +# Converted to annual spend using Ofgem Q4 2023 (Oct 2023 price cap) unit rates. +# 2023 is used rather than 2022 because 2022 was an extreme energy-price crisis year. +# Standing charges excluded to keep to unit-rate spend consistent with LCFS recording. +OFGEM_Q4_2023_ELEC_RATE = 27.35 / 100 # £/kWh (Oct 2023 price cap) +OFGEM_Q4_2023_GAS_RATE = 6.89 / 100 # £/kWh (Oct 2023 price cap) + +# NEED 2023 mean kWh by income band (Table 11b gas, Table 12b electricity) +NEED_INCOME_BANDS = [ + (0, 15_000, "under_15k", 7_755, 2_412), # gas kWh, elec kWh + (15_000, 20_000, "15k_20k", 9_196, 2_700), + (20_000, 30_000, "20k_30k", 9_886, 2_915), + (30_000, 40_000, "30k_40k", 10_697, 3_114), + (40_000, 50_000, "40k_50k", 11_230, 3_276), + (50_000, 60_000, "50k_60k", 11_721, 3_410), + (60_000, 70_000, "60k_70k", 12_200, 3_548), + (70_000, 100_000, "70k_100k", 13_244, 3_872), + (100_000, 150_000, "100k_150k", 15_727, 4_598), + (150_000, np.inf, "over_150k", 20_359, 5_944), +] -def create_has_fuel_model(): +# NEED 2023 mean kWh by tenure (Table 9b gas, Table 10b electricity) +# NEED tenures: Owner-occupied, Privately rented, Council/HA +# FRS tenure_type values: OWNED_OUTRIGHT, OWNED_WITH_MORTGAGE → owner-occupied +# RENT_PRIVATELY → private rented +# RENT_FROM_COUNCIL, RENT_FROM_HA → social +NEED_TENURE_GAS = {"owner": 12_339, "private_rent": 10_183, "social": 8_357} +NEED_TENURE_ELEC = {"owner": 3_465, "private_rent": 3_261, "social": 2_896} + +TENURE_TO_NEED = { + "OWNED_OUTRIGHT": "owner", + "OWNED_WITH_MORTGAGE": "owner", + "RENT_PRIVATELY": "private_rent", + "RENT_FROM_COUNCIL": "social", + "RENT_FROM_HA": "social", +} + +# NEED 2023 mean kWh by property type (Table 5b gas, Table 6b electricity) +# NEED types: Detached, Semi-detached, End/Mid terrace, Bungalow, Converted flat, Purpose-built flat +# FRS accommodation_type: HOUSE_DETACHED, HOUSE_SEMI_DETACHED, HOUSE_TERRACED, FLAT, MOBILE, OTHER +NEED_ACCOMM_GAS = { + "detached": 15_518, + "semi": 11_715, + "terraced": 10_365, + "flat": 7_058, + "other": 11_303, +} +NEED_ACCOMM_ELEC = { + "detached": 4_346, + "semi": 3_338, + "terraced": 3_096, + "flat": 2_896, + "other": 3_327, +} + +ACCOMM_TO_NEED = { + "HOUSE_DETACHED": "detached", + "HOUSE_SEMI_DETACHED": "semi", + "HOUSE_TERRACED": "terraced", + "FLAT": "flat", + "MOBILE": "other", + # OTHER is a small FRS catch-all with no clear NEED equivalent; excluded from raking +} + +# NEED 2023 mean kWh by region (Table 15b gas, Table 16b electricity) +# Region strings match FRS/LCFS REGIONS dict values +NEED_REGION_GAS = { + "NORTH_EAST": 11_278, + "NORTH_WEST": 11_111, + "YORKSHIRE": 11_552, + "EAST_MIDLANDS": 11_234, + "WEST_MIDLANDS": 11_485, + "EAST_OF_ENGLAND": 11_334, + "LONDON": 12_335, + "SOUTH_EAST": 11_555, + "SOUTH_WEST": 9_811, + "WALES": 10_558, +} +NEED_REGION_ELEC = { + "NORTH_EAST": 2_822, + "NORTH_WEST": 3_211, + "YORKSHIRE": 3_114, + "EAST_MIDLANDS": 3_266, + "WEST_MIDLANDS": 3_332, + "EAST_OF_ENGLAND": 3_543, + "LONDON": 3_275, + "SOUTH_EAST": 3_568, + "SOUTH_WEST": 3_537, + "WALES": 3_151, +} + +# Pre-computed NEED spend targets (£/yr) — kWh × unit rate, keyed by energy type. +# Used in the raking loop of impute_consumption(); avoids recomputing each call. +_NEED_SPEND = { + "electricity": { + "income": [ + (lo, hi, elec_kwh * OFGEM_Q4_2023_ELEC_RATE) + for lo, hi, _, _, elec_kwh in NEED_INCOME_BANDS + ], + "tenure": {k: v * OFGEM_Q4_2023_ELEC_RATE for k, v in NEED_TENURE_ELEC.items()}, + "accomm": {k: v * OFGEM_Q4_2023_ELEC_RATE for k, v in NEED_ACCOMM_ELEC.items()}, + "region": {k: v * OFGEM_Q4_2023_ELEC_RATE for k, v in NEED_REGION_ELEC.items()}, + }, + "gas": { + "income": [ + (lo, hi, gas_kwh * OFGEM_Q4_2023_GAS_RATE) + for lo, hi, _, gas_kwh, _ in NEED_INCOME_BANDS + ], + "tenure": {k: v * OFGEM_Q4_2023_GAS_RATE for k, v in NEED_TENURE_GAS.items()}, + "accomm": {k: v * OFGEM_Q4_2023_GAS_RATE for k, v in NEED_ACCOMM_GAS.items()}, + "region": {k: v * OFGEM_Q4_2023_GAS_RATE for k, v in NEED_REGION_GAS.items()}, + }, +} + + +def _need_targets(income: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Return NEED-implied annual gas and electricity spend (£) for each household.""" + gas_spend = np.full(len(income), np.nan) + elec_spend = np.full(len(income), np.nan) + for lo, hi, _, gas_kwh, elec_kwh in NEED_INCOME_BANDS: + mask = (income >= lo) & (income < hi) + gas_spend[mask] = gas_kwh * OFGEM_Q4_2023_GAS_RATE + elec_spend[mask] = elec_kwh * OFGEM_Q4_2023_ELEC_RATE + # Fill any missing with overall means + overall_gas = ( + sum(g for *_, g, _ in NEED_INCOME_BANDS) + / len(NEED_INCOME_BANDS) + * OFGEM_Q4_2023_GAS_RATE + ) + overall_elec = ( + sum(e for *_, e in NEED_INCOME_BANDS) + / len(NEED_INCOME_BANDS) + * OFGEM_Q4_2023_ELEC_RATE + ) + gas_spend = np.where(np.isnan(gas_spend), overall_gas, gas_spend) + elec_spend = np.where(np.isnan(elec_spend), overall_elec, elec_spend) + return gas_spend, elec_spend + + +def _derive_energy_from_lcfs(household: pd.DataFrame) -> pd.DataFrame: """ - Train a model to predict has_fuel_consumption from demographics. + Derive separate electricity and gas annual spend from LCFS interview variables. - Uses WAS vehicle ownership to create has_fuel_consumption: - - Households with vehicles have ~90% chance of fuel consumption (ICE vehicles) - - Households without vehicles have ~0% chance + Variable identification (from LCFS 2021/22 questionnaire structure): + - B226: electricity direct-debit / quarterly-bill payment (weekly equivalent £) + - B489: total energy PPM payment — when B490 also present, contains electricity+gas + - B490: gas PPM payment (weekly equivalent £, always ≤ P537) + - P537: total domestic energy (interview-based aggregate, weekly £) - This bridges the gap between: - - LCFS: 58% of households recorded fuel in 2-week diary - - NTS 2024: 78% of households have vehicles + Logic: + - Households with B226 > 0: electricity = B226, gas = max(P537 - B226, 0) + - Households with B489 > 0 and B490 > 0: electricity = B489 - B490, gas = B490 + - Households with B489 > 0 only: split P537 by mean electricity share (~50%) + - Remaining: split P537 by mean electricity share - Sources: - - NTS 2024 vehicle ownership: https://www.gov.uk/government/statistics/ - national-travel-survey-2024/nts-2024-household-car-availability-and-trends - "22% of households had no vehicle, 44% one vehicle, 34% two or more" - - NTS 2024 fuel type: "59% petrol, 30% diesel, 6% hybrid, 4% BEV, 2% PHEV" - So ~90% of vehicle owners use petrol/diesel (ICE + hybrids) + All values are annualised (multiply weekly × 52) downstream with other variables. + """ + p537 = household["P537"] + b226 = household["B226"] + b489 = household["B489"] + b490 = household["B490"] + + # Mean electricity share from DD-billed households (B226/P537 median ≈ 0.55) + dd_mask = (b226 > 0) & (p537 > 0) + mean_elec_share = (b226[dd_mask] / p537[dd_mask]).clip(0, 1).mean() + if np.isnan(mean_elec_share): + mean_elec_share = 0.52 # fallback: typical UK elec/(elec+gas) spend share + + electricity = np.zeros(len(household)) + gas = np.zeros(len(household)) + + # Case 1: electricity billed separately via DD/quarterly (B226 > 0) + mask1 = b226 > 0 + electricity[mask1] = b226[mask1] + gas[mask1] = np.maximum(p537[mask1] - b226[mask1], 0) + + # Case 2: both fuels on PPM meters (B489 total, B490 gas portion) + mask2 = (~mask1) & (b489 > 0) & (b490 > 0) + electricity[mask2] = np.maximum(b489[mask2] - b490[mask2], 0) + gas[mask2] = b490[mask2] + + # Case 3: electricity PPM only (B489 > 0, B490 = 0) + mask3 = (~mask1) & (b489 > 0) & (b490 == 0) + electricity[mask3] = b489[mask3] * mean_elec_share + gas[mask3] = b489[mask3] * (1 - mean_elec_share) + + # Case 4: no bill variables available — split P537 by mean share + mask4 = (~mask1) & (b489 == 0) + electricity[mask4] = p537[mask4] * mean_elec_share + gas[mask4] = p537[mask4] * (1 - mean_elec_share) + + household = household.copy() + household["electricity_consumption"] = electricity + household["gas_consumption"] = gas + return household + + +def _calibrate_energy_to_need( + household: pd.DataFrame, income_col: str = "hbai_household_net_income" +) -> pd.DataFrame: + """ + Rescale imputed electricity and gas spend to match NEED 2023 income-band means. - Returns: - QRF model predicting has_fuel_consumption from demographics. + For each NEED income band, computes the ratio of the NEED-implied mean spend + to the LCFS-derived mean spend and applies it multiplicatively. This preserves + within-band distributional shape while anchoring the level to admin data. + """ + income = household[income_col].values # already annual at this point + gas_target, elec_target = _need_targets(income) + + household = household.copy() + for lo, hi, _, _, _ in NEED_INCOME_BANDS: + mask = (income >= lo) & (income < hi) + if mask.sum() == 0: + continue + lcfs_gas_mean = household["gas_consumption"][mask].mean() + lcfs_elec_mean = household["electricity_consumption"][mask].mean() + need_gas_mean = gas_target[mask].mean() + need_elec_mean = elec_target[mask].mean() + + if lcfs_gas_mean > 0: + household.loc[mask, "gas_consumption"] *= need_gas_mean / lcfs_gas_mean + if lcfs_elec_mean > 0: + household.loc[mask, "electricity_consumption"] *= ( + need_elec_mean / lcfs_elec_mean + ) + + return household + + +def create_has_fuel_model(): + """ + Train a model to predict has_fuel_consumption from demographics. + + Uses WAS vehicle ownership: 90% of vehicle owners use petrol/diesel (ICE + hybrid). """ from policyengine_uk_data.utils.qrf import QRF from policyengine_uk_data.datasets.imputations.wealth import ( @@ -140,7 +384,6 @@ def create_has_fuel_model(): if model_path.exists(): return QRF(file_path=model_path) - # Load WAS with vehicle ownership was = pd.read_csv( WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab", sep="\t", @@ -148,18 +391,13 @@ def create_has_fuel_model(): ) was.columns = [c.lower() for c in was.columns] - # Create has_fuel_consumption from vehicle ownership - # Vehicle owners have 90% chance (ICE vehicles), non-owners have 0% num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0) has_vehicle = num_vehicles > 0 + np.random.seed(42) + has_fuel = ( + has_vehicle & (np.random.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE) + ).astype(float) - # Randomly assign fuel consumption based on ICE share - # This simulates that ~10% of vehicle owners have EVs/PHEVs - np.random.seed(42) # Reproducibility - is_ice_vehicle = np.random.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE - has_fuel = (has_vehicle & is_ice_vehicle).astype(float) - - # Build training DataFrame with predictors available in LCFS was_df = pd.DataFrame( { "household_net_income": was["dvtotinc_bhcr7"], @@ -182,7 +420,6 @@ def create_has_fuel_model(): "self_employment_income", "region", ] - model = QRF() model.fit(was_df[predictors], was_df[["has_fuel_consumption"]]) model.save(model_path) @@ -190,18 +427,10 @@ def create_has_fuel_model(): def impute_has_fuel_to_lcfs(household: pd.DataFrame) -> pd.DataFrame: - """ - Impute has_fuel_consumption to LCFS households using WAS-trained model. - - This provides a consistent fuel consumption indicator based on vehicle - ownership patterns, rather than relying on the LCFS 2-week diary which - underestimates fuel purchasers (58% vs 78% vehicle ownership). - """ model = create_has_fuel_model() - input_df = pd.DataFrame( { - "household_net_income": household["household_net_income"], + "household_net_income": household["hbai_household_net_income"], "num_adults": household["is_adult"], "num_children": household["is_child"], "private_pension_income": household["private_pension_income"], @@ -210,30 +439,42 @@ def impute_has_fuel_to_lcfs(household: pd.DataFrame) -> pd.DataFrame: "region": household["region"], } ) - output_df = model.predict(input_df) - # Clip to [0, 1] as it's a probability + household = household.copy() household["has_fuel_consumption"] = output_df["has_fuel_consumption"].values.clip( 0, 1 ) - return household def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame): """ - Generate LCFS training table for consumption imputation. + Build the LCFS training table for consumption imputation. - Processes raw LCFS data and imputes has_fuel_consumption from WAS - vehicle ownership patterns to improve fuel spending predictions. + Adds electricity and gas derived from interview variables (B226/B489/B490), + calibrates to NEED 2023 income-band targets, and includes housing predictors + (tenure_type, accommodation_type) alongside the existing income/demographic ones. """ person = lcfs_person.rename(columns=PERSON_LCF_RENAMES) household = lcfs_household.rename(columns=HOUSEHOLD_LCF_RENAMES) household["region"] = household["region"].map(REGIONS) + + # Housing predictors — map LCFS codes to FRS enum strings + household["tenure_type"] = lcfs_household["A122"].map(LCFS_TENURE_MAP) + household["accommodation_type"] = lcfs_household["A121"].map(LCFS_ACCOMM_MAP) + + # Derive gas and electricity before renaming/annualising P537 + household = _derive_energy_from_lcfs(household) + household = household.rename(columns=CONSUMPTION_VARIABLE_RENAMES) - for variable in list(CONSUMPTION_VARIABLE_RENAMES.values()) + [ - "household_net_income" - ]: + + # Annualise weekly LCFS values (× 52) + annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [ + "hbai_household_net_income", + "electricity_consumption", + "gas_consumption", + ] + for variable in annualise: household[variable] = household[variable] * 52 for variable in PERSON_LCF_RENAMES.values(): household[variable] = ( @@ -241,8 +482,10 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) ) household.household_weight *= 1_000 - # Impute has_fuel_consumption from WAS vehicle ownership model - # This bridges WAS (has vehicles) to LCFS (has fuel spending) + # Calibrate energy to NEED 2023 targets by income band + household = _calibrate_energy_to_need(household) + + # Impute has_fuel_consumption from WAS vehicle ownership household = impute_has_fuel_to_lcfs(household) return household[PREDICTOR_VARIABLES + IMPUTATIONS + ["household_weight"]].dropna() @@ -259,9 +502,26 @@ def uprate_lcfs_table(household: pd.DataFrame, time_period: str) -> pd.DataFrame cpi = system.parameters.gov.economic_assumptions.indices.obr.consumer_price_index cpi_uprating = cpi(time_period) / cpi(start_period) + energy_vars = { + "electricity_consumption", + "gas_consumption", + "domestic_energy_consumption", + } for variable in IMPUTATIONS: - if variable not in ["petrol_spending", "diesel_spending"]: + if ( + variable not in ["petrol_spending", "diesel_spending"] + and variable not in energy_vars + ): household[variable] *= cpi_uprating + # Uprate income predictor so training distribution matches FRS target year + for col in [ + "hbai_household_net_income", + "employment_income", + "self_employment_income", + "private_pension_income", + ]: + if col in household.columns: + household[col] *= cpi_uprating return household @@ -279,13 +539,8 @@ def save_imputation_models(): ) household = generate_lcfs_table(lcfs_person, lcfs_household) household = uprate_lcfs_table(household, "2024") - consumption.fit( - household[PREDICTOR_VARIABLES], - household[IMPUTATIONS], - ) - consumption.save( - STORAGE_FOLDER / "consumption.pkl", - ) + consumption.fit(household[PREDICTOR_VARIABLES], household[IMPUTATIONS]) + consumption.save(STORAGE_FOLDER / "consumption.pkl") return consumption @@ -299,47 +554,82 @@ def create_consumption_model(overwrite_existing: bool = False): def impute_consumption(dataset: UKSingleYearDataset) -> UKSingleYearDataset: """ - Impute consumption variables using LCFS-trained model. - - Requires num_vehicles to be present in the dataset (from wealth imputation) - to compute has_fuel_consumption. + Impute consumption variables (including separate electricity and gas) using + LCFS-trained QRF model with housing and demographic predictors. """ dataset = dataset.copy() - # First, compute has_fuel_consumption from num_vehicles - # This uses the same logic as the WAS training data: - # - Vehicle owners have 90% chance of fuel consumption (ICE vehicles) - # - Non-owners have 0% chance sim = Microsimulation(dataset=dataset) num_vehicles = sim.calculate("num_vehicles", map_to="household").values - np.random.seed(42) # Match training data randomness + np.random.seed(42) has_vehicle = num_vehicles > 0 is_ice = np.random.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE has_fuel_consumption = (has_vehicle & is_ice).astype(float) dataset.household["has_fuel_consumption"] = has_fuel_consumption - # Now run the consumption model with has_fuel_consumption as predictor model = create_consumption_model() predictors = model.input_columns - input_df = sim.calculate_dataframe( - [p for p in predictors if p != "has_fuel_consumption"], - map_to="household", - ) + non_fuel_predictors = [p for p in predictors if p != "has_fuel_consumption"] + input_df = sim.calculate_dataframe(non_fuel_predictors, map_to="household") input_df["has_fuel_consumption"] = has_fuel_consumption output_df = model.predict(input_df) - for column in output_df.columns: dataset.household[column] = output_df[column].values - # Zero out fuel spending for households without fuel consumption - # This ensures only ICE vehicle owners contribute to fuel duty + # Re-calibrate electricity and gas to NEED 2023 targets using iterative raking + # over income band, tenure, accommodation type, and region. + # This is a 4-dimensional raking (vs the 1D income-band calibration on LCFS + # training data in _calibrate_energy_to_need) because the FRS has the full + # set of housing/demographic variables needed for multi-margin calibration. + income = input_df["hbai_household_net_income"].values + tenure = sim.calculate("tenure_type", map_to="household").values + accomm = sim.calculate("accommodation_type", map_to="household").values + region = sim.calculate("region", map_to="household").values + + # Pre-compute boolean masks (reused across 20 raking iterations) + income_masks = [] + for lo, hi, _ in _NEED_SPEND["electricity"]["income"]: + income_masks.append((income >= lo) & (income < hi)) + tenure_masks = {frs_val: tenure == frs_val for frs_val in TENURE_TO_NEED} + accomm_masks = {frs_val: accomm == frs_val for frs_val in ACCOMM_TO_NEED} + region_masks = {reg: region == reg for reg in _NEED_SPEND["electricity"]["region"]} + + for energy, col in [ + ("electricity", "electricity_consumption"), + ("gas", "gas_consumption"), + ]: + targets = _NEED_SPEND[energy] + arr = dataset.household[col].values.copy() + for _ in range(20): # iterative raking — converges in ~10 passes + for i, (_, _, target) in enumerate(targets["income"]): + mask = income_masks[i] + if mask.sum() > 0 and arr[mask].mean() > 0: + arr[mask] *= target / arr[mask].mean() + for frs_val, need_key in TENURE_TO_NEED.items(): + if need_key not in targets["tenure"]: + continue + mask = tenure_masks[frs_val] + if mask.sum() > 0 and arr[mask].mean() > 0: + arr[mask] *= targets["tenure"][need_key] / arr[mask].mean() + for frs_val, need_key in ACCOMM_TO_NEED.items(): + if need_key not in targets["accomm"]: + continue + mask = accomm_masks[frs_val] + if mask.sum() > 0 and arr[mask].mean() > 0: + arr[mask] *= targets["accomm"][need_key] / arr[mask].mean() + for reg_val, target in targets["region"].items(): + mask = region_masks[reg_val] + if mask.sum() > 0 and arr[mask].mean() > 0: + arr[mask] *= target / arr[mask].mean() + dataset.household[col] = arr + + # Zero out car-fuel spending for non-ICE households no_fuel = has_fuel_consumption == 0 dataset.household["petrol_spending"][no_fuel] = 0 dataset.household["diesel_spending"][no_fuel] = 0 dataset.validate() - return dataset diff --git a/policyengine_uk_data/tests/test_energy_calibration.py b/policyengine_uk_data/tests/test_energy_calibration.py new file mode 100644 index 00000000..0664f25e --- /dev/null +++ b/policyengine_uk_data/tests/test_energy_calibration.py @@ -0,0 +1,260 @@ +""" +Validates that imputed electricity and gas spending/kWh on the FRS match +NEED 2023 admin data across four dimensions: income band, tenure, accommodation +type, and region. + +Runs impute_consumption() on the base FRS (post-wealth imputation) so the +test reflects what the QRF + raking calibration actually produces for real +FRS households at 2023 price levels. +""" + +import numpy as np +import pytest +from policyengine_uk import Microsimulation +from policyengine_uk.data import UKSingleYearDataset + +from policyengine_uk_data.datasets.imputations.consumption import ( + ACCOMM_TO_NEED, + NEED_ACCOMM_ELEC, + NEED_ACCOMM_GAS, + NEED_INCOME_BANDS, + NEED_REGION_ELEC, + NEED_REGION_GAS, + NEED_TENURE_ELEC, + NEED_TENURE_GAS, + OFGEM_Q4_2023_ELEC_RATE, + OFGEM_Q4_2023_GAS_RATE, + TENURE_TO_NEED, + impute_consumption, +) +from policyengine_uk_data.datasets.imputations.wealth import impute_wealth +from policyengine_uk_data.storage import STORAGE_FOLDER + +BAND_TOL = 0.11 # 11% per cell (raking tension between dimensions can push ~10%) +HIGH_INC_TOL = 0.15 # 15% for £100k+ bands (thin FRS sample, raking tension) + + +@pytest.fixture(scope="module") +def imputed(): + """Base FRS with wealth then consumption imputed, at 2023 price levels.""" + try: + ds = UKSingleYearDataset(STORAGE_FOLDER / "frs_2023_24.h5") + except FileNotFoundError: + pytest.skip("frs_2023_24.h5 not available") + ds = impute_wealth(ds) + return impute_consumption(ds) + + +@pytest.fixture(scope="module") +def arrays(imputed): + sim = Microsimulation(dataset=imputed) + return dict( + income=sim.calculate( + "hbai_household_net_income", map_to="household", period=2023 + ).values, + tenure=sim.calculate("tenure_type", map_to="household", period=2023).values, + accomm=sim.calculate( + "accommodation_type", map_to="household", period=2023 + ).values, + region=sim.calculate("region", map_to="household", period=2023).values, + weights=sim.calculate( + "household_weight", map_to="household", period=2023 + ).values, + elec=imputed.household["electricity_consumption"].values, + gas=imputed.household["gas_consumption"].values, + ) + + +def _wmean(values, weights): + return float((values * weights).sum() / weights.sum()) + + +def _check(label, rows): + _print_table(label, rows) + for band, imputed, target, pct_err in rows: + assert pct_err < BAND_TOL, ( + f"{label} [{band}]: imputed {imputed:.0f} vs NEED {target:.0f} ({pct_err:.1%})" + ) + + +def test_electricity_by_income(arrays): + elec, income, w = arrays["elec"], arrays["income"], arrays["weights"] + rows = [] + for lo, hi, band, _, elec_kwh in NEED_INCOME_BANDS: + target = elec_kwh * OFGEM_Q4_2023_ELEC_RATE + mask = (income >= lo) & (income < hi) + if mask.sum() == 0: + continue + imp = _wmean(elec[mask], w[mask]) + tol = HIGH_INC_TOL if lo >= 100_000 else BAND_TOL + rows.append((band, imp, target, abs(imp - target) / target, tol)) + _print_table( + "Electricity £/yr by income band", + [(b, i, t, e) for b, i, t, e, _ in rows], + ) + for band, imp, target, pct_err, tol in rows: + assert pct_err < tol, ( + f"Electricity by income [{band}]: {imp:.0f} vs {target:.0f} ({pct_err:.1%})" + ) + + +def test_gas_by_income(arrays): + gas, income, w = arrays["gas"], arrays["income"], arrays["weights"] + rows = [] + for lo, hi, band, gas_kwh, _ in NEED_INCOME_BANDS: + target = gas_kwh * OFGEM_Q4_2023_GAS_RATE + mask = (income >= lo) & (income < hi) + if mask.sum() == 0: + continue + imp = _wmean(gas[mask], w[mask]) + tol = HIGH_INC_TOL if lo >= 100_000 else BAND_TOL + rows.append((band, imp, target, abs(imp - target) / target, tol)) + _print_table("Gas £/yr by income band", [(b, i, t, e) for b, i, t, e, _ in rows]) + for band, imp, target, pct_err, tol in rows: + assert pct_err < tol, ( + f"Gas by income [{band}]: {imp:.0f} vs {target:.0f} ({pct_err:.1%})" + ) + + +def test_electricity_by_tenure(arrays): + elec, tenure, w = arrays["elec"], arrays["tenure"], arrays["weights"] + rows = [] + for frs_val, need_key in TENURE_TO_NEED.items(): + target = NEED_TENURE_ELEC[need_key] * OFGEM_Q4_2023_ELEC_RATE + mask = tenure == frs_val + if mask.sum() == 0: + continue + imp = _wmean(elec[mask], w[mask]) + rows.append((frs_val, imp, target, abs(imp - target) / target)) + _check("Electricity £/yr by tenure", rows) + + +def test_gas_by_tenure(arrays): + gas, tenure, w = arrays["gas"], arrays["tenure"], arrays["weights"] + rows = [] + for frs_val, need_key in TENURE_TO_NEED.items(): + target = NEED_TENURE_GAS[need_key] * OFGEM_Q4_2023_GAS_RATE + mask = tenure == frs_val + if mask.sum() == 0: + continue + imp = _wmean(gas[mask], w[mask]) + rows.append((frs_val, imp, target, abs(imp - target) / target)) + _check("Gas £/yr by tenure", rows) + + +def test_electricity_by_accommodation(arrays): + elec, accomm, w = arrays["elec"], arrays["accomm"], arrays["weights"] + rows = [] + for frs_val, need_key in ACCOMM_TO_NEED.items(): + target = NEED_ACCOMM_ELEC[need_key] * OFGEM_Q4_2023_ELEC_RATE + mask = accomm == frs_val + if mask.sum() == 0: + continue + imp = _wmean(elec[mask], w[mask]) + rows.append((frs_val, imp, target, abs(imp - target) / target)) + _check("Electricity £/yr by accommodation type (excl. OTHER)", rows) + + +def test_gas_by_accommodation(arrays): + gas, accomm, w = arrays["gas"], arrays["accomm"], arrays["weights"] + rows = [] + for frs_val, need_key in ACCOMM_TO_NEED.items(): + target = NEED_ACCOMM_GAS[need_key] * OFGEM_Q4_2023_GAS_RATE + mask = accomm == frs_val + if mask.sum() == 0: + continue + imp = _wmean(gas[mask], w[mask]) + rows.append((frs_val, imp, target, abs(imp - target) / target)) + _check("Gas £/yr by accommodation type (excl. OTHER)", rows) + + +def test_electricity_by_region(arrays): + elec, region, w = arrays["elec"], arrays["region"], arrays["weights"] + rows = [] + for reg, target_kwh in NEED_REGION_ELEC.items(): + target = target_kwh * OFGEM_Q4_2023_ELEC_RATE + mask = region == reg + if mask.sum() == 0: + continue + imp = _wmean(elec[mask], w[mask]) + rows.append((reg, imp, target, abs(imp - target) / target)) + _check("Electricity £/yr by region", rows) + + +def test_gas_by_region(arrays): + gas, region, w = arrays["gas"], arrays["region"], arrays["weights"] + rows = [] + for reg, target_kwh in NEED_REGION_GAS.items(): + target = target_kwh * OFGEM_Q4_2023_GAS_RATE + mask = region == reg + if mask.sum() == 0: + continue + imp = _wmean(gas[mask], w[mask]) + rows.append((reg, imp, target, abs(imp - target) / target)) + _check("Gas £/yr by region", rows) + + +def test_non_negative_energy(imputed): + """All households should have non-negative electricity and gas spend.""" + elec = imputed.household["electricity_consumption"].values + gas = imputed.household["gas_consumption"].values + assert (elec >= 0).all(), f"{(elec < 0).sum()} households have negative electricity" + assert (gas >= 0).all(), f"{(gas < 0).sum()} households have negative gas" + + +def test_national_mean(arrays): + """Weighted national mean should be within 15% of NEED 2023 overall mean.""" + w = arrays["weights"] + # NEED 2023 overall mean: unweighted average across income bands as proxy + need_elec_national = ( + sum(e for *_, e in NEED_INCOME_BANDS) / len(NEED_INCOME_BANDS) + ) * OFGEM_Q4_2023_ELEC_RATE + need_gas_national = ( + sum(g for *_, g, _ in NEED_INCOME_BANDS) / len(NEED_INCOME_BANDS) + ) * OFGEM_Q4_2023_GAS_RATE + + imp_elec = _wmean(arrays["elec"], w) + imp_gas = _wmean(arrays["gas"], w) + + elec_err = abs(imp_elec - need_elec_national) / need_elec_national + gas_err = abs(imp_gas - need_gas_national) / need_gas_national + print( + f"\nNational mean electricity: imputed £{imp_elec:.0f} vs NEED £{need_elec_national:.0f} ({elec_err:.1%})" + ) + print( + f"National mean gas: imputed £{imp_gas:.0f} vs NEED £{need_gas_national:.0f} ({gas_err:.1%})" + ) + assert elec_err < 0.15, f"National electricity mean off by {elec_err:.1%}" + assert gas_err < 0.15, f"National gas mean off by {gas_err:.1%}" + + +def test_energy_sum_approx_domestic(imputed): + """Electricity + gas should roughly equal the legacy domestic_energy_consumption.""" + elec = imputed.household["electricity_consumption"].values + gas = imputed.household["gas_consumption"].values + domestic = imputed.household["domestic_energy_consumption"].values + + # Compare medians rather than means to be robust to outliers + combined = np.median(elec + gas) + legacy = np.median(domestic) + if legacy > 0: + ratio = combined / legacy + print( + f"\nMedian(elec+gas)={combined:.0f}, median(domestic_energy)={legacy:.0f}, ratio={ratio:.2f}" + ) + assert 0.5 < ratio < 2.0, ( + f"elec+gas median (£{combined:.0f}) diverges from domestic_energy " + f"median (£{legacy:.0f}) by ratio {ratio:.2f}" + ) + + +def _print_table(title, rows): + print(f"\n{'─' * 68}") + print(f" {title}") + print(f"{'─' * 68}") + print(f" {'Group':<26} {'Imputed':>10} {'NEED 2023':>10} {'Error':>8}") + print(f" {'─' * 26} {'─' * 10} {'─' * 10} {'─' * 8}") + for group, imp, target, pct_err in rows: + flag = " !" if pct_err >= BAND_TOL else "" + print(f" {str(group):<26} {imp:>10.0f} {target:>10.0f} {pct_err:>7.1%}{flag}") + print(f"{'─' * 68}") diff --git a/pyproject.toml b/pyproject.toml index 27c53210..d58ffe99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "policyengine", "google-cloud-storage", "google-auth", - "policyengine-uk>=2.43.5", + "policyengine-uk>=2.74.0", "microcalibrate>=0.18.0", "microimpute>=1.0.1", "ruff>=0.9.0", diff --git a/uv.lock b/uv.lock index 9e03803a..fd7ec078 100644 --- a/uv.lock +++ b/uv.lock @@ -812,15 +812,15 @@ wheels = [ [[package]] name = "microdf-python" -version = "1.1.0" +version = "1.2.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, { name = "pandas" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1e/14/87dc75a11968afca8b1b29d18e947a1cfe0c50bf4cd1aa207989be4696b0/microdf_python-1.1.0.tar.gz", hash = "sha256:4956eee69e1d40f458c7a16cecfbb539f33f0c33138cd408a67df23a188432d1", size = 16617, upload-time = "2025-11-26T01:46:01.236Z" } +sdist = { url = "https://files.pythonhosted.org/packages/91/f0/9689f33e2524b0c0d1cdf0d556ad196bfbb2ec0292f4545f467a37b27773/microdf_python-1.2.2.tar.gz", hash = "sha256:7e5f6adc10b0469de0e6549789ede0a2e6c600d0f5c83eafffc009d1495a7933", size = 20395, upload-time = "2026-02-24T10:47:16.438Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/f2/681200a69c62a2b3a6e0e3902d2f653c9693f4112b8caf89431d8ce1629a/microdf_python-1.1.0-py3-none-any.whl", hash = "sha256:99795bb593b9a62f086565a1472a472d8b3f6af8817336adeaa37397ed17da21", size = 17438, upload-time = "2025-11-26T01:46:00.157Z" }, + { url = "https://files.pythonhosted.org/packages/cc/cc/89cd28dd8ef566a49d353870a88359f5d239d7c254e1bde5e755e067028a/microdf_python-1.2.2-py3-none-any.whl", hash = "sha256:94f8b11b6416be9d04ff86cc311ae9083614bd6d569e7a589d250e89ded3343c", size = 21476, upload-time = "2026-02-24T10:47:15.65Z" }, ] [[package]] @@ -1324,7 +1324,7 @@ wheels = [ [[package]] name = "policyengine-core" -version = "3.20.1" +version = "3.23.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dpath" }, @@ -1344,14 +1344,14 @@ dependencies = [ { name = "standard-imghdr" }, { name = "wheel" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2f/76/4b4d7856154c798393735705f2188673867e1ac138ebe98d2e5e8a926b89/policyengine_core-3.20.1.tar.gz", hash = "sha256:8ec61f8335b89b74349241767f815464b0db32ce8c3e1deefed4c0332cbc0bef", size = 160070, upload-time = "2025-10-01T17:02:33.059Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5d/de/5bc5b02626703ea7d288c84c474ec51e823aa726d55ebabafe7c85e7285f/policyengine_core-3.23.6.tar.gz", hash = "sha256:81bb4057f5d6380f2d7f1af2fe4932bd3bd37fdfda7b841f7ee38b30aa5cc8e6", size = 163499, upload-time = "2026-01-25T14:04:43.233Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/97/44/d85ab4fa74a96bce2f0fd58a6ae6f87d4f40784e3bab924fdf0eb3658746/policyengine_core-3.20.1-py3-none-any.whl", hash = "sha256:9c790108921d04826110f2672569883e84afdc8eac44aea014afb1cebebb72a3", size = 221383, upload-time = "2025-10-01T17:02:31.457Z" }, + { url = "https://files.pythonhosted.org/packages/82/7a/b47b239fb0a85a36b36b47e7665db981800fcac3384aeec6dadf92a9e548/policyengine_core-3.23.6-py3-none-any.whl", hash = "sha256:f0834107335de6f2452d39e53db7a72a57088ed26d3703a4c4eaded55a4e7bce", size = 225309, upload-time = "2026-01-25T14:04:41.844Z" }, ] [[package]] name = "policyengine-uk" -version = "2.61.2" +version = "2.74.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1359,9 +1359,9 @@ dependencies = [ { name = "pydantic" }, { name = "tables" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/71/94/4f6c1bba2085cd2c17a57c6c1da5bda9a71f7f5ec0bce6e5057ebd5b90a1/policyengine_uk-2.61.2.tar.gz", hash = "sha256:05263fef974e885ded0ce6e06010a67c1c09d665d6b3acf061b86e5d680d4766", size = 1062165, upload-time = "2025-11-28T14:59:18.219Z" } +sdist = { url = "https://files.pythonhosted.org/packages/60/12/c87f64560a922b7982e417220abdabe28a0c7d682a8fa81ca16797c143b2/policyengine_uk-2.74.0.tar.gz", hash = "sha256:aebeac8c67a80d042d1ab411c9c16f9b81525639488d04682f35638a11996565", size = 1100162, upload-time = "2026-02-23T13:46:04.588Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/88/ef6888197f7a62cb9e71998bf692358be7a40359e0662d765f6b4c67a414/policyengine_uk-2.61.2-py3-none-any.whl", hash = "sha256:66b7853e791e7dda1f43dfa1e056bd9ad0d381cb7d381803400a6c2de62616ab", size = 1635936, upload-time = "2025-11-28T14:59:15.883Z" }, + { url = "https://files.pythonhosted.org/packages/4b/ba/7c11a21d9ba528ae602ed69db5bdd5667ab3fa6f58d97df7242b9764e3bf/policyengine_uk-2.74.0-py3-none-any.whl", hash = "sha256:395263be4cabdec477b692cd32766843ac097c5e54d3568773a4b572e40450c7", size = 1697261, upload-time = "2026-02-23T13:46:02.912Z" }, ] [[package]] @@ -1418,7 +1418,7 @@ requires-dist = [ { name = "pandas" }, { name = "policyengine" }, { name = "policyengine-core", specifier = ">=3.19.4" }, - { name = "policyengine-uk", specifier = ">=2.43.5" }, + { name = "policyengine-uk", specifier = ">=2.74.0" }, { name = "pydantic", specifier = ">=2.0" }, { name = "pytest", marker = "extra == 'dev'" }, { name = "pyyaml" },