From 3d81aa510becee3ca5bb47378e22377e5c074a2f Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 15:17:46 +0100 Subject: [PATCH 01/20] Add Hyland et al.-style respiratory failure --- icu_features/feature_engineering.py | 63 +++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index f05add1..7dd51b6 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -5,6 +5,7 @@ import click import numpy as np import polars as pl +from icu_benchmarks.load import features from icu_features.constants import CAT_MISSING_NAME, HORIZONS, VARIABLE_REFERENCE_PATH @@ -353,7 +354,7 @@ def treatment_continuous_features( return expressions -def eep_label(events: pl.Expr, horizon: int): +def eep_label(events: pl.Expr, horizon: int, switches_only: bool = True): """ From an event series, create a label for the early event prediction (eep) task. @@ -377,7 +378,11 @@ def eep_label(events: pl.Expr, horizon: int): positive_labels: 1 1 - - - - - - - - - - 1 1 1 1 1 1 1 - 1 1 1 1 1 1 1 negative_labels: - 0 0 0 0 0 - 0 0 0 0 0 0 0 - - 0 0 0 0 0 0 0 0 0 0 - coalesced_labels: 1 1 0 0 0 0 - 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 + + if switches_only is True: label: 1 - 0 0 0 - - 0 0 0 0 0 1 1 1 1 - - - 0 1 1 1 1 - 1 - + else: + label: 1 - 0 0 0 - - 0 0 0 0 0 1 1 1 1 - 1 - 0 1 1 1 1 - 1 - Note that at the time step of a positive event, the label is always missing. At the time step of a negative event, the label could be true, false, or missing. @@ -391,16 +396,16 @@ def eep_label(events: pl.Expr, horizon: int): horizon : int The horizon for the early event prediction task. """ - events_ffilled = events.forward_fill().replace(None, False) - positive_labels = events.replace(False, None).backward_fill(horizon) # shift(-1) and backward_fill(horizon - 1) excludes the last zero. negative_labels = events.replace(True, None).shift(-1).backward_fill(horizon - 1) - coalesced_label = pl.coalesce(positive_labels, negative_labels) - return pl.when(coalesced_label.eq(False) | events_ffilled.eq(False)).then( - coalesced_label - ) + + if switches_only: + events = events.forward_fill() + events = events.replace(None, False) + + return pl.when(coalesced_label.eq(False) | events.eq(False)).then(coalesced_label) def polars_nan_or(*args: pl.Expr): @@ -527,6 +532,42 @@ def outcomes(): ) circulatory_failure_at_8h = eep_label(event, 8).alias("circulatory_failure_at_8h") + # Attempt to reimplement Hyland et al.'s circulatory failure label. + # https://www.nature.com/articles/s41591-020-0789-4 + # First, we interpolate lactate values. If the time difference between two + # consecutive lactate measurements is less than 6 hours, or if both lactate values + # are either above or below 2, we linearly interpolate the lactate value. Else, we + # fill the value forward and backward for 6 hours. + time = pl.when(pl.col("lact").notna()).then(pl.col("time_hours")) + interp = (time.bfill() - time.ffill() < 6) | (bad_lact.ffill() == bad_lact.bfill()) + lact_interp = ( + pl.when(interp) + .then(pl.col("lact").interpolate(method="linear")) + .otherwise(pl.col("lact").bfill(3).ffill(3)) + ) + # On the boundary, if the first value of lactate is above the threshold ("bad"), we + # fill backwards indefinitely. If the last value is above the threshold, fill + # forward indefinitely. + lact_interp = pl.coalesce( + lact_interp, + pl.when(pl.col("time").le(time.min()) & bad_lact.bfill()).then( + lact_interp.ffill() + ), + pl.when(pl.col("time").ge(time.max()) & bad_lact.ffill()).then( + lact_interp.bfill() + ), + ) + bad_lact_interp = lact_interp >= HIGH_LACT_TSH + event = ( + pl.when(bad_map & bad_lact_interp) + .then(True) + .when(~bad_map & ~bad_lact_interp) + .then(False) + ) + circulatory_failure_at_8h_hyland = eep_label(event, 8, switches_only=False).alias( + "circulatory_failure_at_8h_hyland" + ) + # kidney_failure_at_48h # The patient has a kidney failure if they are in stage 3 according to # https://kdigo.org/wp-content/uploads/2016/10/KDIGO-2012-AKI-Guideline-English.pdf @@ -609,6 +650,7 @@ def outcomes(): resp_failure_at_24h, remaining_los, circulatory_failure_at_8h, + circulatory_failure_at_8h_hyland, kidney_failure_at_48h, los_at_24h, log_los_at_24h, @@ -729,6 +771,13 @@ def main(dataset: str, data_dir: str | Path): # noqa D pl.col("time_hours").log1p().alias("log_time_hours"), ) + feature_names = set(features()) + schema_names = set(q.collect_schema().keys()) + missing_features = feature_names - schema_names + + if missing_features: + raise ValueError(f"Missing features: {missing_features}") + tic = perf_counter() out = q.collect() toc = perf_counter() From a8d738a3a0e026a92f1e8a785959d8c87e89b2da Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 15:24:22 +0100 Subject: [PATCH 02/20] Cleanup --- icu_features/feature_engineering.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 7dd51b6..9da009c 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -362,9 +362,9 @@ def eep_label(events: pl.Expr, horizon: int, switches_only: bool = True): - If the last event in the history was positive, and there is a positive event within the next `horizon` hours, the label is missing. - If there was no event in the history or the last event in the history was - negative, and there is a positive event within the next `horizon` hours, the - label is true. This holds even if there is a negative event at the current time - step. + negative or switches_only is False, and there is a positive event within the next + `horizon` hours, the label is true. This holds even if there is a negative event + at the current time step. - Else, if there is a negative event within the next `horizon` hours (and no positive event within the next `horizon` hours or at the current time step), the label is false. @@ -395,6 +395,9 @@ def eep_label(events: pl.Expr, horizon: int, switches_only: bool = True): An expression for an event series. Boolean with possibly missing values. horizon : int The horizon for the early event prediction task. + switches_only : bool, optional, default = True + Whether to only assign True labels for switches from stable to unstable. I.e., + no positive labels for events `1 - 1`. """ positive_labels = events.replace(False, None).backward_fill(horizon) # shift(-1) and backward_fill(horizon - 1) excludes the last zero. @@ -437,7 +440,7 @@ def outcomes(): These are: - mortality_at_24h: A single label at time 24h after entry to the ICU whether the - patient dies in the ICU. THis is a "once per patient" prediction task. + patient dies in the ICU. This is a "once per patient" prediction task. - decompensation_at_24h: Whether the patient decompensates within the next 24 hours. This has label is true if the patient dies within the next 24 hours. Else, this is false. This does not have missing values. @@ -456,10 +459,6 @@ def outcomes(): hours. The patient has a kidney failure if they are in stage 3 according to the KDIGO guidelines: https://kdigo.org/wp-content/uploads/2016/10/KDIGO-2012-AKI-Guideline-English.pdf - - los_at_24h: The length of stay in the ICU at 24 hours after entry. - - log_creatine_in_1h: The log of the creatinine value in 1 hour. - - log_lactate_in_1h: The log of the lactate value in 1 hour. - - log_po2: The logarithm of the PaO2 value. """ # mortality_at_24h # This is a "once per patient" prediction task. At time step 24h, a label is @@ -615,11 +614,6 @@ def outcomes(): aki_3 = pl.when(pl.col("weight").is_null()).then(None).otherwise(aki_3) kidney_failure_at_48h = eep_label(aki_3, 48).alias("kidney_failure_at_48h") - # total length of stay, predicted at 24h after entry to the ICU. - los_at_24h = pl.when(pl.col("time_hours").eq(24)).then(pl.col("los_icu")) - los_at_24h = pl.when(los_at_24h > 0.1).then(los_at_24h).alias("los_at_24h") - log_los_at_24h = los_at_24h.log().alias("log_los_at_24h") - # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( (pl.col("lact") + 0.1).log().shift(-4).alias("log_lactate_in_4h") @@ -652,8 +646,6 @@ def outcomes(): circulatory_failure_at_8h, circulatory_failure_at_8h_hyland, kidney_failure_at_48h, - los_at_24h, - log_los_at_24h, log_lactate_in_4h, log_rel_urine_rate_in_2h, log_pf_ratio_in_12h, From 27b891321f713a3271e31b804033dc63c78642bd Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 15:37:25 +0100 Subject: [PATCH 03/20] Add resp_failure_24h hueser --- icu_features/feature_engineering.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 9da009c..9a79fa1 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -481,7 +481,8 @@ def outcomes(): # respiratory_failure_at_24h # If the PaO2/FiO2 ratio is below 200, the patient is considered to have a - # respiratory failure (event). This used pf_ratio from other_variables(). + # respiratory failure (event). This used pf_ratio from other_variables(). This uses + # a fio2 which was imputed by 21% if the patient was not ventilated. RESP_PF_DEF_TSH = 200.0 events = pl.col("pf_ratio") < RESP_PF_DEF_TSH resp_failure_at_24h = eep_label(events, 24).alias("respiratory_failure_at_24h") @@ -491,6 +492,14 @@ def outcomes(): remaining_los = pl.when(remaining_los > 0).then(remaining_los).otherwise(None) remaining_los = remaining_los.alias("remaining_los") + # respiratory failure label with simple imputation of po2 and fio2, related to + # Hueser et al., 2024 https://www.medrxiv.org/content/10.1101/2024.01.23.24301516v1. + pf_ratio = 100 * pl.col("po2").ffill(1).bfill(1) / pl.col("fio2").ffill(1).bfill(1) + events = pf_ratio < RESP_PF_DEF_TSH + resp_failure_at_24h_hueser = eep_label(events, 24, switches_only=False).alias( + "respiratory_failure_at_24h_hueser" + ) + # circulatory_failure_at_8h # A patient is considered to have a circulatory failure if the mean arterial # is low (below 65, or being raised by a drug) and the lactate is high (above 2). @@ -642,6 +651,7 @@ def outcomes(): mortality_at_24h, decompensation_at_24h, resp_failure_at_24h, + resp_failure_at_24h_hueser, remaining_los, circulatory_failure_at_8h, circulatory_failure_at_8h_hyland, From 8b4cacb3018919443de6d1ec31820dbd2c2d0267 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 16:17:54 +0100 Subject: [PATCH 04/20] Add kidney_failure_at_48h_lyu --- icu_features/feature_engineering.py | 113 ++++++++++++++++++---------- 1 file changed, 74 insertions(+), 39 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 9a79fa1..f48e309 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -579,50 +579,84 @@ def outcomes(): # kidney_failure_at_48h # The patient has a kidney failure if they are in stage 3 according to # https://kdigo.org/wp-content/uploads/2016/10/KDIGO-2012-AKI-Guideline-English.pdf - relative_creatine = pl.col("crea") / pl.col("crea").shift(1).rolling_min( - window_size=7 * 24, min_samples=1 - ) + def kdigo_3(crea, crea_baseline, urine_rate, weight, crrt): + """Compute the KDIGO stage 3 kidney failure label.""" + # AKI 1 is + # - max absolute creatinine increase of 0.3 within 48h or + # - a relative creatinine increase of 1.5. + creatine_min_48 = crea.rolling_min(window_size=48, min_samples=1) + creatine_max_48 = crea.rolling_max(window_size=48, min_samples=1) + creatine_change_48 = creatine_max_48 - creatine_min_48 + aki_1 = polars_nan_or(creatine_change_48 >= 0.3, crea / crea_baseline >= 1.5) + + # AKI 3 is any of the following: + # - a relative creatine increase of 3.0 x baseline + # - AKI 1 and creatinine >= 4.0 + # - not more than 0.3ml/kg/h urine rate for 24h + # - no urine for 12h + # - initiation of renal replacement therapy (crrt) + good_urine_rate = ((urine_rate / weight) >= 0.3).cast(pl.Int32) + low_urine_rate_24 = ~(good_urine_rate.rolling_sum(24, min_samples=1).gt(0)) + + # high_creatine is True if aki_1 is True and creatine >= 4 (neither missing). + # False if either aki_1 is False or creatine < 4. Else, missing. + high_creatine = ~polars_nan_or(~aki_1, ~crea.gt(4)) + + # anuria True if the urine_rate is consistently equal to 0 for 12 hours. False + # if it is ever above 0. If all values are missing, the result is missing. + not_anuria = pl.col("urine_rate").gt(0).cast(pl.Int32) + anuria = ~(not_anuria.rolling_sum(window_size=12, min_samples=1).gt(0)) + + aki_3 = polars_nan_or( + crrt.replace(False, None).ffill(), + (crea / crea_baseline) >= 3.0, + high_creatine, + low_urine_rate_24, + anuria, + ) - # AKI 1 is - # - max absolute creatinine increase of 0.3 within 48h or - # - a relative creatinine increase of 1.5. - creatine_min_48 = pl.col("crea").rolling_min(window_size=48, min_samples=1) - creatine_max_48 = pl.col("crea").rolling_max(window_size=48, min_samples=1) - creatine_change_48 = creatine_max_48 - creatine_min_48 - aki_1 = polars_nan_or(creatine_change_48 >= 0.3, relative_creatine >= 1.5) - - # AKI 3 is any of - # - a relative creatine increase of 3.0 x baseline - # - AKI 1 and creatinine >= 4.0 - # - not more than 0.3ml/kg/h urine rate for 24h - # - no urine for 12h - low_urine_rate = ((pl.col("urine_rate") / pl.col("weight")) < 0.3).cast(pl.Int32) - low_urine_rate_24 = low_urine_rate.rolling_sum(window_size=24, min_samples=1).eq(24) - - # True if aki_1 is True and creatine >= 4 (neither missing). False if either aki_1 - # is False or creatine < 4. Else, missing. - high_creatine = ~polars_nan_or(~aki_1, ~pl.col("crea").gt(4)) - # True if the urine_rate is consistently equal to 0 for 12 hours. False if it is - # ever above 0. If all values are missing, the result is missing. - anuria = ( - pl.col("urine_rate") - .eq(0) - .cast(pl.Int32) - .rolling_sum(window_size=12, min_samples=1) - .eq(12) - ) + # If the weight is missing, the patient could only ever have a positive label, as + # urine related conditions are always missing. We thus set the label to missing. + aki_3 = pl.when(weight.is_null()).then(None).otherwise(aki_3) + + return aki_3 - aki_3 = polars_nan_or( - relative_creatine >= 3.0, - high_creatine, - low_urine_rate_24, - anuria, + aki_3 = kdigo_3( + pl.col("crea"), + pl.col("crea").shift(1).rolling_min(7 * 24, min_samples=1), + pl.col("urine_rate"), + pl.col("weight"), + pl.col("ufilt_ind"), ) - # If the weight is missing, the patient could only ever have a positive label, as - # urine related conditions are always missing. We thus set the label to missing. - aki_3 = pl.when(pl.col("weight").is_null()).then(None).otherwise(aki_3) kidney_failure_at_48h = eep_label(aki_3, 48).alias("kidney_failure_at_48h") + # Kidney failure, motivated by Lyu et al. 2024: + # https://www.medrxiv.org/content/10.1101/2024.02.01.24302063v1 + # Similarly to circulatory_failure_8h_hyland, we linearly interpolate creatine + # values. We only interpolate if the time difference between two consecutive + # creatine measurements is less than 48 hours. We ffill and bfill the first and last + # measurements. + time = pl.when(pl.col("crea").notna()).then(pl.col("time_hours")) + interpolate = time.bfill() - time.ffill() < 48 + crea = pl.coalesce( + pl.when(interpolate).then(pl.col("crea").interpolate(method="linear")), + pl.when(pl.col("time").le(time.min())).then(pl.col("crea").bfill(48)), + pl.when(pl.col("time").ge(time.max())).then(pl.col("crea").ffill(48)), + ) + crea_baseline = pl.when( + pl.col("time").le(7 * 24).then(crea.head(7 * 24).min()) + ).otherwise(crea.rolling_min(7 * 24, min_samples=1)) + urine_rate = pl.col("urine_rate").bfill(48) + + aki_3 = kdigo_3( + crea, + crea_baseline, + urine_rate, + pl.col("weight"), + pl.col("ufilt_ind"), + ) + kidney_failure_at_48h_lyu = eep_label(aki_3, 48).alias("kidney_failure_at_48h_lyu") + # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( (pl.col("lact") + 0.1).log().shift(-4).alias("log_lactate_in_4h") @@ -656,6 +690,7 @@ def outcomes(): circulatory_failure_at_8h, circulatory_failure_at_8h_hyland, kidney_failure_at_48h, + kidney_failure_at_48h_lyu, log_lactate_in_4h, log_rel_urine_rate_in_2h, log_pf_ratio_in_12h, From 103e025782160a2fd708328b0dd140b64507ddb5 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 16:19:44 +0100 Subject: [PATCH 05/20] Remove 'only assign kidney failure if weight is not None' condition --- icu_features/feature_engineering.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index f48e309..cc91db2 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -579,7 +579,7 @@ def outcomes(): # kidney_failure_at_48h # The patient has a kidney failure if they are in stage 3 according to # https://kdigo.org/wp-content/uploads/2016/10/KDIGO-2012-AKI-Guideline-English.pdf - def kdigo_3(crea, crea_baseline, urine_rate, weight, crrt): + def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): """Compute the KDIGO stage 3 kidney failure label.""" # AKI 1 is # - max absolute creatinine increase of 0.3 within 48h or @@ -595,7 +595,7 @@ def kdigo_3(crea, crea_baseline, urine_rate, weight, crrt): # - not more than 0.3ml/kg/h urine rate for 24h # - no urine for 12h # - initiation of renal replacement therapy (crrt) - good_urine_rate = ((urine_rate / weight) >= 0.3).cast(pl.Int32) + good_urine_rate = (rel_urine_rate >= 0.3).cast(pl.Int32) low_urine_rate_24 = ~(good_urine_rate.rolling_sum(24, min_samples=1).gt(0)) # high_creatine is True if aki_1 is True and creatine >= 4 (neither missing). @@ -615,17 +615,12 @@ def kdigo_3(crea, crea_baseline, urine_rate, weight, crrt): anuria, ) - # If the weight is missing, the patient could only ever have a positive label, as - # urine related conditions are always missing. We thus set the label to missing. - aki_3 = pl.when(weight.is_null()).then(None).otherwise(aki_3) - return aki_3 aki_3 = kdigo_3( pl.col("crea"), pl.col("crea").shift(1).rolling_min(7 * 24, min_samples=1), - pl.col("urine_rate"), - pl.col("weight"), + pl.col("rel_urine_rate"), pl.col("ufilt_ind"), ) kidney_failure_at_48h = eep_label(aki_3, 48).alias("kidney_failure_at_48h") @@ -651,8 +646,7 @@ def kdigo_3(crea, crea_baseline, urine_rate, weight, crrt): aki_3 = kdigo_3( crea, crea_baseline, - urine_rate, - pl.col("weight"), + urine_rate / pl.col("weight"), pl.col("ufilt_ind"), ) kidney_failure_at_48h_lyu = eep_label(aki_3, 48).alias("kidney_failure_at_48h_lyu") From ea26babc10a016c4b96d9ddaf7f941fa328c56a1 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 16:24:14 +0100 Subject: [PATCH 06/20] Add hyperglycemia and hypoglycemia at 8h --- icu_features/feature_engineering.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index cc91db2..c2c2f85 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -651,6 +651,13 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): ) kidney_failure_at_48h_lyu = eep_label(aki_3, 48).alias("kidney_failure_at_48h_lyu") + # hyperglycemia_at_8h and hypoglycemia_at_8h according to Mehdizavareha et al., + # https://arxiv.org/pdf/2411.01418 + hyperglycemia_event = pl.col("glucose") > 180 # mg/dl + hypoglycemia_event = pl.col("glucose") < 70 # mg/dl + hyperglycemia_at_8h = eep_label(hyperglycemia_event, 8).alias("hyperglycemia_at_8h") + hypoglycemia_at_8h = eep_label(hypoglycemia_event, 8).alias("hypoglycemia_at_8h") + # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( (pl.col("lact") + 0.1).log().shift(-4).alias("log_lactate_in_4h") @@ -660,21 +667,6 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): pl.col("pf_ratio").log().shift(-12).alias("log_pf_ratio_in_12h") ) - # The "raw" ICU data contains urine measurements from a bag at specific times (ml). - # In `ricu`, we divide these measurement values by the time distance to the last - # measurement. This gives us a urine rate (ml/h). This divided by the patient's - # weight is the relative urine rate (ml/h/kg). These "relative rate" measurements - # are only non-missing at the timepoint of the measurement. - # We assign a label if there is a (positive) measurement in 2 hours. - log_rel_urine_rate_in_2h = ( - pl.when( - pl.col("rel_urine_rate").is_not_null() & pl.col("rel_urine_rate").ge(0.01) - ) - .then(pl.col("rel_urine_rate").log()) - .shift(-2) - .alias("log_rel_urine_rate_in_2h") - ) - return [ mortality_at_24h, decompensation_at_24h, @@ -685,8 +677,9 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): circulatory_failure_at_8h_hyland, kidney_failure_at_48h, kidney_failure_at_48h_lyu, + hyperglycemia_at_8h, + hypoglycemia_at_8h, log_lactate_in_4h, - log_rel_urine_rate_in_2h, log_pf_ratio_in_12h, ] From 9c34198306e28c77f3385e4a83a4ce516159fe78 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 16:37:01 +0100 Subject: [PATCH 07/20] meld and sofa3 --- icu_features/feature_engineering.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index c2c2f85..6a80919 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -658,6 +658,23 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): hyperglycemia_at_8h = eep_label(hyperglycemia_event, 8).alias("hyperglycemia_at_8h") hypoglycemia_at_8h = eep_label(hypoglycemia_event, 8).alias("hypoglycemia_at_8h") + # Liver failure according to + # - MELD score > 30: https://en.wikipedia.org/wiki/Model_for_End-Stage_Liver_Disease + # - SOFA score >= 3: https://en.wikipedia.org/wiki/SOFA_score + crea = pl.col("crea").ffill(1).bfill(1) + crea = pl.coalesce( + pl.when(pl.col("ufilt_ind").replace(False, None).ffill(7 * 24)).then(4), + pl.col("crea").ffill(1).bfill(1), + ) + bili = pl.col("bili").ffill(1).bfill(1).clip(1, None) + inr = pl.col("inr").ffill(1).bfill(1).clip(1, None) + meld_score = 3.78 * crea.log() + 11.2 * inr.log() + 9.57 * bili.log() + 6.43 + meld_event = meld_score > 30 + severe_meld_in_48h = eep_label(meld_event, 48).alias("severe_meld_in_48h") + + sofa3 = pl.col("bili").ffill(1).bfill(1) > 6.0 + sofa3_in_48h = eep_label(sofa3, 48).alias("sofa3_in_48h") + # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( (pl.col("lact") + 0.1).log().shift(-4).alias("log_lactate_in_4h") @@ -679,6 +696,8 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): kidney_failure_at_48h_lyu, hyperglycemia_at_8h, hypoglycemia_at_8h, + severe_meld_in_48h, + sofa3_in_48h, log_lactate_in_4h, log_pf_ratio_in_12h, ] From cd759001c25be59e5043772c6d0c99e88915fbda Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 16:39:24 +0100 Subject: [PATCH 08/20] Sepsis --- icu_features/feature_engineering.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 6a80919..7be7ed0 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -670,10 +670,13 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): inr = pl.col("inr").ffill(1).bfill(1).clip(1, None) meld_score = 3.78 * crea.log() + 11.2 * inr.log() + 9.57 * bili.log() + 6.43 meld_event = meld_score > 30 - severe_meld_in_48h = eep_label(meld_event, 48).alias("severe_meld_in_48h") + severe_meld_at_48h = eep_label(meld_event, 48).alias("severe_meld_at_48h") sofa3 = pl.col("bili").ffill(1).bfill(1) > 6.0 - sofa3_in_48h = eep_label(sofa3, 48).alias("sofa3_in_48h") + sofa3_at_48h = eep_label(sofa3, 48).alias("sofa3_at_48h") + + # Sepsis + sepsis_at_8h = eep_label(pl.col("sepsis").fill_null(False), 8).alias("sepsis_at_8h") # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( @@ -696,8 +699,9 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): kidney_failure_at_48h_lyu, hyperglycemia_at_8h, hypoglycemia_at_8h, - severe_meld_in_48h, - sofa3_in_48h, + severe_meld_at_48h, + sofa3_at_48h, + sepsis_at_8h, log_lactate_in_4h, log_pf_ratio_in_12h, ] From 15de3e143c6ba01d3df9ff87eefcbaeebb3fe617 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 17:14:56 +0100 Subject: [PATCH 09/20] Simpler baseline --- icu_features/feature_engineering.py | 72 +++++++++++++++-------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 7be7ed0..2603ce2 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -494,7 +494,11 @@ def outcomes(): # respiratory failure label with simple imputation of po2 and fio2, related to # Hueser et al., 2024 https://www.medrxiv.org/content/10.1101/2024.01.23.24301516v1. - pf_ratio = 100 * pl.col("po2").ffill(1).bfill(1) / pl.col("fio2").ffill(1).bfill(1) + pf_ratio = ( + 100 + * pl.col("po2").forward_fill(1).backward_fill(1) + / pl.col("fio2").forward_fill(1).backward_fill(1) + ) events = pf_ratio < RESP_PF_DEF_TSH resp_failure_at_24h_hueser = eep_label(events, 24, switches_only=False).alias( "respiratory_failure_at_24h_hueser" @@ -546,23 +550,25 @@ def outcomes(): # consecutive lactate measurements is less than 6 hours, or if both lactate values # are either above or below 2, we linearly interpolate the lactate value. Else, we # fill the value forward and backward for 6 hours. - time = pl.when(pl.col("lact").notna()).then(pl.col("time_hours")) - interp = (time.bfill() - time.ffill() < 6) | (bad_lact.ffill() == bad_lact.bfill()) + time = pl.when(pl.col("lact").is_not_null()).then(pl.col("time_hours")) + interp = (time.backward_fill() - time.forward_fill() < 6) | ( + bad_lact.forward_fill() == bad_lact.backward_fill() + ) lact_interp = ( pl.when(interp) .then(pl.col("lact").interpolate(method="linear")) - .otherwise(pl.col("lact").bfill(3).ffill(3)) + .otherwise(pl.col("lact").backward_fill(3).forward_fill(3)) ) # On the boundary, if the first value of lactate is above the threshold ("bad"), we # fill backwards indefinitely. If the last value is above the threshold, fill # forward indefinitely. lact_interp = pl.coalesce( lact_interp, - pl.when(pl.col("time").le(time.min()) & bad_lact.bfill()).then( - lact_interp.ffill() + pl.when(pl.col("time").le(time.min()) & bad_lact.backward_fill()).then( + lact_interp.forward_fill() ), - pl.when(pl.col("time").ge(time.max()) & bad_lact.ffill()).then( - lact_interp.bfill() + pl.when(pl.col("time").ge(time.max()) & bad_lact.forward_fill()).then( + lact_interp.backward_fill() ), ) bad_lact_interp = lact_interp >= HIGH_LACT_TSH @@ -581,6 +587,8 @@ def outcomes(): # https://kdigo.org/wp-content/uploads/2016/10/KDIGO-2012-AKI-Guideline-English.pdf def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): """Compute the KDIGO stage 3 kidney failure label.""" + crea_baseline = crea.shift(1).rolling_min(7 * 24, min_samples=1) + # AKI 1 is # - max absolute creatinine increase of 0.3 within 48h or # - a relative creatinine increase of 1.5. @@ -607,19 +615,16 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): not_anuria = pl.col("urine_rate").gt(0).cast(pl.Int32) anuria = ~(not_anuria.rolling_sum(window_size=12, min_samples=1).gt(0)) - aki_3 = polars_nan_or( - crrt.replace(False, None).ffill(), + return polars_nan_or( + crrt.cast(pl.Boolean).replace(False, None).forward_fill(), (crea / crea_baseline) >= 3.0, high_creatine, low_urine_rate_24, anuria, ) - return aki_3 - aki_3 = kdigo_3( pl.col("crea"), - pl.col("crea").shift(1).rolling_min(7 * 24, min_samples=1), pl.col("rel_urine_rate"), pl.col("ufilt_ind"), ) @@ -631,21 +636,17 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): # values. We only interpolate if the time difference between two consecutive # creatine measurements is less than 48 hours. We ffill and bfill the first and last # measurements. - time = pl.when(pl.col("crea").notna()).then(pl.col("time_hours")) - interpolate = time.bfill() - time.ffill() < 48 + time = pl.when(pl.col("crea").is_not_null()).then(pl.col("time_hours")) + interpolate = time.backward_fill() - time.forward_fill() < 48 crea = pl.coalesce( pl.when(interpolate).then(pl.col("crea").interpolate(method="linear")), - pl.when(pl.col("time").le(time.min())).then(pl.col("crea").bfill(48)), - pl.when(pl.col("time").ge(time.max())).then(pl.col("crea").ffill(48)), + pl.when(pl.col("time").le(time.min())).then(pl.col("crea").backward_fill(48)), + pl.when(pl.col("time").ge(time.max())).then(pl.col("crea").forward_fill(48)), ) - crea_baseline = pl.when( - pl.col("time").le(7 * 24).then(crea.head(7 * 24).min()) - ).otherwise(crea.rolling_min(7 * 24, min_samples=1)) - urine_rate = pl.col("urine_rate").bfill(48) + urine_rate = pl.col("urine_rate").backward_fill(48) aki_3 = kdigo_3( crea, - crea_baseline, urine_rate / pl.col("weight"), pl.col("ufilt_ind"), ) @@ -653,31 +654,33 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): # hyperglycemia_at_8h and hypoglycemia_at_8h according to Mehdizavareha et al., # https://arxiv.org/pdf/2411.01418 - hyperglycemia_event = pl.col("glucose") > 180 # mg/dl - hypoglycemia_event = pl.col("glucose") < 70 # mg/dl + hyperglycemia_event = pl.col("glu") > 180 # mg/dl + hypoglycemia_event = pl.col("glu") < 70 # mg/dl hyperglycemia_at_8h = eep_label(hyperglycemia_event, 8).alias("hyperglycemia_at_8h") hypoglycemia_at_8h = eep_label(hypoglycemia_event, 8).alias("hypoglycemia_at_8h") # Liver failure according to # - MELD score > 30: https://en.wikipedia.org/wiki/Model_for_End-Stage_Liver_Disease # - SOFA score >= 3: https://en.wikipedia.org/wiki/SOFA_score - crea = pl.col("crea").ffill(1).bfill(1) - crea = pl.coalesce( - pl.when(pl.col("ufilt_ind").replace(False, None).ffill(7 * 24)).then(4), - pl.col("crea").ffill(1).bfill(1), + crea = ( + pl.when( + pl.col("ufilt_ind") + .cast(pl.Boolean) + .replace(False, None) + .forward_fill(7 * 24) + ) + .then(4.0) + .otherwise(pl.col("crea").forward_fill(1).backward_fill(1)) ) - bili = pl.col("bili").ffill(1).bfill(1).clip(1, None) - inr = pl.col("inr").ffill(1).bfill(1).clip(1, None) + bili = pl.col("bili").forward_fill(1).backward_fill(1).clip(1, None) + inr = pl.col("inr_pt").forward_fill(1).backward_fill(1).clip(1, None) meld_score = 3.78 * crea.log() + 11.2 * inr.log() + 9.57 * bili.log() + 6.43 meld_event = meld_score > 30 severe_meld_at_48h = eep_label(meld_event, 48).alias("severe_meld_at_48h") - sofa3 = pl.col("bili").ffill(1).bfill(1) > 6.0 + sofa3 = pl.col("bili").forward_fill(1).backward_fill(1) > 6.0 sofa3_at_48h = eep_label(sofa3, 48).alias("sofa3_at_48h") - # Sepsis - sepsis_at_8h = eep_label(pl.col("sepsis").fill_null(False), 8).alias("sepsis_at_8h") - # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( (pl.col("lact") + 0.1).log().shift(-4).alias("log_lactate_in_4h") @@ -701,7 +704,6 @@ def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): hypoglycemia_at_8h, severe_meld_at_48h, sofa3_at_48h, - sepsis_at_8h, log_lactate_in_4h, log_pf_ratio_in_12h, ] From d6834ae9c2b238540aad0ea5c32c3c18c7609b51 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 17:17:21 +0100 Subject: [PATCH 10/20] Fix import --- icu_features/feature_engineering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 2603ce2..0f299c5 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -5,9 +5,9 @@ import click import numpy as np import polars as pl -from icu_benchmarks.load import features from icu_features.constants import CAT_MISSING_NAME, HORIZONS, VARIABLE_REFERENCE_PATH +from icu_features.load import features logger = logging.getLogger(__name__) From e7c8f0b5ccb607c41a30da09cc97fc92d8f281b6 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 17:37:52 +0100 Subject: [PATCH 11/20] Remove crea_baseline from kdigo_3 args --- icu_features/feature_engineering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 0f299c5..e53eab9 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -585,7 +585,7 @@ def outcomes(): # kidney_failure_at_48h # The patient has a kidney failure if they are in stage 3 according to # https://kdigo.org/wp-content/uploads/2016/10/KDIGO-2012-AKI-Guideline-English.pdf - def kdigo_3(crea, crea_baseline, rel_urine_rate, crrt): + def kdigo_3(crea, rel_urine_rate, crrt): """Compute the KDIGO stage 3 kidney failure label.""" crea_baseline = crea.shift(1).rolling_min(7 * 24, min_samples=1) From 49053153e76b032bc731022adc1d0ecbc6492c43 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 27 Mar 2025 17:54:08 +0100 Subject: [PATCH 12/20] Fix tests --- icu_features/feature_engineering.py | 4 ++-- tests/test_outcomes.py | 27 ++++++++++++--------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index e53eab9..9de7536 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -612,11 +612,11 @@ def kdigo_3(crea, rel_urine_rate, crrt): # anuria True if the urine_rate is consistently equal to 0 for 12 hours. False # if it is ever above 0. If all values are missing, the result is missing. - not_anuria = pl.col("urine_rate").gt(0).cast(pl.Int32) + not_anuria = rel_urine_rate.gt(0).cast(pl.Int32) anuria = ~(not_anuria.rolling_sum(window_size=12, min_samples=1).gt(0)) return polars_nan_or( - crrt.cast(pl.Boolean).replace(False, None).forward_fill(), + crrt.cast(pl.Boolean).replace(False, None).forward_fill().fill_null(False), (crea / crea_baseline) >= 3.0, high_creatine, low_urine_rate_24, diff --git a/tests/test_outcomes.py b/tests/test_outcomes.py index 8fe4971..1f81ec6 100644 --- a/tests/test_outcomes.py +++ b/tests/test_outcomes.py @@ -13,22 +13,29 @@ def to_bool(x): @pytest.mark.parametrize( - "events, expected, horizon", + "events, expected, horizon, switches_only", [ ( "- 1 - - 0 0 - - - - - 0 0 0 - - 1 - 1 - 0 0 0 - 1 0 1", "1 - 0 0 0 - - 0 0 0 0 0 1 1 1 1 - - - 0 1 1 1 1 - 1 -", 4, + True, + ), + ( + "- 1 - - 0 0 - - - - - 0 0 0 - - 1 - 1 - 0 0 0 - 1 0 1", + "1 - 0 0 0 - - 0 0 0 0 0 1 1 1 1 - 1 - 0 1 1 1 1 - 1 -", + 4, + False, ), ], ) -def test_eep_labels(events, expected, horizon): +def test_eep_labels(events, expected, horizon, switches_only): df = pl.DataFrame( { "events": to_bool(events), "expected": to_bool(expected), } - ).with_columns(eep_label(pl.col("events"), horizon).alias("labels")) + ).with_columns(eep_label(pl.col("events"), horizon, switches_only).alias("labels")) assert_series_equal(df["labels"], df["expected"], check_names=False) @@ -95,16 +102,6 @@ def test_polars_nan_or(args, expected): ), pl.Series(np.arange(4 * 24, 0, -1) / 24), ), - ( - "los_at_24h", - pl.DataFrame( - { - "los_icu": 4.0, - "time_hours": np.arange(0, 4 * 24), - } - ), - pl.Series(24 * [None] + [4.0] + 71 * [None]), - ), ( "log_lactate_in_4h", pl.DataFrame( @@ -192,8 +189,8 @@ def test_outcomes(outcome_name, input, expected): pl.DataFrame( { "crea": [1.0] * 24 + [None] * 72 + [2.0] * 72 + [4.0] * 48, - "urine_rate": 70, - "weight": 70, + "rel_urine_rate": 1, + "ufilt_ind": [False] * 216, } ), pl.Series( From 9628bc34c87db7b81e96200cddc487bb98e81d0b Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 18:03:36 +0200 Subject: [PATCH 13/20] Fix eep_labels docstring. --- icu_features/feature_engineering.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 9de7536..694dbe3 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -359,12 +359,13 @@ def eep_label(events: pl.Expr, horizon: int, switches_only: bool = True): From an event series, create a label for the early event prediction (eep) task. - If there is a positive event at the current time step, the label is missing. - - If the last event in the history was positive, and there is a positive event - within the next `horizon` hours, the label is missing. + - If `switches_only` is `True`, and if the last event in the history was positive, + and there is a positive event within the next `horizon` hours, the label is + missing. That is, we only predict switches from stable to unstable. - If there was no event in the history or the last event in the history was - negative or switches_only is False, and there is a positive event within the next - `horizon` hours, the label is true. This holds even if there is a negative event - at the current time step. + negative or `switches_only` is `False`, and there is a positive event within the + next `horizon` hours, the label is true. This holds even if there is a negative + event at the current time step. - Else, if there is a negative event within the next `horizon` hours (and no positive event within the next `horizon` hours or at the current time step), the label is false. From 217d86d2052c74e8f9057ee382174e9cf9384b5a Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 18:05:17 +0200 Subject: [PATCH 14/20] Rename to respiratory_failure_at_24h_severe_imputed, use 100 threshold. --- icu_features/feature_engineering.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 694dbe3..4ec2851 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -495,15 +495,16 @@ def outcomes(): # respiratory failure label with simple imputation of po2 and fio2, related to # Hueser et al., 2024 https://www.medrxiv.org/content/10.1101/2024.01.23.24301516v1. + SEVERE_RESP_PF_DEF_TSH = 100 pf_ratio = ( 100 * pl.col("po2").forward_fill(1).backward_fill(1) / pl.col("fio2").forward_fill(1).backward_fill(1) ) - events = pf_ratio < RESP_PF_DEF_TSH - resp_failure_at_24h_hueser = eep_label(events, 24, switches_only=False).alias( - "respiratory_failure_at_24h_hueser" - ) + events = pf_ratio < SEVERE_RESP_PF_DEF_TSH + respiratory_failure_at_24h_severe_imputed = eep_label( + events, 24, switches_only=False + ).alias("respiratory_failure_at_24h_severe_imputed") # circulatory_failure_at_8h # A patient is considered to have a circulatory failure if the mean arterial @@ -695,7 +696,7 @@ def kdigo_3(crea, rel_urine_rate, crrt): mortality_at_24h, decompensation_at_24h, resp_failure_at_24h, - resp_failure_at_24h_hueser, + respiratory_failure_at_24h_severe_imputed, remaining_los, circulatory_failure_at_8h, circulatory_failure_at_8h_hyland, From 7feab890bfdb141a6f94a9db6db13878646cfe26 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 19:39:25 +0200 Subject: [PATCH 15/20] Update polars_nan_or docs. --- icu_features/feature_engineering.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 4ec2851..3679a4c 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -421,11 +421,11 @@ def polars_nan_or(*args: pl.Expr): Examples -------- - >>> import polars as pl - >>> a = pl.Series("a", [1, 2, None]) - >>> b = pl.Series("b", [0, None, 2]) - >>> polars_nan_or(a < 0, b == 2) - [ False, None, True ] + >>> polars_nan_or( + >>> pl.Series("a", [True, True, True, False, False, False]), + >>> pl.Series("b", [True, None, False, True, None, False]), + >>> ) + [True, True, True, True, None, False] """ return ( pl.when(pl.max_horizontal(*args)) # This ignores nans @@ -493,8 +493,8 @@ def outcomes(): remaining_los = pl.when(remaining_los > 0).then(remaining_los).otherwise(None) remaining_los = remaining_los.alias("remaining_los") - # respiratory failure label with simple imputation of po2 and fio2, related to - # Hueser et al., 2024 https://www.medrxiv.org/content/10.1101/2024.01.23.24301516v1. + # Severe respiratory failure label with simple imputation of po2 and fio2, related + # to Hueser et al. https://www.medrxiv.org/content/10.1101/2024.01.23.24301516v1. SEVERE_RESP_PF_DEF_TSH = 100 pf_ratio = ( 100 From 74aa7a28a972c969be64974316e9426ebcc09a60 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 19:47:48 +0200 Subject: [PATCH 16/20] Fix circ_failure and rename to _imputed --- icu_features/feature_engineering.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 3679a4c..88d0694 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -546,12 +546,12 @@ def outcomes(): ) circulatory_failure_at_8h = eep_label(event, 8).alias("circulatory_failure_at_8h") - # Attempt to reimplement Hyland et al.'s circulatory failure label. + # Circulatory failure label using interpolated lactate, inspured by Hyland et al. # https://www.nature.com/articles/s41591-020-0789-4 # First, we interpolate lactate values. If the time difference between two # consecutive lactate measurements is less than 6 hours, or if both lactate values # are either above or below 2, we linearly interpolate the lactate value. Else, we - # fill the value forward and backward for 6 hours. + # fill the value forward and backward for 3 hours. time = pl.when(pl.col("lact").is_not_null()).then(pl.col("time_hours")) interp = (time.backward_fill() - time.forward_fill() < 6) | ( bad_lact.forward_fill() == bad_lact.backward_fill() @@ -561,16 +561,16 @@ def outcomes(): .then(pl.col("lact").interpolate(method="linear")) .otherwise(pl.col("lact").backward_fill(3).forward_fill(3)) ) - # On the boundary, if the first value of lactate is above the threshold ("bad"), we - # fill backwards indefinitely. If the last value is above the threshold, fill - # forward indefinitely. + # On the boundary, if the first value of lactate is below the threshold ("good"), we + # fill backwards indefinitely. If the last value is below the threshold, fill + # forward indefinitely. Else, we fill forward and backward for 3 hours. lact_interp = pl.coalesce( lact_interp, pl.when(pl.col("time").le(time.min()) & bad_lact.backward_fill()).then( - lact_interp.forward_fill() + lact_interp.backward_fill() ), pl.when(pl.col("time").ge(time.max()) & bad_lact.forward_fill()).then( - lact_interp.backward_fill() + lact_interp.forward_fill() ), ) bad_lact_interp = lact_interp >= HIGH_LACT_TSH @@ -580,8 +580,8 @@ def outcomes(): .when(~bad_map & ~bad_lact_interp) .then(False) ) - circulatory_failure_at_8h_hyland = eep_label(event, 8, switches_only=False).alias( - "circulatory_failure_at_8h_hyland" + circulatory_failure_at_8h_imputed = eep_label(event, 8, switches_only=False).alias( + "circulatory_failure_at_8h_imputed" ) # kidney_failure_at_48h @@ -699,7 +699,7 @@ def kdigo_3(crea, rel_urine_rate, crrt): respiratory_failure_at_24h_severe_imputed, remaining_los, circulatory_failure_at_8h, - circulatory_failure_at_8h_hyland, + circulatory_failure_at_8h_imputed, kidney_failure_at_48h, kidney_failure_at_48h_lyu, hyperglycemia_at_8h, From 7a19ca4e27db239b0c92be2f879782a538fc1498 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 19:51:30 +0200 Subject: [PATCH 17/20] Kidney switches_only=False, rename to _imputed --- icu_features/feature_engineering.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 88d0694..9c0013b 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -632,9 +632,9 @@ def kdigo_3(crea, rel_urine_rate, crrt): ) kidney_failure_at_48h = eep_label(aki_3, 48).alias("kidney_failure_at_48h") - # Kidney failure, motivated by Lyu et al. 2024: + # Kidney failure with creatine imputation motivated by Lyu et al. 2024: # https://www.medrxiv.org/content/10.1101/2024.02.01.24302063v1 - # Similarly to circulatory_failure_8h_hyland, we linearly interpolate creatine + # Similarly to circulatory_failure_8h_imputed, we linearly interpolate creatine # values. We only interpolate if the time difference between two consecutive # creatine measurements is less than 48 hours. We ffill and bfill the first and last # measurements. @@ -652,7 +652,9 @@ def kdigo_3(crea, rel_urine_rate, crrt): urine_rate / pl.col("weight"), pl.col("ufilt_ind"), ) - kidney_failure_at_48h_lyu = eep_label(aki_3, 48).alias("kidney_failure_at_48h_lyu") + kidney_failure_at_48h_imputed = eep_label(aki_3, 48, switches_only=False).alias( + "kidney_failure_at_48h_imputed" + ) # hyperglycemia_at_8h and hypoglycemia_at_8h according to Mehdizavareha et al., # https://arxiv.org/pdf/2411.01418 @@ -701,7 +703,7 @@ def kdigo_3(crea, rel_urine_rate, crrt): circulatory_failure_at_8h, circulatory_failure_at_8h_imputed, kidney_failure_at_48h, - kidney_failure_at_48h_lyu, + kidney_failure_at_48h_imputed, hyperglycemia_at_8h, hypoglycemia_at_8h, severe_meld_at_48h, From c222b6c15f0bf04ebc137ba00982c1c1e888675d Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 19:52:17 +0200 Subject: [PATCH 18/20] switches_only=False for the glycemias --- icu_features/feature_engineering.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 9c0013b..a7b1532 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -660,8 +660,12 @@ def kdigo_3(crea, rel_urine_rate, crrt): # https://arxiv.org/pdf/2411.01418 hyperglycemia_event = pl.col("glu") > 180 # mg/dl hypoglycemia_event = pl.col("glu") < 70 # mg/dl - hyperglycemia_at_8h = eep_label(hyperglycemia_event, 8).alias("hyperglycemia_at_8h") - hypoglycemia_at_8h = eep_label(hypoglycemia_event, 8).alias("hypoglycemia_at_8h") + hyperglycemia_at_8h = eep_label(hyperglycemia_event, 8, switches_only=False).alias( + "hyperglycemia_at_8h" + ) + hypoglycemia_at_8h = eep_label(hypoglycemia_event, 8, switches_only=False).alias( + "hypoglycemia_at_8h" + ) # Liver failure according to # - MELD score > 30: https://en.wikipedia.org/wiki/Model_for_End-Stage_Liver_Disease From 03ab13d19dd06235b6bfdfa735980130d6a2a1a9 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 19:52:46 +0200 Subject: [PATCH 19/20] Also for meld. --- icu_features/feature_engineering.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index a7b1532..17d6842 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -684,7 +684,9 @@ def kdigo_3(crea, rel_urine_rate, crrt): inr = pl.col("inr_pt").forward_fill(1).backward_fill(1).clip(1, None) meld_score = 3.78 * crea.log() + 11.2 * inr.log() + 9.57 * bili.log() + 6.43 meld_event = meld_score > 30 - severe_meld_at_48h = eep_label(meld_event, 48).alias("severe_meld_at_48h") + severe_meld_at_48h = eep_label(meld_event, 48, switches_only=False).alias( + "severe_meld_at_48h" + ) sofa3 = pl.col("bili").forward_fill(1).backward_fill(1) > 6.0 sofa3_at_48h = eep_label(sofa3, 48).alias("sofa3_at_48h") From 91df5fc76618ef00cc03aae1b85c850f25d8ec61 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 2 Apr 2025 19:53:55 +0200 Subject: [PATCH 20/20] rename lvier_sofa. --- icu_features/feature_engineering.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/icu_features/feature_engineering.py b/icu_features/feature_engineering.py index 17d6842..5255aca 100644 --- a/icu_features/feature_engineering.py +++ b/icu_features/feature_engineering.py @@ -688,8 +688,8 @@ def kdigo_3(crea, rel_urine_rate, crrt): "severe_meld_at_48h" ) - sofa3 = pl.col("bili").forward_fill(1).backward_fill(1) > 6.0 - sofa3_at_48h = eep_label(sofa3, 48).alias("sofa3_at_48h") + liver_sofa3 = pl.col("bili").forward_fill(1).backward_fill(1) > 6.0 + liver_sofa3_at_48h = eep_label(liver_sofa3, 48).alias("liver_sofa3_at_48h") # log(lactate) in 4 hours. This is 1/2 the forecast horizon of circ. failure eep. log_lactate_in_4h = ( @@ -713,7 +713,7 @@ def kdigo_3(crea, rel_urine_rate, crrt): hyperglycemia_at_8h, hypoglycemia_at_8h, severe_meld_at_48h, - sofa3_at_48h, + liver_sofa3_at_48h, log_lactate_in_4h, log_pf_ratio_in_12h, ]