From 8af833a9d3c6efd61590cd71ed66a73964898a6e Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 11:43:03 +0200
Subject: [PATCH 01/12] Allow `train_size` when `complete_train=True`

---
 icu_benchmarks/data/split_process_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu_benchmarks/data/split_process_data.py b/icu_benchmarks/data/split_process_data.py
index 54fa5c0e..0bf9fc86 100644
--- a/icu_benchmarks/data/split_process_data.py
+++ b/icu_benchmarks/data/split_process_data.py
@@ -173,7 +173,7 @@ def preprocess_data(
         )
     else:
         # If full train is set, we use all data for training/validation
-        sanitized_data = make_train_val_polars(data, vars, train_size=None, seed=seed, debug=debug, runmode=runmode)
+        sanitized_data = make_train_val_polars(data, vars, train_size=train_size, seed=seed, debug=debug, runmode=runmode)
 
     # Apply preprocessing
     start = timer()

From 739f861fa91b1c2c32155ae276fd4da9734d998c Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 11:54:42 +0200
Subject: [PATCH 02/12] fix: update not overwrite

---
 icu_benchmarks/data/split_process_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu_benchmarks/data/split_process_data.py b/icu_benchmarks/data/split_process_data.py
index 0bf9fc86..a28f92c7 100644
--- a/icu_benchmarks/data/split_process_data.py
+++ b/icu_benchmarks/data/split_process_data.py
@@ -191,7 +191,7 @@ def preprocess_data(
             sel = _dict[key].select(pl.all().has_nulls())
             logging.debug(sel.select(col.name for col in sel if col.item(0)))
             _dict[key] = val.fill_null(strategy="zero")
-            _dict[key] = val.fill_nan(0)
+            _dict[key] = _dict[key].fill_nan(0)
             logging.debug("Dropping columns with nulls")
             sel = _dict[key].select(pl.all().has_nulls())
             logging.debug(sel.select(col.name for col in sel if col.item(0)))

From 06e8dac267403d284d09574f99fccd1112d3d251 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 12:01:55 +0200
Subject: [PATCH 03/12] "inner" join instead of "left" to prevent silent null
 rows

---
 icu_benchmarks/data/split_process_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/icu_benchmarks/data/split_process_data.py b/icu_benchmarks/data/split_process_data.py
index a28f92c7..6a91bb51 100644
--- a/icu_benchmarks/data/split_process_data.py
+++ b/icu_benchmarks/data/split_process_data.py
@@ -391,7 +391,7 @@ def make_train_val_polars(
     for fold in split.keys():
         data_split[fold] = {
             data_type: split[fold]
-            .join(data[data_type].with_columns(pl.col(_id).cast(pl.datatypes.Int64)), on=_id, how="left")
+            .join(data[data_type].with_columns(pl.col(_id).cast(pl.datatypes.Int64)), on=_id, how="inner")
             .sort(by=_id)
             for data_type in data.keys()
         }
@@ -586,7 +586,7 @@ def make_single_split_polars(
         # set sort to true to make sure that IDs are reordered after scrambling earlier
         data_split[fold] = {
             data_type: split[fold]
-            .join(data[data_type].with_columns(pl.col(id).cast(pl.datatypes.Int64)), on=id, how="left")
+            .join(data[data_type].with_columns(pl.col(id).cast(pl.datatypes.Int64)), on=id, how="inner")
             .sort(by=id)
             for data_type in data.keys()
         }

From b3a888f7102edee28e98d382c6fbce29038e4550 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 12:18:04 +0200
Subject: [PATCH 04/12] Convinience - order features/indicators

---
 icu_benchmarks/data/loader.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index bfa944ad..8212b285 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -47,6 +47,14 @@ def __init__(
             logging.info("Using static dataset")
             self.row_indicators = data[split][DataSegment.features][self.vars["GROUP"]]
             self.features_df = data[split][DataSegment.features]
+
+        # order columns: index, features (alphabetically), indicator (alphabetically)
+        cols = self.features_df.columns
+        m_index = ["stay_id"]
+        front = sorted([c for c in cols if not c.startswith("MissingIndicator_") and c not in m_index])
+        back = sorted([c for c in cols if c.startswith("MissingIndicator_") and c not in m_index])
+        self.features_df= self.features_df[m_index + front + back]
+
         # calculate basic info for the data
         self.num_stays = self.grouping_df[self.vars["GROUP"]].unique().shape[0]
         self.maxlen = self.features_df.group_by([self.vars["GROUP"]]).len().max().item(0, 1)

From 87cfc75e7a2911ee9d6639897ce3ef570b30b45e Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 12:28:37 +0200
Subject: [PATCH 05/12] Make Data loader work w/ more than 1 label

---
 icu_benchmarks/data/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index 8212b285..78a80b03 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -123,7 +123,7 @@ def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, Tensor]:
 
         if len(labels) == 1:
             # only one label per stay, align with window
-            labels = np.concatenate([np.empty(window.shape[0] - 1) * np.nan, labels], axis=0)
+            labels = np.concatenate([np.full((window.shape[0] - 1, len(self.vars["LABEL"])), np.nan), labels], axis=0)
 
         length_diff = self.maxlen - window.shape[0]
         pad_mask = np.ones(window.shape[0])
@@ -132,7 +132,7 @@ def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, Tensor]:
         if length_diff > 0:
             # window shorter than the longest window in dataset, pad to same length
             window = np.concatenate([window, np.ones((length_diff, window.shape[1])) * pad_value], axis=0)
-            labels = np.concatenate([labels, np.ones(length_diff) * pad_value], axis=0)
+            labels = np.concatenate([labels, np.ones((length_diff, len(self.vars["LABEL"]))) * pad_value], axis=0)
             pad_mask = np.concatenate([pad_mask, np.zeros(length_diff)], axis=0)
 
         not_labeled = np.argwhere(np.isnan(labels))

From 06c380cac8d068bae77ea2a79b7e3d62bbaa4228 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 14:40:56 +0200
Subject: [PATCH 06/12] fix: debug sampling (stay_id wise not row wise) + no
 `id` shadowing

---
 icu_benchmarks/data/split_process_data.py | 27 ++++++++++++++---------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/icu_benchmarks/data/split_process_data.py b/icu_benchmarks/data/split_process_data.py
index 6a91bb51..1252645c 100644
--- a/icu_benchmarks/data/split_process_data.py
+++ b/icu_benchmarks/data/split_process_data.py
@@ -368,8 +368,12 @@ def make_train_val_polars(
         )
 
     if debug:
-        logging.info("Using only 1% of the data for debugging. Note that this might lead to errors for small datasets.")
-        data[DataSegment.outcome] = data[DataSegment.outcome].sample(fraction=0.01, seed=seed)
+        logging.info("Using only 1% of stay_id for debugging. Note that this might lead to errors for small datasets.")
+        sampled_ids = data[DataSegment.outcome][_id].unique().sample(fraction=0.01, seed=seed)
+        data[DataSegment.outcome] = data[DataSegment.outcome].filter(pl.col(_id).is_in(sampled_ids))
+        data[DataSegment.dynamic] = data[DataSegment.dynamic].filter(pl.col(_id).is_in(sampled_ids))
+        if DataSegment.static in data:
+            data[DataSegment.static] = data[DataSegment.static].filter(pl.col(_id).is_in(sampled_ids))
 
     stays = pl.Series(name=_id, values=data[DataSegment.outcome][_id].unique())
 
@@ -535,18 +539,21 @@ def make_single_split_polars(
     For a more detailed documentation refer to make_single_splits(...)
     """
     # ID variable
-    id = vars[VarType.group]
+    _id = vars[VarType.group]
     if debug:
-        # Only use 1% of the data
-        logging.info("Using only 1% of the data for debugging. Note that this might lead to errors for small datasets.")
-        data[DataSegment.outcome] = data[DataSegment.outcome].sample(fraction=0.01, seed=seed)
+        logging.info("Using only stay_id's of the data for debugging. Note that this might lead to errors for small datasets.")
+        sampled_ids = data[DataSegment.outcome][_id].unique().sample(fraction=0.01, seed=seed)
+        data[DataSegment.outcome] = data[DataSegment.outcome].filter(pl.col(_id).is_in(sampled_ids))
+        data[DataSegment.dynamic] = data[DataSegment.dynamic].filter(pl.col(_id).is_in(sampled_ids))
+        if DataSegment.static in data:
+            data[DataSegment.static] = data[DataSegment.static].filter(pl.col(_id).is_in(sampled_ids))
 
     # Get stay IDs from outcome segment
-    stays = pl.Series(name=id, values=data[DataSegment.outcome][id].unique())
+    stays = pl.Series(name=_id, values=data[DataSegment.outcome][_id].unique())
     # If there are labels, and the task is classification, use stratified k-fold
     if VarType.label in vars and runmode is RunMode.classification:
         # Get labels from outcome data (takes the highest value (or True) in case seq2seq classification)
-        labels: pl.Series = data[DataSegment.outcome].group_by(id).max().sort(id)[vars[VarType.label]]
+        labels: pl.Series = data[DataSegment.outcome].group_by(_id).max().sort(_id)[vars[VarType.label]]
         if labels.value_counts().min().item(0, 1) < cv_folds:
             raise Exception(
                 f"The smallest amount of samples in a class is: {labels.value_counts().min()}, "
@@ -586,8 +593,8 @@ def make_single_split_polars(
         # set sort to true to make sure that IDs are reordered after scrambling earlier
         data_split[fold] = {
             data_type: split[fold]
-            .join(data[data_type].with_columns(pl.col(id).cast(pl.datatypes.Int64)), on=id, how="inner")
-            .sort(by=id)
+            .join(data[data_type].with_columns(pl.col(_id).cast(pl.datatypes.Int64)), on=_id, how="inner")
+            .sort(by=_id)
             for data_type in data.keys()
         }
 

From a090760e8e5b51a452c6759146b090b2a03941f5 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 14:46:45 +0200
Subject: [PATCH 07/12] Use vectorized ram buildup instead of sequential -
 *massive* speedup

---
 icu_benchmarks/data/loader.py | 43 ++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index 78a80b03..867950c0 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -110,7 +110,7 @@ def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, Tensor]:
             A sample from the data, consisting of data, labels and padding mask.
         """
         if self._cached_dataset is not None:
-            return self._cached_dataset[idx]
+            return tuple(element[idx] for element in self._cached_dataset)
 
         pad_value = 0.0
         stay_id = self.outcome_df[self.vars["GROUP"]].unique()[idx]  # [self.vars["GROUP"]]
@@ -146,6 +146,47 @@ def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, Tensor]:
 
         return from_numpy(data), from_numpy(labels), from_numpy(pad_mask)
 
+    def ram_cache(self, cache: bool = True) -> None:
+        self._cached_dataset = None
+        if cache:
+            logging.info(f"Caching {self.split} dataset in ram.")
+
+            # Get per-stay lengths in group order
+            lengths = self.features_df.group_by(self.vars["GROUP"], maintain_order=True).len()["len"].to_numpy().astype(int)
+            offsets = np.concatenate([[0], lengths.cumsum()[:-1]])
+
+            n_stays = self.num_stays
+            n_features = len(self.features_df.columns) - 1  # exclude GROUP
+            n_labels = len(self.vars["LABEL"])
+
+            # Extract full arrays once
+            self.features_df = self.features_df.sort(self.vars["GROUP"])
+            self.outcome_df = self.outcome_df.sort(self.vars["GROUP"])
+            data_np = self.features_df.select(pl.exclude(self.vars["GROUP"])).to_numpy().astype(np.float32)
+            labels_np = self.outcome_df[self.vars["LABEL"]].to_numpy().astype(np.float32)
+
+            # Pre-allocate
+            padded_data = np.zeros((n_stays, self.maxlen, n_features), dtype=np.float32)
+            padded_labels = np.zeros((n_stays, self.maxlen, n_labels), dtype=np.float32)
+            pad_mask = np.zeros((n_stays, self.maxlen), dtype=bool)
+
+            for i, (offset, length) in enumerate(zip(offsets, lengths, strict=True)):
+                padded_data[i, :length] = data_np[offset : offset + length]
+                padded_labels[i, :length] = labels_np[offset : offset + length]
+                pad_mask[i, :length] = True
+
+            # Replace nan labels with -1 and mask them out (mirrors __getitem__)
+            nan_mask = np.isnan(padded_labels)  # (n_stays, maxlen, n_labels)
+            padded_labels[nan_mask] = -1
+            # If any label is nan at a timestep, zero the mask
+            pad_mask[nan_mask.any(axis=-1)] = False
+
+            self._cached_dataset = (
+                from_numpy(padded_data),
+                from_numpy(padded_labels),
+                from_numpy(pad_mask),
+            )
+
     def get_balance(self) -> list:
         """Return the weight balance for the split of interest.
 

From 178bfd03f6e800158390837d7540b73f4ddaf75c Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 14:46:59 +0200
Subject: [PATCH 08/12] linting

---
 icu_benchmarks/data/loader.py       | 2 +-
 icu_benchmarks/data/preprocessor.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index 867950c0..7b3467a5 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -53,7 +53,7 @@ def __init__(
         m_index = ["stay_id"]
         front = sorted([c for c in cols if not c.startswith("MissingIndicator_") and c not in m_index])
         back = sorted([c for c in cols if c.startswith("MissingIndicator_") and c not in m_index])
-        self.features_df= self.features_df[m_index + front + back]
+        self.features_df = self.features_df[m_index + front + back]
 
         # calculate basic info for the data
         self.num_stays = self.grouping_df[self.vars["GROUP"]].unique().shape[0]
diff --git a/icu_benchmarks/data/preprocessor.py b/icu_benchmarks/data/preprocessor.py
index c4b0e6d4..23e88b35 100644
--- a/icu_benchmarks/data/preprocessor.py
+++ b/icu_benchmarks/data/preprocessor.py
@@ -312,7 +312,7 @@ def _process_outcome(
                 outcome_rec.add_step(
                     StepSklearn(
                         sklearn_transformer=FunctionTransformer(
-                            func=lambda x: ((x - self.outcome_min) / (self.outcome_max - self.outcome_min))
+                            func=lambda x: (x - self.outcome_min) / (self.outcome_max - self.outcome_min)
                         ),
                         sel=all_outcomes(),
                     )
@@ -528,7 +528,7 @@ def _process_outcome(self, data, vars, split):
             outcome_rec.add_step(
                 StepSklearn(
                     sklearn_transformer=FunctionTransformer(
-                        func=lambda x: ((x - self.outcome_min) / (self.outcome_max - self.outcome_min))
+                        func=lambda x: (x - self.outcome_min) / (self.outcome_max - self.outcome_min)
                     ),
                     sel=all_outcomes(),
                 )

From 786c86a5560f70216277d6784be5fae81be2d5d1 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 15:17:46 +0200
Subject: [PATCH 09/12] Handle classification in ram_cache

---
 icu_benchmarks/data/loader.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index 7b3467a5..6672d3e0 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -165,6 +165,8 @@ def ram_cache(self, cache: bool = True) -> None:
             data_np = self.features_df.select(pl.exclude(self.vars["GROUP"])).to_numpy().astype(np.float32)
             labels_np = self.outcome_df[self.vars["LABEL"]].to_numpy().astype(np.float32)
 
+            single_label_per_stay = self.outcome_df.shape[0] == n_stays
+
             # Pre-allocate
             padded_data = np.zeros((n_stays, self.maxlen, n_features), dtype=np.float32)
             padded_labels = np.zeros((n_stays, self.maxlen, n_labels), dtype=np.float32)
@@ -172,7 +174,13 @@ def ram_cache(self, cache: bool = True) -> None:
 
             for i, (offset, length) in enumerate(zip(offsets, lengths, strict=True)):
                 padded_data[i, :length] = data_np[offset : offset + length]
-                padded_labels[i, :length] = labels_np[offset : offset + length]
+                if single_label_per_stay:
+                    # mirror __getitem__: all NaN except final timestep
+                    stay_labels = np.full((length, n_labels), np.nan, dtype=np.float32)
+                    stay_labels[-1] = labels_np[i]
+                    padded_labels[i, :length] = stay_labels
+                else:
+                    padded_labels[i, :length] = labels_np[offset : offset + length]
                 pad_mask[i, :length] = True
 
             # Replace nan labels with -1 and mask them out (mirrors __getitem__)

From a6ec88598a83b341e72e14394bbd5b9737b937d6 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 15:19:07 +0200
Subject: [PATCH 10/12] Use vars["GROUP"] instead of hardcoded stay_id

---
 icu_benchmarks/data/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index 6672d3e0..47e82939 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -50,7 +50,7 @@ def __init__(
 
         # order columns: index, features (alphabetically), indicator (alphabetically)
         cols = self.features_df.columns
-        m_index = ["stay_id"]
+        m_index = [self.vars["GROUP"]]
         front = sorted([c for c in cols if not c.startswith("MissingIndicator_") and c not in m_index])
         back = sorted([c for c in cols if c.startswith("MissingIndicator_") and c not in m_index])
         self.features_df = self.features_df[m_index + front + back]

From 7dc68bf2213580ab5c8d140bb45c703c294dd587 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 15:20:23 +0200
Subject: [PATCH 11/12] Sort stays

---
 icu_benchmarks/data/split_process_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/icu_benchmarks/data/split_process_data.py b/icu_benchmarks/data/split_process_data.py
index 1252645c..ed966981 100644
--- a/icu_benchmarks/data/split_process_data.py
+++ b/icu_benchmarks/data/split_process_data.py
@@ -375,7 +375,7 @@ def make_train_val_polars(
         if DataSegment.static in data:
             data[DataSegment.static] = data[DataSegment.static].filter(pl.col(_id).is_in(sampled_ids))
 
-    stays = pl.Series(name=_id, values=data[DataSegment.outcome][_id].unique())
+    stays = pl.Series(name=_id, values=data[DataSegment.outcome][_id].unique().sort())
 
     if VarType.label in vars and runmode is RunMode.classification:
         labels = data[DataSegment.outcome].group_by(_id).max()[label]
@@ -549,7 +549,7 @@ def make_single_split_polars(
             data[DataSegment.static] = data[DataSegment.static].filter(pl.col(_id).is_in(sampled_ids))
 
     # Get stay IDs from outcome segment
-    stays = pl.Series(name=_id, values=data[DataSegment.outcome][_id].unique())
+    stays = pl.Series(name=_id, values=data[DataSegment.outcome][_id].unique().sort())
     # If there are labels, and the task is classification, use stratified k-fold
     if VarType.label in vars and runmode is RunMode.classification:
         # Get labels from outcome data (takes the highest value (or True) in case seq2seq classification)

From 0b5ae4877c5ac583b8208bdfb42296bfd1071690 Mon Sep 17 00:00:00 2001
From: tinkpad <jurinho@hotmail.de>
Date: Thu, 23 Apr 2026 15:36:58 +0200
Subject: [PATCH 12/12] Add docstring to ram_cache

---
 icu_benchmarks/data/loader.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
index 47e82939..fb2bed39 100644
--- a/icu_benchmarks/data/loader.py
+++ b/icu_benchmarks/data/loader.py
@@ -147,6 +147,9 @@ def __getitem__(self, idx: int) -> tuple[Tensor, Tensor, Tensor]:
         return from_numpy(data), from_numpy(labels), from_numpy(pad_mask)
 
     def ram_cache(self, cache: bool = True) -> None:
+        """Prepares a in-memory cache of the data, transforms the DataFrames to padded Tensors.
+        saves (padded_features, padded_labels, pad_mask) in `self._cached_dataset`
+        """
         self._cached_dataset = None
         if cache:
             logging.info(f"Caching {self.split} dataset in ram.")