bio-learn · marcbal77 · Dec 9, 2025 · Dec 7, 2025 · Dec 7, 2025
diff --git a/biolearn/corrections.py b/biolearn/corrections.py
@@ -0,0 +1,41 @@
+"""Post-load correction functions for GEO datasets with known issues."""
+
+import pandas as pd
+
+from biolearn.util import cached_download
+
+
+def fix_gse110554(geo_data, source_path):
+    """Fix cell_type metadata for GSM2998097 and GSM2998106.
+
+    These samples have metadata on row 14 instead of row 13 due to
+    GEO formatting issues. See https://github.com/bio-learn/biolearn/issues/87
+    """
+    samples_to_fix = ["GSM2998097", "GSM2998106"]
+    file_path = cached_download(source_path)
+
+    # Read correct cell_type values from row 53 (1-indexed: row 54)
+    # The standard cell_type is on row 53, but these samples need row 54
+    raw = pd.read_table(file_path, index_col=0, skiprows=52, nrows=1)
+
+    for sample in samples_to_fix:
+        if sample in raw.columns and sample in geo_data.metadata.index:
+            value = raw[sample].iloc[0]
+            if isinstance(value, str) and ":" in value:
+                value = value.split(":")[1].strip()
+            geo_data.metadata.loc[sample, "cell_type"] = value
+
+    return geo_data
+
+
+# Registry maps correction names to functions
+CORRECTIONS = {
+    "fix_gse110554": fix_gse110554,
+}
+
+
+def apply_correction(name, geo_data, source_path):
+    """Apply a named correction to GeoData."""
+    if name not in CORRECTIONS:
+        raise ValueError(f"Unknown correction: {name}")
+    return CORRECTIONS[name](geo_data, source_path)
diff --git a/biolearn/data/library.yaml b/biolearn/data/library.yaml
@@ -79,6 +79,7 @@ items:
   title: 'FlowSorted.Blood.EPIC: An optimized library for reference-based deconvolution
     of whole-blood biospecimens assayed using the Illumina HumanMethylationEPIC BeadArray
     (II)'
+  corrections: fix_gse110554
   summary: DNA methylation assessments of peripheral blood DNA can be used to accurately
     estimate the relative proportions of underlying leukocyte subtypes. Such cell
     deconvolution analysis relies on libraries of discriminating differentially methylated

diff --git a/biolearn/data_library.py b/biolearn/data_library.py
@@ -1150,6 +1150,7 @@ def __init__(self, source_definition, cache=None):
         self.tags = source_definition.get(
             "tags", []
         )  # Default empty list if tags are not provided
+        self.corrections = source_definition.get("corrections")
 
         self.parser = self._create_parser(source_definition["parser"])
 
@@ -1170,6 +1171,10 @@ def load(self):
             return cached
 
         data = self.parser.parse(self.path)
+        if self.corrections:
+            from biolearn.corrections import apply_correction
+
+            data = apply_correction(self.corrections, data, self.path)
         self.cache.store(
             self.id, data, self.CACHE_CATEGORY, self.CACHE_VERSION
         )

diff --git a/biolearn/test/test_data_library.py b/biolearn/test/test_data_library.py
@@ -372,3 +372,26 @@ def test_load_datasource_with_curated_tag(capsys):
     df = source.load()
     captured = capsys.readouterr()
     assert captured.out == ""
+
+
+def test_datasource_with_unknown_corrections():
+    """Test that unknown corrections raise an error"""
+    source_def = {
+        "id": "TestData",
+        "path": get_test_data_file("geo_dnam_test_file"),
+        "parser": {
+            "type": "geo-matrix",
+            "id-row": 33,
+            "metadata": {
+                "age": {"row": 47, "parse": "numeric"},
+                "sex": {"row": 41, "parse": "sex"},
+                "cancer": {"row": 50, "parse": "string"},
+            },
+            "matrix-start": 74,
+        },
+        "corrections": "nonexistent_correction",
+    }
+    source = DataSource(source_def)
+    with pytest.raises(ValueError) as e:
+        source.load()
+    assert "Unknown correction" in str(e.value)