diff --git a/biolearn/corrections.py b/biolearn/corrections.py new file mode 100644 index 0000000..80c52fd --- /dev/null +++ b/biolearn/corrections.py @@ -0,0 +1,41 @@ +"""Post-load correction functions for GEO datasets with known issues.""" + +import pandas as pd + +from biolearn.util import cached_download + + +def fix_gse110554(geo_data, source_path): + """Fix cell_type metadata for GSM2998097 and GSM2998106. + + These samples have metadata on row 14 instead of row 13 due to + GEO formatting issues. See https://github.com/bio-learn/biolearn/issues/87 + """ + samples_to_fix = ["GSM2998097", "GSM2998106"] + file_path = cached_download(source_path) + + # Read correct cell_type values from row 53 (1-indexed: row 54) + # The standard cell_type is on row 53, but these samples need row 54 + raw = pd.read_table(file_path, index_col=0, skiprows=52, nrows=1) + + for sample in samples_to_fix: + if sample in raw.columns and sample in geo_data.metadata.index: + value = raw[sample].iloc[0] + if isinstance(value, str) and ":" in value: + value = value.split(":")[1].strip() + geo_data.metadata.loc[sample, "cell_type"] = value + + return geo_data + + +# Registry maps correction names to functions +CORRECTIONS = { + "fix_gse110554": fix_gse110554, +} + + +def apply_correction(name, geo_data, source_path): + """Apply a named correction to GeoData.""" + if name not in CORRECTIONS: + raise ValueError(f"Unknown correction: {name}") + return CORRECTIONS[name](geo_data, source_path) diff --git a/biolearn/data/library.yaml b/biolearn/data/library.yaml index c019aa9..6f4bc26 100644 --- a/biolearn/data/library.yaml +++ b/biolearn/data/library.yaml @@ -79,6 +79,7 @@ items: title: 'FlowSorted.Blood.EPIC: An optimized library for reference-based deconvolution of whole-blood biospecimens assayed using the Illumina HumanMethylationEPIC BeadArray (II)' + corrections: fix_gse110554 summary: DNA methylation assessments of peripheral blood DNA can be used to accurately estimate the relative proportions of underlying leukocyte subtypes. Such cell deconvolution analysis relies on libraries of discriminating differentially methylated diff --git a/biolearn/data_library.py b/biolearn/data_library.py index 8b1e8b7..dc3eeae 100644 --- a/biolearn/data_library.py +++ b/biolearn/data_library.py @@ -1150,6 +1150,7 @@ def __init__(self, source_definition, cache=None): self.tags = source_definition.get( "tags", [] ) # Default empty list if tags are not provided + self.corrections = source_definition.get("corrections") self.parser = self._create_parser(source_definition["parser"]) @@ -1170,6 +1171,10 @@ def load(self): return cached data = self.parser.parse(self.path) + if self.corrections: + from biolearn.corrections import apply_correction + + data = apply_correction(self.corrections, data, self.path) self.cache.store( self.id, data, self.CACHE_CATEGORY, self.CACHE_VERSION ) diff --git a/biolearn/test/test_data_library.py b/biolearn/test/test_data_library.py index 2bc498d..871631d 100644 --- a/biolearn/test/test_data_library.py +++ b/biolearn/test/test_data_library.py @@ -372,3 +372,26 @@ def test_load_datasource_with_curated_tag(capsys): df = source.load() captured = capsys.readouterr() assert captured.out == "" + + +def test_datasource_with_unknown_corrections(): + """Test that unknown corrections raise an error""" + source_def = { + "id": "TestData", + "path": get_test_data_file("geo_dnam_test_file"), + "parser": { + "type": "geo-matrix", + "id-row": 33, + "metadata": { + "age": {"row": 47, "parse": "numeric"}, + "sex": {"row": 41, "parse": "sex"}, + "cancer": {"row": 50, "parse": "string"}, + }, + "matrix-start": 74, + }, + "corrections": "nonexistent_correction", + } + source = DataSource(source_def) + with pytest.raises(ValueError) as e: + source.load() + assert "Unknown correction" in str(e.value)