Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions biolearn/corrections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Post-load correction functions for GEO datasets with known issues."""

import pandas as pd

from biolearn.util import cached_download


def fix_gse110554(geo_data, source_path):
"""Fix cell_type metadata for GSM2998097 and GSM2998106.

These samples have metadata on row 14 instead of row 13 due to
GEO formatting issues. See https://github.com/bio-learn/biolearn/issues/87
"""
samples_to_fix = ["GSM2998097", "GSM2998106"]
file_path = cached_download(source_path)

# Read correct cell_type values from row 53 (1-indexed: row 54)
# The standard cell_type is on row 53, but these samples need row 54
raw = pd.read_table(file_path, index_col=0, skiprows=52, nrows=1)

for sample in samples_to_fix:
if sample in raw.columns and sample in geo_data.metadata.index:
value = raw[sample].iloc[0]
if isinstance(value, str) and ":" in value:
value = value.split(":")[1].strip()
geo_data.metadata.loc[sample, "cell_type"] = value

return geo_data


# Registry maps correction names to functions
CORRECTIONS = {
"fix_gse110554": fix_gse110554,
}


def apply_correction(name, geo_data, source_path):
"""Apply a named correction to GeoData."""
if name not in CORRECTIONS:
raise ValueError(f"Unknown correction: {name}")
return CORRECTIONS[name](geo_data, source_path)
1 change: 1 addition & 0 deletions biolearn/data/library.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ items:
title: 'FlowSorted.Blood.EPIC: An optimized library for reference-based deconvolution
of whole-blood biospecimens assayed using the Illumina HumanMethylationEPIC BeadArray
(II)'
corrections: fix_gse110554
summary: DNA methylation assessments of peripheral blood DNA can be used to accurately
estimate the relative proportions of underlying leukocyte subtypes. Such cell
deconvolution analysis relies on libraries of discriminating differentially methylated
Expand Down
5 changes: 5 additions & 0 deletions biolearn/data_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,7 @@ def __init__(self, source_definition, cache=None):
self.tags = source_definition.get(
"tags", []
) # Default empty list if tags are not provided
self.corrections = source_definition.get("corrections")

self.parser = self._create_parser(source_definition["parser"])

Expand All @@ -1170,6 +1171,10 @@ def load(self):
return cached

data = self.parser.parse(self.path)
if self.corrections:
from biolearn.corrections import apply_correction

data = apply_correction(self.corrections, data, self.path)
self.cache.store(
self.id, data, self.CACHE_CATEGORY, self.CACHE_VERSION
)
Expand Down
23 changes: 23 additions & 0 deletions biolearn/test/test_data_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,3 +372,26 @@ def test_load_datasource_with_curated_tag(capsys):
df = source.load()
captured = capsys.readouterr()
assert captured.out == ""


def test_datasource_with_unknown_corrections():
"""Test that unknown corrections raise an error"""
source_def = {
"id": "TestData",
"path": get_test_data_file("geo_dnam_test_file"),
"parser": {
"type": "geo-matrix",
"id-row": 33,
"metadata": {
"age": {"row": 47, "parse": "numeric"},
"sex": {"row": 41, "parse": "sex"},
"cancer": {"row": 50, "parse": "string"},
},
"matrix-start": 74,
},
"corrections": "nonexistent_correction",
}
source = DataSource(source_def)
with pytest.raises(ValueError) as e:
source.load()
assert "Unknown correction" in str(e.value)