NKI-AI · moerlemans · Apr 17, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/ahcore/backends.py b/ahcore/backends.py
@@ -4,7 +4,7 @@
 from dlup.backends.common import AbstractSlideBackend
 from dlup.types import PathLike
 
-from ahcore.readers import StitchingMode, ZarrFileImageReader
+from ahcore.readers import StitchingMode, ZarrFileImageReader, H5FileImageReader
 
 
 class ZarrSlide(AbstractSlideBackend):
@@ -42,3 +42,39 @@ def read_region(self, coordinates: tuple[int, int], level: int, size: tuple[int,
 
     def close(self):
         self._reader.close()
+
+class H5Slide(AbstractSlideBackend):
+    def __init__(self, filename: PathLike, stitching_mode: StitchingMode | str = StitchingMode.CROP) -> None:
+        super().__init__(filename)
+        self._reader: H5FileImageReader = H5FileImageReader(filename, stitching_mode=stitching_mode)
+        self._spacings = [(self._reader.mpp, self._reader.mpp)]
+
+    @property
+    def size(self):
+        return self._reader.size
+
+    @property
+    def level_dimensions(self) -> tuple[tuple[int, int], ...]:
+        return (self._reader.size,)
+
+    @property
+    def level_downsamples(self) -> tuple[float, ...]:
+        return (1.0,)
+
+    @property
+    def vendor(self) -> str:
+        return "H5FileImageReader"
+
+    @property
+    def properties(self) -> dict[str, Any]:
+        return self._reader.metadata
+
+    @property
+    def magnification(self):
+        return None
+
+    def read_region(self, coordinates: tuple[int, int], level: int, size: tuple[int, int]) -> pyvips.Image:
+        return self._reader.read_region(coordinates, level, size)
+
+    def close(self):
+        self._reader.close()
diff --git a/ahcore/cli/data.py b/ahcore/cli/data.py
@@ -1,7 +1,6 @@
 """Module to write copy manifests files over to SCRATCH directory"""
 
 from __future__ import annotations
-
 import argparse
 import hashlib
 import os
@@ -44,8 +43,18 @@ def copy_data(args: argparse.Namespace) -> None:
             for patient in all_records:
                 for image in patient.images:
                     image_fn = image.filename
+
                     get_from = base_dir / image_fn
-                    write_to = Path(target_dir) / dataset_name / image_fn
+                    write_to = Path(target_dir) / dataset_name / "images" / image_fn
+
+                    accompanying_folder_write_to, accompanying_folder_get_from = None, None
+                    if get_from.suffix == ".mrxs":
+                        accompanying_folder_get_from = get_from.parent / get_from.stem
+                        if not accompanying_folder_get_from.is_dir():
+                            raise ValueError(
+                                f"Image {image_fn} does not have an accompanying folder, which is expected for mrxs images"
+                            )
+                        accompanying_folder_write_to = write_to.parent / write_to.stem
 
                     write_to.parent.mkdir(parents=True, exist_ok=True)
                     if write_to.exists():
@@ -54,16 +63,93 @@ def copy_data(args: argparse.Namespace) -> None:
                         new_hash = _quick_hash(get_from)
                         if old_hash == new_hash:
                             # Skip if they are the same
-                            progress.console.log("Skipping file as it already exists: {}".format(image_fn))
+                            progress.console.log("Skipping (image) file as it already exists: {}".format(image_fn))
                             progress.update(task, advance=1)
                             continue
 
                     total_size += get_from.stat().st_size
 
                     # Copy file from get_from to write_to
                     shutil.copy(get_from, write_to)
+                    if accompanying_folder_get_from is not None:
+                        shutil.copytree(accompanying_folder_get_from, accompanying_folder_write_to, dirs_exist_ok=True)
+                        total_size += accompanying_folder_get_from.stat().st_size
                     progress.update(task, advance=1)
 
+                    # copy mask and annotations
+                    if args.mask_dir is not None or args.annotations_dir is not None:
+                        for mask in image.masks:
+                            mask_fn = mask.filename
+                            get_from = args.mask_dir / mask_fn
+                            write_to = Path(target_dir) / dataset_name / "masks" / mask_fn
+                            write_to.parent.mkdir(parents=True, exist_ok=True)
+                            if write_to.exists():
+                                # compute the hash of previous and new file
+                                old_hash = _quick_hash(write_to)
+                                new_hash = _quick_hash(get_from)
+                                if old_hash == new_hash:
+                                    # Skip if they are the same
+                                    progress.console.log(
+                                        "Skipping (mask) file as it already exists: {}".format(mask_fn)
+                                    )
+                                    progress.update(task, advance=1)
+                                    continue
+
+                            total_size += get_from.stat().st_size
+                            shutil.copy(get_from, write_to)
+                            progress.update(task, advance=1)
+
+                        for annotation in image.annotations:
+                            annotation_fn = annotation.filename
+                            get_from = args.annotations_dir / annotation_fn
+                            write_to = Path(target_dir) / dataset_name / "annotations" / annotation_fn
+                            write_to.parent.mkdir(parents=True, exist_ok=True)
+                            if write_to.exists():
+                                # compute the hash of previous and new file
+                                old_hash = _quick_hash(write_to)
+                                new_hash = _quick_hash(get_from)
+                                if old_hash == new_hash:
+                                    # Skip if they are the same
+                                    progress.console.log(
+                                        "Skipping (annotation) file as it already exists: {}".format(annotation_fn)
+                                    )
+                                    progress.update(task, advance=1)
+                                    continue
+
+                            total_size += get_from.stat().st_size
+                            shutil.copy(get_from, write_to)
+                            progress.update(task, advance=1)
+
+                    # copy features
+                    if args.features_dir is not None:
+                        if args.feature_version is not None:
+                            feature = dm.get_image_features_by_image_and_feature_version(
+                                image.id, args.feature_version
+                            )  # there should only be one
+                            feature_fn = feature.filename
+                            get_from = args.features_dir / feature_fn
+                            write_to = Path(target_dir) / dataset_name / "features" / feature_fn
+                            write_to.parent.mkdir(parents=True, exist_ok=True)
+                            if write_to.exists():
+                                # compute the hash of previous and new file
+                                old_hash = _quick_hash(write_to)
+                                new_hash = _quick_hash(get_from)
+                                if old_hash == new_hash:
+                                    # Skip if they are the same
+                                    progress.console.log(
+                                        "Skipping (feature) file as it already exists: {}".format(feature_fn)
+                                    )
+                                    progress.update(task, advance=1)
+                                    continue
+
+                            total_size += get_from.stat().st_size
+                            shutil.copy(get_from, write_to)
+                            progress.update(task, advance=1)
+                        else:
+                            raise ValueError(
+                                "Feature version is not provided, but features directory is provided. Please provide both to copy features."
+                            )
+
     progress.console.log("Total data size copied: {:.2f} GB".format(total_size / 1024**3))
 
 
@@ -85,7 +171,7 @@ def register_parser(
     _parser.add_argument(
         "manifest_uri",
         type=str,
-        help="URI that refers to the sqlalchemy supported database path.",
+        help="URI that refers to the sqlalchemy supported database path. If using an sqlite database this looks like 'sqlite:///your_database_path'.",
     )
     _parser.add_argument(
         "manifest_name",
@@ -107,4 +193,33 @@ def register_parser(
         type=str,
         help="Name of the dataset to copy the data to. The data will be copied over to $SCRATCH / DATASET_NAME",
     )
+
+    _parser.add_argument(
+        "--mask_dir",
+        type=dir_path(require_writable=False),
+        help="Directory to which the masks paths defined in the manifest are relative to.",
+        default=None,
+    )
+
+    _parser.add_argument(
+        "--annotations_dir",
+        type=dir_path(require_writable=False),
+        help="Directory to which the annotations paths defined in the manifest are relative to.",
+        default=None,
+    )
+
+    _parser.add_argument(
+        "--features_dir",
+        type=dir_path(require_writable=False),
+        help="Directory to which the features paths defined in the manifest are relative to.",
+        default=None,
+    )
+
+    _parser.add_argument(
+        "--feature_version",
+        type=dir_path(require_writable=False),
+        help="Version of the features to copy.",
+        default=None,
+    )
+
     _parser.set_defaults(subcommand=copy_data)
diff --git a/ahcore/readers.py b/ahcore/readers.py
@@ -156,11 +156,16 @@ def metadata(self) -> dict[str, Any]:
             assert self._metadata
         return self._metadata
 
-    def _decompress_data(self, tile: GenericNumberArray) -> GenericNumberArray:
+    def _decompress_and_reshape_data(self, tile: GenericNumberArray) -> GenericNumberArray:
         if self._is_binary:
             with PIL.Image.open(io.BytesIO(tile)) as img:
-                return np.array(img).transpose(2, 0, 1)
+                return np.array(img).transpose(2, 0, 1)  # fixme: this also shouldn't work because the thing is flattened and doesn't have 3 dimensions
         else:
+            # If handling features, we need to expand dimensions to match the expected shape.
+            if tile.ndim == 1:  # fixme: is this the correct location for this
+                if not self._tile_size == [1, 1]:
+                    raise NotImplementedError(f"Tile is single dimensional and {self._tile_size=} should be [1, 1], other cases have not been considered and cause unwanted behaviour.")
+                return tile.reshape(self._num_channels, *self._tile_size)
             return tile
 
     def read_region(self, location: tuple[int, int], level: int, size: tuple[int, int]) -> pyvips.Image:
@@ -201,7 +206,7 @@ def read_region(self, location: tuple[int, int], level: int, size: tuple[int, in
         total_rows = math.ceil((self._size[1] - self._tile_overlap[1]) / self._stride[1])
         total_cols = math.ceil((self._size[0] - self._tile_overlap[0]) / self._stride[0])
 
-        assert total_rows * total_cols == num_tiles
+        assert total_rows * total_cols == num_tiles  # Equality only holds if features where created without mask
 
         x, y = location
         w, h = size
@@ -230,7 +235,7 @@ def read_region(self, location: tuple[int, int], level: int, size: tuple[int, in
                 tile = (
                     self._empty_tile()
                     if tile_index_in_image_dataset == -1
-                    else self._decompress_data(image_dataset[tile_index_in_image_dataset])
+                    else self._decompress_and_reshape_data(image_dataset[tile_index_in_image_dataset])
                 )
                 start_y = i * self._stride[1] - y
                 end_y = start_y + self._tile_size[1]
@@ -242,6 +247,7 @@ def read_region(self, location: tuple[int, int], level: int, size: tuple[int, in
                 img_start_x = max(0, start_x)
                 img_end_x = min(w, end_x)
 
+
                 if self._stitching_mode == StitchingMode.CROP:
                     crop_start_y = img_start_y - start_y
                     crop_end_y = img_end_y - start_y

diff --git a/ahcore/utils/data.py b/ahcore/utils/data.py
@@ -58,9 +58,10 @@ class DataDescription(BaseModel):
     manifest_database_uri: str
     manifest_name: str
     split_version: str
+    feature_version: Optional[str] = None
     annotations_dir: Path
-    training_grid: GridDescription
-    inference_grid: GridDescription
+    training_grid: Optional[GridDescription] = None
+    inference_grid: Optional[GridDescription] = None
     index_map: Optional[Dict[str, int]]
     remap_labels: Optional[Dict[str, str]] = None
     use_class_weights: Optional[bool] = False

diff --git a/ahcore/utils/database_models.py b/ahcore/utils/database_models.py
@@ -72,6 +72,7 @@ class Image(Base):
     annotations: Mapped[List["ImageAnnotations"]] = relationship("ImageAnnotations", back_populates="image")
     labels: Mapped[List["ImageLabels"]] = relationship("ImageLabels", back_populates="image")
     caches: Mapped[List["ImageCache"]] = relationship("ImageCache", back_populates="image")
+    features: Mapped[List["ImageFeature"]] = relationship("ImageFeature", back_populates="image")
 
 
 class ImageCache(Base):
@@ -115,6 +116,46 @@ class CacheDescription(Base):
     cache: Mapped["ImageCache"] = relationship("ImageCache", back_populates="description")
 
 
+class ImageFeature(Base):
+    """Image feature table."""
+
+    __tablename__ = "image_feature"
+    id = Column(Integer, primary_key=True)
+    # pylint: disable=E1102
+    created = Column(DateTime(timezone=True), default=func.now())
+    last_updated = Column(DateTime(timezone=True), default=func.now(), onupdate=func.now())
+    filename = Column(String, unique=True, nullable=False)
+    reader = Column(String)
+    num_tiles = Column(Integer)
+    image_id = Column(Integer, ForeignKey("image.id"), nullable=False)
+
+    image: Mapped["Image"] = relationship("Image", back_populates="features")
+    description: Mapped["FeatureDescription"] = relationship("FeatureDescription", back_populates="image_feature")
+
+class FeatureDescription(Base):
+    """Feature description table."""
+
+    __tablename__ = "feature_description"
+
+    id = Column(Integer, primary_key=True)
+    # pylint: disable=E1102
+    created = Column(DateTime(timezone=True), default=func.now())
+    last_updated = Column(DateTime(timezone=True), default=func.now(), onupdate=func.now())
+
+    mpp = Column(Float)
+    tile_size_width = Column(Integer)
+    tile_size_height = Column(Integer)
+    tile_overlap_width = Column(Integer)
+    tile_overlap_height = Column(Integer)
+    description = Column(String)
+
+    version = Column(String, unique=True, nullable=False)  # use this to select which features we want to use
+
+    model_name = Column(String)
+    model_path = Column(String)
+    feature_dimension = Column(Integer)
+    image_transforms_description = Column(String)  # it would be nice to have a way to track which transforms the feature extractors used, but maybe this is not the best way to do it
+
 class Mask(Base):
     """Mask table."""