KatherLab · EzicStar · Jul 1, 2025 · Feb 27, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,12 +7,26 @@ on:
     branches: [ main ]
 
 jobs:
-  test:
+  test_extractors:
     runs-on: ubuntu-latest
     # needs: format_and_lint
     strategy:
       matrix:
         python-version: ["3.11", "3.12"]
+        extractor: [
+          "ctranspath",
+          "chief-ctranspath",
+          "conch",
+          "conch1_5",
+          "uni",
+          "uni2",
+          "dino-bloom",
+          "gigapath",
+          "h-optimus-0",
+          "h-optimus-1",
+          "mstar",
+          "plip",
+        ]
 
     steps:
     - uses: actions/checkout@v4
@@ -30,13 +44,46 @@ jobs:
     #     python-version: ${{ matrix.python-version }}
 
     - name: Install the project
-      run: uv sync --all-extras --dev
+      run: uv sync --extra all --dev
 
     - name: Build
       run: uv build
 
-    - name: Run tests
-      run: uv run pytest tests/
+    - name: Run tests for extractor ${{ matrix.extractor }}
+      run: uv run pytest -s tests/test_feature_extractors.py -k "${{ matrix.extractor }}" --verbose
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+  test_others:
+    runs-on: ubuntu-latest
+    # needs: format_and_lint
+    strategy:
+      matrix:
+        python-version: ["3.11", "3.12"]
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        enable-cache: true
+        cache-dependency-glob: "uv.lock"
+
+    # - name: Set up Python ${{ matrix.python-version }}
+    #   uses: actions/setup-python@v5
+    #   with:
+    #     python-version: ${{ matrix.python-version }}
+
+    - name: Install the project
+      run: uv sync --extra all --dev
+
+    - name: Build
+      run: uv build
+
+    - name: Run other tests
+      run: uv run pytest -s tests/ -k "not test_feature_extractors.py" --verbose
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
@@ -56,4 +103,4 @@ jobs:
     - name: Check code formatting with Ruff
       run: ruff format --diff --target-version=py311
       # continue-on-error: true
-
+
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ parts/
 sdist/
 var/
 wheels/
+weights/
 share/python-wheels/
 *.egg-info/
 .installed.cfg

diff --git a/README.md b/README.md
@@ -18,13 +18,11 @@ A Protocol for End-to-End Deep Learning in Computational Pathology".
 
 ## Installing stamp
 
-To install stamp, run:
+We recommend installing STAMP with [uv](https://docs.astral.sh/uv/):
 ```bash
-# We recommend using a virtual environment to install stamp
-python -m venv .venv
-. .venv/bin/activate
+uv sync --all extras
 
-pip install "stamp[all] @ git+https://github.com/KatherLab/STAMP"
+source .venv/bin/activate
 ```
 
 > [!IMPORTANT]
@@ -43,16 +41,18 @@ pip install "stamp[all] @ git+https://github.com/KatherLab/STAMP"
 If the installation was successful, running `stamp` in your terminal should yield the following output:
 ```
 $ stamp
-usage: stamp [-h] [--config CONFIG_FILE_PATH] {init,setup,preprocess,train,crossval,deploy,statistics,config,heatmaps} ...
+usage: stamp [-h] [--config CONFIG_FILE_PATH] {init,preprocess,encode_slides,encode_patients,train,crossval,deploy,statistics,config,heatmaps} ...
 
 STAMP: Solid Tumor Associative Modeling in Pathology
 
 positional arguments:
-  {init,setup,preprocess,train,crossval,deploy,statistics,config,heatmaps}
+  {init,preprocess,encode_slides,encode_patients,train,crossval,deploy,statistics,config,heatmaps}
     init                Create a new STAMP configuration file at the path specified by --config
     preprocess          Preprocess whole-slide images into feature vectors
+    encode_slides       Encode patch-level features into slide-level embeddings
+    encode_patients     Encode features into patient-level embeddings
     train               Train a Vision Transformer model
-    crossval            Train a Vision Transformer model with cross validation
+    crossval            Train a Vision Transformer model with cross validation for modeling.n_splits folds
     deploy              Deploy a trained Vision Transformer model
     statistics          Generate AUROCs and AUPRCs with 95%CI for a trained Vision Transformer model
     config              Print the loaded configuration

diff --git a/getting-started.md b/getting-started.md
@@ -44,10 +44,21 @@ which in turn allows us to efficiently train machine learning models with them.
 
 Stamp currently supports the following feature extractors:
   - [ctranspath][ctranspath]
+  - [chief_ctranspath][chief_ctranspath]
   - [DinoBloom][dinobloom]
   - [CONCH][conch]
+  - [CONCHv1.5][conch1_5]
   - [UNI][uni]
+  - [UNI2][uni2]
+  - [Virchow][virchow]
   - [Virchow2][virchow2]
+  - [Gigapath][gigapath]
+  - [H-optimus-0][h_optimus_0]
+  - [H-optimus-1][h_optimus_1]
+  - [mSTAR][mstar]
+  - [MUSK][musk]
+  - [PLIP][plip]
+
 
 As some of the above require you to request access to the model on huggingface,
 we will stick with ctranspath for this example.
@@ -128,8 +139,19 @@ meaning ignored that it was ignored during feature extraction.
 [ctranspath]: https://www.sciencedirect.com/science/article/abs/pii/S1361841522002043 "Transformer-based unsupervised contrastive learning for histopathological image classification"
 [dinobloom]: https://github.com/marrlab/DinoBloom "DinoBloom: A Foundation Model for Generalizable Cell Embeddings in Hematology"
 [uni]: https://www.nature.com/articles/s41591-024-02857-3 "Towards a general-purpose foundation model for computational pathology"
+[uni2]: https://huggingface.co/MahmoodLab/UNI2-h
 [conch]: https://www.nature.com/articles/s41591-024-02856-4 "A visual-language foundation model for computational pathology"
+[conch1_5]: https://huggingface.co/MahmoodLab/conchv1_5
+[virchow]: https://huggingface.co/paige-ai/Virchow "A foundation model for clinical-grade computational pathology and rare cancers detection"
 [virchow2]: https://huggingface.co/paige-ai/Virchow2
+[chief_ctranspath]: https://github.com/hms-dbmi/CHIEF
+[gigapath]: https://huggingface.co/prov-gigapath/prov-gigapath
+[h_optimus_0]: https://huggingface.co/bioptimus/H-optimus-0
+[h_optimus_1]: https://huggingface.co/bioptimus/H-optimus-1
+[mstar]: https://huggingface.co/Wangyh/mSTAR
+[musk]: https://huggingface.co/xiangjx/musk
+[plip]: https://github.com/PathologyFoundation/plip
+
 
 ## Doing Cross-Validation on the Data Set
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -42,59 +42,56 @@ dependencies = [
     "torchmetrics>=1.6.0",
     "torchvision>=0.20.1",
     "tqdm>=4.66.6",
+    "timm>=0.9.11",
 ]
 
 [project.optional-dependencies]
-dinobloom = [
-    "torchvision>=0.20.1",
-    "xformers>=0.0.28.post3",
-]
 conch = [
     "huggingface-hub>=0.26.2",
     "conch @ git+https://github.com/Mahmoodlab/CONCH.git@02d6ac59cc20874bff0f581de258c2b257f69a84",
 ]
 conch1_5 = [
     "transformers>=4.45.2",
     "einops-exts==0.0.4",
-    "torch>=2.0.0"
 ]
 ctranspath = [
     "gdown>=5.2.0",
-    "torchvision>=0.20.1",
 ]
 chief_ctranspath = [
     "gdown>=5.2.0",
-    "torchvision>=0.20.1",
     "torch>=2.0.0"
 ]
 gigapath = [
-    "timm>=0.9.11",
-    "torchvision>=0.20.1",
-]
-h_optimus_0 = [
-    "timm>=0.9.11",
-    "torchvision>=0.20.1",
-]
-h_optimus_1 = [
-    "timm>=0.9.11",
-    "torchvision>=0.20.1",
+    "gigapath @ git+https://github.com/EzicStar/prov-gigapath.git@d4cf55321df37aaf867e24a31c61bcf490a296eb"
 ]
 uni = [
     "huggingface-hub>=0.26.2",
-    #TODO change the git repo back to mahmoodlab's once our pull request has been accepted
-    "uni @ git+https://github.com/KatherLab/uni.git@f37c299eb0bffa0e585f120974082cfec6ee6d53",
-]
-uni2 = [
-    "timm>=0.9.11",
-    "torch>=2.0.0",
+    "uni @ git+https://github.com/mahmoodlab/UNI.git",
 ]
 virchow2 = [
     "huggingface-hub>=0.27.1",
-    "timm>=0.9.11",
     "torch>=2.0.0",
 ]
+cobra = [
+    "jinja2>=3.1.4",
+    "cobra @ git+https://github.com/KatherLab/COBRA.git@f1a576e1133330ffc2d1df6ee110701921c7b7c9",
+]
+prism = [
+    "sacremoses==0.1.1",
+    "environs==11.0.0",
+]
+madeleine = [
+    "madeleine @ git+https://github.com/mahmoodlab/MADELEINE.git@de7c85acc2bdad352e6df8eee5694f8b6f288012"
+]
+musk = [
+    "musk @ git+https://github.com/lilab-stanford/MUSK.git@e1699c27687f44bbf6d4adfcbb2abe89795d347f",
+]
+plip = [
+    "transformers>=4.45.2"
+]
+
 # Blanket target
-all = ["stamp[dinobloom,conch,ctranspath,uni,virchow2]"]
+all = ["stamp[conch,ctranspath,uni,virchow2,chief_ctranspath,conch1_5,prism,madeleine,musk,plip]"]
 
 [project.scripts]
 "stamp" = "stamp.__main__:main"
@@ -127,3 +124,18 @@ markers = [
 
 [tool.ruff]
 lint.ignore = ["F722"] # https://docs.kidger.site/jaxtyping/faq/#flake8-or-ruff-are-throwing-an-error
+
+[[tool.uv.dependency-metadata]]
+name = "uni"
+version = "v0.1.0"
+requires-dist = [
+    "torch>=2.0.1",
+    "torchvision",
+    "timm>=0.9.8",
+    "numpy",
+    "pandas",
+    "scikit-learn",
+    "tqdm",
+    "transformers",
+    "xformers; sys_platform != 'darwin'"  # xformers is not supported on macOS
+]
diff --git a/src/stamp/__main__.py b/src/stamp/__main__.py
@@ -77,6 +77,49 @@ def _run_cli(args: argparse.Namespace) -> None:
                 generate_hash=config.preprocessing.generate_hash,
             )
 
+        case "encode_slides":
+            from stamp.encoding import init_slide_encoder_
+
+            if config.slide_encoding is None:
+                raise ValueError("no slide encoding configuration supplied")
+
+            _add_file_handle_(_logger, output_dir=config.slide_encoding.output_dir)
+            _logger.info(
+                "using the following configuration:\n"
+                f"{yaml.dump(config.slide_encoding.model_dump(mode='json'))}"
+            )
+            init_slide_encoder_(
+                encoder=config.slide_encoding.encoder,
+                output_dir=config.slide_encoding.output_dir,
+                feat_dir=config.slide_encoding.feat_dir,
+                device=config.slide_encoding.device,
+                agg_feat_dir=config.slide_encoding.agg_feat_dir,
+                generate_hash=config.slide_encoding.generate_hash,
+            )
+
+        case "encode_patients":
+            from stamp.encoding import init_patient_encoder_
+
+            if config.patient_encoding is None:
+                raise ValueError("no patient encoding configuration supplied")
+
+            _add_file_handle_(_logger, output_dir=config.patient_encoding.output_dir)
+            _logger.info(
+                "using the following configuration:\n"
+                f"{yaml.dump(config.patient_encoding.model_dump(mode='json'))}"
+            )
+            init_patient_encoder_(
+                encoder=config.patient_encoding.encoder,
+                output_dir=config.patient_encoding.output_dir,
+                feat_dir=config.patient_encoding.feat_dir,
+                slide_table_path=config.patient_encoding.slide_table,
+                patient_label=config.patient_encoding.patient_label,
+                filename_label=config.patient_encoding.filename_label,
+                device=config.patient_encoding.device,
+                agg_feat_dir=config.patient_encoding.agg_feat_dir,
+                generate_hash=config.patient_encoding.generate_hash,
+            )
+
         case "train":
             from stamp.modeling.train import train_categorical_model_
 
@@ -249,6 +292,14 @@ def main() -> None:
     commands.add_parser(
         "preprocess", help="Preprocess whole-slide images into feature vectors"
     )
+    commands.add_parser(
+        "encode_slides",
+        help="Encode patch-level features into slide-level embeddings",
+    )
+    commands.add_parser(
+        "encode_patients",
+        help="Encode features into patient-level embeddings",
+    )
     commands.add_parser("train", help="Train a Vision Transformer model")
     commands.add_parser(
         "crossval",

diff --git a/src/stamp/cache.py b/src/stamp/cache.py
@@ -2,6 +2,7 @@
 import os
 import shutil
 import urllib.request
+from functools import cache
 from pathlib import Path
 from typing import Final
 
@@ -36,3 +37,17 @@ def download_file(*, url: str, file_name: str, sha256sum: str) -> Path:
 def file_digest(file: str | Path) -> str:
     with open(file, "rb") as fp:
         return hashlib.file_digest(fp, "sha256").hexdigest()
+
+
+@cache
+def get_processing_code_hash(file_path) -> str:
+    """The hash of the entire process codebase.
+
+    It is used to assure that features extracted with different versions of this code base
+    can be identified as such after the fact.
+    """
+    hasher = hashlib.sha256()
+    for file_path in sorted(file_path.parent.glob("*.py")):
+        with open(file_path, "rb") as fp:
+            hasher.update(fp.read())
+    return hasher.hexdigest()
diff --git a/src/stamp/config.py b/src/stamp/config.py
@@ -1,5 +1,6 @@
 from pydantic import BaseModel, ConfigDict
 
+from stamp.encoding.config import PatientEncodingConfig, SlideEncodingConfig
 from stamp.heatmaps.config import HeatmapConfig
 from stamp.modeling.config import CrossvalConfig, DeploymentConfig, TrainConfig
 from stamp.preprocessing.config import PreprocessingConfig
@@ -18,3 +19,7 @@ class StampConfig(BaseModel):
     statistics: StatsConfig | None = None
 
     heatmaps: HeatmapConfig | None = None
+
+    slide_encoding: SlideEncodingConfig | None = None
+
+    patient_encoding: PatientEncodingConfig | None = None