-
Notifications
You must be signed in to change notification settings - Fork 3
add slidescore url and slidescore study id to download manifest. #42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a021ff7
46ebb0a
0a47294
c5e19a6
2790c91
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,9 @@ | |
| This module contains the CLI utilities that can be used with slidescore in python. | ||
|
|
||
| """ | ||
|
|
||
| # pylint: disable=duplicate-code | ||
|
|
||
| import argparse | ||
| import csv | ||
| import json | ||
|
|
@@ -15,7 +18,7 @@ | |
| from collections import defaultdict | ||
| from enum import Enum | ||
| from pathlib import Path | ||
| from typing import Iterable, Optional | ||
| from typing import Iterable, List, Optional, Union | ||
|
|
||
| import shapely.geometry | ||
| from tqdm import tqdm | ||
|
|
@@ -70,6 +73,7 @@ def parse_api_token(data: Optional[Path] = None) -> str: | |
|
|
||
|
|
||
| def _shapely_to_slidescore(shapely_object): | ||
| # pylint: disable=too-many-branches | ||
| shapely_type = type(shapely_object) | ||
| if shapely_type == shapely.geometry.Polygon: | ||
| if len(shapely_object.interiors) != 0: | ||
|
|
@@ -351,30 +355,152 @@ def _download_labels(args: argparse.Namespace) -> None: | |
| ) | ||
|
|
||
|
|
||
| def append_to_manifest(save_dir: pathlib.Path, image_id: int, filename: pathlib.Path) -> None: | ||
| def set_study_details_to_mapping( | ||
| save_dir: pathlib.Path, mapping_format: str, slidescore_url: str, study_id: int | ||
| ) -> None: | ||
| """ | ||
| Sets the slidescore study details to the mapping file | ||
|
|
||
| Parameters | ||
| ---------- | ||
|
|
||
| save_dir: pathlib.Path | ||
| mapping_format: str | ||
| slidesore_url: str | ||
| study_id: int | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
||
| None | ||
| """ | ||
| if mapping_format == "json": | ||
| append_to_json_mapping( | ||
| save_dir=save_dir, keys=[slidescore_url, str(study_id), "slidescore_url"], value=slidescore_url | ||
| ) | ||
| append_to_json_mapping( | ||
| save_dir=save_dir, keys=[slidescore_url, str(study_id), "slidescore_study_id"], value=study_id | ||
| ) | ||
| elif mapping_format == "tsv": | ||
| append_to_tsv_mapping(save_dir=save_dir, items=[f"# {slidescore_url}"]) | ||
| append_to_tsv_mapping(save_dir=save_dir, items=[f"# {study_id}"]) | ||
| else: | ||
| raise ValueError(f"mapping_format should be either 'tsv' or 'json', but is {mapping_format}") | ||
|
|
||
|
|
||
| def append_to_tsv_mapping(save_dir: pathlib.Path, items: List[str]) -> None: | ||
| """ | ||
| Create a manifest mapping image id to image name | ||
|
|
||
| Creates a file that looks like | ||
| ```txt | ||
| # <slidescore_url> | ||
| # <slidescore_study_id> | ||
| <image_id_1> <image_name_1> | ||
| ... | ||
| ``` | ||
|
|
||
| Parameters | ||
| ---------- | ||
| save_dir: pathlib.Path | ||
| items: List[str] | ||
|
|
||
| Returns | ||
| ------- | ||
| None | ||
| """ | ||
| if not save_dir.is_dir(): | ||
| save_dir.mkdir(parents=True) | ||
| tab = "\t" | ||
| with open(save_dir / "slidescore_mapping.tsv", "a+", encoding="utf-8") as file: | ||
| file.write(f"{tab.join(items)}\n") | ||
|
|
||
|
|
||
| def append_to_json_mapping( | ||
| save_dir: pathlib.Path, | ||
| keys: List[str], | ||
| value: Union[pathlib.Path, int, str], | ||
| filename: str = "slidescore_mapping.json", | ||
| ) -> None: | ||
| """ | ||
| Create a manifest mapping image id to the filename. | ||
| Generic method to append a hierarchical key structure with one value to a dictionary in a json file to fix | ||
| missing functionality in python dict classes | ||
|
|
||
| We wish to get | ||
| >> {}[['a', 'b']] = 1 | ||
| {'a': {'b': 1}} | ||
|
|
||
| But this is not possible in a python dict, and throws an error | ||
|
|
||
| This function is used to create a slidescore_mapping.json file, mapping slidescore ID (unique) to image name, | ||
| created when downloading WSIs from slidescores, or when just creating the slidescore_mapping. | ||
|
|
||
| Will end up a file like | ||
| { | ||
| "url": { | ||
| "study_id": { | ||
| "slidescore_url": "url", | ||
| "slidescore_study_id": study_id, | ||
| "slide_filename_to_study_image_id_mapping": { | ||
| "image_id": "image_name", | ||
| ... | ||
| ... | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Parameters | ||
| ---------- | ||
| save_dir : pathlib.Path | ||
| image_id : int | ||
| filename : pathlib.Path | ||
| keys : List[Union[str, pathlib.Path]], sets the hierarchical keys to be set. | ||
| E.g. ['slide_filename_id_mapping', 'filename'] | ||
| sets manifest['slide_filename_id_mapping']['filename'] to the given slidescore ID | ||
| value : Union[int, pathlib.Path, str], value to be set. Generally either a URL, a path, or an integer ID | ||
| filename : fileanme for json file | ||
|
|
||
| Returns | ||
| ------- | ||
| None | ||
| """ | ||
| with open(save_dir / "slidescore_mapping.txt", "a", encoding="utf-8") as file: | ||
| file.write(f"{image_id} {filename.name}\n") | ||
| # Make dir if it doesn't exist. Usage in the CLI, however, places it in an existing directory | ||
| if not save_dir.is_dir(): | ||
| save_dir.mkdir(parents=True) | ||
|
|
||
| value = value.name if isinstance(value, pathlib.Path) else value | ||
| config_filepath = save_dir / filename | ||
|
|
||
| try: | ||
| # Read file if it exists | ||
| with open(config_filepath, mode="r", encoding="utf-8") as file: | ||
| obj = json.load(file) | ||
| except FileNotFoundError: | ||
| # Otherwise create new object | ||
| obj = {} | ||
|
|
||
| new_obj = obj # Create a pointer that we can update | ||
| for idx, key in enumerate(keys): | ||
| if idx == len(keys) - 1: # At the last node | ||
| new_obj[key] = value # Set the leaf | ||
| else: | ||
| if key not in new_obj.keys(): | ||
| new_obj[key] = {} # Not at the last node and the key does not exist; make a subdict | ||
| new_obj = new_obj[key] # Update pointer | ||
|
|
||
| # Save file, overwriting the old file | ||
| with open(config_filepath, mode="w", encoding="utf-8") as file: | ||
| json.dump(obj, file, ensure_ascii=False, indent=4) | ||
|
|
||
|
|
||
| # pylint: disable=too-many-arguments, too-many-branches | ||
| def download_wsis( | ||
| slidescore_url: str, | ||
| api_token: str, | ||
| study_id: int, | ||
| save_dir: pathlib.Path, | ||
| disable_certificate_check: bool = False, | ||
| disable_download: bool = False, | ||
| mapping_format: str = "json", | ||
| ) -> None: | ||
| """ | ||
| Download all WSIs for a given study from SlideScore | ||
|
|
@@ -386,6 +512,9 @@ def download_wsis( | |
| study_id : int | ||
| save_dir : pathlib.Path | ||
| disable_certificate_check : bool | ||
| disable_download : bool | ||
| mapping_format: str | ||
| either of "json" or "tsv" | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -399,23 +528,49 @@ def download_wsis( | |
| # Collect image metadata | ||
| images = client.get_images(study_id) | ||
|
|
||
| # Add study details to mapping manifest | ||
| set_study_details_to_mapping( | ||
| save_dir=save_dir, mapping_format=mapping_format, slidescore_url=slidescore_url, study_id=study_id | ||
| ) | ||
|
|
||
| # Download and save WSIs | ||
| for image in tqdm(images): | ||
| image_id = image["id"] | ||
| image_name = image["name"] | ||
| if not disable_download: | ||
| logger.info("Downloading image for id: %s", image_id) | ||
| filename = client.download_slide(study_id, image, save_dir=save_dir) | ||
| logger.info("Image with id %s has been saved to %s.", image_id, filename) | ||
| if mapping_format == "json": | ||
| append_to_json_mapping( | ||
| save_dir=save_dir, | ||
| keys=[slidescore_url, str(study_id), "slide_filename_to_study_image_id_mapping", str(image_id)], | ||
| value=image_name, | ||
| ) | ||
| elif mapping_format == "tsv": | ||
| append_to_tsv_mapping( | ||
| save_dir=save_dir, | ||
| items=[str(image_id), image_name], | ||
| ) | ||
|
Comment on lines
+550
to
+554
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If not one of these, it's better to raise an error
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Thought this was done by the argparser, but raising error here makes the function more generic |
||
|
|
||
|
|
||
| logger.info("Downloading image for id: %s", image_id) | ||
| filename = client.download_slide(study_id, image, save_dir=save_dir) | ||
| logger.info("Image with id %s has been saved to %s.", image_id, filename) | ||
| append_to_manifest(save_dir, image_id, filename) | ||
| def _download_mapping(args: argparse.Namespace): | ||
| """Main function that downloads only the mapping from filename to slidescore slide ID | ||
|
|
||
| Calls _download_wsi while setting `disable_download=True` | ||
| """ | ||
| _download_wsi(args=args, disable_download=True) | ||
|
|
||
|
|
||
| def _download_wsi(args: argparse.Namespace): | ||
| def _download_wsi(args: argparse.Namespace, disable_download=False): | ||
| """Main function that downloads WSIs from SlideScore. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| args: argparse.Namespace | ||
| The arguments passed from the CLI. Run with `-h` to see the required parameters | ||
| disable_download: bool | ||
| If download is disabled, only the mapping is saved. Can also be used to debug. | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -429,6 +584,8 @@ def _download_wsi(args: argparse.Namespace): | |
| args.study_id, | ||
| args.output_dir, | ||
| disable_certificate_check=args.disable_certificate_check, | ||
| disable_download=disable_download, | ||
| mapping_format=args.mapping_format, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -445,6 +602,43 @@ def register_parser(parser: argparse._SubParsersAction): | |
|
|
||
| download_wsi_parser.set_defaults(subcommand=_download_wsi) | ||
|
|
||
| download_wsi_parser.add_argument( | ||
| "--mapping-format", | ||
| dest="mapping_format", | ||
| type=str, | ||
| help="Save mapping as either json or tsv", | ||
| choices=["tsv", "json"], | ||
| required=False, | ||
| default="tsv", | ||
| ) | ||
|
|
||
| download_mapping_parser = parser.add_parser( | ||
| "download-study-slide-mapping", | ||
| help="Download the download_config.json" | ||
| " with url, study id, and file to " | ||
| "slidescore ID mapping from SlideScore" | ||
| " without downloading the WSIs. " | ||
| "Useful if slides are already on disk," | ||
| "but slidescore information is not", | ||
| ) | ||
| download_mapping_parser.add_argument( | ||
| "output_dir", | ||
| type=pathlib.Path, | ||
| help="Directory to save output too.", | ||
| ) | ||
|
|
||
| download_mapping_parser.add_argument( | ||
| "--mapping-format", | ||
| dest="mapping_format", | ||
| type=str, | ||
| help="Save mapping as either json or tsv", | ||
| choices=["tsv", "json"], | ||
| required=False, | ||
| default="tsv", | ||
| ) | ||
|
|
||
| download_mapping_parser.set_defaults(subcommand=_download_mapping) | ||
|
|
||
| download_label_parser = parser.add_parser("download-labels", help="Download labels from SlideScore.") | ||
| download_label_parser.add_argument( | ||
| "-q", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there no other way to get the mapping only?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not that i can think of without duplicating a lot of code or entirely refactoring the function into less coherent parts.