-
Notifications
You must be signed in to change notification settings - Fork 33
Implement Feature/tiger-wsibulk #884
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
400dff8
37d4950
41e96c7
a23b673
b447b0d
247e629
9137aad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plz add this config to tests/eva/vision/test_vision_cli.py (at least to test_configuration_initialization, ideally also to test_predict_fit_from_configuration), so we can test for instantiation errors
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding some feedback I got from claude / codex here: • src/eva/vision/data/datasets/tiger.py:108: _load_file_paths only glob matches *.tif. The TIGER docs you just added (docs/datasets/tiger.md, see the sample tree) show the WSIs using the .tiff extension. • src/eva/vision/data/datasets/classification/tiger_wsibulk.py:86: the dataset now emits progress via print every time annotations are built. In multi-worker loaders this will spam STDOUT and can even deadlock under some multiprocessing backends; please route this through the project logger (or make it optional). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,134 @@ | ||
| --- | ||
| trainer: | ||
| class_path: eva.Trainer | ||
| init_args: | ||
| n_runs: &N_RUNS ${oc.env:N_RUNS, 20} | ||
| default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/tiger_wsibulk} | ||
| max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 100} | ||
| checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best} | ||
| callbacks: | ||
| - class_path: eva.callbacks.ConfigurationLogger | ||
| - class_path: lightning.pytorch.callbacks.TQDMProgressBar | ||
| init_args: | ||
| refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1} | ||
| - class_path: lightning.pytorch.callbacks.LearningRateMonitor | ||
| init_args: | ||
| logging_interval: epoch | ||
| - class_path: lightning.pytorch.callbacks.ModelCheckpoint | ||
| init_args: | ||
| filename: best | ||
| save_last: ${oc.env:SAVE_LAST, false} | ||
| save_top_k: 1 | ||
| monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy} | ||
| mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max} | ||
| - class_path: lightning.pytorch.callbacks.EarlyStopping | ||
| init_args: | ||
| min_delta: 0 | ||
| patience: ${oc.env:PATIENCE, 20} | ||
| monitor: *MONITOR_METRIC | ||
| mode: *MONITOR_METRIC_MODE | ||
| - class_path: eva.callbacks.ClassificationEmbeddingsWriter | ||
| init_args: | ||
| output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/tiger_wsibulk} | ||
| dataloader_idx_map: | ||
| 0: train | ||
| 1: val | ||
| 2: test | ||
| metadata_keys: ["wsi_id"] | ||
| backbone: | ||
| class_path: eva.vision.models.ModelFromRegistry | ||
| init_args: | ||
| model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino} | ||
| model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null} | ||
| overwrite: true | ||
| logger: | ||
| - class_path: lightning.pytorch.loggers.TensorBoardLogger | ||
| init_args: | ||
| save_dir: *OUTPUT_ROOT | ||
| name: "" | ||
| model: | ||
| class_path: eva.HeadModule | ||
| init_args: | ||
| head: | ||
| class_path: eva.vision.models.networks.ABMIL | ||
| init_args: | ||
| input_size: ${oc.env:IN_FEATURES, 384} | ||
| output_size: &NUM_CLASSES 1 | ||
| projected_input_size: 128 | ||
| criterion: torch.nn.BCEWithLogitsLoss | ||
| optimizer: | ||
| class_path: torch.optim.AdamW | ||
| init_args: | ||
| lr: ${oc.env:LR_VALUE, 0.001} | ||
| betas: [0.9, 0.999] | ||
| metrics: | ||
| common: | ||
| - class_path: eva.metrics.AverageLoss | ||
| - class_path: eva.metrics.BinaryClassificationMetrics | ||
| data: | ||
| class_path: eva.DataModule | ||
| init_args: | ||
| datasets: | ||
| train: | ||
| class_path: eva.datasets.MultiEmbeddingsClassificationDataset | ||
| init_args: &DATASET_ARGS | ||
| root: *DATASET_EMBEDDINGS_ROOT | ||
| manifest_file: manifest.csv | ||
| split: train | ||
| embeddings_transforms: | ||
| class_path: eva.core.data.transforms.Pad2DTensor | ||
| init_args: | ||
| pad_size: &N_PATCHES ${oc.env:N_PATCHES, 200} | ||
| target_transforms: | ||
| class_path: eva.core.data.transforms.dtype.ArrayToFloatTensor | ||
| val: | ||
| class_path: eva.datasets.MultiEmbeddingsClassificationDataset | ||
| init_args: | ||
| <<: *DATASET_ARGS | ||
| split: val | ||
| test: | ||
| class_path: eva.datasets.MultiEmbeddingsClassificationDataset | ||
| init_args: | ||
| <<: *DATASET_ARGS | ||
| split: test | ||
| predict: | ||
| - class_path: eva.vision.datasets.TIGERWsiBulk | ||
| init_args: &PREDICT_DATASET_ARGS | ||
| root: ${oc.env:DATA_ROOT, ./data/training/wsibulk} | ||
| sampler: | ||
| class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler | ||
| init_args: | ||
| max_samples: *N_PATCHES | ||
| embeddings_dir: *DATASET_EMBEDDINGS_ROOT | ||
| width: 224 | ||
| height: 224 | ||
| split: train | ||
| coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv | ||
| image_transforms: | ||
| class_path: eva.vision.data.transforms.common.ResizeAndCrop | ||
| init_args: | ||
| size: ${oc.env:RESIZE_DIM, 224} | ||
| mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} | ||
| std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]} | ||
| - class_path: eva.vision.datasets.TIGERWsiBulk | ||
| init_args: | ||
| <<: *PREDICT_DATASET_ARGS | ||
| split: val | ||
| - class_path: eva.vision.datasets.TIGERWsiBulk | ||
| init_args: | ||
| <<: *PREDICT_DATASET_ARGS | ||
| split: test | ||
| dataloaders: | ||
| train: | ||
| batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32} | ||
| num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4} | ||
| shuffle: true | ||
| val: | ||
| batch_size: *BATCH_SIZE | ||
| num_workers: *N_DATA_WORKERS | ||
| test: | ||
| batch_size: *BATCH_SIZE | ||
| num_workers: *N_DATA_WORKERS | ||
| predict: | ||
| batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64} | ||
| num_workers: *N_DATA_WORKERS |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| # TIGER (Tumor Infiltrating Lymphocytes in breast cancER) | ||
|
|
||
| TIGER contains digital pathology images of Her2 positive (Her2+) and Triple Negative (TNBC) breast cancer whole-slide images, together with manual annotations. Training data comes from multiple sources. A subset of Her2+ and TNBC cases is provided by the Radboud University Medical Center (RUMC) (Nijmegen, Netherlands). A second subset of Her2+ and TNBC cases is provided by the Jules Bordet Institut (JB) (Bruxelles, Belgium). A third subset of TNBC cases only is derived from the TCGA-BRCA archive obtained from the Genomic Data Commons Data Portal. | ||
|
|
||
| It contains 3 different datasets and thus 3 different tasks to add to eva. | ||
|
|
||
| WSIBULK - WSI level classification task: Detecting tumour presence in patches of a given slide. | ||
| WSITILS - Regression task: predicting "TIL" score of a whole slide image. | ||
| WSIROIS - Cell level segmentation task: predicting boundaries of TIL cells. | ||
|
|
||
| However only WSIBULK and WSITILS are currently implemented. | ||
|
|
||
| Source: https://tiger.grand-challenge.org/Data/ | ||
|
|
||
|
|
||
| ## Raw data | ||
|
|
||
| ### Key stats | ||
|
|
||
| | | | | ||
| |---------------------------|----------------------------------------------------------| | ||
| | **Modality** | Vision (WSI) | | ||
| | **Tasks** | Binary Classification / Regression | | ||
| | **Cancer type** | Breast | | ||
| | **Data size** | 182 GB | | ||
| | **Image dimension** | ~20k x 20k x 3 | | ||
| | **Magnification (μm/px)** | 20x (0.5) - Level 0 | | ||
| | **Files format** | `.tif` | | ||
| | **Number of images** | 178 WSIs (96 for WSIBULK and 82 for WSITILS) | | ||
|
|
||
|
|
||
| ### Organization | ||
|
|
||
| The data `tiger.zip` from [grand challenge](https://tiger.grand-challenge.org/) is organized as follows: | ||
|
|
||
| training/ | ||
| |_wsibulk/ * Used for classification task | ||
| | |__annotations-tumor-bulk/ * Manual annotations of "tumor bulk" regions | ||
| | | |___masks/ * Binary masks in TIF format | ||
| | | |___xmls/ * Not used in eva | ||
| | |__images/ * Whole-Slide Images | ||
| | │ ├── 103S.tif | ||
| │ | └── ... | ||
| | |__tissue-masks/ * Not used in eva | ||
| | | ||
jklubienski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| |_wsirois/ * Not used in eva currently | ||
| | | ||
| |_wsitils/ * Used for regression task | ||
| | |__images/ * Whole-slide images | ||
| | │ ├── 104S.tif | ||
| │ | └── ... | ||
| | |__tissue-masks/ * Not used in eva | ||
| | |__tiger-til-scores-wsitils.csv * Target variable file | ||
|
|
||
|
|
||
| ## Download and preprocessing | ||
|
|
||
| The `TIGER` dataset class doesn't download the data during runtime and must be downloaded manually as follows: | ||
|
|
||
| - Make sure that the latest version of the AWS CLI is installed on your system by following [these instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) | ||
|
|
||
| With the AWS CLI installed, you can download the official training set (no AWS account required) by running: | ||
|
|
||
| `aws s3 cp s3://tiger-training/ /path/to/destination/ --recursive --no-sign-request` | ||
jklubienski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| These instructions can also be found on the official challenge page [here](https://tiger.grand-challenge.org/Data/) | ||
|
|
||
| We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| """tiger_wsibulk dataset class.""" | ||
|
|
||
| import ast | ||
| import functools | ||
| import os | ||
| from pathlib import Path | ||
| from typing import Dict | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import tifffile as tiff | ||
| import torch | ||
| from typing_extensions import override | ||
|
|
||
| from eva.core.utils.progress_bar import tqdm | ||
| from eva.vision.data.datasets import _validators, tiger | ||
| from eva.vision.data.wsi.patching import PatchCoordinates, samplers | ||
|
|
||
|
|
||
| class TIGERWsiBulk(tiger.TIGERBase): | ||
| """Dataset class for the TIGER tumor detection task. | ||
|
|
||
| Splits a slide-level WSI into multiple different patch level samples, | ||
| dynmaically assigning them labels based on their overlaps with a binary mask. | ||
| """ | ||
|
|
||
| _expected_dataset_lengths: Dict[str | None, int] = { | ||
| "train": 65, | ||
| "val": 13, | ||
| "test": 15, | ||
| None: 93, | ||
| } | ||
| """Represents the expected numbers of WSIs in the dataset for validation. | ||
| Can be overridden for unit tests""" | ||
|
|
||
| _tumor_mask_threshold: float = 0.5 | ||
| """ Proportion of the patch that needs to be covered by the mask in order for it to | ||
| be annotated as a "tumor" (1)""" | ||
|
|
||
| _target_mpp: float = 0.5 | ||
| """Microns per pixel, in this case stating that a pixel covers 0.5 microns per pixel | ||
| Set as a constant in this implementation to ensure no mis-matches with the binary mask""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| root: str, | ||
| sampler: samplers.Sampler, | ||
| embeddings_dir: str, | ||
| **kwargs, | ||
| ) -> None: | ||
| """Initializes dataset. | ||
|
|
||
| Args: | ||
| root: Root directory of the dataset. | ||
| sampler: The sampler to use for sampling patch coordinates. | ||
| embeddings_dir: Directory where the patch data is stored. Used for annotations. | ||
| kwargs: Key-word arguments from the base class. | ||
| """ | ||
| self._embeddings_dir = embeddings_dir | ||
| super().__init__(root=root, sampler=sampler, **kwargs) | ||
|
|
||
| @functools.cached_property | ||
| def annotations(self) -> Dict[str, int]: | ||
| """Builds per-patch labels from the coords CSV files and mask .tif images. | ||
|
|
||
| Returns: | ||
| A dict: { "img_name-patch_index": label } | ||
| """ | ||
| annotations = {} | ||
|
|
||
| csv_folder = os.path.normpath(self._embeddings_dir) | ||
|
|
||
| split_to_csv = { | ||
| split: os.path.join(csv_folder, f"coords_{split}.csv") | ||
| for split in ["train", "val", "test"] | ||
| } | ||
|
|
||
| splits_to_load = ( | ||
| [self._split] if self._split in ["train", "val", "test"] else ["train", "val", "test"] | ||
| ) | ||
|
|
||
| for split in splits_to_load: | ||
| csv_path = split_to_csv[split] | ||
| df = pd.read_csv(csv_path) | ||
| n_rows = len(df) | ||
|
|
||
| for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"): | ||
|
|
||
| file_name = row.file | ||
|
|
||
| row_dict = row._asdict() | ||
| coords = PatchCoordinates( | ||
| x_y=ast.literal_eval(row_dict["x_y"]) if isinstance(row_dict["x_y"], str) else row_dict["x_y"], | ||
| width=int(row_dict["width"]), | ||
| height=int(row_dict["height"]), | ||
| level_idx=int(row_dict["level_idx"]), | ||
| mask=None, | ||
| ) | ||
|
|
||
| annotations.update( | ||
| self._process_patch_coordinates(file_name, coords, self._tumor_mask_threshold) | ||
| ) | ||
|
|
||
| return annotations | ||
|
|
||
| def _process_patch_coordinates( | ||
| self, file: str, coords: PatchCoordinates, threshold: float | ||
| ) -> dict[str, int]: | ||
| annotations: dict[str, int] = {} | ||
| img_name = Path(file).stem | ||
| patch_w = int(coords.width) | ||
| patch_h = int(coords.height) | ||
|
|
||
| mask_path = os.path.join(self._root, "annotations-tumor-bulk", "masks", f"{img_name}.tif") | ||
| mask = tiff.imread(mask_path) | ||
|
|
||
| for idx, (x, y) in enumerate(coords.x_y): | ||
| patch_region = mask[y : y + patch_h, x : x + patch_w] | ||
| tumor_fraction = np.mean(patch_region > 0) | ||
| label = 1 if tumor_fraction > threshold else 0 | ||
| key = f"{img_name}-{idx}" | ||
| annotations[key] = label | ||
|
|
||
| del mask | ||
| return annotations | ||
|
|
||
| @override | ||
| def prepare_data(self) -> None: | ||
| _validators.check_dataset_exists(self._root, False) | ||
|
|
||
| @override | ||
| def validate(self) -> None: | ||
| _validators.check_number_of_files( | ||
| self._file_paths, self._expected_dataset_lengths[self._split], self._split | ||
| ) | ||
|
|
||
| @override | ||
| def load_target(self, index: int) -> torch.Tensor: | ||
|
|
||
| metadata = self.load_metadata(index) | ||
|
|
||
| slide_idx = metadata["slide_idx"] | ||
| patch_idx = metadata["patch_idx"] | ||
|
|
||
| file_path = self._file_paths[slide_idx] | ||
| slide_name = Path(file_path).stem | ||
| key = f"{slide_name}-{patch_idx}" | ||
| label = self.annotations[key] | ||
|
|
||
| return torch.tensor(label, dtype=torch.int64) |
Uh oh!
There was an error while loading. Please reload this page.