diff --git a/configs/vision/pathology/offline/classification/tiger_wsibulk.yaml b/configs/vision/pathology/offline/classification/tiger_wsibulk.yaml new file mode 100644 index 000000000..9cabb2c1e --- /dev/null +++ b/configs/vision/pathology/offline/classification/tiger_wsibulk.yaml @@ -0,0 +1,134 @@ +--- +trainer: + class_path: eva.Trainer + init_args: + n_runs: &N_RUNS ${oc.env:N_RUNS, 20} + default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/tiger_wsibulk} + max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 100} + checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best} + callbacks: + - class_path: eva.callbacks.ConfigurationLogger + - class_path: lightning.pytorch.callbacks.TQDMProgressBar + init_args: + refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1} + - class_path: lightning.pytorch.callbacks.LearningRateMonitor + init_args: + logging_interval: epoch + - class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + filename: best + save_last: ${oc.env:SAVE_LAST, false} + save_top_k: 1 + monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy} + mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max} + - class_path: lightning.pytorch.callbacks.EarlyStopping + init_args: + min_delta: 0 + patience: ${oc.env:PATIENCE, 20} + monitor: *MONITOR_METRIC + mode: *MONITOR_METRIC_MODE + - class_path: eva.callbacks.ClassificationEmbeddingsWriter + init_args: + output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/tiger_wsibulk} + dataloader_idx_map: + 0: train + 1: val + 2: test + metadata_keys: ["wsi_id"] + backbone: + class_path: eva.vision.models.ModelFromRegistry + init_args: + model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino} + model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null} + overwrite: true + logger: + - class_path: lightning.pytorch.loggers.TensorBoardLogger + init_args: + save_dir: *OUTPUT_ROOT + name: "" +model: + class_path: eva.HeadModule + init_args: + head: + class_path: eva.vision.models.networks.ABMIL + init_args: + input_size: ${oc.env:IN_FEATURES, 384} + output_size: &NUM_CLASSES 1 + projected_input_size: 128 + criterion: torch.nn.BCEWithLogitsLoss + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: ${oc.env:LR_VALUE, 0.001} + betas: [0.9, 0.999] + metrics: + common: + - class_path: eva.metrics.AverageLoss + - class_path: eva.metrics.BinaryClassificationMetrics +data: + class_path: eva.DataModule + init_args: + datasets: + train: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: &DATASET_ARGS + root: *DATASET_EMBEDDINGS_ROOT + manifest_file: manifest.csv + split: train + embeddings_transforms: + class_path: eva.core.data.transforms.Pad2DTensor + init_args: + pad_size: &N_PATCHES ${oc.env:N_PATCHES, 200} + target_transforms: + class_path: eva.core.data.transforms.dtype.ArrayToFloatTensor + val: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: + <<: *DATASET_ARGS + split: val + test: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: + <<: *DATASET_ARGS + split: test + predict: + - class_path: eva.vision.datasets.TIGERWsiBulk + init_args: &PREDICT_DATASET_ARGS + root: ${oc.env:DATA_ROOT, ./data/training/wsibulk} + sampler: + class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler + init_args: + max_samples: *N_PATCHES + embeddings_dir: *DATASET_EMBEDDINGS_ROOT + width: 224 + height: 224 + split: train + coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv + image_transforms: + class_path: eva.vision.data.transforms.common.ResizeAndCrop + init_args: + size: ${oc.env:RESIZE_DIM, 224} + mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} + std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]} + - class_path: eva.vision.datasets.TIGERWsiBulk + init_args: + <<: *PREDICT_DATASET_ARGS + split: val + - class_path: eva.vision.datasets.TIGERWsiBulk + init_args: + <<: *PREDICT_DATASET_ARGS + split: test + dataloaders: + train: + batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32} + num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4} + shuffle: true + val: + batch_size: *BATCH_SIZE + num_workers: *N_DATA_WORKERS + test: + batch_size: *BATCH_SIZE + num_workers: *N_DATA_WORKERS + predict: + batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64} + num_workers: *N_DATA_WORKERS diff --git a/docs/datasets/tiger.md b/docs/datasets/tiger.md new file mode 100644 index 000000000..ed0d26f21 --- /dev/null +++ b/docs/datasets/tiger.md @@ -0,0 +1,68 @@ +# TIGER (Tumor Infiltrating Lymphocytes in breast cancER) + +TIGER contains digital pathology images of Her2 positive (Her2+) and Triple Negative (TNBC) breast cancer whole-slide images, together with manual annotations. Training data comes from multiple sources. A subset of Her2+ and TNBC cases is provided by the Radboud University Medical Center (RUMC) (Nijmegen, Netherlands). A second subset of Her2+ and TNBC cases is provided by the Jules Bordet Institut (JB) (Bruxelles, Belgium). A third subset of TNBC cases only is derived from the TCGA-BRCA archive obtained from the Genomic Data Commons Data Portal. + +It contains 3 different datasets and thus 3 different tasks to add to eva. + +WSIBULK - WSI level classification task: Detecting tumour presence in patches of a given slide. +WSITILS - Regression task: predicting "TIL" score of a whole slide image. +WSIROIS - Cell level segmentation task: predicting boundaries of TIL cells. + +However only WSIBULK and WSITILS are currently implemented. + +Source: https://tiger.grand-challenge.org/Data/ + + +## Raw data + +### Key stats + +| | | +|---------------------------|----------------------------------------------------------| +| **Modality** | Vision (WSI) | +| **Tasks** | Binary Classification / Regression | +| **Cancer type** | Breast | +| **Data size** | 182 GB | +| **Image dimension** | ~20k x 20k x 3 | +| **Magnification (μm/px)** | 20x (0.5) - Level 0 | +| **Files format** | `.tif` | +| **Number of images** | 178 WSIs (96 for WSIBULK and 82 for WSITILS) | + + +### Organization + +The data `tiger.zip` from [grand challenge](https://tiger.grand-challenge.org/) is organized as follows: + +training/ + |_wsibulk/ * Used for classification task + | |__annotations-tumor-bulk/ * Manual annotations of "tumor bulk" regions + | | |___masks/ * Binary masks in TIF format + | | |___xmls/ * Not used in eva + | |__images/ * Whole-Slide Images + | │ ├── 103S.tif + │ | └── ... + | |__tissue-masks/ * Not used in eva + | + |_wsirois/ * Not used in eva currently + | + |_wsitils/ * Used for regression task + | |__images/ * Whole-slide images + | │ ├── 104S.tif + │ | └── ... + | |__tissue-masks/ * Not used in eva + | |__tiger-til-scores-wsitils.csv * Target variable file + + +## Download and preprocessing + +The `TIGER` dataset class doesn't download the data during runtime and must be downloaded manually as follows: + +- Make sure that the latest version of the AWS CLI is installed on your system by following [these instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) + +With the AWS CLI installed, you can download the official training set (no AWS account required) by running: + +`aws s3 cp s3://tiger-training/ /path/to/destination/ --recursive --no-sign-request` + +These instructions can also be found on the official challenge page [here](https://tiger.grand-challenge.org/Data/) + +We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio. \ No newline at end of file diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py index 95ed8d847..210c60048 100644 --- a/src/eva/vision/data/datasets/__init__.py +++ b/src/eva/vision/data/datasets/__init__.py @@ -11,6 +11,7 @@ GleasonArvaniti, PANDASmall, PatchCamelyon, + TIGERWsiBulk, UniToPatho, WsiClassificationDataset, ) @@ -49,4 +50,5 @@ "VisionDataset", "MultiWsiDataset", "WsiDataset", + "TIGERWsiBulk", ] diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index cd60020d1..e2bb3b894 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -9,6 +9,7 @@ from eva.vision.data.datasets.classification.mhist import MHIST from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon +from eva.vision.data.datasets.classification.tiger_wsibulk import TIGERWsiBulk from eva.vision.data.datasets.classification.unitopatho import UniToPatho from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset @@ -25,5 +26,5 @@ "WsiClassificationDataset", "PANDA", "PANDASmall", - "Camelyon16", + "TIGERWsiBulk", ] diff --git a/src/eva/vision/data/datasets/classification/tiger_wsibulk.py b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py new file mode 100644 index 000000000..e4f9fdb5b --- /dev/null +++ b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py @@ -0,0 +1,150 @@ +"""tiger_wsibulk dataset class.""" + +import ast +import functools +import os +from pathlib import Path +from typing import Dict + +import numpy as np +import pandas as pd +import tifffile as tiff +import torch +from typing_extensions import override + +from eva.core.utils.progress_bar import tqdm +from eva.vision.data.datasets import _validators, tiger +from eva.vision.data.wsi.patching import PatchCoordinates, samplers + + +class TIGERWsiBulk(tiger.TIGERBase): + """Dataset class for the TIGER tumor detection task. + + Splits a slide-level WSI into multiple different patch level samples, + dynmaically assigning them labels based on their overlaps with a binary mask. + """ + + _expected_dataset_lengths: Dict[str | None, int] = { + "train": 65, + "val": 13, + "test": 15, + None: 93, + } + """Represents the expected numbers of WSIs in the dataset for validation. + Can be overridden for unit tests""" + + _tumor_mask_threshold: float = 0.5 + """ Proportion of the patch that needs to be covered by the mask in order for it to + be annotated as a "tumor" (1)""" + + _target_mpp: float = 0.5 + """Microns per pixel, in this case stating that a pixel covers 0.5 microns per pixel + Set as a constant in this implementation to ensure no mis-matches with the binary mask""" + + def __init__( + self, + root: str, + sampler: samplers.Sampler, + embeddings_dir: str, + **kwargs, + ) -> None: + """Initializes dataset. + + Args: + root: Root directory of the dataset. + sampler: The sampler to use for sampling patch coordinates. + embeddings_dir: Directory where the patch data is stored. Used for annotations. + kwargs: Key-word arguments from the base class. + """ + self._embeddings_dir = embeddings_dir + super().__init__(root=root, sampler=sampler, **kwargs) + + @functools.cached_property + def annotations(self) -> Dict[str, int]: + """Builds per-patch labels from the coords CSV files and mask .tif images. + + Returns: + A dict: { "img_name-patch_index": label } + """ + annotations = {} + + csv_folder = os.path.normpath(self._embeddings_dir) + + split_to_csv = { + split: os.path.join(csv_folder, f"coords_{split}.csv") + for split in ["train", "val", "test"] + } + + splits_to_load = ( + [self._split] if self._split in ["train", "val", "test"] else ["train", "val", "test"] + ) + + for split in splits_to_load: + csv_path = split_to_csv[split] + df = pd.read_csv(csv_path) + n_rows = len(df) + + for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"): + + file_name = row.file + + row_dict = row._asdict() + coords = PatchCoordinates( + x_y=ast.literal_eval(row_dict["x_y"]) if isinstance(row_dict["x_y"], str) else row_dict["x_y"], + width=int(row_dict["width"]), + height=int(row_dict["height"]), + level_idx=int(row_dict["level_idx"]), + mask=None, +) + + annotations.update( + self._process_patch_coordinates(file_name, coords, self._tumor_mask_threshold) + ) + + return annotations + + def _process_patch_coordinates( + self, file: str, coords: PatchCoordinates, threshold: float + ) -> dict[str, int]: + annotations: dict[str, int] = {} + img_name = Path(file).stem + patch_w = int(coords.width) + patch_h = int(coords.height) + + mask_path = os.path.join(self._root, "annotations-tumor-bulk", "masks", f"{img_name}.tif") + mask = tiff.imread(mask_path) + + for idx, (x, y) in enumerate(coords.x_y): + patch_region = mask[y : y + patch_h, x : x + patch_w] + tumor_fraction = np.mean(patch_region > 0) + label = 1 if tumor_fraction > threshold else 0 + key = f"{img_name}-{idx}" + annotations[key] = label + + del mask + return annotations + + @override + def prepare_data(self) -> None: + _validators.check_dataset_exists(self._root, False) + + @override + def validate(self) -> None: + _validators.check_number_of_files( + self._file_paths, self._expected_dataset_lengths[self._split], self._split + ) + + @override + def load_target(self, index: int) -> torch.Tensor: + + metadata = self.load_metadata(index) + + slide_idx = metadata["slide_idx"] + patch_idx = metadata["patch_idx"] + + file_path = self._file_paths[slide_idx] + slide_name = Path(file_path).stem + key = f"{slide_name}-{patch_idx}" + label = self.annotations[key] + + return torch.tensor(label, dtype=torch.int64) diff --git a/src/eva/vision/data/datasets/tiger.py b/src/eva/vision/data/datasets/tiger.py new file mode 100644 index 000000000..dd035f313 --- /dev/null +++ b/src/eva/vision/data/datasets/tiger.py @@ -0,0 +1,132 @@ +"""Abstract base class for TIGER datasets spanning different task types.""" + +import abc +import glob +import os +import random +from typing import Any, Callable, Dict, List, Literal, Tuple + +import torch +from torchvision import tv_tensors +from torchvision.transforms.v2 import functional +from typing_extensions import override + +from eva.core.data import splitting +from eva.vision.data.datasets import _validators, vision, wsi +from eva.vision.data.wsi.patching import samplers + + +class TIGERBase( + wsi.MultiWsiDataset, + vision.VisionDataset[tv_tensors.Image, torch.Tensor], + abc.ABC, +): + """Abstract base class for TIGER datasets spanning different task types.""" + + _train_split_ratio: float = 0.7 + _val_split_ratio: float = 0.15 + _test_split_ratio: float = 0.15 + + _target_mpp: float = 0.5 + """Target microns per pixel (mpp) for patches.""" + + def __init__( + self, + root: str, + sampler: samplers.Sampler, + split: Literal["train", "val", "test"] | None = None, + width: int = 224, + height: int = 224, + backend: str = "openslide", + image_transforms: Callable | None = None, + coords_path: str | None = None, + seed: int = 42, + ) -> None: + """Initializes the dataset. + + Args: + root: Root directory of the dataset. + sampler: The sampler to use for sampling patch coordinates. + split: Dataset split to use. If `None`, the entire dataset is used. + width: Patch width in pixels. + height: Patch height in pixels. + backend: WSI reading backend. + image_transforms: Transforms to apply to patches. + coords_path: Optional path to save patch coordinates. + seed: Random seed. + """ + self._root = root + self._split = split + self._width = width + self._height = height + self._seed = seed + + wsi.MultiWsiDataset.__init__( + self, + root=root, + file_paths=self._load_file_paths(split), + width=width, + height=height, + sampler=sampler, + target_mpp=self._target_mpp, + backend=backend, + image_transforms=image_transforms, + coords_path=coords_path, + ) + + @override + def prepare_data(self) -> None: + _validators.check_dataset_exists(self._root, False) + + @override + def __getitem__(self, index: int) -> Tuple[tv_tensors.Image, torch.Tensor, Dict[str, Any]]: + return vision.VisionDataset.__getitem__(self, index) + + @override + def load_data(self, index: int) -> tv_tensors.Image: + image_array = wsi.MultiWsiDataset.__getitem__(self, index) + return functional.to_image(image_array) + + @override + def load_metadata(self, index: int) -> Dict[str, Any]: + return wsi.MultiWsiDataset.load_metadata(self, index) + + @abc.abstractmethod + def annotations(self) -> Dict[str, Any]: + """Task-specific annotations (classification labels, regression targets, etc.).""" + raise NotImplementedError + + @abc.abstractmethod + def load_target(self, index: int): + """Task-specific target loading.""" + raise NotImplementedError + + def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: + """Loads the file paths of WSIs from wsibulk/images. + + Splits are assigned 70% train, 15% val, 15% test but can be dynamically defined if required. + """ + image_dir = os.path.join(self._root, "images") + all_paths = sorted(glob.glob(os.path.join(image_dir, "*.tif"))) + + if not all_paths: + raise FileNotFoundError(f"No .tif files found in {image_dir}") + + train_indices, val_indices, test_indices = splitting.random_split( + all_paths, + self._train_split_ratio, + self._val_split_ratio, + self._test_split_ratio, + self._seed, + ) + + if split == "train": + selected_paths = [all_paths[i] for i in train_indices] + elif split == "val": + selected_paths = [all_paths[i] for i in val_indices] + elif split == "test": + selected_paths = [all_paths[i] for i in test_indices] + else: + selected_paths = all_paths + + return [os.path.relpath(path, self._root) for path in selected_paths] diff --git a/src/eva/vision/data/datasets/wsi.py b/src/eva/vision/data/datasets/wsi.py index 4c1c789a3..8e31d5644 100644 --- a/src/eva/vision/data/datasets/wsi.py +++ b/src/eva/vision/data/datasets/wsi.py @@ -179,7 +179,11 @@ def load_metadata(self, index: int) -> Dict[str, Any]: """Loads the metadata for the patch at the specified index.""" dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) - return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata + return { + "wsi_id": self.filename(index).split(".")[0], + "slide_idx": dataset_index, + "patch_idx": sample_index, + } | patch_metadata def _load_datasets(self) -> list[WsiDataset]: logger.info(f"Initializing dataset with {len(self._file_paths)} WSIs ...") diff --git a/src/eva/vision/data/wsi/patching/coordinates.py b/src/eva/vision/data/wsi/patching/coordinates.py index 0152115f5..54dfcc8d3 100644 --- a/src/eva/vision/data/wsi/patching/coordinates.py +++ b/src/eva/vision/data/wsi/patching/coordinates.py @@ -1,5 +1,6 @@ """A module for handling coordinates of patches from a whole-slide image.""" +import ast import dataclasses import functools from typing import Any, Dict, List, Tuple diff --git a/src/eva/vision/models/networks/abmil.py b/src/eva/vision/models/networks/abmil.py index bb2ca4820..20b6f9b51 100644 --- a/src/eva/vision/models/networks/abmil.py +++ b/src/eva/vision/models/networks/abmil.py @@ -111,7 +111,8 @@ def forward(self, input_tensor: torch.Tensor) -> torch.Tensor: attention_result = torch.matmul(torch.transpose(attention_weights, 1, 2), input_tensor) # (batch_size, 1, hidden_size_attention) - attention_result = torch.squeeze(attention_result, 1) # (batch_size, hidden_size_attention) + attention_result = torch.squeeze(attention_result, 1) + # (batch_size, hidden_size_attention) return self.classifier(attention_result) # (batch_size, output_size) diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/103S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/103S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/103S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/111S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/111S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/111S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/119S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/119S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/119S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/124S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/124S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/124S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/127B.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/127B.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/127B.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv new file mode 100644 index 000000000..f5cf97fb6 --- /dev/null +++ b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv @@ -0,0 +1,2 @@ +file,x_y,width,height,level_idx +images/103S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv new file mode 100644 index 000000000..0c985521f --- /dev/null +++ b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv @@ -0,0 +1,4 @@ +file,x_y,width,height,level_idx +images/127B.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 +images/119S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 +images/124S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv new file mode 100644 index 000000000..78fc82441 --- /dev/null +++ b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv @@ -0,0 +1,2 @@ +file,x_y,width,height,level_idx +images/111S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/103S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/103S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/103S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/111S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/111S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/111S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/119S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/119S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/119S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/124S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/124S.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/124S.tif differ diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/127B.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/127B.tif new file mode 100644 index 000000000..590a94558 Binary files /dev/null and b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/127B.tif differ diff --git a/tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py b/tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py new file mode 100644 index 000000000..36351439e --- /dev/null +++ b/tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py @@ -0,0 +1,106 @@ +"""Tiger WSIBULK dataset tests.""" + +import os +from typing import Any, Literal + +import pytest +import torch +import torchvision.transforms.v2 as torch_transforms +from torchvision import tv_tensors + +from eva.vision.data import datasets +from eva.vision.data import transforms as eva_transforms +from eva.vision.data.wsi.patching import samplers + +TARGET_SIZE = 224 +DEFAULT_ARGS = { + "width": 16, + "height": 16, + "sampler": samplers.GridSampler(), + "backend": "openslide", + "image_transforms": torch_transforms.Compose([eva_transforms.ResizeAndCrop(size=TARGET_SIZE)]), + "embeddings_dir": "tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir", +} + + +def test_split_and_expected_shapes(root: str, monkeypatch): + """Test loading the dataset with different splits.""" + + train_dataset = datasets.TIGERWsiBulk(root=root, split="train", **DEFAULT_ARGS) + val_dataset = datasets.TIGERWsiBulk(root=root, split="val", **DEFAULT_ARGS) + test_dataset = datasets.TIGERWsiBulk(root=root, split="test", **DEFAULT_ARGS) + + _setup_datasets(train_dataset, val_dataset, test_dataset, monkeypatch=monkeypatch) + + assert len(train_dataset) == 192 + assert len(val_dataset) == 64 + assert len(test_dataset) == 64 + + _check_batch_shape(train_dataset[0]) + _check_batch_shape(val_dataset[0]) + _check_batch_shape(test_dataset[0]) + + +@pytest.mark.parametrize("split", ["train", "val", "test", None]) +def test_filenames(root: str, split: Literal["train", "val", "test"], monkeypatch): + """Tests that the number of filenames matches the dataset size.""" + dataset = datasets.TIGERWsiBulk(root=root, split=split, **DEFAULT_ARGS) + _setup_datasets(dataset, monkeypatch=monkeypatch) + + filenames = set() + for i in range(len(dataset)): + filenames.add(dataset.filename(i)) + + assert len(filenames) == len(dataset.datasets) + + +def _check_batch_shape(batch: Any): + assert isinstance(batch, tuple) + assert len(batch) == 3 + + image, target, metadata = batch + assert isinstance(image, tv_tensors.Image) + assert image.shape == (3, TARGET_SIZE, TARGET_SIZE) + + assert isinstance(target, torch.Tensor) + assert isinstance(metadata, dict) + assert "wsi_id" in metadata + assert "x" in metadata + assert "y" in metadata + assert "width" in metadata + assert "height" in metadata + assert "level_idx" in metadata + + +@pytest.fixture +def root(assets_path: str) -> str: + """Fixture returning the root directory of the dataset.""" + return os.path.join(assets_path, "vision/datasets/tiger_wsibulk") + + +def _setup_datasets(*dataset_splits: datasets.TIGERWsiBulk, monkeypatch): + + monkeypatch.setattr( + datasets.TIGERWsiBulk, + "_expected_dataset_lengths", + {"train": 3, "val": 1, "test": 1, None: 5}, + ) + + split_to_file = { + "train": "coords_train.csv", + "val": "coords_val.csv", + "test": "coords_test.csv", + } + + for dataset in dataset_splits: + + split = dataset._split + if split is not None: + csv_file = split_to_file[split] + monkeypatch.setattr( + dataset, + "_coords_path", + f"tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/{csv_file}", + ) + + dataset.setup() diff --git a/tests/eva/vision/test_vision_cli.py b/tests/eva/vision/test_vision_cli.py index 2c30a4b27..2eca802c7 100644 --- a/tests/eva/vision/test_vision_cli.py +++ b/tests/eva/vision/test_vision_cli.py @@ -41,6 +41,7 @@ "configs/vision/pathology/offline/classification/mhist.yaml", "configs/vision/pathology/offline/classification/panda.yaml", "configs/vision/pathology/offline/classification/patch_camelyon.yaml", + "configs/vision/pathology/offline/classification/tiger_wsibulk.yaml", "configs/vision/pathology/offline/classification/unitopatho.yaml", # segmentation "configs/vision/pathology/offline/segmentation/bcss.yaml", @@ -86,6 +87,7 @@ def test_fit_from_configuration(configuration_file: str, lib_path: str) -> None: "configs/vision/tests/offline/patch_camelyon.yaml", "configs/vision/tests/offline/panda.yaml", "configs/vision/tests/offline/consep.yaml", + "configs/vision/tests/offline/tiger_wsibulk.yaml", ], ) def test_predict_fit_from_configuration(configuration_file: str, lib_path: str) -> None: