From 400dff8cb35c7249fd2e3d3085764cf944db5593 Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Fri, 20 Jun 2025 08:58:11 +0100 Subject: [PATCH 1/7] Add capacity to load remote YAML URLs --- src/eva/core/callbacks/config.py | 29 +++++++++++++++++++++++------ src/eva/core/trainers/_recorder.py | 3 +++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/eva/core/callbacks/config.py b/src/eva/core/callbacks/config.py index fd7197a17..f28278998 100644 --- a/src/eva/core/callbacks/config.py +++ b/src/eva/core/callbacks/config.py @@ -8,10 +8,13 @@ import lightning.pytorch as pl import yaml +import requests from lightning_fabric.utilities import cloud_io from loguru import logger as cli_logger from omegaconf import OmegaConf from typing_extensions import TypeGuard, override +from requests import RequestException +from urllib.parse import urlparse from eva.core import loggers @@ -109,12 +112,26 @@ def _load_yaml_files(paths: List[str]) -> Dict[str, Any]: """ merged_config = {} for config_path in paths: - fs = cloud_io.get_filesystem(config_path) - with fs.open(config_path, "r") as file: - omegaconf_file = OmegaConf.load(file) # type: ignore - config_dict = OmegaConf.to_object(omegaconf_file) # type: ignore - parsed_config = _type_resolver(config_dict) # type: ignore - merged_config.update(parsed_config) + parsed_url = urlparse(config_path) + is_remote = parsed_url.scheme in ("http", "https") + + if is_remote: + try: + response = requests.get(config_path) + response.raise_for_status() + except RequestException as e: + raise RuntimeError(f"Failed to download remote config: {config_path}\n{str(e)}") + + omegaconf_file = OmegaConf.create(response.text) + else: + fs = cloud_io.get_filesystem(config_path) + with fs.open(config_path, "r") as file: + omegaconf_file = OmegaConf.load(file) # type: ignore + + config_dict = OmegaConf.to_object(omegaconf_file) # type: ignore + parsed_config = _type_resolver(config_dict) # type: ignore + merged_config.update(parsed_config) + return merged_config diff --git a/src/eva/core/trainers/_recorder.py b/src/eva/core/trainers/_recorder.py index d3e209a6c..c7bee5947 100644 --- a/src/eva/core/trainers/_recorder.py +++ b/src/eva/core/trainers/_recorder.py @@ -5,6 +5,7 @@ import os import statistics import sys +import requests from typing import Dict, List, Mapping, TypedDict from lightning.pytorch.utilities.types import _EVALUATE_OUTPUT @@ -14,6 +15,8 @@ from rich import console as rich_console from rich import table as rich_table from toolz import dicttoolz +from urllib.parse import urlparse +from requests.exceptions import RequestException SESSION_METRICS = Mapping[str, List[float]] """Session metrics type-hint.""" From 37d495037113bee6bf022e3bcd5cda1be984557c Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Fri, 20 Jun 2025 13:41:36 +0100 Subject: [PATCH 2/7] Change _save_config function to use fspec --- src/eva/core/callbacks/config.py | 29 ++++++----------------------- src/eva/core/trainers/_recorder.py | 3 --- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/src/eva/core/callbacks/config.py b/src/eva/core/callbacks/config.py index f28278998..fd7197a17 100644 --- a/src/eva/core/callbacks/config.py +++ b/src/eva/core/callbacks/config.py @@ -8,13 +8,10 @@ import lightning.pytorch as pl import yaml -import requests from lightning_fabric.utilities import cloud_io from loguru import logger as cli_logger from omegaconf import OmegaConf from typing_extensions import TypeGuard, override -from requests import RequestException -from urllib.parse import urlparse from eva.core import loggers @@ -112,26 +109,12 @@ def _load_yaml_files(paths: List[str]) -> Dict[str, Any]: """ merged_config = {} for config_path in paths: - parsed_url = urlparse(config_path) - is_remote = parsed_url.scheme in ("http", "https") - - if is_remote: - try: - response = requests.get(config_path) - response.raise_for_status() - except RequestException as e: - raise RuntimeError(f"Failed to download remote config: {config_path}\n{str(e)}") - - omegaconf_file = OmegaConf.create(response.text) - else: - fs = cloud_io.get_filesystem(config_path) - with fs.open(config_path, "r") as file: - omegaconf_file = OmegaConf.load(file) # type: ignore - - config_dict = OmegaConf.to_object(omegaconf_file) # type: ignore - parsed_config = _type_resolver(config_dict) # type: ignore - merged_config.update(parsed_config) - + fs = cloud_io.get_filesystem(config_path) + with fs.open(config_path, "r") as file: + omegaconf_file = OmegaConf.load(file) # type: ignore + config_dict = OmegaConf.to_object(omegaconf_file) # type: ignore + parsed_config = _type_resolver(config_dict) # type: ignore + merged_config.update(parsed_config) return merged_config diff --git a/src/eva/core/trainers/_recorder.py b/src/eva/core/trainers/_recorder.py index c7bee5947..d3e209a6c 100644 --- a/src/eva/core/trainers/_recorder.py +++ b/src/eva/core/trainers/_recorder.py @@ -5,7 +5,6 @@ import os import statistics import sys -import requests from typing import Dict, List, Mapping, TypedDict from lightning.pytorch.utilities.types import _EVALUATE_OUTPUT @@ -15,8 +14,6 @@ from rich import console as rich_console from rich import table as rich_table from toolz import dicttoolz -from urllib.parse import urlparse -from requests.exceptions import RequestException SESSION_METRICS = Mapping[str, List[float]] """Session metrics type-hint.""" From 41e96c7e677eb48eb704e37b1d1346384cfd878f Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Wed, 10 Sep 2025 08:16:37 +0100 Subject: [PATCH 3/7] Implement TIGER Tumour classification task --- .../offline/classification/tiger_tumour.yaml | 134 ++++++++++++ docs/datasets/tiger.md | 69 ++++++ src/eva/vision/data/datasets/__init__.py | 2 + .../data/datasets/classification/__init__.py | 2 + .../datasets/classification/tiger_tumour.py | 204 ++++++++++++++++++ src/eva/vision/models/networks/abmil.py | 3 +- 6 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 configs/vision/pathology/offline/classification/tiger_tumour.yaml create mode 100644 docs/datasets/tiger.md create mode 100644 src/eva/vision/data/datasets/classification/tiger_tumour.py diff --git a/configs/vision/pathology/offline/classification/tiger_tumour.yaml b/configs/vision/pathology/offline/classification/tiger_tumour.yaml new file mode 100644 index 000000000..a41094ed8 --- /dev/null +++ b/configs/vision/pathology/offline/classification/tiger_tumour.yaml @@ -0,0 +1,134 @@ +--- +trainer: + class_path: eva.Trainer + init_args: + n_runs: &N_RUNS ${oc.env:N_RUNS, 20} + default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/tiger_tumour} + max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 100} + checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best} + callbacks: + - class_path: eva.callbacks.ConfigurationLogger + - class_path: lightning.pytorch.callbacks.TQDMProgressBar + init_args: + refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1} + - class_path: lightning.pytorch.callbacks.LearningRateMonitor + init_args: + logging_interval: epoch + - class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + filename: best + save_last: ${oc.env:SAVE_LAST, false} + save_top_k: 1 + monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy} + mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max} + - class_path: lightning.pytorch.callbacks.EarlyStopping + init_args: + min_delta: 0 + patience: ${oc.env:PATIENCE, 20} + monitor: *MONITOR_METRIC + mode: *MONITOR_METRIC_MODE + - class_path: eva.callbacks.ClassificationEmbeddingsWriter + init_args: + output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/tiger_tumour} + dataloader_idx_map: + 0: train + 1: val + 2: test + metadata_keys: ["wsi_id"] + backbone: + class_path: eva.vision.models.ModelFromRegistry + init_args: + model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino} + model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null} + overwrite: false + logger: + - class_path: lightning.pytorch.loggers.TensorBoardLogger + init_args: + save_dir: *OUTPUT_ROOT + name: "" +model: + class_path: eva.HeadModule + init_args: + head: + class_path: eva.vision.models.networks.ABMIL + init_args: + input_size: ${oc.env:IN_FEATURES, 384} + output_size: &NUM_CLASSES 1 + projected_input_size: 128 + criterion: torch.nn.BCEWithLogitsLoss + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: ${oc.env:LR_VALUE, 0.001} + betas: [0.9, 0.999] + metrics: + common: + - class_path: eva.metrics.AverageLoss + - class_path: eva.metrics.BinaryClassificationMetrics +data: + class_path: eva.DataModule + init_args: + datasets: + train: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: &DATASET_ARGS + root: *DATASET_EMBEDDINGS_ROOT + manifest_file: manifest.csv + split: train + embeddings_transforms: + class_path: eva.core.data.transforms.Pad2DTensor + init_args: + pad_size: &N_PATCHES ${oc.env:N_PATCHES, 200} + target_transforms: + class_path: eva.core.data.transforms.dtype.ArrayToFloatTensor + val: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: + <<: *DATASET_ARGS + split: val + test: + class_path: eva.datasets.MultiEmbeddingsClassificationDataset + init_args: + <<: *DATASET_ARGS + split: test + predict: + - class_path: eva.vision.datasets.TIGERTumour + init_args: &PREDICT_DATASET_ARGS + root: ${oc.env:DATA_ROOT, ./data/training/wsibulk} + sampler: + class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler + init_args: + max_samples: *N_PATCHES + width: 224 + height: 224 + target_mpp: 0.5 + split: train + coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv + image_transforms: + class_path: eva.vision.data.transforms.common.ResizeAndCrop + init_args: + size: ${oc.env:RESIZE_DIM, 224} + mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} + std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]} + - class_path: eva.vision.datasets.TIGERTumour + init_args: + <<: *PREDICT_DATASET_ARGS + split: val + - class_path: eva.vision.datasets.TIGERTumour + init_args: + <<: *PREDICT_DATASET_ARGS + split: test + dataloaders: + train: + batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32} + num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4} + shuffle: true + val: + batch_size: *BATCH_SIZE + num_workers: *N_DATA_WORKERS + test: + batch_size: *BATCH_SIZE + num_workers: *N_DATA_WORKERS + predict: + batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64} + num_workers: *N_DATA_WORKERS diff --git a/docs/datasets/tiger.md b/docs/datasets/tiger.md new file mode 100644 index 000000000..5d4b4a146 --- /dev/null +++ b/docs/datasets/tiger.md @@ -0,0 +1,69 @@ +# TIGER (Tumor Infiltrating Lymphocytes in breast cancER) + +TIGER contains digital pathology images of Her2 positive (Her2+) and Triple Negative (TNBC) breast cancer whole-slide images, together with manual annotations. Training data comes from multiple sources. A subset of Her2+ and TNBC cases is provided by the Radboud University Medical Center (RUMC) (Nijmegen, Netherlands). A second subset of Her2+ and TNBC cases is provided by the Jules Bordet Institut (JB) (Bruxelles, Belgium). A third subset of TNBC cases only is derived from the TCGA-BRCA archive obtained from the Genomic Data Commons Data Portal. + +It contains 3 different datasets and thus 3 different tasks to add to eva. However only two are currently added. + +WSIBULK - WSI level classification task: Detecting tumour presence in patches of a given slide. +WSITILS - Regression task: predicting "TIL" score of a whole slide image. + +Source: https://tiger.grand-challenge.org/Data/ + + +## Raw data + +### Key stats + +| | | +|---------------------------|----------------------------------------------------------| +| **Modality** | Vision (WSI) | +| **Tasks** | Binary Classification / Regression | +| **Cancer type** | Breast | +| **Data size** | 182 GB | +| **Image dimension** | ~20k x 20k x 3 | +| **Magnification (μm/px)** | 20x (0.5) - Level 0 | +| **Files format** | `.tif` | +| **Number of images** | 178 WSIs (96 for WSIBULK and 82 for WSITILS) | + + +### Organization + +The data `tiger.zip` from [grand challenge](https://tiger.grand-challenge.org/) is organized as follows: + +training/ + |_wsibulk/ (used for classification task) + | |__annotations-tumor-bulk/ * manual annotations of "tumor bulk" regions (see https://tiger.grand-challenge.org/Data/ for details) + | | |___masks/ * annotations in multiresolution TIF format + | | |___xmls/ (not used in eva) + | |__images/ + + | |__tissue-masks/ (not used in eva) + | + |_wsirois/ (not used in eva yet) + | + |_wsitils/ (used for regression task) + | |__images/ + | │ ├── 104S.tiff + │ | └── ... * whole-slide images + | |__tissue-masks/ (not used in eva) + | |__tiger-tils-scores-wsitils.csv (target variable file) + + + + +## Download and preprocessing + +The `TIGER` dataset class doesn't download the data during runtime and must be downloaded manually as follows: + +- Make sure that the latest version of the AWS CLI is installed on your system by following [these instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) + +With the AWS CLI installed, you can download the public training set (no AWS account required) by running: + +`aws s3 cp s3://tiger-training/ /path/to/destination/ --recursive --no-sign-request` + + +We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio. + + + + diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py index 95ed8d847..c755630b7 100644 --- a/src/eva/vision/data/datasets/__init__.py +++ b/src/eva/vision/data/datasets/__init__.py @@ -11,6 +11,7 @@ GleasonArvaniti, PANDASmall, PatchCamelyon, + TIGERTumour, UniToPatho, WsiClassificationDataset, ) @@ -49,4 +50,5 @@ "VisionDataset", "MultiWsiDataset", "WsiDataset", + "TIGERTumour", ] diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index cd60020d1..18fc3a663 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -9,6 +9,7 @@ from eva.vision.data.datasets.classification.mhist import MHIST from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon +from eva.vision.data.datasets.classification.tiger_tumour import TIGERTumour from eva.vision.data.datasets.classification.unitopatho import UniToPatho from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset @@ -26,4 +27,5 @@ "PANDA", "PANDASmall", "Camelyon16", + "TIGERTumour", ] diff --git a/src/eva/vision/data/datasets/classification/tiger_tumour.py b/src/eva/vision/data/datasets/classification/tiger_tumour.py new file mode 100644 index 000000000..ed370bfdf --- /dev/null +++ b/src/eva/vision/data/datasets/classification/tiger_tumour.py @@ -0,0 +1,204 @@ +"""Tiger_tumour dataset class.""" + +import ast +import functools +import glob +import os +from pathlib import Path +from typing import Any, Callable, Dict, List, Literal, NamedTuple, Tuple + +import numpy as np +import pandas as pd +import tifffile as tiff +import torch +from torchvision import tv_tensors +from torchvision.transforms.v2 import functional +from typing_extensions import override + +from eva.core.utils.progress_bar import tqdm +from eva.vision.data.datasets import _validators, vision, wsi +from eva.vision.data.wsi.patching import samplers + + +class TIGERTumour(wsi.MultiWsiDataset, vision.VisionDataset[tv_tensors.Image, torch.Tensor]): + """Dataset class for the TIL tumour detection task.""" + + class ImageRow(NamedTuple): + """Represents the patch coordinates of one WSI.""" + + file: str + x_y: str + width: int + height: int + level_idx: int + + def __init__( + self, + root: str, + sampler: samplers.Sampler, + split: Literal["train", "val", "test"] | None = None, + width: int = 224, + height: int = 224, + target_mpp: float = 0.5, + backend: str = "openslide", + image_transforms: Callable | None = None, + coords_path: str | None = None, + seed: int = 42, + n_patches: int = 200, + ) -> None: + """Initializes the dataset. + + Args: + root: Root directory of the dataset. + sampler: The sampler to use for sampling patch coordinates. + split: Dataset split to use. If `None`, the entire dataset is used. + width: Width of the patches to be extracted, in pixels. + height: Height of the patches to be extracted, in pixels. + target_mpp: Target microns per pixel (mpp) for the patches. + backend: The backend to use for reading the whole-slide images. + image_transforms: Transforms to apply to the extracted image patches. + coords_path: File path to save the patch coordinates as .csv. + seed: Random seed for reproducibility. + n_patches: Number of patches sampled + """ + self._split = split + self._root = root + self._width = width + self._height = height + self._target_mpp = target_mpp + self._seed = seed + self._n_patches = n_patches + + wsi.MultiWsiDataset.__init__( + self, + root=root, + file_paths=self._load_file_paths(split), + width=width, + height=height, + sampler=sampler, + target_mpp=target_mpp, + backend=backend, + image_transforms=image_transforms, + coords_path=coords_path, + ) + + @functools.cached_property + def annotations(self) -> Dict[str, int]: + """Builds per-patch labels from the coords CSV files and mask .tif images. + + Returns: + A dict: { "img_name-patch_index": label } + """ + annotations = {} + + # Proportion of the patch that needs to be covered by the mask in order for it to + # be annotated as a "tumor" (1) + THRESHOLD = 0.5 + + main_dir = os.path.dirname(os.path.dirname(self._root)) + csv_folder = os.path.join(main_dir, "embeddings", "dino_vits16", "tiger_tumour") + + split_to_csv = { + split: os.path.join(csv_folder, f"coords_{split}.csv") + for split in ["train", "val", "test"] + } + + splits_to_load = ( + [self._split] if self._split in ["train", "val", "test"] else ["train", "val", "test"] + ) + + for split in splits_to_load: + csv_path = split_to_csv[split] + df = pd.read_csv(csv_path) + n_rows = len(df) + + print(f"Annotating split '{split}' with {n_rows} images...") + + for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"): + image_row = TIGERTumour.ImageRow(*row) + annotations.update(self._process_image_row(image_row, THRESHOLD)) + + return annotations + + def _process_image_row(self, row: ImageRow, threshold: float) -> dict[str, int]: + annotations: dict[str, int] = {} + img_name = Path(row.file).stem + patch_coords = ast.literal_eval(row.x_y) + patch_w = int(row.width) + patch_h = int(row.height) + + mask_path = os.path.join(self._root, "annotations-tumor-bulk", "masks", f"{img_name}.tif") + mask = tiff.imread(mask_path) + + for idx, (x, y) in enumerate(patch_coords): + patch_region = mask[y : y + patch_h, x : x + patch_w] + tumor_fraction = np.mean(patch_region > 0) + label = 1 if tumor_fraction > threshold else 0 + key = f"{img_name}-{idx}" + annotations[key] = label + + del mask + return annotations + + @override + def prepare_data(self) -> None: + _validators.check_dataset_exists(self._root, False) + + @override + def validate(self) -> None: + expected_n_files = {"train": 65, "val": 13, "test": 15, None: 93} + _validators.check_number_of_files( + self._file_paths, expected_n_files[self._split], self._split + ) + + @override + def __getitem__(self, index: int) -> Tuple[tv_tensors.Image, torch.Tensor, Dict[str, Any]]: + return vision.VisionDataset.__getitem__(self, index) + + @override + def load_data(self, index: int) -> tv_tensors.Image: + image_array = wsi.MultiWsiDataset.__getitem__(self, index) + return functional.to_image(image_array) + + @override + def load_target(self, index: int) -> torch.Tensor: + slide_idx = index // self._n_patches + patch_idx = index % self._n_patches + + file_path = self._file_paths[slide_idx] + slide_name = Path(file_path).stem + key = f"{slide_name}-{patch_idx}" + label = self.annotations[key] + + return torch.tensor(label, dtype=torch.int64) + + @override + def load_metadata(self, index: int) -> Dict[str, Any]: + return wsi.MultiWsiDataset.load_metadata(self, index) + + def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: + """Loads the file paths of WSIs from wsibulk/images. + + Splits are assigned 70% train, 15% val, 15% test by filename sorting. + + """ + image_dir = os.path.join(self._root, "images") + all_paths = sorted(glob.glob(os.path.join(image_dir, "*.tif"))) + + if not all_paths: + raise FileNotFoundError(f"No .tif files found in {image_dir}") + + n_total = len(all_paths) + n_train = int(n_total * 0.7) + n_val = int(n_total * 0.15) + + if split == "train": + selected_paths = all_paths[:n_train] + elif split == "val": + selected_paths = all_paths[n_train : n_train + n_val] + elif split == "test": + selected_paths = all_paths[n_train + n_val :] + elif split is None: + selected_paths = all_paths + + return [os.path.relpath(path, self._root) for path in selected_paths] diff --git a/src/eva/vision/models/networks/abmil.py b/src/eva/vision/models/networks/abmil.py index bb2ca4820..20b6f9b51 100644 --- a/src/eva/vision/models/networks/abmil.py +++ b/src/eva/vision/models/networks/abmil.py @@ -111,7 +111,8 @@ def forward(self, input_tensor: torch.Tensor) -> torch.Tensor: attention_result = torch.matmul(torch.transpose(attention_weights, 1, 2), input_tensor) # (batch_size, 1, hidden_size_attention) - attention_result = torch.squeeze(attention_result, 1) # (batch_size, hidden_size_attention) + attention_result = torch.squeeze(attention_result, 1) + # (batch_size, hidden_size_attention) return self.classifier(attention_result) # (batch_size, output_size) From a23b6737a6fceb1090302c45be4f25282bc98c59 Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Mon, 22 Sep 2025 08:06:30 +0100 Subject: [PATCH 4/7] Refactor codebase and allign with queries from code review --- .../{tiger_tumour.yaml => tiger_wsibulk.yaml} | 14 +- docs/datasets/tiger.md | 37 ++-- src/eva/vision/data/datasets/__init__.py | 4 +- .../data/datasets/classification/__init__.py | 4 +- .../datasets/classification/tiger_tumour.py | 204 ------------------ .../datasets/classification/tiger_wsibulk.py | 144 +++++++++++++ src/eva/vision/data/datasets/tiger.py | 129 +++++++++++ src/eva/vision/data/datasets/wsi.py | 6 +- .../vision/data/wsi/patching/coordinates.py | 21 ++ 9 files changed, 330 insertions(+), 233 deletions(-) rename configs/vision/pathology/offline/classification/{tiger_tumour.yaml => tiger_wsibulk.yaml} (93%) delete mode 100644 src/eva/vision/data/datasets/classification/tiger_tumour.py create mode 100644 src/eva/vision/data/datasets/classification/tiger_wsibulk.py create mode 100644 src/eva/vision/data/datasets/tiger.py diff --git a/configs/vision/pathology/offline/classification/tiger_tumour.yaml b/configs/vision/pathology/offline/classification/tiger_wsibulk.yaml similarity index 93% rename from configs/vision/pathology/offline/classification/tiger_tumour.yaml rename to configs/vision/pathology/offline/classification/tiger_wsibulk.yaml index a41094ed8..9cabb2c1e 100644 --- a/configs/vision/pathology/offline/classification/tiger_tumour.yaml +++ b/configs/vision/pathology/offline/classification/tiger_wsibulk.yaml @@ -3,7 +3,7 @@ trainer: class_path: eva.Trainer init_args: n_runs: &N_RUNS ${oc.env:N_RUNS, 20} - default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/tiger_tumour} + default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/tiger_wsibulk} max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 100} checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best} callbacks: @@ -29,7 +29,7 @@ trainer: mode: *MONITOR_METRIC_MODE - class_path: eva.callbacks.ClassificationEmbeddingsWriter init_args: - output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/tiger_tumour} + output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/tiger_wsibulk} dataloader_idx_map: 0: train 1: val @@ -40,7 +40,7 @@ trainer: init_args: model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino} model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null} - overwrite: false + overwrite: true logger: - class_path: lightning.pytorch.loggers.TensorBoardLogger init_args: @@ -92,16 +92,16 @@ data: <<: *DATASET_ARGS split: test predict: - - class_path: eva.vision.datasets.TIGERTumour + - class_path: eva.vision.datasets.TIGERWsiBulk init_args: &PREDICT_DATASET_ARGS root: ${oc.env:DATA_ROOT, ./data/training/wsibulk} sampler: class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler init_args: max_samples: *N_PATCHES + embeddings_dir: *DATASET_EMBEDDINGS_ROOT width: 224 height: 224 - target_mpp: 0.5 split: train coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv image_transforms: @@ -110,11 +110,11 @@ data: size: ${oc.env:RESIZE_DIM, 224} mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]} std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]} - - class_path: eva.vision.datasets.TIGERTumour + - class_path: eva.vision.datasets.TIGERWsiBulk init_args: <<: *PREDICT_DATASET_ARGS split: val - - class_path: eva.vision.datasets.TIGERTumour + - class_path: eva.vision.datasets.TIGERWsiBulk init_args: <<: *PREDICT_DATASET_ARGS split: test diff --git a/docs/datasets/tiger.md b/docs/datasets/tiger.md index 5d4b4a146..e7ab08022 100644 --- a/docs/datasets/tiger.md +++ b/docs/datasets/tiger.md @@ -2,10 +2,13 @@ TIGER contains digital pathology images of Her2 positive (Her2+) and Triple Negative (TNBC) breast cancer whole-slide images, together with manual annotations. Training data comes from multiple sources. A subset of Her2+ and TNBC cases is provided by the Radboud University Medical Center (RUMC) (Nijmegen, Netherlands). A second subset of Her2+ and TNBC cases is provided by the Jules Bordet Institut (JB) (Bruxelles, Belgium). A third subset of TNBC cases only is derived from the TCGA-BRCA archive obtained from the Genomic Data Commons Data Portal. -It contains 3 different datasets and thus 3 different tasks to add to eva. However only two are currently added. +It contains 3 different datasets and thus 3 different tasks to add to eva. WSIBULK - WSI level classification task: Detecting tumour presence in patches of a given slide. WSITILS - Regression task: predicting "TIL" score of a whole slide image. +WSIROIS - Cell level segmentation task: predicting boundaries of TIL cells. + +However only WSIBULK and WSITILS are currently implemented. Source: https://tiger.grand-challenge.org/Data/ @@ -31,24 +34,23 @@ Source: https://tiger.grand-challenge.org/Data/ The data `tiger.zip` from [grand challenge](https://tiger.grand-challenge.org/) is organized as follows: training/ - |_wsibulk/ (used for classification task) - | |__annotations-tumor-bulk/ * manual annotations of "tumor bulk" regions (see https://tiger.grand-challenge.org/Data/ for details) - | | |___masks/ * annotations in multiresolution TIF format - | | |___xmls/ (not used in eva) - | |__images/ - - | |__tissue-masks/ (not used in eva) + |_wsibulk/ * Used for classification task + | |__annotations-tumor-bulk/ * Manual annotations of "tumor bulk" regions + | | |___masks/ * Binary masks in TIF format + | | |___xmls/ * Not used in eva + | |__images/ * Whole-Slide Images + | │ ├── 103S.tiff + │ | └── ... + | |__tissue-masks/ * Not used in eva | - |_wsirois/ (not used in eva yet) + |_wsirois/ * Not used in eva currently | - |_wsitils/ (used for regression task) - | |__images/ + |_wsitils/ * Used for regression task + | |__images/ * Whole-slide images | │ ├── 104S.tiff - │ | └── ... * whole-slide images - | |__tissue-masks/ (not used in eva) - | |__tiger-tils-scores-wsitils.csv (target variable file) - - + │ | └── ... + | |__tissue-masks/ * Not used in eva + | |__tiger-tils-scores-wsitils.csv * Target variable file ## Download and preprocessing @@ -57,10 +59,11 @@ The `TIGER` dataset class doesn't download the data during runtime and must be d - Make sure that the latest version of the AWS CLI is installed on your system by following [these instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) -With the AWS CLI installed, you can download the public training set (no AWS account required) by running: +With the AWS CLI installed, you can download the official training set (no AWS account required) by running: `aws s3 cp s3://tiger-training/ /path/to/destination/ --recursive --no-sign-request` +These instructions can also be found on the official challenge page [here](https://tiger.grand-challenge.org/Data/) We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio. diff --git a/src/eva/vision/data/datasets/__init__.py b/src/eva/vision/data/datasets/__init__.py index c755630b7..210c60048 100644 --- a/src/eva/vision/data/datasets/__init__.py +++ b/src/eva/vision/data/datasets/__init__.py @@ -11,7 +11,7 @@ GleasonArvaniti, PANDASmall, PatchCamelyon, - TIGERTumour, + TIGERWsiBulk, UniToPatho, WsiClassificationDataset, ) @@ -50,5 +50,5 @@ "VisionDataset", "MultiWsiDataset", "WsiDataset", - "TIGERTumour", + "TIGERWsiBulk", ] diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index 18fc3a663..8d824c4fa 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -9,7 +9,7 @@ from eva.vision.data.datasets.classification.mhist import MHIST from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon -from eva.vision.data.datasets.classification.tiger_tumour import TIGERTumour +from eva.vision.data.datasets.classification.tiger_wsibulk import TIGERWsiBulk from eva.vision.data.datasets.classification.unitopatho import UniToPatho from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset @@ -27,5 +27,5 @@ "PANDA", "PANDASmall", "Camelyon16", - "TIGERTumour", + "TIGERWsiBulk", ] diff --git a/src/eva/vision/data/datasets/classification/tiger_tumour.py b/src/eva/vision/data/datasets/classification/tiger_tumour.py deleted file mode 100644 index ed370bfdf..000000000 --- a/src/eva/vision/data/datasets/classification/tiger_tumour.py +++ /dev/null @@ -1,204 +0,0 @@ -"""Tiger_tumour dataset class.""" - -import ast -import functools -import glob -import os -from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, NamedTuple, Tuple - -import numpy as np -import pandas as pd -import tifffile as tiff -import torch -from torchvision import tv_tensors -from torchvision.transforms.v2 import functional -from typing_extensions import override - -from eva.core.utils.progress_bar import tqdm -from eva.vision.data.datasets import _validators, vision, wsi -from eva.vision.data.wsi.patching import samplers - - -class TIGERTumour(wsi.MultiWsiDataset, vision.VisionDataset[tv_tensors.Image, torch.Tensor]): - """Dataset class for the TIL tumour detection task.""" - - class ImageRow(NamedTuple): - """Represents the patch coordinates of one WSI.""" - - file: str - x_y: str - width: int - height: int - level_idx: int - - def __init__( - self, - root: str, - sampler: samplers.Sampler, - split: Literal["train", "val", "test"] | None = None, - width: int = 224, - height: int = 224, - target_mpp: float = 0.5, - backend: str = "openslide", - image_transforms: Callable | None = None, - coords_path: str | None = None, - seed: int = 42, - n_patches: int = 200, - ) -> None: - """Initializes the dataset. - - Args: - root: Root directory of the dataset. - sampler: The sampler to use for sampling patch coordinates. - split: Dataset split to use. If `None`, the entire dataset is used. - width: Width of the patches to be extracted, in pixels. - height: Height of the patches to be extracted, in pixels. - target_mpp: Target microns per pixel (mpp) for the patches. - backend: The backend to use for reading the whole-slide images. - image_transforms: Transforms to apply to the extracted image patches. - coords_path: File path to save the patch coordinates as .csv. - seed: Random seed for reproducibility. - n_patches: Number of patches sampled - """ - self._split = split - self._root = root - self._width = width - self._height = height - self._target_mpp = target_mpp - self._seed = seed - self._n_patches = n_patches - - wsi.MultiWsiDataset.__init__( - self, - root=root, - file_paths=self._load_file_paths(split), - width=width, - height=height, - sampler=sampler, - target_mpp=target_mpp, - backend=backend, - image_transforms=image_transforms, - coords_path=coords_path, - ) - - @functools.cached_property - def annotations(self) -> Dict[str, int]: - """Builds per-patch labels from the coords CSV files and mask .tif images. - - Returns: - A dict: { "img_name-patch_index": label } - """ - annotations = {} - - # Proportion of the patch that needs to be covered by the mask in order for it to - # be annotated as a "tumor" (1) - THRESHOLD = 0.5 - - main_dir = os.path.dirname(os.path.dirname(self._root)) - csv_folder = os.path.join(main_dir, "embeddings", "dino_vits16", "tiger_tumour") - - split_to_csv = { - split: os.path.join(csv_folder, f"coords_{split}.csv") - for split in ["train", "val", "test"] - } - - splits_to_load = ( - [self._split] if self._split in ["train", "val", "test"] else ["train", "val", "test"] - ) - - for split in splits_to_load: - csv_path = split_to_csv[split] - df = pd.read_csv(csv_path) - n_rows = len(df) - - print(f"Annotating split '{split}' with {n_rows} images...") - - for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"): - image_row = TIGERTumour.ImageRow(*row) - annotations.update(self._process_image_row(image_row, THRESHOLD)) - - return annotations - - def _process_image_row(self, row: ImageRow, threshold: float) -> dict[str, int]: - annotations: dict[str, int] = {} - img_name = Path(row.file).stem - patch_coords = ast.literal_eval(row.x_y) - patch_w = int(row.width) - patch_h = int(row.height) - - mask_path = os.path.join(self._root, "annotations-tumor-bulk", "masks", f"{img_name}.tif") - mask = tiff.imread(mask_path) - - for idx, (x, y) in enumerate(patch_coords): - patch_region = mask[y : y + patch_h, x : x + patch_w] - tumor_fraction = np.mean(patch_region > 0) - label = 1 if tumor_fraction > threshold else 0 - key = f"{img_name}-{idx}" - annotations[key] = label - - del mask - return annotations - - @override - def prepare_data(self) -> None: - _validators.check_dataset_exists(self._root, False) - - @override - def validate(self) -> None: - expected_n_files = {"train": 65, "val": 13, "test": 15, None: 93} - _validators.check_number_of_files( - self._file_paths, expected_n_files[self._split], self._split - ) - - @override - def __getitem__(self, index: int) -> Tuple[tv_tensors.Image, torch.Tensor, Dict[str, Any]]: - return vision.VisionDataset.__getitem__(self, index) - - @override - def load_data(self, index: int) -> tv_tensors.Image: - image_array = wsi.MultiWsiDataset.__getitem__(self, index) - return functional.to_image(image_array) - - @override - def load_target(self, index: int) -> torch.Tensor: - slide_idx = index // self._n_patches - patch_idx = index % self._n_patches - - file_path = self._file_paths[slide_idx] - slide_name = Path(file_path).stem - key = f"{slide_name}-{patch_idx}" - label = self.annotations[key] - - return torch.tensor(label, dtype=torch.int64) - - @override - def load_metadata(self, index: int) -> Dict[str, Any]: - return wsi.MultiWsiDataset.load_metadata(self, index) - - def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: - """Loads the file paths of WSIs from wsibulk/images. - - Splits are assigned 70% train, 15% val, 15% test by filename sorting. - - """ - image_dir = os.path.join(self._root, "images") - all_paths = sorted(glob.glob(os.path.join(image_dir, "*.tif"))) - - if not all_paths: - raise FileNotFoundError(f"No .tif files found in {image_dir}") - - n_total = len(all_paths) - n_train = int(n_total * 0.7) - n_val = int(n_total * 0.15) - - if split == "train": - selected_paths = all_paths[:n_train] - elif split == "val": - selected_paths = all_paths[n_train : n_train + n_val] - elif split == "test": - selected_paths = all_paths[n_train + n_val :] - elif split is None: - selected_paths = all_paths - - return [os.path.relpath(path, self._root) for path in selected_paths] diff --git a/src/eva/vision/data/datasets/classification/tiger_wsibulk.py b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py new file mode 100644 index 000000000..c38a25e3d --- /dev/null +++ b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py @@ -0,0 +1,144 @@ +"""tiger_wsibulk dataset class.""" + +import functools +import os +from pathlib import Path +from typing import Dict + +import numpy as np +import pandas as pd +import tifffile as tiff +import torch +from typing_extensions import override + +from eva.core.utils.progress_bar import tqdm +from eva.vision.data.datasets import _validators, tiger +from eva.vision.data.wsi.patching import PatchCoordinates, samplers + + +class TIGERWsiBulk(tiger.TIGERBase): + """Dataset class for the TIGER tumor detection task. + + Splits a slide-level WSI into multiple different patch level samples, + dynmaically assigning them labels based on their overlaps with a binary mask. + """ + + _expected_dataset_lengths: Dict[str | None, int] = { + "train": 65, + "val": 13, + "test": 15, + None: 93, + } + """Represents the expected numbers of WSIs in the dataset for validation. + Can be overridden for unit tests""" + + _tumor_mask_threshold: float = 0.5 + """ Proportion of the patch that needs to be covered by the mask in order for it to + be annotated as a "tumor" (1)""" + + _target_mpp: float = 0.5 + """Microns per pixel, in this case stating that a pixel covers 0.5 microns per pixel + Set as a constant in this implementation to ensure no mis-matches with the binary mask""" + + def __init__( + self, + root: str, + sampler: samplers.Sampler, + embeddings_dir: str, + **kwargs, + ) -> None: + """Initializes dataset. + + Args: + root: Root directory of the dataset. + sampler: The sampler to use for sampling patch coordinates. + embeddings_dir: Directory where the patch data is stored. Used for annotations. + kwargs: Key-word arguments from the base class. + """ + self._embeddings_dir = embeddings_dir + super().__init__(root=root, sampler=sampler, **kwargs) + + @functools.cached_property + def annotations(self) -> Dict[str, int]: + """Builds per-patch labels from the coords CSV files and mask .tif images. + + Returns: + A dict: { "img_name-patch_index": label } + """ + annotations = {} + + csv_folder = os.path.normpath(self._embeddings_dir) + + split_to_csv = { + split: os.path.join(csv_folder, f"coords_{split}.csv") + for split in ["train", "val", "test"] + } + + splits_to_load = ( + [self._split] if self._split in ["train", "val", "test"] else ["train", "val", "test"] + ) + + for split in splits_to_load: + csv_path = split_to_csv[split] + df = pd.read_csv(csv_path) + n_rows = len(df) + + print(f"Annotating split '{split}' with {n_rows} images...") + + for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"): + + file_name = row.file + + coords = PatchCoordinates.from_dict(row=row._asdict()) + + annotations.update( + self._process_patch_coordinates(file_name, coords, self._tumor_mask_threshold) + ) + + return annotations + + def _process_patch_coordinates( + self, file: str, coords: PatchCoordinates, threshold: float + ) -> dict[str, int]: + annotations: dict[str, int] = {} + img_name = Path(file).stem + patch_w = int(coords.width) + patch_h = int(coords.height) + + mask_path = os.path.join(self._root, "annotations-tumor-bulk", "masks", f"{img_name}.tif") + mask = tiff.imread(mask_path) + + for idx, (x, y) in enumerate(coords.x_y): + patch_region = mask[y : y + patch_h, x : x + patch_w] + tumor_fraction = np.mean(patch_region > 0) + label = 1 if tumor_fraction > threshold else 0 + key = f"{img_name}-{idx}" + annotations[key] = label + + del mask + return annotations + + @override + def prepare_data(self) -> None: + _validators.check_dataset_exists(self._root, False) + + # @override + # def validate(self) -> None: + # _validators.check_number_of_files( + # self._file_paths, self._expected_dataset_lengths[self._split], self._split + # ) + + @override + def load_target(self, index: int) -> torch.Tensor: + + metadata = self.load_metadata(index) + + slide_idx = metadata["slide_idx"] + patch_idx = metadata["patch_idx"] + + file_path = self._file_paths[slide_idx] + slide_name = Path(file_path).stem + key = f"{slide_name}-{patch_idx}" + label = self.annotations[key] + + return torch.tensor(label, dtype=torch.int64) diff --git a/src/eva/vision/data/datasets/tiger.py b/src/eva/vision/data/datasets/tiger.py new file mode 100644 index 000000000..b176c2f02 --- /dev/null +++ b/src/eva/vision/data/datasets/tiger.py @@ -0,0 +1,129 @@ +"""Abstract base class for TIGER datasets spanning different task types.""" + +import abc +import glob +import os +import random +from typing import Any, Callable, Dict, List, Literal, Tuple + +import torch +from torchvision import tv_tensors +from torchvision.transforms.v2 import functional +from typing_extensions import override + +from eva.vision.data.datasets import _validators, vision, wsi +from eva.vision.data.wsi.patching import samplers + + +class TIGERBase( + wsi.MultiWsiDataset, + vision.VisionDataset[tv_tensors.Image, torch.Tensor], + abc.ABC, +): + """Abstract base class for TIGER datasets spanning different task types.""" + + _train_split_ratio: float = 0.7 + _val_split_ratio: float = 0.15 + + # target microns per pixel (mpp) for patches. + _target_mpp: float = 0.5 + + def __init__( + self, + root: str, + sampler: samplers.Sampler, + split: Literal["train", "val", "test"] | None = None, + width: int = 224, + height: int = 224, + backend: str = "openslide", + image_transforms: Callable | None = None, + coords_path: str | None = None, + seed: int = 42, + ) -> None: + """Initializes the dataset. + + Args: + root: Root directory of the dataset. + sampler: The sampler to use for sampling patch coordinates. + split: Dataset split to use. If `None`, the entire dataset is used. + width: Patch width in pixels. + height: Patch height in pixels. + backend: WSI reading backend. + image_transforms: Transforms to apply to patches. + coords_path: Optional path to save patch coordinates. + seed: Random seed. + """ + self._root = root + self._split = split + self._width = width + self._height = height + self._seed = seed + + wsi.MultiWsiDataset.__init__( + self, + root=root, + file_paths=self._load_file_paths(split), + width=width, + height=height, + sampler=sampler, + target_mpp=self._target_mpp, + backend=backend, + image_transforms=image_transforms, + coords_path=coords_path, + ) + + @override + def prepare_data(self) -> None: + _validators.check_dataset_exists(self._root, False) + + @override + def __getitem__(self, index: int) -> Tuple[tv_tensors.Image, torch.Tensor, Dict[str, Any]]: + return vision.VisionDataset.__getitem__(self, index) + + @override + def load_data(self, index: int) -> tv_tensors.Image: + image_array = wsi.MultiWsiDataset.__getitem__(self, index) + return functional.to_image(image_array) + + @override + def load_metadata(self, index: int) -> Dict[str, Any]: + return wsi.MultiWsiDataset.load_metadata(self, index) + + @abc.abstractmethod + def annotations(self) -> Dict[str, Any]: + """Task-specific annotations (classification labels, regression targets, etc.).""" + raise NotImplementedError + + @abc.abstractmethod + def load_target(self, index: int): + """Task-specific target loading.""" + raise NotImplementedError + + def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: + """Loads the file paths of WSIs from wsibulk/images. + + Splits are assigned 70% train, 15% val, 15% test by filename sorting. + """ + image_dir = os.path.join(self._root, "images") + all_paths = sorted(glob.glob(os.path.join(image_dir, "*.tif"))) + + if not all_paths: + raise FileNotFoundError(f"No .tif files found in {image_dir}") + + rng = random.Random(self._seed) # nosec B311 + rng.shuffle(all_paths) + + n_total = len(all_paths) + n_train = int(n_total * self._train_split_ratio) + n_val = int(n_total * self._val_split_ratio) + + if split == "train": + selected_paths = all_paths[:n_train] + elif split == "val": + selected_paths = all_paths[n_train : n_train + n_val] + elif split == "test": + selected_paths = all_paths[n_train + n_val :] + elif split is None: + selected_paths = all_paths + + return [os.path.relpath(path, self._root) for path in selected_paths] diff --git a/src/eva/vision/data/datasets/wsi.py b/src/eva/vision/data/datasets/wsi.py index 4c1c789a3..8e31d5644 100644 --- a/src/eva/vision/data/datasets/wsi.py +++ b/src/eva/vision/data/datasets/wsi.py @@ -179,7 +179,11 @@ def load_metadata(self, index: int) -> Dict[str, Any]: """Loads the metadata for the patch at the specified index.""" dataset_index, sample_index = self._get_dataset_idx(index), self._get_sample_idx(index) patch_metadata = self.datasets[dataset_index].load_metadata(sample_index) - return {"wsi_id": self.filename(index).split(".")[0]} | patch_metadata + return { + "wsi_id": self.filename(index).split(".")[0], + "slide_idx": dataset_index, + "patch_idx": sample_index, + } | patch_metadata def _load_datasets(self) -> list[WsiDataset]: logger.info(f"Initializing dataset with {len(self._file_paths)} WSIs ...") diff --git a/src/eva/vision/data/wsi/patching/coordinates.py b/src/eva/vision/data/wsi/patching/coordinates.py index 0152115f5..28d809567 100644 --- a/src/eva/vision/data/wsi/patching/coordinates.py +++ b/src/eva/vision/data/wsi/patching/coordinates.py @@ -1,5 +1,6 @@ """A module for handling coordinates of patches from a whole-slide image.""" +import ast import dataclasses import functools from typing import Any, Dict, List, Tuple @@ -83,6 +84,26 @@ def to_dict(self, include_keys: List[str] | None = None) -> Dict[str, Any]: coord_dict = {key: coord_dict[key] for key in include_keys} return coord_dict + @classmethod + def from_dict(cls, row: Dict[str, Any]) -> "PatchCoordinates": + """Create PatchCoordinates directly from a dictionary row. + + Args: + row: A dict with keys {file, x_y, width, height, level_idx}. + `x_y` should be a stringified list of (x, y) tuples. + """ + # Parsing the x_y string into a proper format + x_y = row["x_y"] + x_y = ast.literal_eval(x_y) + + return cls( + x_y=x_y, + width=int(row["width"]), + height=int(row["height"]), + level_idx=int(row["level_idx"]), + mask=None, + ) + @functools.lru_cache(LRU_CACHE_SIZE) def get_cached_coords( From b447b0da284bb002762753355181e5dd321c718e Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Wed, 1 Oct 2025 14:28:07 +0100 Subject: [PATCH 5/7] Updated wsibulk task based on secondary feedback --- docs/datasets/tiger.md | 12 ++++------- .../data/datasets/classification/__init__.py | 1 - .../datasets/classification/tiger_wsibulk.py | 14 ++++++------- src/eva/vision/data/datasets/tiger.py | 21 +++++++++---------- .../vision/data/wsi/patching/coordinates.py | 20 ------------------ tests/eva/vision/test_vision_cli.py | 2 ++ 6 files changed, 22 insertions(+), 48 deletions(-) diff --git a/docs/datasets/tiger.md b/docs/datasets/tiger.md index e7ab08022..ed0d26f21 100644 --- a/docs/datasets/tiger.md +++ b/docs/datasets/tiger.md @@ -39,7 +39,7 @@ training/ | | |___masks/ * Binary masks in TIF format | | |___xmls/ * Not used in eva | |__images/ * Whole-Slide Images - | │ ├── 103S.tiff + | │ ├── 103S.tif │ | └── ... | |__tissue-masks/ * Not used in eva | @@ -47,10 +47,10 @@ training/ | |_wsitils/ * Used for regression task | |__images/ * Whole-slide images - | │ ├── 104S.tiff + | │ ├── 104S.tif │ | └── ... | |__tissue-masks/ * Not used in eva - | |__tiger-tils-scores-wsitils.csv * Target variable file + | |__tiger-til-scores-wsitils.csv * Target variable file ## Download and preprocessing @@ -65,8 +65,4 @@ With the AWS CLI installed, you can download the official training set (no AWS a These instructions can also be found on the official challenge page [here](https://tiger.grand-challenge.org/Data/) -We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio. - - - - +We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio. \ No newline at end of file diff --git a/src/eva/vision/data/datasets/classification/__init__.py b/src/eva/vision/data/datasets/classification/__init__.py index 8d824c4fa..e2bb3b894 100644 --- a/src/eva/vision/data/datasets/classification/__init__.py +++ b/src/eva/vision/data/datasets/classification/__init__.py @@ -26,6 +26,5 @@ "WsiClassificationDataset", "PANDA", "PANDASmall", - "Camelyon16", "TIGERWsiBulk", ] diff --git a/src/eva/vision/data/datasets/classification/tiger_wsibulk.py b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py index c38a25e3d..8d4068565 100644 --- a/src/eva/vision/data/datasets/classification/tiger_wsibulk.py +++ b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py @@ -83,13 +83,11 @@ def annotations(self) -> Dict[str, int]: df = pd.read_csv(csv_path) n_rows = len(df) - print(f"Annotating split '{split}' with {n_rows} images...") - for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"): file_name = row.file - coords = PatchCoordinates.from_dict(row=row._asdict()) + coords = PatchCoordinates(**row._asdict()) annotations.update( self._process_patch_coordinates(file_name, coords, self._tumor_mask_threshold) @@ -122,11 +120,11 @@ def _process_patch_coordinates( def prepare_data(self) -> None: _validators.check_dataset_exists(self._root, False) - # @override - # def validate(self) -> None: - # _validators.check_number_of_files( - # self._file_paths, self._expected_dataset_lengths[self._split], self._split - # ) + @override + def validate(self) -> None: + _validators.check_number_of_files( + self._file_paths, self._expected_dataset_lengths[self._split], self._split + ) @override def load_target(self, index: int) -> torch.Tensor: diff --git a/src/eva/vision/data/datasets/tiger.py b/src/eva/vision/data/datasets/tiger.py index b176c2f02..283b57f37 100644 --- a/src/eva/vision/data/datasets/tiger.py +++ b/src/eva/vision/data/datasets/tiger.py @@ -11,6 +11,7 @@ from torchvision.transforms.v2 import functional from typing_extensions import override +from eva.core.data import splitting from eva.vision.data.datasets import _validators, vision, wsi from eva.vision.data.wsi.patching import samplers @@ -24,9 +25,10 @@ class TIGERBase( _train_split_ratio: float = 0.7 _val_split_ratio: float = 0.15 + _test_split_ratio: float = 0.15 - # target microns per pixel (mpp) for patches. _target_mpp: float = 0.5 + '''Target microns per pixel (mpp) for patches.''' def __init__( self, @@ -102,7 +104,7 @@ def load_target(self, index: int): def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) -> List[str]: """Loads the file paths of WSIs from wsibulk/images. - Splits are assigned 70% train, 15% val, 15% test by filename sorting. + Splits are assigned 70% train, 15% val, 15% test but can be dynamically defined if required. """ image_dir = os.path.join(self._root, "images") all_paths = sorted(glob.glob(os.path.join(image_dir, "*.tif"))) @@ -110,19 +112,16 @@ def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) if not all_paths: raise FileNotFoundError(f"No .tif files found in {image_dir}") - rng = random.Random(self._seed) # nosec B311 - rng.shuffle(all_paths) - - n_total = len(all_paths) - n_train = int(n_total * self._train_split_ratio) - n_val = int(n_total * self._val_split_ratio) + train_indices, val_indices, test_indices = splitting.random_split( + all_paths, self._train_split_ratio, self._val_split_ratio, self._test_split_ratio, self._seed + ) if split == "train": - selected_paths = all_paths[:n_train] + selected_paths = all_paths[train_indices[0] : train_indices[1]] elif split == "val": - selected_paths = all_paths[n_train : n_train + n_val] + selected_paths = all_paths[val_indices[0] : val_indices[1]] elif split == "test": - selected_paths = all_paths[n_train + n_val :] + selected_paths = all_paths[test_indices[0] : test_indices[1]] elif split is None: selected_paths = all_paths diff --git a/src/eva/vision/data/wsi/patching/coordinates.py b/src/eva/vision/data/wsi/patching/coordinates.py index 28d809567..54dfcc8d3 100644 --- a/src/eva/vision/data/wsi/patching/coordinates.py +++ b/src/eva/vision/data/wsi/patching/coordinates.py @@ -84,26 +84,6 @@ def to_dict(self, include_keys: List[str] | None = None) -> Dict[str, Any]: coord_dict = {key: coord_dict[key] for key in include_keys} return coord_dict - @classmethod - def from_dict(cls, row: Dict[str, Any]) -> "PatchCoordinates": - """Create PatchCoordinates directly from a dictionary row. - - Args: - row: A dict with keys {file, x_y, width, height, level_idx}. - `x_y` should be a stringified list of (x, y) tuples. - """ - # Parsing the x_y string into a proper format - x_y = row["x_y"] - x_y = ast.literal_eval(x_y) - - return cls( - x_y=x_y, - width=int(row["width"]), - height=int(row["height"]), - level_idx=int(row["level_idx"]), - mask=None, - ) - @functools.lru_cache(LRU_CACHE_SIZE) def get_cached_coords( diff --git a/tests/eva/vision/test_vision_cli.py b/tests/eva/vision/test_vision_cli.py index 2c30a4b27..2eca802c7 100644 --- a/tests/eva/vision/test_vision_cli.py +++ b/tests/eva/vision/test_vision_cli.py @@ -41,6 +41,7 @@ "configs/vision/pathology/offline/classification/mhist.yaml", "configs/vision/pathology/offline/classification/panda.yaml", "configs/vision/pathology/offline/classification/patch_camelyon.yaml", + "configs/vision/pathology/offline/classification/tiger_wsibulk.yaml", "configs/vision/pathology/offline/classification/unitopatho.yaml", # segmentation "configs/vision/pathology/offline/segmentation/bcss.yaml", @@ -86,6 +87,7 @@ def test_fit_from_configuration(configuration_file: str, lib_path: str) -> None: "configs/vision/tests/offline/patch_camelyon.yaml", "configs/vision/tests/offline/panda.yaml", "configs/vision/tests/offline/consep.yaml", + "configs/vision/tests/offline/tiger_wsibulk.yaml", ], ) def test_predict_fit_from_configuration(configuration_file: str, lib_path: str) -> None: From 247e629c2cddfb14fd4398fe726e4ead1543bfdf Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Fri, 10 Oct 2025 12:50:24 +0100 Subject: [PATCH 6/7] Small bugfixes and test support --- .../datasets/classification/tiger_wsibulk.py | 10 +++++++++- src/eva/vision/data/datasets/tiger.py | 16 ++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/eva/vision/data/datasets/classification/tiger_wsibulk.py b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py index 8d4068565..e4f9fdb5b 100644 --- a/src/eva/vision/data/datasets/classification/tiger_wsibulk.py +++ b/src/eva/vision/data/datasets/classification/tiger_wsibulk.py @@ -1,5 +1,6 @@ """tiger_wsibulk dataset class.""" +import ast import functools import os from pathlib import Path @@ -87,7 +88,14 @@ def annotations(self) -> Dict[str, int]: file_name = row.file - coords = PatchCoordinates(**row._asdict()) + row_dict = row._asdict() + coords = PatchCoordinates( + x_y=ast.literal_eval(row_dict["x_y"]) if isinstance(row_dict["x_y"], str) else row_dict["x_y"], + width=int(row_dict["width"]), + height=int(row_dict["height"]), + level_idx=int(row_dict["level_idx"]), + mask=None, +) annotations.update( self._process_patch_coordinates(file_name, coords, self._tumor_mask_threshold) diff --git a/src/eva/vision/data/datasets/tiger.py b/src/eva/vision/data/datasets/tiger.py index 283b57f37..dd035f313 100644 --- a/src/eva/vision/data/datasets/tiger.py +++ b/src/eva/vision/data/datasets/tiger.py @@ -28,7 +28,7 @@ class TIGERBase( _test_split_ratio: float = 0.15 _target_mpp: float = 0.5 - '''Target microns per pixel (mpp) for patches.''' + """Target microns per pixel (mpp) for patches.""" def __init__( self, @@ -113,16 +113,20 @@ def _load_file_paths(self, split: Literal["train", "val", "test"] | None = None) raise FileNotFoundError(f"No .tif files found in {image_dir}") train_indices, val_indices, test_indices = splitting.random_split( - all_paths, self._train_split_ratio, self._val_split_ratio, self._test_split_ratio, self._seed + all_paths, + self._train_split_ratio, + self._val_split_ratio, + self._test_split_ratio, + self._seed, ) if split == "train": - selected_paths = all_paths[train_indices[0] : train_indices[1]] + selected_paths = [all_paths[i] for i in train_indices] elif split == "val": - selected_paths = all_paths[val_indices[0] : val_indices[1]] + selected_paths = [all_paths[i] for i in val_indices] elif split == "test": - selected_paths = all_paths[test_indices[0] : test_indices[1]] - elif split is None: + selected_paths = [all_paths[i] for i in test_indices] + else: selected_paths = all_paths return [os.path.relpath(path, self._root) for path in selected_paths] From 9137aad27d6521bdcfce7ff779945bc5407d5451 Mon Sep 17 00:00:00 2001 From: Jklubienski Date: Fri, 10 Oct 2025 12:52:35 +0100 Subject: [PATCH 7/7] Added unit tests --- .../annotations-tumor-bulk/masks/103S.tif | Bin 0 -> 246784 bytes .../annotations-tumor-bulk/masks/111S.tif | Bin 0 -> 246784 bytes .../annotations-tumor-bulk/masks/119S.tif | Bin 0 -> 246784 bytes .../annotations-tumor-bulk/masks/124S.tif | Bin 0 -> 246784 bytes .../annotations-tumor-bulk/masks/127B.tif | Bin 0 -> 246784 bytes .../embeddings_dir/coords_test.csv | 2 + .../embeddings_dir/coords_train.csv | 4 + .../embeddings_dir/coords_val.csv | 2 + .../datasets/tiger_wsibulk/images/103S.tif | Bin 0 -> 246784 bytes .../datasets/tiger_wsibulk/images/111S.tif | Bin 0 -> 246784 bytes .../datasets/tiger_wsibulk/images/119S.tif | Bin 0 -> 246784 bytes .../datasets/tiger_wsibulk/images/124S.tif | Bin 0 -> 246784 bytes .../datasets/tiger_wsibulk/images/127B.tif | Bin 0 -> 246784 bytes .../classification/test_tiger_wsibulk.py | 106 ++++++++++++++++++ 14 files changed, 114 insertions(+) create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/103S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/111S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/119S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/124S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/127B.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/images/103S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/images/111S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/images/119S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/images/124S.tif create mode 100644 tests/eva/assets/vision/datasets/tiger_wsibulk/images/127B.tif create mode 100644 tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/103S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/103S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/111S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/111S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/119S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/119S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/124S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/124S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/127B.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/annotations-tumor-bulk/masks/127B.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv new file mode 100644 index 000000000..f5cf97fb6 --- /dev/null +++ b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_test.csv @@ -0,0 +1,2 @@ +file,x_y,width,height,level_idx +images/103S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv new file mode 100644 index 000000000..0c985521f --- /dev/null +++ b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_train.csv @@ -0,0 +1,4 @@ +file,x_y,width,height,level_idx +images/127B.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 +images/119S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 +images/124S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv new file mode 100644 index 000000000..78fc82441 --- /dev/null +++ b/tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/coords_val.csv @@ -0,0 +1,2 @@ +file,x_y,width,height,level_idx +images/111S.tif,"[(96, 160), (160, 64), (64, 64), (96, 0), (0, 224), (64, 32), (96, 96), (192, 192), (224, 224), (160, 128), (160, 192), (0, 160), (224, 160), (96, 32), (128, 0), (64, 160), (224, 128), (64, 128), (224, 0), (192, 64), (192, 224), (224, 64), (192, 96), (0, 128), (96, 64), (32, 224), (224, 96), (64, 224), (96, 128), (160, 0), (32, 32), (128, 160), (96, 224), (128, 224), (64, 0), (0, 96), (128, 64), (96, 192), (32, 64), (192, 0), (224, 32), (0, 192), (128, 192), (32, 96), (192, 128), (160, 32), (64, 192), (64, 96), (128, 96), (0, 0), (160, 224), (192, 32), (160, 160), (32, 128), (192, 160), (160, 96), (32, 192), (224, 192), (0, 64), (128, 128), (128, 32), (0, 32), (32, 160), (32, 0)]",16,16,1 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/103S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/103S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/111S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/111S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/119S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/119S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/124S.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/124S.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/assets/vision/datasets/tiger_wsibulk/images/127B.tif b/tests/eva/assets/vision/datasets/tiger_wsibulk/images/127B.tif new file mode 100644 index 0000000000000000000000000000000000000000..590a9455861b0c44cd2e6b45fcc9b40f775d5f1c GIT binary patch literal 246784 zcmeIwO>3Oj83y2YBqw+k+(i~mych(5K#j01+?1?@lPt0cflz2j1C?zFiKs?knc### z=%#<7%l?qib$?HnO~#&g+{s~f_u@IHT=K{0yfe>p?)#EH{q%R!&1sr$t^U3I+PqGS z8`EO_Xxee*&?^ZMr2v%UHAZ!Pwwzt7+QW&YsXTc6wP zeP{8;>wU-nSjTyH$Ftj8vzvS0-+H#$cW-s?d-J{jnLqe&tJ{zBdmrt1_VMD@bYp)1 z_FUwX)g7B>(@*O^$9wDBzCZZ<{Kd12$0wIxTzq%>r-KidcMtDBxU>BA<3B#VxL&<= z@aN}u?>>6)*MsK$FL#f9xnu0v`N`Gk>f>MCfAIS|%UAy$ef`56`1Q+^%jZ|CAV-Hs zA1uGUxISNAU!0wtU0j|XUcFd8I(huf>C-3EbUgPv{?~f&_-|{hW)@-Q4^B*0asNd#iijoA3SieDA{@-9Fm!?Bm6)*IA|;>y4jmKl^E3JimN@ z@a+8L>h$2lC-3eO!e>26b5MfjCKy|myeI;-R3O^3 zqn*S0<>RAyw|UC}!vJlc(avH0^6}BU+q~s~VSu*JXy>qg`S@twZQgRgFhJX9v~yU$ ze0((THg7p#7@+Ml+BvLWK0cavo3|V=4AAx&?HtxGA0N%T&07u_259??b`I;8kB{cv z<}C*d1GIfcJBRhl$4B#S^OggK0op#Jox}R&z9v@=H2Ek2MhzW zeMUQn^~=Xc^KSE&1BL28aU>Kn7Guk<um*1Xr?_o9gBg5-FukL1!_0+V3=g6)yr>By7w@f3)F7t!7#~CtC!!Nbnjs{7pUFPgJF`PRxiIj z>E6R^E>OFn2g4*otzLe6(!Gb-T%dMC4~9vGTD|=Cq_y~`~r`s$Fcwb literal 0 HcmV?d00001 diff --git a/tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py b/tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py new file mode 100644 index 000000000..36351439e --- /dev/null +++ b/tests/eva/vision/data/datasets/classification/test_tiger_wsibulk.py @@ -0,0 +1,106 @@ +"""Tiger WSIBULK dataset tests.""" + +import os +from typing import Any, Literal + +import pytest +import torch +import torchvision.transforms.v2 as torch_transforms +from torchvision import tv_tensors + +from eva.vision.data import datasets +from eva.vision.data import transforms as eva_transforms +from eva.vision.data.wsi.patching import samplers + +TARGET_SIZE = 224 +DEFAULT_ARGS = { + "width": 16, + "height": 16, + "sampler": samplers.GridSampler(), + "backend": "openslide", + "image_transforms": torch_transforms.Compose([eva_transforms.ResizeAndCrop(size=TARGET_SIZE)]), + "embeddings_dir": "tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir", +} + + +def test_split_and_expected_shapes(root: str, monkeypatch): + """Test loading the dataset with different splits.""" + + train_dataset = datasets.TIGERWsiBulk(root=root, split="train", **DEFAULT_ARGS) + val_dataset = datasets.TIGERWsiBulk(root=root, split="val", **DEFAULT_ARGS) + test_dataset = datasets.TIGERWsiBulk(root=root, split="test", **DEFAULT_ARGS) + + _setup_datasets(train_dataset, val_dataset, test_dataset, monkeypatch=monkeypatch) + + assert len(train_dataset) == 192 + assert len(val_dataset) == 64 + assert len(test_dataset) == 64 + + _check_batch_shape(train_dataset[0]) + _check_batch_shape(val_dataset[0]) + _check_batch_shape(test_dataset[0]) + + +@pytest.mark.parametrize("split", ["train", "val", "test", None]) +def test_filenames(root: str, split: Literal["train", "val", "test"], monkeypatch): + """Tests that the number of filenames matches the dataset size.""" + dataset = datasets.TIGERWsiBulk(root=root, split=split, **DEFAULT_ARGS) + _setup_datasets(dataset, monkeypatch=monkeypatch) + + filenames = set() + for i in range(len(dataset)): + filenames.add(dataset.filename(i)) + + assert len(filenames) == len(dataset.datasets) + + +def _check_batch_shape(batch: Any): + assert isinstance(batch, tuple) + assert len(batch) == 3 + + image, target, metadata = batch + assert isinstance(image, tv_tensors.Image) + assert image.shape == (3, TARGET_SIZE, TARGET_SIZE) + + assert isinstance(target, torch.Tensor) + assert isinstance(metadata, dict) + assert "wsi_id" in metadata + assert "x" in metadata + assert "y" in metadata + assert "width" in metadata + assert "height" in metadata + assert "level_idx" in metadata + + +@pytest.fixture +def root(assets_path: str) -> str: + """Fixture returning the root directory of the dataset.""" + return os.path.join(assets_path, "vision/datasets/tiger_wsibulk") + + +def _setup_datasets(*dataset_splits: datasets.TIGERWsiBulk, monkeypatch): + + monkeypatch.setattr( + datasets.TIGERWsiBulk, + "_expected_dataset_lengths", + {"train": 3, "val": 1, "test": 1, None: 5}, + ) + + split_to_file = { + "train": "coords_train.csv", + "val": "coords_val.csv", + "test": "coords_test.csv", + } + + for dataset in dataset_splits: + + split = dataset._split + if split is not None: + csv_file = split_to_file[split] + monkeypatch.setattr( + dataset, + "_coords_path", + f"tests/eva/assets/vision/datasets/tiger_wsibulk/embeddings_dir/{csv_file}", + ) + + dataset.setup()