Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions configs/vision/pathology/offline/classification/tiger_wsibulk.yaml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plz add this config to tests/eva/vision/test_vision_cli.py (at least to test_configuration_initialization, ideally also to test_predict_fit_from_configuration), so we can test for instantiation errors

Copy link
Collaborator

@nkaenzig nkaenzig Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding some feedback I got from claude / codex here:

• src/eva/vision/data/datasets/tiger.py:108: _load_file_paths only glob matches *.tif. The TIGER docs you just added (docs/datasets/tiger.md, see the sample tree) show the WSIs using the .tiff extension.

• src/eva/vision/data/datasets/classification/tiger_wsibulk.py:86: the dataset now emits progress via print every time annotations are built. In multi-worker loaders this will spam STDOUT and can even deadlock under some multiprocessing backends; please route this through the project logger (or make it optional).

Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 20}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/tiger_wsibulk}
max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 100}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: ${oc.env:SAVE_LAST, false}
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 20}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/tiger_wsibulk}
dataloader_idx_map:
0: train
1: val
2: test
metadata_keys: ["wsi_id"]
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: true
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: eva.vision.models.networks.ABMIL
init_args:
input_size: ${oc.env:IN_FEATURES, 384}
output_size: &NUM_CLASSES 1
projected_input_size: 128
criterion: torch.nn.BCEWithLogitsLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.001}
betas: [0.9, 0.999]
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.BinaryClassificationMetrics
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
embeddings_transforms:
class_path: eva.core.data.transforms.Pad2DTensor
init_args:
pad_size: &N_PATCHES ${oc.env:N_PATCHES, 200}
target_transforms:
class_path: eva.core.data.transforms.dtype.ArrayToFloatTensor
val:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: test
predict:
- class_path: eva.vision.datasets.TIGERWsiBulk
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/training/wsibulk}
sampler:
class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler
init_args:
max_samples: *N_PATCHES
embeddings_dir: *DATASET_EMBEDDINGS_ROOT
width: 224
height: 224
split: train
coords_path: ${data.init_args.datasets.train.init_args.root}/coords_${.split}.csv
image_transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
size: ${oc.env:RESIZE_DIM, 224}
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.TIGERWsiBulk
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
- class_path: eva.vision.datasets.TIGERWsiBulk
init_args:
<<: *PREDICT_DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
test:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
num_workers: *N_DATA_WORKERS
68 changes: 68 additions & 0 deletions docs/datasets/tiger.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# TIGER (Tumor Infiltrating Lymphocytes in breast cancER)

TIGER contains digital pathology images of Her2 positive (Her2+) and Triple Negative (TNBC) breast cancer whole-slide images, together with manual annotations. Training data comes from multiple sources. A subset of Her2+ and TNBC cases is provided by the Radboud University Medical Center (RUMC) (Nijmegen, Netherlands). A second subset of Her2+ and TNBC cases is provided by the Jules Bordet Institut (JB) (Bruxelles, Belgium). A third subset of TNBC cases only is derived from the TCGA-BRCA archive obtained from the Genomic Data Commons Data Portal.

It contains 3 different datasets and thus 3 different tasks to add to eva.

WSIBULK - WSI level classification task: Detecting tumour presence in patches of a given slide.
WSITILS - Regression task: predicting "TIL" score of a whole slide image.
WSIROIS - Cell level segmentation task: predicting boundaries of TIL cells.

However only WSIBULK and WSITILS are currently implemented.

Source: https://tiger.grand-challenge.org/Data/


## Raw data

### Key stats

| | |
|---------------------------|----------------------------------------------------------|
| **Modality** | Vision (WSI) |
| **Tasks** | Binary Classification / Regression |
| **Cancer type** | Breast |
| **Data size** | 182 GB |
| **Image dimension** | ~20k x 20k x 3 |
| **Magnification (μm/px)** | 20x (0.5) - Level 0 |
| **Files format** | `.tif` |
| **Number of images** | 178 WSIs (96 for WSIBULK and 82 for WSITILS) |


### Organization

The data `tiger.zip` from [grand challenge](https://tiger.grand-challenge.org/) is organized as follows:

training/
|_wsibulk/ * Used for classification task
| |__annotations-tumor-bulk/ * Manual annotations of "tumor bulk" regions
| | |___masks/ * Binary masks in TIF format
| | |___xmls/ * Not used in eva
| |__images/ * Whole-Slide Images
| │ ├── 103S.tif
│ | └── ...
| |__tissue-masks/ * Not used in eva
|
|_wsirois/ * Not used in eva currently
|
|_wsitils/ * Used for regression task
| |__images/ * Whole-slide images
| │ ├── 104S.tif
│ | └── ...
| |__tissue-masks/ * Not used in eva
| |__tiger-til-scores-wsitils.csv * Target variable file


## Download and preprocessing

The `TIGER` dataset class doesn't download the data during runtime and must be downloaded manually as follows:

- Make sure that the latest version of the AWS CLI is installed on your system by following [these instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)

With the AWS CLI installed, you can download the official training set (no AWS account required) by running:

`aws s3 cp s3://tiger-training/ /path/to/destination/ --recursive --no-sign-request`

These instructions can also be found on the official challenge page [here](https://tiger.grand-challenge.org/Data/)

We then generate random stratified train / validation and test splits using a 0.7 / 0.15 / 0.15 ratio.
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
GleasonArvaniti,
PANDASmall,
PatchCamelyon,
TIGERWsiBulk,
UniToPatho,
WsiClassificationDataset,
)
Expand Down Expand Up @@ -49,4 +50,5 @@
"VisionDataset",
"MultiWsiDataset",
"WsiDataset",
"TIGERWsiBulk",
]
3 changes: 2 additions & 1 deletion src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from eva.vision.data.datasets.classification.mhist import MHIST
from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall
from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
from eva.vision.data.datasets.classification.tiger_wsibulk import TIGERWsiBulk
from eva.vision.data.datasets.classification.unitopatho import UniToPatho
from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset

Expand All @@ -25,5 +26,5 @@
"WsiClassificationDataset",
"PANDA",
"PANDASmall",
"Camelyon16",
"TIGERWsiBulk",
]
150 changes: 150 additions & 0 deletions src/eva/vision/data/datasets/classification/tiger_wsibulk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""tiger_wsibulk dataset class."""

import ast
import functools
import os
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
import tifffile as tiff
import torch
from typing_extensions import override

from eva.core.utils.progress_bar import tqdm
from eva.vision.data.datasets import _validators, tiger
from eva.vision.data.wsi.patching import PatchCoordinates, samplers


class TIGERWsiBulk(tiger.TIGERBase):
"""Dataset class for the TIGER tumor detection task.

Splits a slide-level WSI into multiple different patch level samples,
dynmaically assigning them labels based on their overlaps with a binary mask.
"""

_expected_dataset_lengths: Dict[str | None, int] = {
"train": 65,
"val": 13,
"test": 15,
None: 93,
}
"""Represents the expected numbers of WSIs in the dataset for validation.
Can be overridden for unit tests"""

_tumor_mask_threshold: float = 0.5
""" Proportion of the patch that needs to be covered by the mask in order for it to
be annotated as a "tumor" (1)"""

_target_mpp: float = 0.5
"""Microns per pixel, in this case stating that a pixel covers 0.5 microns per pixel
Set as a constant in this implementation to ensure no mis-matches with the binary mask"""

def __init__(
self,
root: str,
sampler: samplers.Sampler,
embeddings_dir: str,
**kwargs,
) -> None:
"""Initializes dataset.

Args:
root: Root directory of the dataset.
sampler: The sampler to use for sampling patch coordinates.
embeddings_dir: Directory where the patch data is stored. Used for annotations.
kwargs: Key-word arguments from the base class.
"""
self._embeddings_dir = embeddings_dir
super().__init__(root=root, sampler=sampler, **kwargs)

@functools.cached_property
def annotations(self) -> Dict[str, int]:
"""Builds per-patch labels from the coords CSV files and mask .tif images.

Returns:
A dict: { "img_name-patch_index": label }
"""
annotations = {}

csv_folder = os.path.normpath(self._embeddings_dir)

split_to_csv = {
split: os.path.join(csv_folder, f"coords_{split}.csv")
for split in ["train", "val", "test"]
}

splits_to_load = (
[self._split] if self._split in ["train", "val", "test"] else ["train", "val", "test"]
)

for split in splits_to_load:
csv_path = split_to_csv[split]
df = pd.read_csv(csv_path)
n_rows = len(df)

for row in tqdm(df.itertuples(index=False), total=n_rows, desc=f"[{split}]"):

file_name = row.file

row_dict = row._asdict()
coords = PatchCoordinates(
x_y=ast.literal_eval(row_dict["x_y"]) if isinstance(row_dict["x_y"], str) else row_dict["x_y"],
width=int(row_dict["width"]),
height=int(row_dict["height"]),
level_idx=int(row_dict["level_idx"]),
mask=None,
)

annotations.update(
self._process_patch_coordinates(file_name, coords, self._tumor_mask_threshold)
)

return annotations

def _process_patch_coordinates(
self, file: str, coords: PatchCoordinates, threshold: float
) -> dict[str, int]:
annotations: dict[str, int] = {}
img_name = Path(file).stem
patch_w = int(coords.width)
patch_h = int(coords.height)

mask_path = os.path.join(self._root, "annotations-tumor-bulk", "masks", f"{img_name}.tif")
mask = tiff.imread(mask_path)

for idx, (x, y) in enumerate(coords.x_y):
patch_region = mask[y : y + patch_h, x : x + patch_w]
tumor_fraction = np.mean(patch_region > 0)
label = 1 if tumor_fraction > threshold else 0
key = f"{img_name}-{idx}"
annotations[key] = label

del mask
return annotations

@override
def prepare_data(self) -> None:
_validators.check_dataset_exists(self._root, False)

@override
def validate(self) -> None:
_validators.check_number_of_files(
self._file_paths, self._expected_dataset_lengths[self._split], self._split
)

@override
def load_target(self, index: int) -> torch.Tensor:

metadata = self.load_metadata(index)

slide_idx = metadata["slide_idx"]
patch_idx = metadata["patch_idx"]

file_path = self._file_paths[slide_idx]
slide_name = Path(file_path).stem
key = f"{slide_name}-{patch_idx}"
label = self.annotations[key]

return torch.tensor(label, dtype=torch.int64)
Loading
Loading