Merge pull request #783 from sevmag/new_dataset_snowstorm

sevmag · web-flow · commit 941c9cd16688 · 2025-02-19T20:17:29.000+01:00
add snowstorm_dataset and IceCubehosted class
diff --git a/src/graphnet/data/__init__.py b/src/graphnet/data/__init__.py
@@ -9,4 +9,8 @@
 from .pre_configured import I3ToParquetConverter
 from .pre_configured import I3ToSQLiteConverter
 from .datamodule import GraphNeTDataModule
-from .curated_datamodule import CuratedDataset, ERDAHostedDataset
+from .curated_datamodule import (
+    CuratedDataset,
+    ERDAHostedDataset,
+    IceCubeHostedDataset,
+)
diff --git a/src/graphnet/data/constants.py b/src/graphnet/data/constants.py
@@ -29,6 +29,27 @@ class FEATURES:
         "sensor_pos_z",
         "t",
     ]
+    SNOWSTORM = [
+        "dom_x",
+        "dom_y",
+        "dom_z",
+        "charge",
+        "dom_time",
+        "width",
+        "pmt_area",
+        "rde",
+        "is_bright_dom",
+        "is_bad_dom",
+        "is_saturated_dom",
+        "is_errata_dom",
+        "event_time",
+        "hlc",
+        "awtd",
+        "string",
+        "pmt_number",
+        "dom_number",
+        "dom_type",
+    ]
     KAGGLE = ["x", "y", "z", "time", "charge", "auxiliary"]
     LIQUIDO = ["sipm_x", "sipm_y", "sipm_z", "t"]
 
@@ -84,6 +105,57 @@ class TRUTH:
         "primary_hadron_1_energy",
         "total_energy",
     ]
+    SNOWSTORM = [
+        "energy",
+        "position_x",
+        "position_y",
+        "position_z",
+        "azimuth",
+        "zenith",
+        "pid",
+        "event_time",
+        "interaction_type",
+        "elasticity",
+        "RunID",
+        "SubrunID",
+        "EventID",
+        "SubEventID",
+        "dbang_decay_length",
+        "track_length",
+        "stopped_muon",
+        "energy_track",
+        "energy_cascade",
+        "inelasticity",
+        "DeepCoreFilter_13",
+        "CascadeFilter_13",
+        "MuonFilter_13",
+        "OnlineL2Filter_17",
+        "L3_oscNext_bool",
+        "L4_oscNext_bool",
+        "L5_oscNext_bool",
+        "L6_oscNext_bool",
+        "L7_oscNext_bool",
+        "Homogenized_QTot",
+        "MCLabelClassification",
+        "MCLabelCoincidentMuons",
+        "MCLabelBgMuonMCPE",
+        "MCLabelBgMuonMCPECharge",
+        "GNLabelTrackEnergyDeposited",
+        "GNLabelTrackEnergyOnEntrance",
+        "GNLabelTrackEnergyOnEntrancePrimary",
+        "GNLabelTrackEnergyDepositedPrimary",
+        "GNLabelEnergyPrimary",
+        "GNLabelCascadeEnergyDepositedPrimary",
+        "GNLabelCascadeEnergyDeposited",
+        "GNLabelEnergyDepositedTotal",
+        "GNLabelEnergyDepositedPrimary",
+        "GNLabelHighestEInIceParticleIsChild",
+        "GNLabelHighestEInIceParticleDistance",
+        "GNLabelHighestEInIceParticleEFraction",
+        "GNLabelHighestEInIceParticleEOnEntrance",
+        "GNLabelHighestEDaughterDistance",
+        "GNLabelHighestEDaughterEFraction",
+    ]
     KAGGLE = ["zenith", "azimuth"]
     LIQUIDO = [
         "vertex_x",
diff --git a/src/graphnet/data/curated_datamodule.py b/src/graphnet/data/curated_datamodule.py
@@ -8,6 +8,7 @@
 from typing import Dict, Any, Optional, List, Tuple, Union
 from abc import abstractmethod
 import os
+from glob import glob
 
 from .datamodule import GraphNeTDataModule
 from graphnet.models.graphs import GraphDefinition
@@ -280,3 +281,75 @@ def prepare_data(self) -> None:
             os.system(f"wget -O {file_path} {self._mirror}/{file_hash}")
             os.system(f"tar -xf {file_path} -C {self.dataset_dir}")
             os.system(f"rm {file_path}")
+
+
+class IceCubeHostedDataset(CuratedDataset):
+    """A base class for dataset/datamodule hosted on the IceCube cluster.
+
+    Inheriting subclasses will need to do:
+    - fill out the `_zipped_files` attribute, which
+        should be a list of paths to files that are compressed using `tar` with
+        extension ".tar.gz" and are stored on the IceCube Cluster in "/data/".
+    - implement the `_get_dir_name` method, which should return the
+        directory name where the files resulting from the unzipping of a
+        compressed file should end up.
+    """
+
+    _mirror = "https://convey.icecube.wisc.edu"
+
+    def prepare_data(self) -> None:
+        """Prepare the dataset for training."""
+        assert hasattr(self, "_zipped_files") and (len(self._zipped_files) > 0)
+
+        # Check which files still need to be downloaded
+        files_to_dl = self._resolve_downloads()
+        if files_to_dl == []:
+            return
+
+        # Download files
+        USER = input("Username: ")
+        source_file_paths = " ".join(
+            [f"{self._mirror}{f}" for f in files_to_dl]
+        )
+        os.system(
+            f"wget -P {self.dataset_dir} --user={USER} "
+            + f"--ask-password {source_file_paths}"
+        )
+
+        # unzip files
+        for file in glob(os.path.join(self.dataset_dir, "*.tar.gz")):
+            tmp_dir = os.path.join(self.dataset_dir, "tmp")
+            os.mkdir(tmp_dir)
+            os.system(f"tar -xzf {file} -C {tmp_dir}")
+            unzip_dir = self._get_dir_name(file)
+            os.makedirs(unzip_dir)
+            for db_file in glob(
+                os.path.join(tmp_dir, "**/*.db"), recursive=True
+            ):
+                os.system(f"mv {db_file} {unzip_dir}")
+
+            os.system(f"rm {file}")
+            os.system(f"rm -r {tmp_dir}")
+
+    @abstractmethod
+    def _get_dir_name(self, source_file_path: str) -> str:
+        """Get directory name from source file path.
+
+        E.g. if `source_file_path` is "/data/set/file.tar.gz",
+        return os.path.join(self.dataset_dir, source_file_path.split("/")[-2])
+        to have 'set' as the directory name where all files resulting from the
+        unzipping of `source_file_path` end up. If no substrucutre is desired,
+        just return `self.dataset_dir`
+        """
+        raise NotImplementedError
+
+    def _resolve_downloads(self) -> List[str]:
+        """Resolve which files still need to be downloaded."""
+        if not os.path.exists(self.dataset_dir):
+            return self._zipped_files
+        dir_names = [self._get_dir_name(f) for f in self._zipped_files]
+        ret = []
+        for i, dir in enumerate(dir_names):
+            if not os.path.exists(dir):
+                ret.append(self._zipped_files[i])
+        return ret
diff --git a/src/graphnet/datasets/__init__.py b/src/graphnet/datasets/__init__.py
@@ -2,3 +2,4 @@
 
 from .test_dataset import TestDataset
 from .prometheus_datasets import TRIDENTSmall, BaikalGVDSmall, PONESmall
+from .snowstorm_dataset import SnowStormDataset
diff --git a/src/graphnet/datasets/snowstorm_dataset.py b/src/graphnet/datasets/snowstorm_dataset.py
@@ -0,0 +1,185 @@
+"""Snowstorm dataset module hosted on the IceCube Collaboration servers."""
+
+import pandas as pd
+import re
+import os
+from typing import Dict, Any, Optional, List, Tuple, Union
+from glob import glob
+from sklearn.model_selection import train_test_split
+
+from graphnet.data.constants import FEATURES, TRUTH
+from graphnet.data.curated_datamodule import IceCubeHostedDataset
+from graphnet.data.utilities import query_database
+from graphnet.models.graphs import GraphDefinition
+
+AVAILABLE_RUN_IDS = [
+    *list(range(22010, 22019)),
+    *list(range(22042, 22051)),
+    *list(range(22078, 22087)),
+]
+
+
+class SnowStormDataset(IceCubeHostedDataset):
+    """IceCube SnowStorm simulation dataset.
+
+    More information can be found at
+    https://wiki.icecube.wisc.edu/index.php/SnowStorm_MC#File_Locations
+    This is a IceCube Collaboration simulation dataset.
+    Requires a username and password.
+    """
+
+    _experiment = "IceCube SnowStorm dataset"
+    _creator = "Aske Rosted"
+    _citation = "arXiv:1909.01530"
+    _available_backends = ["sqlite"]
+
+    _pulsemaps = ["SRTInIcePulses"]
+    _truth_table = "truth"
+    _pulse_truth = None
+    _features = FEATURES.SNOWSTORM
+    _event_truth = TRUTH.SNOWSTORM
+    _data_root_dir = "/data/ana/graphnet/Snowstorm_l2"
+
+    def __init__(
+        self,
+        run_ids: List[int],
+        graph_definition: GraphDefinition,
+        download_dir: str,
+        truth: Optional[List[str]] = None,
+        features: Optional[List[str]] = None,
+        train_dataloader_kwargs: Optional[Dict[str, Any]] = None,
+        validation_dataloader_kwargs: Optional[Dict[str, Any]] = None,
+        test_dataloader_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Construct SnowStormDataset.
+
+        Args:
+            run_ids: List of RunIDs to include.
+            graph_definition: Method that defines the data representation.
+            download_dir: Directory to download dataset to.
+            truth (Optional): List of event-level truth to include. Will
+                            include all available information if not given.
+            features (Optional): List of input features from pulsemap to use.
+                                If not given, all available features will be
+                                used.
+            train_dataloader_kwargs (Optional): Arguments for the training
+                                        DataLoader. Default None.
+            validation_dataloader_kwargs (Optional): Arguments for the
+                                        validation DataLoader, Default None.
+            test_dataloader_kwargs (Optional): Arguments for the test
+                                    DataLoader. Default None.
+        """
+        assert all(
+            [i in AVAILABLE_RUN_IDS for i in run_ids]
+        ), f"RunIDs must be in {AVAILABLE_RUN_IDS}. You provided {run_ids}"
+        self._run_ids = run_ids
+        self._zipped_files = [
+            os.path.join(self._data_root_dir, f"{s}.tar.gz") for s in run_ids
+        ]
+
+        super().__init__(
+            graph_definition=graph_definition,
+            download_dir=download_dir,
+            truth=truth,
+            features=features,
+            backend="sqlite",
+            train_dataloader_kwargs=train_dataloader_kwargs,
+            validation_dataloader_kwargs=validation_dataloader_kwargs,
+            test_dataloader_kwargs=test_dataloader_kwargs,
+        )
+
+    def _prepare_args(
+        self, backend: str, features: List[str], truth: List[str]
+    ) -> Tuple[Dict[str, Any], Union[List[int], None], Union[List[int], None]]:
+        """Prepare arguments for dataset."""
+        assert backend == "sqlite"
+        dataset_paths = []
+        for rid in self._run_ids:
+            dataset_paths += glob(
+                os.path.join(self.dataset_dir, str(rid), "**/*.db"),
+                recursive=True,
+            )
+
+        # get event numbers from all datasets
+        event_no = []
+
+        # get RunID
+        pattern = rf"{re.escape(self.dataset_dir)}/(\d+)/.*"
+        event_counts: Dict[str, int] = {}
+        event_counts = {}
+        for path in dataset_paths:
+
+            # Extract the ID
+            match = re.search(pattern, path)
+            assert match
+            run_id = match.group(1)
+
+            query_df = query_database(
+                database=path,
+                query=f"SELECT event_no FROM {self._truth_table}",
+            )
+            query_df["path"] = path
+            event_no.append(query_df)
+
+            # save event count for description
+            if run_id in event_counts:
+                event_counts[run_id] += query_df.shape[0]
+            else:
+                event_counts[run_id] = query_df.shape[0]
+
+        event_no = pd.concat(event_no, axis=0)
+
+        # split the non-unique event numbers into train/val and test
+        train_val, test = train_test_split(
+            event_no,
+            test_size=0.10,
+            random_state=42,
+            shuffle=True,
+        )
+
+        train_val = train_val.groupby("path")
+        test = test.groupby("path")
+
+        # parse into right format for CuratedDataset
+        train_val_selection = []
+        test_selection = []
+        for path in dataset_paths:
+            train_val_selection.append(
+                train_val["event_no"].get_group(path).tolist()
+            )
+            test_selection.append(test["event_no"].get_group(path).tolist())
+
+        dataset_args = {
+            "truth_table": self._truth_table,
+            "pulsemaps": self._pulsemaps,
+            "path": dataset_paths,
+            "graph_definition": self._graph_definition,
+            "features": features,
+            "truth": truth,
+        }
+
+        self._create_comment(event_counts)
+
+        return dataset_args, train_val_selection, test_selection
+
+    @classmethod
+    def _create_comment(cls, event_counts: Dict[str, int] = {}) -> None:
+        """Print the number of events in each RunID."""
+        fixed_string = (
+            " Simulation produced by the IceCube Collaboration, "
+            + "https://wiki.icecube.wisc.edu/index.php/SnowStorm_MC#File_Locations"  # noqa: E501
+        )
+        tot = 0
+        runid_string = ""
+        for k, v in event_counts.items():
+            runid_string += f"RunID {k} contains {v:10d} events\n"
+            tot += v
+        cls._comments = (
+            f"Contains ~{tot/1e6:.1f} million events:\n"
+            + runid_string
+            + fixed_string
+        )
+
+    def _get_dir_name(self, source_file_path: str) -> str:
+        file_name = os.path.basename(source_file_path).split(".")[0]
+        return str(os.path.join(self.dataset_dir, file_name))

Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@`
`2`	`2`
`3`	`3`	`from .test_dataset import TestDataset`
`4`	`4`	`from .prometheus_datasets import TRIDENTSmall, BaikalGVDSmall, PONESmall`
	`5`	`+from .snowstorm_dataset import SnowStormDataset`