From 649dd15e65e26af1710e20ee3755db20bdf103cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 7 Aug 2025 22:32:54 +0200 Subject: [PATCH 1/6] Update _base.py --- skpro/distributions/base/_base.py | 93 ++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/skpro/distributions/base/_base.py b/skpro/distributions/base/_base.py index 16afaeddb..bb21cbced 100644 --- a/skpro/distributions/base/_base.py +++ b/skpro/distributions/base/_base.py @@ -236,15 +236,49 @@ def _loc(self, rowidx=None, colidx=None): colidx = pd.Index([colidx]) if rowidx is not None: - row_iloc = self.index.get_indexer_for(rowidx) + row_iloc = self._get_indexer_like_pandas(self.index, rowidx) else: row_iloc = None if colidx is not None: - col_iloc = self.columns.get_indexer_for(colidx) + col_iloc = self._get_indexer_like_pandas(self.columns, colidx) else: col_iloc = None return self._iloc(rowidx=row_iloc, colidx=col_iloc) + def _get_indexer_like_pandas(self, index, keys): + """ + A unified helper that mimics pandas' get_indexer_for but supports: + + - scalar key (e.g., "a", ("a", 1)) + - tuple key (partial or full) + - list of keys (partial or full) + - works for both Index and MultiIndex + + Returns: + np.ndarray of positions (integers) + """ + if is_scalar_notnone(keys) or isinstance(keys, tuple): + keys = [keys] + + if isinstance(index, pd.MultiIndex): + # Use get_locs for each key (full or partial) + ilocs = [] + for key in keys: + if isinstance(key, slice): + ilocs.append(index.slice_indexer(key.start, key.stop, key.step)) + else: + iloc = index.get_locs([key]) + if isinstance(iloc, slice): + iloc = np.arange(len(index))[iloc] + ilocs.append(iloc) + return np.concatenate(ilocs) if ilocs else np.array([], dtype=int) + # if not isinstance(index, pd.MultiIndex): + # Regular Index + if isinstance(keys, slice): + return np.arange(len(index))[index.slice_indexer(keys.start, keys.stop, keys.step)] + return index.get_indexer(keys) + + def _at(self, rowidx=None, colidx=None): if rowidx is not None: row_iloc = self.index.get_indexer_for([rowidx])[0] @@ -772,6 +806,61 @@ def _log_pdf(self, x): raise NotImplementedError(self._method_error_msg("log_pdf", "error")) + def pdfj(self, x): + r"""Probability density function. + + Let :math:`X` be a random variables with the distribution of ``self``, + taking values in ``(N, n)`` ``DataFrame``-s + Let :math:`x\in \mathbb{R}^{N\times n}`. + By :math:`p_{X_{ij}}`, denote the marginal pdf of :math:`X` at the + :math:`(i,j)`-th entry. + + The output of this method, for input ``x`` representing :math:`x`, + is a ``DataFrame`` with same columns and indices as ``self``, + and entries :math:`p_{X_{ij}}(x_{ij})`. + + If ``self`` has a mixed or discrete distribution, this returns + the weighted continuous part of `self`'s distribution instead of the pdf, + i.e., the marginal pdf integrate to the weight of the continuous part. + + Parameters + ---------- + x : ``pandas.DataFrame`` or 2D ``np.ndarray`` + representing :math:`x`, as above + + Returns + ------- + ``pd.DataFrame`` with same columns and index as ``self`` + containing :math:`p_{X_{ij}}(x_{ij})`, as above + """ + distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False) + if distr_type == "discrete": + return self._coerce_to_self_index_df(0, flatten=False) + + return self._boilerplate("_pdf", x=x) + + def _pdf(self, x): + """Probability density function. + + Private method, to be implemented by subclasses. + """ + self_has_logpdf = self._has_implementation_of("log_pdf") + self_has_logpdf = self_has_logpdf or self._has_implementation_of("_log_pdf") + if self_has_logpdf: + approx_method = ( + "by exponentiating the output returned by the log_pdf method, " + "this may be numerically unstable" + ) + warn(self._method_error_msg("pdf", fill_in=approx_method)) + + x = self._coerce_to_self_index_df(x, flatten=False) + res = self.log_pdf(x=x) + if isinstance(res, pd.DataFrame): + res = res.values + return np.exp(res) + + raise NotImplementedError(self._method_error_msg("pdf", "error")) + def pmf(self, x): r"""Probability mass function. From f062c5a60183cd107219ca95ce3c96f81dcf4380 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 7 Aug 2025 22:33:11 +0200 Subject: [PATCH 2/6] Update test_all_distrs.py --- skpro/distributions/tests/test_all_distrs.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index 1fed1c5c9..23d3e191e 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -50,10 +50,12 @@ def _has_capability(distr, method): METHODS_SCALAR = ["mean", "var", "energy"] METHODS_SCALAR_POS = ["var", "energy"] # result always non-negative? -METHODS_X = ["energy", "pdf", "log_pdf", "pmf", "log_pmf", "cdf"] -METHODS_X_POS = ["energy", "pdf", "pmf", "cdf", "surv", "haz"] # result non-negative? +METHODS_X = ["energy", "pdf", "log_pdf", "pmf", "log_pmf", "cdf", "pdfj"] +METHODS_X_POS = [ + "energy", "pdf", "pmf", "cdf", "surv", "haz", "pdfj" +] # result non-negative? METHODS_P = ["ppf"] -METHODS_ROWWISE = ["energy"] # results in one column +METHODS_ROWWISE = ["energy", "pdfj"] # results in one column class TestAllDistributions(PackageConfig, DistributionFixtureGenerator, QuickTester): From f407d2db6593321eab4d71208a4be34a1f5ebea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 7 Aug 2025 22:33:30 +0200 Subject: [PATCH 3/6] revert --- skpro/distributions/base/_base.py | 38 ++----------------------------- 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/skpro/distributions/base/_base.py b/skpro/distributions/base/_base.py index bb21cbced..48ad5981f 100644 --- a/skpro/distributions/base/_base.py +++ b/skpro/distributions/base/_base.py @@ -236,49 +236,15 @@ def _loc(self, rowidx=None, colidx=None): colidx = pd.Index([colidx]) if rowidx is not None: - row_iloc = self._get_indexer_like_pandas(self.index, rowidx) + row_iloc = self.index.get_indexer_for(rowidx) else: row_iloc = None if colidx is not None: - col_iloc = self._get_indexer_like_pandas(self.columns, colidx) + col_iloc = self.columns.get_indexer_for(colidx) else: col_iloc = None return self._iloc(rowidx=row_iloc, colidx=col_iloc) - def _get_indexer_like_pandas(self, index, keys): - """ - A unified helper that mimics pandas' get_indexer_for but supports: - - - scalar key (e.g., "a", ("a", 1)) - - tuple key (partial or full) - - list of keys (partial or full) - - works for both Index and MultiIndex - - Returns: - np.ndarray of positions (integers) - """ - if is_scalar_notnone(keys) or isinstance(keys, tuple): - keys = [keys] - - if isinstance(index, pd.MultiIndex): - # Use get_locs for each key (full or partial) - ilocs = [] - for key in keys: - if isinstance(key, slice): - ilocs.append(index.slice_indexer(key.start, key.stop, key.step)) - else: - iloc = index.get_locs([key]) - if isinstance(iloc, slice): - iloc = np.arange(len(index))[iloc] - ilocs.append(iloc) - return np.concatenate(ilocs) if ilocs else np.array([], dtype=int) - # if not isinstance(index, pd.MultiIndex): - # Regular Index - if isinstance(keys, slice): - return np.arange(len(index))[index.slice_indexer(keys.start, keys.stop, keys.step)] - return index.get_indexer(keys) - - def _at(self, rowidx=None, colidx=None): if rowidx is not None: row_iloc = self.index.get_indexer_for([rowidx])[0] From 4630d831da7574535bb5b53343cc2d14bb050d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sat, 1 Nov 2025 15:37:01 +0100 Subject: [PATCH 4/6] Update _base.py --- skpro/distributions/base/_base.py | 75 ++++++++++++++++--------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/skpro/distributions/base/_base.py b/skpro/distributions/base/_base.py index eac60c764..791dade9f 100644 --- a/skpro/distributions/base/_base.py +++ b/skpro/distributions/base/_base.py @@ -699,7 +699,7 @@ def _boilerplate(self, method, columns=None, **kwargs): res = res[()] return res - def pdf(self, x): + def pdf(self, x, axis=None): r"""Probability density function. Let :math:`X` be a random variables with the distribution of ``self``, @@ -713,18 +713,52 @@ def pdf(self, x): and entries :math:`p_{X_{ij}}(x_{ij})`. If ``self`` has a mixed or discrete distribution, this returns - the weighted continuous part of `self`'s distribution instead of the pdf, + the weighted continuous part of ``self``'s distribution instead of the pdf, i.e., the marginal pdf integrate to the weight of the continuous part. + Joint pdfs can be obtained by specifying the ``axis`` argument: + + * ``axis=0`` : joint pdf along rows. + Result is a single-row ``DataFrame`` corresponding to + :math:`p_{X_{\cdot j}}(x_{\cdot j})`, where :math:`X_{\cdot j}` is the + random variable corresponding to the :math:`j`-th column of :math:`X`, + :math:`x_{\cdot j}` is the :math:`j`-th column of :math:`x`, + and :math:`p_{X_{\cdot j}}` is the joint pdf of :math:`X_{\cdot j}`. + * ``axis=1`` : joint pdf along columns. + Result is a single-column ``DataFrame`` corresponding to + :math:`p_{X_{i \cdot}}(x_{i \cdot})`, where :math:`X_{i \cdot}` is the + random variable corresponding to the :math:`i`-th row of :math:`X`, + :math:`x_{i \cdot}` is the :math:`i`-th row of :math:`x`, + * ``axis=(0, 1)`` : joint pdf along rows and columns. + Result is a single scalar value, corresponding to + :math:`p_{X}(x)`, where :math:`p_{X}` is the joint pdf of :math:`X`. + Parameters ---------- x : ``pandas.DataFrame`` or 2D ``np.ndarray`` representing :math:`x`, as above + axis : None or tuple of int, default=None + Axes or axis along which the pdf is joint: + + * None : marginal pdfs are returned (default). + Result has same shape as ``self`` and same index and columns. + * 0 : joint pdf along rows, result has one row and same columns as ``self``. + * 1 : joint pdf along columns, + result has one column and same index as ``self``. + * (0, 1) : joint pdf along rows and columns, + result is a single scalar, a numpy float. Returns ------- - ``pd.DataFrame`` with same columns and index as ``self`` - containing :math:`p_{X_{ij}}(x_{ij})`, as above + ``pd.DataFrame`` + with same columns and index as ``self`` at default (``axis=None``), + containing :math:`p_{X_{ij}}(x_{ij})`, as above. + + * if ``axis=0``, single-row ``DataFrame`` with joint pdfs along rows, + columns same as ``self``, row index is ``[0]`` + * if ``axis=1``, single-column ``DataFrame`` with joint pdfs along columns + index same as ``self``, column name is ``'pdf'`` + * if ``axis=(0, 1)``, single scalar value, a numpy float """ distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False) if distr_type == "discrete": @@ -833,39 +867,6 @@ def _log_pdf(self, x): raise NotImplementedError(self._method_error_msg("log_pdf", "error")) - def pdfj(self, x): - r"""Probability density function. - - Let :math:`X` be a random variables with the distribution of ``self``, - taking values in ``(N, n)`` ``DataFrame``-s - Let :math:`x\in \mathbb{R}^{N\times n}`. - By :math:`p_{X_{i}}`, denote the marginal pdf of :math:`X` at the - :math:`i)`-th row. - - The output of this method, for input ``x`` representing :math:`x`, - is a ``DataFrame`` with same indices as ``self``, a single column ``'pdf'``, - and entries :math:`p_{X_{i}}(x_{i})`. - - If ``self`` has a mixed or discrete distribution, this returns - the weighted continuous part of `self`'s distribution instead of the pdf, - i.e., the marginal pdf integrated to the weight of the continuous part. - - Parameters - ---------- - x : ``pandas.DataFrame`` or 2D ``np.ndarray`` - representing :math:`x`, as above - - Returns - ------- - ``pd.DataFrame`` with same index as ``self`` and single column ``'pdf'``, - containing :math:`p_{X_{i}}(x_{i})`, as above - """ - distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False) - if distr_type == "discrete": - return self._coerce_to_self_index_df(0, flatten=False) - - return self._boilerplate("_jpdf", x=x) - @staticmethod def _approx_derivative(x, fun, h=1e-7): """Approximate the derivative of the log PDF using finite differences. From 778359afe074270bd67e15f0eacfb514e0e235fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sat, 1 Nov 2025 16:10:00 +0100 Subject: [PATCH 5/6] Update _base.py --- skpro/distributions/base/_base.py | 54 ++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/skpro/distributions/base/_base.py b/skpro/distributions/base/_base.py index 791dade9f..8e0ee17af 100644 --- a/skpro/distributions/base/_base.py +++ b/skpro/distributions/base/_base.py @@ -26,6 +26,9 @@ class BaseDistribution(BaseObject): # ------------- "distr:measuretype": "mixed", # distribution type, mixed, continuous, discrete "distr:paramtype": "general", + "property:multivariate": False, # whether distribution is multivariate + "property:indep_axes": (0, 1), # axes along which distr is independent + # # parameterization type - parametric, nonparametric, composite # # default parameter settings for MC estimates @@ -679,6 +682,8 @@ def _boilerplate(self, method, columns=None, **kwargs): x_inner = x.values # else, coerce to a numpy array if needed # then, broadcast it to the shape of self + if k == "axis": + x_inner = _coerce_to_tuple(x) else: x_inner = self._coerce_to_self_index_np(x, flatten=False) kwargs_inner[k] = x_inner @@ -729,7 +734,7 @@ def pdf(self, x, axis=None): :math:`p_{X_{i \cdot}}(x_{i \cdot})`, where :math:`X_{i \cdot}` is the random variable corresponding to the :math:`i`-th row of :math:`X`, :math:`x_{i \cdot}` is the :math:`i`-th row of :math:`x`, - * ``axis=(0, 1)`` : joint pdf along rows and columns. + * ``axis=(0, 1)`` or ``axis=="all"`` : joint pdf along rows and columns. Result is a single scalar value, corresponding to :math:`p_{X}(x)`, where :math:`p_{X}` is the joint pdf of :math:`X`. @@ -737,7 +742,7 @@ def pdf(self, x, axis=None): ---------- x : ``pandas.DataFrame`` or 2D ``np.ndarray`` representing :math:`x`, as above - axis : None or tuple of int, default=None + axis : None, ``"all"``, or tuple of int, default=None Axes or axis along which the pdf is joint: * None : marginal pdfs are returned (default). @@ -764,9 +769,38 @@ def pdf(self, x, axis=None): if distr_type == "discrete": return self._coerce_to_self_index_df(0, flatten=False) - return self._boilerplate("_pdf", x=x) + # handle joint / marginalization + indep_axes = self.get_tag("property:indep_axes", (0, 1)) + if axis is not None: + if axis == "all": + axis = (0, 1) + axis = _coerce_to_tuple(axis) + + axes_to_pass = tuple([ax for ax in axis if ax not in indep_axes]) + axes_to_handle_here = [ax for ax in axis if ax in indep_axes] + + axs = {"axis": axes_to_pass} if len(axes_to_pass) > 0 else {} + else: + axs = {} + axes_to_handle_here = [] + + pdf_val = self._boilerplate("_pdf", x=x, **axs) + + # handle marginalization over independent axes + for ax in axes_to_handle_here: + pdf_val = pdf_val.prod(axis=ax) + if isinstance(pdf_val, pd.Series): + if ax == 0: + pdf_val = pdf_val.to_frame().T + pdf_val.index = pd.Index([0]) + else: + pdf_val = pdf_val.to_frame(name="pdf") + if len(axis) == 2: + pdf_val = pdf_val.values[0, 0] + + return pdf_val - def _pdf(self, x): + def _pdf(self, x, axis=None): """Probability density function. Private method, to be implemented by subclasses. @@ -2030,3 +2064,15 @@ def _coerce_to_pd_index_or_none(x): if isinstance(x, pd.Index): return x return pd.Index(x) + + +def _coerce_to_tuple(x): + """Coerce to tuple.""" + if x is None: + return () + if isinstance(x, tuple): + return x + # if iterable but not string, coerce to tuple + if hasattr(x, "__iter__") and not isinstance(x, str): + return tuple(x) + return (x,) # else, make single-element tuple From 44382432fbc618f13575ea0d1fb348279457c6ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sun, 2 Nov 2025 12:46:26 +0100 Subject: [PATCH 6/6] Update test_all_distrs.py --- skpro/distributions/tests/test_all_distrs.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index 23d3e191e..1fed1c5c9 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -50,12 +50,10 @@ def _has_capability(distr, method): METHODS_SCALAR = ["mean", "var", "energy"] METHODS_SCALAR_POS = ["var", "energy"] # result always non-negative? -METHODS_X = ["energy", "pdf", "log_pdf", "pmf", "log_pmf", "cdf", "pdfj"] -METHODS_X_POS = [ - "energy", "pdf", "pmf", "cdf", "surv", "haz", "pdfj" -] # result non-negative? +METHODS_X = ["energy", "pdf", "log_pdf", "pmf", "log_pmf", "cdf"] +METHODS_X_POS = ["energy", "pdf", "pmf", "cdf", "surv", "haz"] # result non-negative? METHODS_P = ["ppf"] -METHODS_ROWWISE = ["energy", "pdfj"] # results in one column +METHODS_ROWWISE = ["energy"] # results in one column class TestAllDistributions(PackageConfig, DistributionFixtureGenerator, QuickTester):