From 81bef2e28c2f6297565667b277ecd021887d61ec Mon Sep 17 00:00:00 2001 From: RakshithaKowlikar Date: Wed, 26 Nov 2025 19:44:59 +0530 Subject: [PATCH 1/2] Add JohnsonQPDRegressor for quantile to probability conversion --- docs/source/api_reference/regression.rst | 1 + skpro/regression/compose/__init__.py | 2 + skpro/regression/compose/_johnson_quantile.py | 318 ++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 skpro/regression/compose/_johnson_quantile.py diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index 80e5fbd3d..aa1239c41 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -22,6 +22,7 @@ Composition :toctree: auto_generated/ :template: class.rst + JohnsonQPDRegressor Pipeline TransformedTargetRegressor diff --git a/skpro/regression/compose/__init__.py b/skpro/regression/compose/__init__.py index ae972a3bb..6c7c78742 100644 --- a/skpro/regression/compose/__init__.py +++ b/skpro/regression/compose/__init__.py @@ -1,9 +1,11 @@ """Composition and pipelines for probabilistic supervised regression.""" +from skpro.regression.compose._johnson_quantile import JohnsonQPDRegressor from skpro.regression.compose._pipeline import Pipeline from skpro.regression.compose._ttr import TransformedTargetRegressor __all__ = [ "Pipeline", + "JohnsonQPDRegressor", "TransformedTargetRegressor", ] diff --git a/skpro/regression/compose/_johnson_quantile.py b/skpro/regression/compose/_johnson_quantile.py new file mode 100644 index 000000000..43051dad5 --- /dev/null +++ b/skpro/regression/compose/_johnson_quantile.py @@ -0,0 +1,318 @@ +"""Johnson QPD wrapper to convert quantile predictions to probabilistic predictions.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["RakshithaKowlikar"] +__all__ = ["JohnsonQPDRegressor"] + +import numpy as np +import pandas as pd +from sklearn import clone + +from skpro.distributions import QPD_Johnson +from skpro.regression.base import BaseProbaRegressor +from skpro.utils.sklearn import prep_skl_df + + +class JohnsonQPDRegressor(BaseProbaRegressor): + """Wrapper converting quantile predictions to probabilistic via Johnson QPD. + + The ``JohnsonQPDRegressor`` takes any quantile regressor and converts its + quantile predictions into full probability distributions using the Johnson + Quantile-Parameterized Distribution. + + The wrapper fits three copies of the base quantile regressor at different + quantile levels (alpha, 0.5, 1-alpha), then uses these predictions to + parameterize a Johnson QPD for each prediction. + + ``fit(X, y)`` - changes state by fitting three clones of ``estimator`` + with quantile parameters set to ``alpha``, 0.5, and ``1-alpha``. + + ``predict(X)`` - result is median predictions from the median quantile regressor. + + ``predict_proba(X)`` - first obtains quantile predictions from all three + regressors, then returns a ``QPD_Johnson`` distribution parameterized + by these quantiles. + + ``predict_interval(X)``, ``predict_quantiles(X)`` - as ``predict_proba``, + using the base class default to obtain predictions from the + ``QPD_Johnson`` distribution. + + Parameters + ---------- + estimator : sklearn regressor + quantile regressor to fit at different quantile levels, + must have a ``quantile`` parameter that can be set via ``set_params`` + alpha : float, default=0.1 + quantile level for symmetric percentile triplet, + must be in the range (0, 0.5), + the wrapper will fit quantiles at ``alpha``, 0.5, and ``1-alpha`` + lower : float, optional, default=None + lower bound for distribution support, if None, unbounded below + upper : float, optional, default=None + upper bound for distribution support, if None, unbounded above + base_dist : str, default='normal' + base distribution type for Johnson QPD, + one of 'normal', 'logistic', 'cauchy' + + Attributes + ---------- + estimator_low_ : sklearn regressor + clone of ``estimator``, fitted to lower quantile (alpha) + estimator_median_ : sklearn regressor + clone of ``estimator``, fitted to median quantile (0.5) + estimator_high_ : sklearn regressor + clone of ``estimator``, fitted to upper quantile (1-alpha) + + Examples + -------- + >>> from skpro.regression.compose import JohnsonQPDRegressor + >>> from sklearn.linear_model import QuantileRegressor + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.model_selection import train_test_split + >>> + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> + >>> # Create wrapper with quantile regressor + >>> base_model = QuantileRegressor(alpha=0.0) + >>> proba_model = JohnsonQPDRegressor(base_model, alpha=0.1) + >>> + >>> # Fit and predict + >>> proba_model.fit(X_train, y_train) + JohnsonQPDRegressor(...) + >>> y_pred_proba = proba_model.predict_proba(X_test) + >>> y_pred_interval = proba_model.predict_interval( + ... X_test, coverage=0.9 + ... ) + """ + + _tags = { + "capability:missing": False, + "capability:survival": False, + "capability:quantile": True, + } + + def __init__( + self, estimator, alpha=0.1, lower=None, upper=None, base_dist="normal" + ): + self.estimator = estimator + self.alpha = alpha + self.lower = lower + self.upper = upper + self.base_dist = base_dist + super().__init__() + + def _fit(self, X, y, C=None): + """Fit regressor to training data. + + Writes to self: + Sets fitted model attributes ending in "_". + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pd.DataFrame, must be same length as X + labels to fit regressor to + C : pd.DataFrame, optional (default=None) + censoring information for survival analysis, + should have same column name as y, same length as X and y + should have entries 0 and 1 (float or int) + 0 = uncensored, 1 = (right) censored + if None, all observations are assumed to be uncensored + Can be passed to any probabilistic regressor, + but is ignored if capability:survival tag is False. + + Returns + ------- + self : reference to self + """ + # coerce X to pandas DataFrame with string column names + X = prep_skl_df(X, copy_df=True) + + # validate alpha range + if not 0 < self.alpha < 0.5: + raise ValueError( + f"alpha must be in the range (0, 0.5), but found alpha={self.alpha}" + ) + + # convert y to 1D array for sklearn compatibility + if isinstance(y, pd.DataFrame): + # check for multivariate targets + if y.shape[1] > 1: + raise ValueError( + "JohnsonQPDRegressor supports only univariate targets, " + f"but y has {y.shape[1]} columns." + ) + y_array = y.values.ravel() + elif isinstance(y, pd.Series): + y_array = y.values + else: + y_array = y.ravel() if hasattr(y, "ravel") else y + + # clone estimator for three quantile levels + self.estimator_low_ = clone(self.estimator) + self.estimator_median_ = clone(self.estimator) + self.estimator_high_ = clone(self.estimator) + + # set quantile parameter for each estimator + self.estimator_low_.set_params(quantile=self.alpha) + self.estimator_median_.set_params(quantile=0.5) + self.estimator_high_.set_params(quantile=1 - self.alpha) + + # fit all three estimators + self.estimator_low_.fit(X, y_array) + self.estimator_median_.fit(X, y_array) + self.estimator_high_.fit(X, y_array) + + return self + + def _predict(self, X): + """Predict labels for data from features. + + State required: + Requires state to be "fitted" = self.is_fitted=True + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in fit + data to predict labels for + + Returns + ------- + y : pandas DataFrame, same length as `X`, same columns as `y` in `fit` + labels predicted for `X`, median (0.5 quantile) predictions + """ + # coerce X to pandas DataFrame with string column names + X = prep_skl_df(X, copy_df=True) + + y_pred = self.estimator_median_.predict(X) + + # ensure return is a DataFrame with correct columns + if isinstance(y_pred, pd.DataFrame): + y_pred_df = y_pred.copy() + y_pred_df.columns = self._y_metadata["feature_names"] + else: + y_pred_arr = np.asarray(y_pred) + if y_pred_arr.ndim == 1: + y_pred_arr = y_pred_arr.reshape(-1, 1) + y_pred_df = pd.DataFrame( + y_pred_arr, index=X.index, columns=self._y_metadata["feature_names"] + ) + + return y_pred_df + + def _predict_proba(self, X): + """Predict distribution over labels for data from features. + + State required: + Requires state to be "fitted". + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in fit + data to predict labels for + + Returns + ------- + y_pred : skpro BaseDistribution, same length as X + labels predicted for X + """ + # coerce X to pandas DataFrame with string column names + X = prep_skl_df(X, copy_df=True) + + # Get quantile predictions + qv_low = self.estimator_low_.predict(X) + qv_median = self.estimator_median_.predict(X) + qv_high = self.estimator_high_.predict(X) + + # convert predictions to arrays + if isinstance(qv_low, pd.DataFrame): + qv_low = qv_low.values.flatten() + elif isinstance(qv_low, pd.Series): + qv_low = qv_low.values + + if isinstance(qv_median, pd.DataFrame): + qv_median = qv_median.values.flatten() + elif isinstance(qv_median, pd.Series): + qv_median = qv_median.values + + if isinstance(qv_high, pd.DataFrame): + qv_high = qv_high.values.flatten() + elif isinstance(qv_high, pd.Series): + qv_high = qv_high.values + + # enforce quantile ordering and minimum spread + # if quantile estimates are degenerate or misordered, + # use symmetric spread around median with minimum width + q_range = np.abs(qv_high - qv_low) + min_spread = np.maximum(q_range * 0.01, 1e-3) + spread = np.maximum(min_spread, q_range / 2.0) + + qv_low_fixed = qv_median - spread + qv_high_fixed = qv_median + spread + + # apply bounds if specified + if self.lower is not None: + qv_low_fixed = np.maximum(qv_low_fixed, self.lower) + qv_median = np.maximum(qv_median, self.lower) + qv_high_fixed = np.maximum(qv_high_fixed, self.lower) + + if self.upper is not None: + qv_low_fixed = np.minimum(qv_low_fixed, self.upper) + qv_median = np.minimum(qv_median, self.upper) + qv_high_fixed = np.minimum(qv_high_fixed, self.upper) + + # construct QPD_Johnson distribution + dist = QPD_Johnson( + alpha=self.alpha, + qv_low=qv_low_fixed, + qv_median=qv_median, + qv_high=qv_high_fixed, + lower=self.lower, + upper=self.upper, + base_dist=self.base_dist, + index=X.index, + columns=self._y_metadata["feature_names"], + ) + + return dist + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return "default" set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + from sklearn.linear_model import QuantileRegressor + + params1 = { + "estimator": QuantileRegressor(alpha=0.0), + "alpha": 0.1, + } + + params2 = { + "estimator": QuantileRegressor(alpha=0.0), + "alpha": 0.25, + "lower": 0.0, + } + + return [params1, params2] From 2c66bb856ec2e96f9158c0e24e586a2af67d6e2d Mon Sep 17 00:00:00 2001 From: RakshithaKowlikar Date: Thu, 27 Nov 2025 18:24:44 +0530 Subject: [PATCH 2/2] fix: add numerical stability fixes for QPD and robust quantile handling for JohnsonQPDRegressor --- skpro/distributions/qpd.py | 58 +++++++++++++- skpro/regression/compose/_johnson_quantile.py | 76 ++++++++++++++----- 2 files changed, 111 insertions(+), 23 deletions(-) diff --git a/skpro/distributions/qpd.py b/skpro/distributions/qpd.py index 360019779..cfd4ef33d 100644 --- a/skpro/distributions/qpd.py +++ b/skpro/distributions/qpd.py @@ -168,6 +168,42 @@ def get_test_params(cls, parameter_set="default"): } return [params1, params2, params3, params4] + def sample(self, n_samples=None): + """Sample from the distribution. + + Parameters + ---------- + n_samples : int, optional (default=None) + number of samples to generate + + Returns + ------- + samples : pd.DataFrame + samples from the distribution + """ + if n_samples is not None: + return super().sample(n_samples=n_samples) + + # handle scalar case + if self.index is None or self.columns is None: + return super().sample(n_samples=n_samples) + + # array case: sample via inverse transform + n_rows = len(self.index) + n_cols = len(self.columns) + + random_state = getattr(self, "_random_state", None) + if random_state is None: + random_state = np.random.RandomState() + + uniform_samples = pd.DataFrame( + random_state.uniform(size=(n_rows, n_cols)), + index=self.index, + columns=self.columns, + ) + + return self.ppf(uniform_samples) + class QPD_S(BaseDistribution): """Johnson Quantile-Parameterized Distributions with semi-bounded mode. @@ -776,7 +812,9 @@ def tfun(x): theta = np.where(LH2B > 0, qll, qhl) theta = np.where(LH2B == 0, qml, theta) if mode == "U": - theta = np.where(LH2B > 0, BL / HL, HB / HL) + theta_raw = np.where(LH2B > 0, BL / HL, HB / HL) + theta_raw = np.where(LH2B == 0, 0.25, theta_raw) + theta = np.clip(theta_raw, 1e-6, 0.5 - 1e-6) if mode in ["B", "S"]: in_arccosh = HL / (2 * HBL) @@ -785,15 +823,27 @@ def tfun(x): delta_unn = np.sinh(delta_unn) delta = delta_unn / c elif mode == "U": - delta = 1.0 / np.arccosh(1 / (2.0 * theta)) - delta = np.where(LH2B == 0, 1, delta) + with np.errstate(divide="ignore", invalid="ignore", over="ignore"): + arg = 1.0 / (2.0 * theta) + arg = np.where(np.isfinite(arg), arg, 1e8) + arg = np.maximum(arg, 1.0 + 1e-8) + ac = np.arccosh(arg) + + # prevent division by zero + ac = np.where(ac == 0.0, 1e-8, ac) + delta = 1.0 / ac + delta = np.where(LH2B == 0, 1.0, delta) if mode == "B": kappa = HL / np.sinh(2 * delta * c) elif mode == "S": kappa = HBL / (delta * c) elif mode == "U": - kappa = HL / np.sinh(2.0 / delta) + # numerically stable kappa for unbounded mode + with np.errstate(divide="ignore", invalid="ignore", over="ignore"): + s = np.sinh(2.0 / delta) + s = np.where(s == 0.0, 1e-8, s) + kappa = HL / s kappa = np.where(LH2B == 0, HB, kappa) params = { diff --git a/skpro/regression/compose/_johnson_quantile.py b/skpro/regression/compose/_johnson_quantile.py index 43051dad5..43bb8f635 100644 --- a/skpro/regression/compose/_johnson_quantile.py +++ b/skpro/regression/compose/_johnson_quantile.py @@ -89,7 +89,6 @@ class JohnsonQPDRegressor(BaseProbaRegressor): _tags = { "capability:missing": False, "capability:survival": False, - "capability:quantile": True, } def __init__( @@ -248,28 +247,35 @@ def _predict_proba(self, X): elif isinstance(qv_high, pd.Series): qv_high = qv_high.values - # enforce quantile ordering and minimum spread - # if quantile estimates are degenerate or misordered, - # use symmetric spread around median with minimum width - q_range = np.abs(qv_high - qv_low) - min_spread = np.maximum(q_range * 0.01, 1e-3) - spread = np.maximum(min_spread, q_range / 2.0) + # ensure quantiles are valid floats + qv_low = np.asarray(qv_low, dtype=float) + qv_median = np.asarray(qv_median, dtype=float) + qv_high = np.asarray(qv_high, dtype=float) + qv_median = np.nan_to_num(qv_median, nan=0.0) - qv_low_fixed = qv_median - spread - qv_high_fixed = qv_median + spread + # enforce minimum spread to prevent degenerate distributions + pred_half = np.abs(qv_high - qv_low) / 2.0 + data_scale = np.maximum(np.abs(qv_median), 1.0) + min_half = np.maximum(data_scale * 0.05, 1e-3) + half_spread = np.maximum(pred_half, min_half) - # apply bounds if specified + # apply symmetric spread around median + qv_low_fixed = qv_median - half_spread + qv_high_fixed = qv_median + half_spread + + # user-specified bounds if self.lower is not None: qv_low_fixed = np.maximum(qv_low_fixed, self.lower) - qv_median = np.maximum(qv_median, self.lower) - qv_high_fixed = np.maximum(qv_high_fixed, self.lower) - + qv_median = np.maximum(qv_median, self.lower + 1e-6) if self.upper is not None: - qv_low_fixed = np.minimum(qv_low_fixed, self.upper) - qv_median = np.minimum(qv_median, self.upper) qv_high_fixed = np.minimum(qv_high_fixed, self.upper) + qv_median = np.minimum(qv_median, self.upper - 1e-6) + + # enforce strict ordering for QPD stability + eps = 1e-6 + qv_low_fixed = np.minimum(qv_low_fixed, qv_median - eps) + qv_high_fixed = np.maximum(qv_high_fixed, qv_median + eps) - # construct QPD_Johnson distribution dist = QPD_Johnson( alpha=self.alpha, qv_low=qv_low_fixed, @@ -284,6 +290,39 @@ def _predict_proba(self, X): return dist + def _predict_var(self, X): + """Predict variance. + + Uses quantile spread to estimate variance assuming normality. + + Parameters + ---------- + X : pd.DataFrame + features to predict variance for + + Returns + ------- + pd.DataFrame + variance predictions, same shape as y + """ + from scipy.stats import norm + + X = prep_skl_df(X, copy_df=True) + + qv_low = np.asarray(self.estimator_low_.predict(X), dtype=float).flatten() + qv_high = np.asarray(self.estimator_high_.predict(X), dtype=float).flatten() + + z_low = norm.ppf(self.alpha) + z_high = norm.ppf(1 - self.alpha) + z_spread = z_high - z_low + + sigma = np.abs(qv_high - qv_low) / z_spread + var = (sigma**2).reshape(-1, 1) + + return pd.DataFrame( + var, index=X.index, columns=self._y_metadata["feature_names"] + ) + @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. @@ -305,14 +344,13 @@ def get_test_params(cls, parameter_set="default"): from sklearn.linear_model import QuantileRegressor params1 = { - "estimator": QuantileRegressor(alpha=0.0), + "estimator": QuantileRegressor(alpha=0.0, solver="highs"), "alpha": 0.1, } params2 = { - "estimator": QuantileRegressor(alpha=0.0), + "estimator": QuantileRegressor(alpha=0.0, solver="highs"), "alpha": 0.25, - "lower": 0.0, } return [params1, params2]