Uses sample_weights in distributed learning and adds testing

simonez-tuidi · simonez-tuidi · commit b276598ca831 · 2026-01-29T16:29:31.000+01:00
diff --git a/mlforecast/distributed/forecast.py b/mlforecast/distributed/forecast.py
@@ -142,6 +142,7 @@ def _preprocess_partition(
         keep_last_n: Optional[int] = None,
         window_info: Optional[WindowInfo] = None,
         fit_ts_only: bool = False,
+        weight_col: str | None = None,
     ) -> List[List[Any]]:
         ts = copy.deepcopy(base_ts)
         if fit_ts_only:
@@ -152,6 +153,7 @@ def _preprocess_partition(
                 target_col=target_col,
                 static_features=static_features,
                 keep_last_n=keep_last_n,
+                weight_col=weight_col,
             )
             core_tfms = ts._get_core_lag_tfms()
             if core_tfms:
@@ -195,6 +197,7 @@ def _preprocess_partition(
             static_features=static_features,
             dropna=dropna,
             keep_last_n=keep_last_n,
+            weight_col=weight_col,
         )
         return [
             [
@@ -220,6 +223,7 @@ def _preprocess_partitions(
         keep_last_n: Optional[int] = None,
         window_info: Optional[WindowInfo] = None,
         fit_ts_only: bool = False,
+        weight_col: str | None = None,
     ) -> List[Any]:
         if self.num_partitions:
             partition = dict(by=id_col, num=self.num_partitions, algo="coarse")
@@ -247,6 +251,7 @@ def _preprocess_partitions(
                 "keep_last_n": keep_last_n,
                 "window_info": window_info,
                 "fit_ts_only": fit_ts_only,
+                "weight_col": weight_col,
             },
             schema="ts:binary,train:binary,valid:binary",
             engine=self.engine,
@@ -266,13 +271,15 @@ def _preprocess(
         dropna: bool = True,
         keep_last_n: Optional[int] = None,
         window_info: Optional[WindowInfo] = None,
+        weight_col: str | None = None,
     ) -> fugue.AnyDataFrame:
         self._base_ts.id_col = id_col
         self._base_ts.time_col = time_col
         self._base_ts.target_col = target_col
         self._base_ts.static_features = static_features
         self._base_ts.dropna = dropna
         self._base_ts.keep_last_n = keep_last_n
+        self._base_ts.weight_col = weight_col
         self._partition_results = self._preprocess_partitions(
             data=data,
             id_col=id_col,
@@ -282,6 +289,7 @@ def _preprocess(
             dropna=dropna,
             keep_last_n=keep_last_n,
             window_info=window_info,
+            weight_col=weight_col,
         )
         base_schema = fa.get_schema(data)
         features_schema = {
@@ -341,6 +349,7 @@ def _fit(
         dropna: bool = True,
         keep_last_n: Optional[int] = None,
         window_info: Optional[WindowInfo] = None,
+        weight_col: str | None = None,
     ) -> "DistributedMLForecast":
         prep = self._preprocess(
             data,
@@ -351,28 +360,41 @@ def _fit(
             dropna=dropna,
             keep_last_n=keep_last_n,
             window_info=window_info,
+            weight_col=weight_col,
         )
+        exclude_cols = {id_col, time_col, target_col}
+        if weight_col is not None:
+            exclude_cols.add(weight_col)
         features = [
             x
             for x in fa.get_column_names(prep)
-            if x not in {id_col, time_col, target_col}
+            if x not in exclude_cols
         ]
         self.models_ = {}
         if SPARK_INSTALLED and isinstance(data, SparkDataFrame):
             featurizer = VectorAssembler(
                 inputCols=features, outputCol="features", handleInvalid="keep"
             )
-            train_data = featurizer.transform(prep)[target_col, "features"]
+            select_cols = [target_col, "features"]
+            if weight_col is not None:
+                select_cols.append(weight_col)
+            train_data = featurizer.transform(prep).select(*select_cols)
             for name, model in self.models.items():
-                trained_model = model._pre_fit(target_col).fit(train_data)
+                trained_model = model._pre_fit(target_col, weight_col).fit(train_data)
                 self.models_[name] = model.extract_local_model(trained_model)
         elif DASK_INSTALLED and isinstance(data, dd.DataFrame):
             X, y = prep[features], prep[target_col]
+            if weights:=weight_col:
+                weights = prep[weight_col]
             for name, model in self.models.items():
-                trained_model = clone(model).fit(X, y)
+                trained_model = clone(model).fit(X, y, sample_weight=weights)
                 self.models_[name] = trained_model.model_
         elif RAY_INSTALLED and isinstance(data, RayDataset):
             # Need to materialize
+            if weight_col is not None:
+                raise NotImplementedError(
+                    "Only spark and dask engines currently support sample weights."
+                )
             prep_selected = prep.select_columns(cols=features + [target_col]).materialize()
             X = RayDMatrix(
                 prep_selected,
@@ -396,6 +418,7 @@ def fit(
         static_features: Optional[List[str]] = None,
         dropna: bool = True,
         keep_last_n: Optional[int] = None,
+        weight_col: str | None = None,
     ) -> "DistributedMLForecast":
         """Apply the feature engineering and train the models.
 
@@ -409,6 +432,7 @@ def fit(
             dropna (bool): Drop rows with missing values produced by the transformations. Defaults to True.
             keep_last_n (int, optional): Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.
                 Defaults to None.
+            weight_col (str, optional): Column that contains the sample weights. Defaults to None.
 
         Returns:
             (DistributedMLForecast): Forecast object with series values and trained models.
@@ -421,6 +445,7 @@ def fit(
             static_features=static_features,
             dropna=dropna,
             keep_last_n=keep_last_n,
+            weight_col=weight_col,
         )
 
     @staticmethod
@@ -548,6 +573,7 @@ def cross_validation(
         before_predict_callback: Optional[Callable] = None,
         after_predict_callback: Optional[Callable] = None,
         input_size: Optional[int] = None,
+        weight_col: str | None = None,
     ) -> fugue.AnyDataFrame:
         """Perform time series cross validation.
         Creates `n_windows` splits where each window has `h` test periods,
@@ -577,6 +603,7 @@ def cross_validation(
                 The series identifier is on the index. Defaults to None.
             input_size (int, optional): Maximum training samples per serie in each window. If None, will use an expanding window.
                 Defaults to None.
+            weight_col (str, optional): Column that contains the sample weights. Defaults to None.
 
         Returns:
             (dask, spark or ray DataFrame): Predictions for each window with the series id, timestamp, target value and predictions from each model.
@@ -595,6 +622,7 @@ def cross_validation(
                     dropna=dropna,
                     keep_last_n=keep_last_n,
                     window_info=window_info,
+                    weight_col=weight_col,
                 )
                 self.cv_models_.append(self.models_)
                 partition_results = self._partition_results
@@ -608,6 +636,7 @@ def cross_validation(
                     dropna=dropna,
                     keep_last_n=keep_last_n,
                     window_info=window_info,
+                    weight_col=weight_col,
                 )
             schema = self._get_predict_schema() + Schema(
                 ("cutoff", "datetime"), (self._base_ts.target_col, "double")
@@ -846,4 +875,4 @@ def combine_core_lag_tfms(by_partition):
         fcst = MLForecast(models=self.models_, freq=ts.freq)
         fcst.ts = ts
         fcst.models_ = self.models_
-        return fcst
+        return fcst
diff --git a/mlforecast/distributed/models/spark/lgb.py b/mlforecast/distributed/models/spark/lgb.py
@@ -23,10 +23,12 @@
 
 
 class SparkLGBMForecast(LightGBMRegressor):
-    def _pre_fit(self, target_col):
+    def _pre_fit(self, target_col, weight_col=None):
+        if weight_col is not None and hasattr(self, "setWeightCol"):
+            return self.setLabelCol(target_col).setWeightCol(weight_col)
         return self.setLabelCol(target_col)
 
     def extract_local_model(self, trained_model):
         model_str = trained_model.getNativeModel()
         local_model = lgb.Booster(model_str=model_str)
-        return local_model
+        return local_model
diff --git a/mlforecast/distributed/models/spark/xgb.py b/mlforecast/distributed/models/spark/xgb.py
@@ -15,12 +15,14 @@
 
 
 class SparkXGBForecast(SparkXGBRegressor):
-    def _pre_fit(self, target_col):
+    def _pre_fit(self, target_col, weight_col=None):
         self.setParams(label_col=target_col)
+        if weight_col is not None:
+            self.setParams(weight_col=weight_col)
         return self
 
     def extract_local_model(self, trained_model):
         model_str = trained_model.get_booster().save_raw("ubj")
         local_model = xgb.XGBRegressor()
         local_model.load_model(model_str)
-        return local_model
+        return local_model
diff --git a/tests/test_distributed_forecast.py b/tests/test_distributed_forecast.py
@@ -2,8 +2,10 @@
 import warnings
 
 import dask.dataframe as dd
+import numpy as np
 import pandas as pd
 import pytest
+from sklearn.base import BaseEstimator
 
 from mlforecast.distributed import DistributedMLForecast
 from mlforecast.distributed.models.dask.lgb import DaskLGBMForecast
@@ -12,21 +14,64 @@
 
 warnings.simplefilter("ignore", FutureWarning)
 
-@pytest.mark.skipif(sys.platform == "win32", reason="Distributed tests are not supported on Windows")
-@pytest.mark.skipif(sys.version_info <= (3, 9), reason="Distributed tests are not supported on Python < 3.10")
-def test_dask_distributed_forecast():
+
+def _reset_index_partition(partition: pd.DataFrame) -> pd.DataFrame:
+    return partition.reset_index()
+
+
+def _make_partitioned_series(df: pd.DataFrame, npartitions: int = 4) -> dd.DataFrame:
+    partitioned = dd.from_pandas(df.set_index("unique_id"), npartitions=npartitions)
+    partitioned = partitioned.map_partitions(_reset_index_partition)
+    partitioned["unique_id"] = partitioned["unique_id"].astype(str)
+    return partitioned
+
+
+@pytest.fixture(scope="module")
+def partitioned_series():
     series = generate_daily_series(
         100, equal_ends=True, min_length=500, max_length=1_000
     )
-    npartitions = 4
-    partitioned_series = dd.from_pandas(
-        series.set_index("unique_id"), npartitions=npartitions
-    )  # make sure we split by the id_col
-    partitioned_series = partitioned_series.map_partitions(lambda df: df.reset_index())
-    partitioned_series["unique_id"] = partitioned_series["unique_id"].astype(
-        str
-    )  # can't handle categoricals atm
+    return _make_partitioned_series(series)
+
+
+@pytest.fixture
+def small_ordered_series():
+    series = generate_daily_series(5, min_length=60, max_length=60)
+    return series.sort_values(["unique_id", "ds"]).reset_index(drop=True)
+
+
+class _RecordingLocalModel:
+    def __init__(self, sample_weight):
+        if sample_weight is None:
+            self.sample_weight_ = None
+            self.weight_mean_ = 0.0
+        else:
+            self.sample_weight_ = np.asarray(sample_weight, dtype=float)
+            self.weight_mean_ = float(self.sample_weight_.mean())
+
+    def predict(self, X):
+        length = X.shape[0] if hasattr(X, "shape") else len(X)
+        return np.full(length, self.weight_mean_, dtype=float)
 
+
+class _RecordingDaskRegressor(BaseEstimator):
+    def fit(self, X, y, sample_weight=None):  # noqa: ARG002, D401, N803
+        if sample_weight is None:
+            weights = None
+        else:
+            if hasattr(sample_weight, "compute"):
+                sample_weight = sample_weight.compute()
+            weights = (
+                sample_weight.to_numpy()
+                if hasattr(sample_weight, "to_numpy")
+                else np.asarray(sample_weight, dtype=float)
+            )
+        self.model_ = _RecordingLocalModel(weights)
+        return self
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Distributed tests are not supported on Windows")
+@pytest.mark.skipif(sys.version_info <= (3, 9), reason="Distributed tests are not supported on Python < 3.10")
+def test_dask_distributed_forecast(partitioned_series):
     # test existing features provide the same result
     fcst = DistributedMLForecast(
         models=[DaskLGBMForecast(verbosity=-1, random_state=0)],
@@ -49,3 +94,33 @@ def test_dask_distributed_forecast():
     fcst.preprocess(partitioned_series, static_features=[], dropna=False)
     preds2 = fcst.predict(10).compute()
     pd.testing.assert_frame_equal(preds1, preds2)
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="Distributed tests are not supported on Windows")
+@pytest.mark.skipif(sys.version_info <= (3, 9), reason="Distributed tests are not supported on Python < 3.10")
+def test_dask_distributed_weight_col_affects_predictions(small_ordered_series):
+    def _fit_and_forecast(weights):
+        weighted = small_ordered_series.copy()
+        weighted["weight"] = weights
+        partitioned = _make_partitioned_series(weighted, npartitions=2)
+        fcst = DistributedMLForecast(
+            models={"stub": _RecordingDaskRegressor()},
+            freq="D",
+            lags=[1],
+            date_features=["dayofweek"],
+        )
+        fcst.fit(
+            partitioned,
+            static_features=[],
+            dropna=False,
+            weight_col="weight",
+        )
+        return fcst.predict(5).compute()
+
+    uniform_weights = np.ones(len(small_ordered_series))
+    skewed_weights = np.arange(len(small_ordered_series), dtype=float)
+
+    preds_uniform = _fit_and_forecast(uniform_weights)
+    preds_skewed = _fit_and_forecast(skewed_weights)
+
+    assert not np.allclose(preds_uniform["stub"], preds_skewed["stub"])