rapidsai · rapids-bot · Jul 23, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
@@ -297,9 +297,12 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
-    // TODO(wphicks): Make sure buffer lands on same device as model
-    auto out_buffer = raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type};
-    auto in_buffer  = raft_proto::buffer{input, num_rows * num_features(), in_mem_type};
+    int current_device_id;
+    raft_proto::cuda_check(cudaGetDevice(&current_device_id));
+    auto out_buffer =
+      raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id};
+    auto in_buffer =
+      raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id};
     predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 

@@ -48,6 +48,8 @@ from cuml.fil.postprocessing cimport element_op, row_op
 from cuml.fil.tree_layout cimport tree_layout as fil_tree_layout
 from cuml.internals.treelite cimport *
 
+from cuda.bindings import runtime
+
 
 cdef extern from "cuml/fil/forest_model.hpp" namespace "ML::fil" nogil:
     cdef cppclass forest_model:
@@ -154,7 +156,7 @@ cdef class ForestInference_impl():
         align_bytes=0,
         use_double_precision=None,
         mem_type=None,
-        device_id=0
+        device_id=None,
     ):
         # Store reference to RAFT handle to control lifetime, since raft_proto
         # handle keeps a pointer to it
@@ -197,6 +199,14 @@ cdef class ForestInference_impl():
         else:
             raise RuntimeError(f"Unrecognized tree layout {layout}")
 
+        # Use assertion here, since device_id being None would indicate
+        # a bug, not a user error. The outer ForestInference object
+        # should set an integer device_id before passing it to
+        # ForestInference_impl.
+        assert device_id is not None, (
+            "device_id should be set before building ForestInference_impl"
+        )
+
         self.model = import_from_treelite_handle(
             tl_handle,
             tree_layout,
@@ -457,9 +467,10 @@ class ForestInference(Base, CMajorInputTagMixin):
         only for models trained and double precision and when exact
         conformance between results from FIL and the original training
         framework is of paramount importance.
-    device_id : int, default=0
+    device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
-        model. For CPU execution, this value is currently ignored.
+        model. If set to None, use the currently active device.
+        For CPU execution, this value is currently ignored.
     """
 
     def _reload_model(self):
@@ -553,7 +564,7 @@ class ForestInference(Base, CMajorInputTagMixin):
         try:
             return self._device_id_
         except AttributeError:
-            self._device_id_ = 0
+            self._device_id_ = None
             return self._device_id_
 
     @device_id.setter
@@ -562,14 +573,13 @@ class ForestInference(Base, CMajorInputTagMixin):
             old_value = self.device_id
         except AttributeError:
             old_value = None
-        if value is not None:
-            self._device_id_ = value
-            if (
-                self.treelite_model is not None
-                and self.device_id != old_value
-                and hasattr(self, '_gpu_forest')
-            ):
-                self._load_to_fil(device_id=self.device_id)
+        self._device_id_ = value
+        if (
+            self.treelite_model is not None
+            and self.device_id != old_value
+            and hasattr(self, '_gpu_forest')
+        ):
+            self._load_to_fil(device_id=self.device_id)
 
     @property
     def treelite_model(self):
@@ -616,7 +626,7 @@ class ForestInference(Base, CMajorInputTagMixin):
         default_chunk_size=None,
         align_bytes=None,
         precision='single',
-        device_id=0,
+        device_id=None,
     ):
         super().__init__(
             handle=handle, verbose=verbose, output_type=output_type
@@ -633,12 +643,22 @@ class ForestInference(Base, CMajorInputTagMixin):
         self.treelite_model = treelite_model
         self._load_to_fil(device_id=self.device_id)
 
-    def _load_to_fil(self, mem_type=None, device_id=0):
+    def _load_to_fil(self, mem_type=None, device_id=None):
         if mem_type is None:
             mem_type = GlobalSettings().fil_memory_type
         else:
             mem_type = MemoryType.from_str(mem_type)
 
+        if device_id is None:
+            # If no device ID is explicitly given, use the currently
+            # active device
+            status, current_device_id = runtime.cudaGetDevice()
+            if status != runtime.cudaError_t.cudaSuccess:
+                _, name = runtime.cudaGetErrorName(status)
+                _, msg = runtime.cudaGetErrorString(status)
+                raise RuntimeError(f"Failed to run cudaGetDevice(). {name}: {msg}")
+            device_id = current_device_id
+
         if mem_type.is_device_accessible:
             self.device_id = device_id
 

@@ -14,8 +14,10 @@
 #
 
 import os
+from contextlib import nullcontext
 from math import ceil
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pytest
@@ -37,6 +39,9 @@
 from sklearn.model_selection import train_test_split  # noqa: E402
 
 from cuml import ForestInference  # noqa: E402
+from cuml.ensemble import (  # noqa: E402
+    RandomForestClassifier as cumlRandomForestClassifier,
+)
 from cuml.fil import get_fil_device_type, set_fil_device_type  # noqa: E402
 from cuml.internals.device_type import DeviceType  # noqa: E402
 from cuml.internals.global_settings import GlobalSettings  # noqa: E402
@@ -899,6 +904,100 @@ def test_missing_categorical(category_list):
     np.testing.assert_equal(fil_preds.flatten(), gtil_preds.flatten())
 
 
+@pytest.mark.parametrize("device_id", [None, 0, 1, 2])
+@pytest.mark.parametrize("model_kind", ["sklearn", "xgboost", "cuml"])
+def test_device_selection(device_id, model_kind, tmp_path):
+    current_device = cp.cuda.runtime.getDevice()
+
+    if device_id is not None and device_id >= cp.cuda.runtime.getDeviceCount():
+        pytest.skip(
+            reason="device_id larger than the number of available GPU devices"
+        )
+
+    n_rows = 1000
+    n_columns = 30
+    n_classes = 3
+    n_estimators = 10
+
+    X, y = simulate_data(
+        n_rows,
+        n_columns,
+        n_classes,
+        random_state=0,
+        classification=True,
+    )
+
+    # 1. Model can be loaded with device_id set
+    if model_kind == "sklearn":
+        skl_model = RandomForestClassifier(
+            max_depth=3, random_state=0, n_estimators=n_estimators
+        )
+        skl_model.fit(X, y)
+        fm = ForestInference.load_from_sklearn(
+            skl_model,
+            precision="native",
+            is_classifier=True,
+            device_id=device_id,
+        )
+    elif model_kind == "xgboost":
+        xgb_model = xgb.XGBClassifier(
+            max_depth=3, random_state=0, n_estimators=n_estimators
+        )
+        xgb_model.fit(X, y)
+        model_path = os.path.join(tmp_path, "xgb_class.ubj")
+        xgb_model.save_model(model_path)
+        fm = ForestInference.load(
+            model_path,
+            model_type="xgboost_ubj",
+            precision="native",
+            is_classifier=True,
+            device_id=device_id,
+        )
+    elif model_kind == "cuml":
+        device_context = (
+            cp.cuda.Device(device_id) if device_id else nullcontext()
+        )
+
+        with device_context:
+            # TODO(hcho3): Remove n_streams=1 argument once the bug
+            # https://github.com/rapidsai/cuml/issues/5983 is resolved
+            cuml_model = cumlRandomForestClassifier(
+                max_depth=3,
+                random_state=0,
+                n_estimators=n_estimators,
+                n_streams=1,
+            )
+            cuml_model.fit(cp.array(X), cp.array(y))
+            fm = cuml_model.convert_to_fil_model()
+    else:
+        raise NotImplementedError()
+
+    # 2. The section above didn't corrupt current device context
+    assert cp.cuda.runtime.getDevice() == current_device
+
+    # 3. Device selection is correctly saved to device_id property
+    assert fm.device_id == (device_id if device_id else 0)
+
+    # 4. Inference can run on an input with the selected device
+    device_context = cp.cuda.Device(device_id) if device_id else nullcontext()
+    with device_context:
+        _ = fm.predict_proba(cp.array(X))
+
+    # 5. The section above didn't corrupt current device context
+    assert cp.cuda.runtime.getDevice() == current_device
+
+    # 6. Attempting to run inference with an input from a different device
+    #    is an error
+    if device_id is not None and device_id != 0:
+        with cp.cuda.Device(0), pytest.raises(
+            RuntimeError, match=r".*I/O data on different device than model.*"
+        ):
+            _ = fm.predict_proba(cp.array(X))
+
+    # 7. The section above didn't corrupt current device context
+    assert cp.cuda.runtime.getDevice() == current_device
+
+
 def test_wide_data():
     n_rows = 50
     n_features = 100000