diff --git a/cpp/include/cuml/fil/forest_model.hpp b/cpp/include/cuml/fil/forest_model.hpp index c3a848aab4..7921ed6846 100644 --- a/cpp/include/cuml/fil/forest_model.hpp +++ b/cpp/include/cuml/fil/forest_model.hpp @@ -297,9 +297,12 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { - // TODO(wphicks): Make sure buffer lands on same device as model - auto out_buffer = raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type}; - auto in_buffer = raft_proto::buffer{input, num_rows * num_features(), in_mem_type}; + int current_device_id; + raft_proto::cuda_check(cudaGetDevice(¤t_device_id)); + auto out_buffer = + raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id}; + auto in_buffer = + raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id}; predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size); } diff --git a/python/cuml/cuml/fil/fil.pyx b/python/cuml/cuml/fil/fil.pyx index 2b857cf1c9..2b1753d77c 100644 --- a/python/cuml/cuml/fil/fil.pyx +++ b/python/cuml/cuml/fil/fil.pyx @@ -48,6 +48,8 @@ from cuml.fil.postprocessing cimport element_op, row_op from cuml.fil.tree_layout cimport tree_layout as fil_tree_layout from cuml.internals.treelite cimport * +from cuda.bindings import runtime + cdef extern from "cuml/fil/forest_model.hpp" namespace "ML::fil" nogil: cdef cppclass forest_model: @@ -154,7 +156,7 @@ cdef class ForestInference_impl(): align_bytes=0, use_double_precision=None, mem_type=None, - device_id=0 + device_id=None, ): # Store reference to RAFT handle to control lifetime, since raft_proto # handle keeps a pointer to it @@ -197,6 +199,14 @@ cdef class ForestInference_impl(): else: raise RuntimeError(f"Unrecognized tree layout {layout}") + # Use assertion here, since device_id being None would indicate + # a bug, not a user error. The outer ForestInference object + # should set an integer device_id before passing it to + # ForestInference_impl. + assert device_id is not None, ( + "device_id should be set before building ForestInference_impl" + ) + self.model = import_from_treelite_handle( tl_handle, tree_layout, @@ -457,9 +467,10 @@ class ForestInference(Base, CMajorInputTagMixin): only for models trained and double precision and when exact conformance between results from FIL and the original training framework is of paramount importance. - device_id : int, default=0 + device_id : int or None, default=None For GPU execution, the device on which to load and execute this - model. For CPU execution, this value is currently ignored. + model. If set to None, use the currently active device. + For CPU execution, this value is currently ignored. """ def _reload_model(self): @@ -553,7 +564,7 @@ class ForestInference(Base, CMajorInputTagMixin): try: return self._device_id_ except AttributeError: - self._device_id_ = 0 + self._device_id_ = None return self._device_id_ @device_id.setter @@ -562,14 +573,13 @@ class ForestInference(Base, CMajorInputTagMixin): old_value = self.device_id except AttributeError: old_value = None - if value is not None: - self._device_id_ = value - if ( - self.treelite_model is not None - and self.device_id != old_value - and hasattr(self, '_gpu_forest') - ): - self._load_to_fil(device_id=self.device_id) + self._device_id_ = value + if ( + self.treelite_model is not None + and self.device_id != old_value + and hasattr(self, '_gpu_forest') + ): + self._load_to_fil(device_id=self.device_id) @property def treelite_model(self): @@ -616,7 +626,7 @@ class ForestInference(Base, CMajorInputTagMixin): default_chunk_size=None, align_bytes=None, precision='single', - device_id=0, + device_id=None, ): super().__init__( handle=handle, verbose=verbose, output_type=output_type @@ -633,12 +643,22 @@ class ForestInference(Base, CMajorInputTagMixin): self.treelite_model = treelite_model self._load_to_fil(device_id=self.device_id) - def _load_to_fil(self, mem_type=None, device_id=0): + def _load_to_fil(self, mem_type=None, device_id=None): if mem_type is None: mem_type = GlobalSettings().fil_memory_type else: mem_type = MemoryType.from_str(mem_type) + if device_id is None: + # If no device ID is explicitly given, use the currently + # active device + status, current_device_id = runtime.cudaGetDevice() + if status != runtime.cudaError_t.cudaSuccess: + _, name = runtime.cudaGetErrorName(status) + _, msg = runtime.cudaGetErrorString(status) + raise RuntimeError(f"Failed to run cudaGetDevice(). {name}: {msg}") + device_id = current_device_id + if mem_type.is_device_accessible: self.device_id = device_id diff --git a/python/cuml/cuml/tests/test_fil.py b/python/cuml/cuml/tests/test_fil.py index bfac6a973c..9c3c06ad15 100644 --- a/python/cuml/cuml/tests/test_fil.py +++ b/python/cuml/cuml/tests/test_fil.py @@ -14,8 +14,10 @@ # import os +from contextlib import nullcontext from math import ceil +import cupy as cp import numpy as np import pandas as pd import pytest @@ -37,6 +39,9 @@ from sklearn.model_selection import train_test_split # noqa: E402 from cuml import ForestInference # noqa: E402 +from cuml.ensemble import ( # noqa: E402 + RandomForestClassifier as cumlRandomForestClassifier, +) from cuml.fil import get_fil_device_type, set_fil_device_type # noqa: E402 from cuml.internals.device_type import DeviceType # noqa: E402 from cuml.internals.global_settings import GlobalSettings # noqa: E402 @@ -899,6 +904,100 @@ def test_missing_categorical(category_list): np.testing.assert_equal(fil_preds.flatten(), gtil_preds.flatten()) +@pytest.mark.parametrize("device_id", [None, 0, 1, 2]) +@pytest.mark.parametrize("model_kind", ["sklearn", "xgboost", "cuml"]) +def test_device_selection(device_id, model_kind, tmp_path): + current_device = cp.cuda.runtime.getDevice() + + if device_id is not None and device_id >= cp.cuda.runtime.getDeviceCount(): + pytest.skip( + reason="device_id larger than the number of available GPU devices" + ) + + n_rows = 1000 + n_columns = 30 + n_classes = 3 + n_estimators = 10 + + X, y = simulate_data( + n_rows, + n_columns, + n_classes, + random_state=0, + classification=True, + ) + + # 1. Model can be loaded with device_id set + if model_kind == "sklearn": + skl_model = RandomForestClassifier( + max_depth=3, random_state=0, n_estimators=n_estimators + ) + skl_model.fit(X, y) + fm = ForestInference.load_from_sklearn( + skl_model, + precision="native", + is_classifier=True, + device_id=device_id, + ) + elif model_kind == "xgboost": + xgb_model = xgb.XGBClassifier( + max_depth=3, random_state=0, n_estimators=n_estimators + ) + xgb_model.fit(X, y) + model_path = os.path.join(tmp_path, "xgb_class.ubj") + xgb_model.save_model(model_path) + fm = ForestInference.load( + model_path, + model_type="xgboost_ubj", + precision="native", + is_classifier=True, + device_id=device_id, + ) + elif model_kind == "cuml": + device_context = ( + cp.cuda.Device(device_id) if device_id else nullcontext() + ) + + with device_context: + # TODO(hcho3): Remove n_streams=1 argument once the bug + # https://github.com/rapidsai/cuml/issues/5983 is resolved + cuml_model = cumlRandomForestClassifier( + max_depth=3, + random_state=0, + n_estimators=n_estimators, + n_streams=1, + ) + cuml_model.fit(cp.array(X), cp.array(y)) + fm = cuml_model.convert_to_fil_model() + else: + raise NotImplementedError() + + # 2. The section above didn't corrupt current device context + assert cp.cuda.runtime.getDevice() == current_device + + # 3. Device selection is correctly saved to device_id property + assert fm.device_id == (device_id if device_id else 0) + + # 4. Inference can run on an input with the selected device + device_context = cp.cuda.Device(device_id) if device_id else nullcontext() + with device_context: + _ = fm.predict_proba(cp.array(X)) + + # 5. The section above didn't corrupt current device context + assert cp.cuda.runtime.getDevice() == current_device + + # 6. Attempting to run inference with an input from a different device + # is an error + if device_id is not None and device_id != 0: + with cp.cuda.Device(0), pytest.raises( + RuntimeError, match=r".*I/O data on different device than model.*" + ): + _ = fm.predict_proba(cp.array(X)) + + # 7. The section above didn't corrupt current device context + assert cp.cuda.runtime.getDevice() == current_device + + def test_wide_data(): n_rows = 50 n_features = 100000