diff --git a/python/cuml/cuml/feature_extraction/_vectorizers.py b/python/cuml/cuml/feature_extraction/_vectorizers.py index e8086f5ec6..162f895e7f 100644 --- a/python/cuml/cuml/feature_extraction/_vectorizers.py +++ b/python/cuml/cuml/feature_extraction/_vectorizers.py @@ -28,9 +28,20 @@ cp = gpu_only_import("cupy") cudf = gpu_only_import("cudf") +np = cpu_only_import("numpy") pd = cpu_only_import("pandas") +def min_signed_type(n): + for int_dtype in (np.int8, np.int16, np.int32, np.int64): + dtype = np.dtype(int_dtype) + if (dtype.itemsize * 8) >= 8: + if np.iinfo(int_dtype).min <= n <= np.iinfo(int_dtype).max: + return dtype + # resort to using `int64` and let numpy raise appropriate exception: + return np.int64(n).dtype + + def _preprocess( doc, lower=False, @@ -255,7 +266,7 @@ def _compute_empty_doc_ids(self, count_df, n_doc): of documents. """ remaining_docs = count_df["doc_id"].unique() - dtype = cudf.utils.dtypes.min_signed_type(n_doc) + dtype = min_signed_type(n_doc) doc_ids = cudf.DataFrame( data={"all_ids": cp.arange(0, n_doc, dtype=dtype)}, dtype=dtype ) diff --git a/python/cuml/cuml/internals/array.py b/python/cuml/cuml/internals/array.py index d1f1269290..f120600bb7 100644 --- a/python/cuml/cuml/internals/array.py +++ b/python/cuml/cuml/internals/array.py @@ -44,7 +44,6 @@ cached_property = safe_import_from( "functools", "cached_property", alt=null_decorator ) -CudfBuffer = gpu_only_import_from("cudf.core.buffer", "Buffer") CudfDataFrame = gpu_only_import_from("cudf", "DataFrame") CudfIndex = gpu_only_import_from("cudf", "Index") CudfSeries = gpu_only_import_from("cudf", "Series")