Skip to content

Commit 4a1e5f2

Browse files
committed
Remove dask to sparse workarounds
This fixes a failure in `test_to_sparse_dask_array` with dask main. It seems that the workarounds previously implemented are fixed in cupy / dask and can now be removed from cuml. xref rapidsai/dask-upstream-testing#37
1 parent 50152d4 commit 4a1e5f2

1 file changed

Lines changed: 6 additions & 56 deletions

File tree

python/cuml/cuml/dask/common/dask_arr_utils.py

Lines changed: 6 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
1+
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -77,21 +77,7 @@ def _conv_array_to_sparse(arr):
7777
def to_sparse_dask_array(cudf_or_array, client=None):
7878
"""
7979
Converts an array or cuDF to a sparse Dask array backed by sparse CuPy.
80-
CSR matrices. Unfortunately, due to current limitations in Dask, there is
81-
no direct path to convert a cupyx.scipy.sparse.spmatrix into a CuPy backed
82-
dask.Array without copying to host.
83-
84-
85-
NOTE: Until https://github.com/cupy/cupy/issues/2655 and
86-
https://github.com/dask/dask/issues/5604 are implemented, compute()
87-
will not be able to be called on a Dask.array that is backed with
88-
sparse CuPy arrays because they lack the necessary functionality
89-
to be stacked into a single array. The array returned from this
90-
utility will, however, still be able to be passed into functions
91-
that can make use of sparse CuPy-backed Dask.Array (eg. Distributed
92-
Naive Bayes).
93-
94-
Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387
80+
Csr matrices.
9581
9682
Parameters
9783
----------
@@ -105,51 +91,15 @@ def to_sparse_dask_array(cudf_or_array, client=None):
10591
-------
10692
dask_array : dask.Array backed by cupyx.scipy.sparse.csr_matrix
10793
"""
108-
client = default_client() if client is None else client
109-
110-
# Makes sure the MatDescriptor workaround for CuPy sparse arrays
111-
# is loaded (since Dask lazy-loaded serialization in cuML is only
112-
# executed when object from the cuML package needs serialization.
113-
# This can go away once the MatDescriptor pickling bug is fixed
114-
# in CuPy.
115-
# Ref: https://github.com/cupy/cupy/issues/3061
116-
from cuml.comm import serialize # NOQA
117-
94+
ret = cudf_or_array
11895
shape = cudf_or_array.shape
119-
12096
meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1))
12197

122-
ret = cudf_or_array
123-
124-
# If we have a Dask array, convert it to a Dask DataFrame
125-
if isinstance(ret, dask.array.Array):
126-
# At the time of developing this, using map_blocks will not work
127-
# to convert a Dask.Array to CuPy sparse arrays underneath.
128-
129-
def _conv_np_to_df(x):
130-
cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype)
131-
return cudf.DataFrame(cupy_ary)
132-
133-
parts = client.sync(_extract_partitions, ret)
134-
futures = [
135-
client.submit(_conv_np_to_df, part, workers=[w], pure=False)
136-
for w, part in parts
137-
]
138-
139-
ret = df_to_dask_cudf(futures)
140-
141-
# If we have a Dask Dataframe, use `map_partitions` to convert it
142-
# to a Sparse Cupy-backed Dask Array. This will also convert the dense
143-
# Dask array above to a Sparse Cupy-backed Dask Array, since we cannot
144-
# use map_blocks on the array, but we can use `map_partitions` on the
145-
# Dataframe.
14698
if isinstance(ret, dask.dataframe.DataFrame):
147-
ret = ret.map_partitions(
148-
_conv_df_to_sparse, meta=dask.array.from_array(meta)
149-
)
99+
ret = ret.to_dask_array()
150100

151-
# This will also handle the input of dask.array.Array
152-
return ret
101+
if isinstance(cudf_or_array, dask.array.Array):
102+
return cudf_or_array.map_blocks(_conv_array_to_sparse, meta=meta)
153103

154104
else:
155105

0 commit comments

Comments
 (0)