Skip to content

Commit 059ee7f

Browse files
authored
Remove exposure on column construction and unwrap buffers on pylibcudf conversion (#20980)
This PR removes the ability to construct columns and buffers that are already exposed, which is not actually ever possible in the current cudf data model. This change allows us to simplify the various column constructors and standardize the validation process. Relatedly, this PR ensures that conversion from cudf ColumnBase to pylibcudf Column unwraps Buffers so that you do not expose the pylibcudf representation to cudf's Buffer semantics. That change should allow us to fully decouple the internal representation of pylibcudf Columns inside cudf from how they are exposed to public APIs, which also ensures that we do not break CoW and spilling functionality by making too many Buffer copies that we shouldn't. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Roeschke (https://github.com/mroeschke) URL: #20980
1 parent 0011cb2 commit 059ee7f

24 files changed

Lines changed: 288 additions & 351 deletions

python/cudf/cudf/core/buffer/buffer.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -91,19 +91,18 @@ def __init__(
9191
ptr: int,
9292
size: int,
9393
owner: object,
94-
exposed: bool,
9594
):
9695
if size < 0:
9796
raise ValueError("size cannot be negative")
9897

9998
self._ptr = ptr
10099
self._size = size
101100
self._owner = owner
102-
self._exposed = exposed
101+
self._exposed = False
103102
self._slices = weakref.WeakSet()
104103

105104
@classmethod
106-
def from_device_memory(cls, data: Any, exposed: bool) -> Self:
105+
def from_device_memory(cls, data: Any) -> Self:
107106
"""Create from an object providing a `__cuda_array_interface__`.
108107
109108
No data is being copied.
@@ -112,10 +111,6 @@ def from_device_memory(cls, data: Any, exposed: bool) -> Self:
112111
----------
113112
data : device-buffer-like
114113
An object implementing the CUDA Array Interface.
115-
exposed : bool
116-
Mark the buffer as permanently exposed. This is used by
117-
copy-on-write to determine when a deep copy is required
118-
and by SpillableBuffer to mark the buffer unspillable.
119114
120115
Returns
121116
-------
@@ -135,7 +130,7 @@ def from_device_memory(cls, data: Any, exposed: bool) -> Self:
135130
size = data.size
136131
else:
137132
ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
138-
return cls(ptr=ptr, size=size, owner=data, exposed=exposed)
133+
return cls(ptr=ptr, size=size, owner=data)
139134

140135
@classmethod
141136
def from_host_memory(cls, data: Any) -> Self:
@@ -166,7 +161,7 @@ def from_host_memory(cls, data: Any) -> Self:
166161
# Copy to device memory
167162
buf = rmm.DeviceBuffer(ptr=ptr, size=size)
168163
# Create from device memory
169-
return cls.from_device_memory(buf, exposed=False)
164+
return cls.from_device_memory(buf)
170165

171166
@property
172167
def size(self) -> int:
@@ -398,7 +393,6 @@ def copy(self, deep: bool = True) -> Self:
398393
ptr=self._owner.ptr + self._offset,
399394
size=self.size,
400395
),
401-
exposed=False,
402396
)
403397
return self.__class__(owner=owner, offset=0, size=owner.size)
404398

@@ -477,7 +471,7 @@ def deserialize(cls, header: dict, frames: list) -> Self:
477471
header["owner-type-serialized-name"]
478472
]
479473
if hasattr(frame, "__cuda_array_interface__"):
480-
owner = owner_type.from_device_memory(frame, exposed=False)
474+
owner = owner_type.from_device_memory(frame)
481475
else:
482476
owner = owner_type.from_host_memory(frame)
483477
return cls(

python/cudf/cudf/core/buffer/spillable_buffer.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def _finalize_init(self, ptr_desc: dict[str, Any]) -> None:
217217
self._manager.add(self)
218218

219219
@classmethod
220-
def from_device_memory(cls, data: Any, exposed: bool) -> Self:
220+
def from_device_memory(cls, data: Any) -> Self:
221221
"""Create a spillabe buffer from device memory.
222222
223223
No data is being copied.
@@ -226,15 +226,13 @@ def from_device_memory(cls, data: Any, exposed: bool) -> Self:
226226
----------
227227
data : device-buffer-like
228228
An object implementing the CUDA Array Interface.
229-
exposed : bool
230-
Mark the buffer as permanently exposed (unspillable).
231229
232230
Returns
233231
-------
234232
SpillableBufferOwner
235233
Buffer representing the same device memory as `data`
236234
"""
237-
ret = super().from_device_memory(data, exposed=exposed)
235+
ret = super().from_device_memory(data)
238236
ret._finalize_init(ptr_desc={"type": "gpu"})
239237
return ret
240238

@@ -268,7 +266,7 @@ def from_host_memory(cls, data: Any) -> Self:
268266
data = data.cast("B") # Make sure itemsize==1
269267

270268
# Create an already spilled buffer
271-
ret = cls(ptr=0, size=data.nbytes, owner=None, exposed=False)
269+
ret = cls(ptr=0, size=data.nbytes, owner=None)
272270
ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data})
273271
return ret
274272

@@ -575,7 +573,6 @@ def serialize(self) -> tuple[dict, list]:
575573
Buffer(
576574
owner=BufferOwner.from_device_memory(
577575
SpillableBufferCAIWrapper(self),
578-
exposed=False,
579576
)
580577
)
581578
]

python/cudf/cudf/core/buffer/utils.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ def get_buffer_owner(data: Any) -> BufferOwner | None:
4343

4444
def as_buffer(
4545
data: Any,
46-
*,
47-
exposed: bool = False,
4846
) -> Buffer:
4947
"""Factory function to wrap `data` in a Buffer object.
5048
@@ -68,10 +66,6 @@ def as_buffer(
6866
----------
6967
data : buffer-like or array-like
7068
A buffer-like or array-like object.
71-
exposed : bool, optional
72-
Mark the buffer as permanently exposed. This is used by
73-
copy-on-write to determine when a deep copy is required and
74-
by SpillableBuffer to mark the buffer unspillable.
7569
7670
Return
7771
------
@@ -94,16 +88,12 @@ def as_buffer(
9488

9589
# Handle host memory,
9690
if not hasattr(data, "__cuda_array_interface__"):
97-
if exposed:
98-
raise ValueError("cannot created exposed host memory")
9991
return buffer_class(owner=owner_class.from_host_memory(data))
10092

10193
# Check if `data` is owned by a known class
10294
owner = get_buffer_owner(data)
10395
if owner is None: # `data` is new device memory
104-
return buffer_class(
105-
owner=owner_class.from_device_memory(data, exposed=exposed)
106-
)
96+
return buffer_class(owner=owner_class.from_device_memory(data))
10797

10898
# At this point, we know that `data` is owned by a known class, which
10999
# should be the same class as specified by the current config (see above)

python/cudf/cudf/core/column/categorical.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,12 @@ def base_data(self) -> None:
103103
return None
104104

105105
def _get_children_from_pylibcudf_column(
106-
self, plc_column: plc.Column, dtype: DtypeObj, exposed: bool
106+
self, plc_column: plc.Column, dtype: DtypeObj
107107
) -> tuple[ColumnBase]:
108108
"""
109109
This column considers the plc_column (i.e. codes) as children
110110
"""
111-
return (
112-
type(self).from_pylibcudf(plc_column, data_ptr_exposed=exposed),
113-
)
111+
return (type(self).from_pylibcudf(plc_column),)
114112

115113
def __contains__(self, item: ScalarLike) -> bool:
116114
try:
@@ -137,8 +135,8 @@ def codes(self) -> NumericalColumn:
137135
def ordered(self) -> bool | None:
138136
return self.dtype.ordered
139137

140-
def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column:
141-
return self.children[0].to_pylibcudf(mode)
138+
def to_pylibcudf(self) -> plc.Column:
139+
return self.children[0].to_pylibcudf()
142140

143141
def __setitem__(self, key: Any, value: Any) -> None:
144142
if is_scalar(value) and _is_null_host_scalar(value):
@@ -680,7 +678,6 @@ def _with_type_metadata(self: Self, dtype: DtypeObj) -> Self:
680678
return type(self)(
681679
plc_column=self.plc_column,
682680
dtype=dtype,
683-
exposed=False,
684681
)
685682

686683
return self

0 commit comments

Comments
 (0)