Skip to content

Commit 5e50001

Browse files
committed
Remove workaround
1 parent 90aad85 commit 5e50001

7 files changed

Lines changed: 57 additions & 89 deletions

File tree

python/cudf/cudf/core/column/column.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,11 @@
8585
dtype_to_pylibcudf_type,
8686
find_common_type,
8787
get_dtype_of_same_kind,
88+
is_arrow_null_dtype,
8889
is_column_like,
8990
is_mixed_with_object_dtype,
9091
is_pandas_nullable_extension_dtype,
9192
is_pandas_nullable_numpy_dtype,
92-
maybe_normalize_arrow_null,
9393
min_signed_type,
9494
np_dtypes_to_pandas_dtypes,
9595
pyarrow_dtype_to_cudf_dtype,
@@ -378,13 +378,9 @@ def _wrap_and_validate(col: plc.Column, dtype: DtypeObj) -> plc.Column:
378378
"Normalize to np.dtype('O') before calling "
379379
"ColumnBase.create."
380380
)
381-
if isinstance(dtype, pd.ArrowDtype) and pa.types.is_null(
382-
dtype.pyarrow_dtype
383-
):
381+
if is_arrow_null_dtype(dtype) and col.null_count() != col.size():
384382
raise ValueError(
385-
f"dtype {dtype} is a pandas nullable string dtype with all nulls. "
386-
"Normalize to an empty string column with the same pandas StringDtype "
387-
"before calling ColumnBase.create."
383+
f"dtype {dtype} can only be used with all-null columns."
388384
)
389385

390386
dtype_kind = dtype.kind
@@ -961,15 +957,11 @@ def create(
961957
like copy-on-write. When validation is disabled, the caller is responsible for
962958
ensuring that col and its children are already normalized and wrapped.
963959
"""
964-
# For pandas nullable null types (ArrowDtype wrapping pa.null()),
965-
# normalize the column data and dtype before construction.
966-
col, dtype, old_dtype = maybe_normalize_arrow_null(col, dtype)
967-
968960
# Dispatch to the appropriate subclass based on dtype
969961
target_cls = ColumnBase._dispatch_subclass_from_dtype(dtype)
970962
self = target_cls.__new__(target_cls)
971963
self.plc_column = _wrap_and_validate(col, dtype) if validate else col
972-
self._dtype = dtype if old_dtype is None else old_dtype
964+
self._dtype = dtype
973965
self._distinct_count = {}
974966
self._has_nulls = {}
975967
# The set of exposed buffers associated with this column. These buffers must be
@@ -1419,6 +1411,8 @@ def dropna(self) -> Self:
14191411
return self.copy()
14201412

14211413
def to_arrow(self) -> pa.Array:
1414+
if is_arrow_null_dtype(self.dtype):
1415+
return pa.nulls(len(self))
14221416
with self.access(mode="read", scope="internal"):
14231417
return _handle_nulls(
14241418
self.plc_column.to_arrow(
@@ -3323,6 +3317,12 @@ def as_column(
33233317
elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
33243318
if isinstance(arbitrary, pa.NullArray) and dtype is None:
33253319
dtype = np.dtype("object")
3320+
elif is_arrow_null_dtype(dtype):
3321+
if arbitrary.null_count != len(arbitrary):
3322+
raise ValueError(
3323+
f"dtype {dtype} can only be used with all-null data."
3324+
)
3325+
arbitrary = pa.nulls(len(arbitrary))
33263326
column = ColumnBase.from_arrow(arbitrary)
33273327
if nan_as_null is not False:
33283328
column = column.nans_to_nulls()
@@ -3536,6 +3536,11 @@ def as_column(
35363536
elif length < 0:
35373537
raise ValueError(f"{length=} must be >=0.")
35383538

3539+
if is_arrow_null_dtype(dtype):
3540+
if is_na_like(arbitrary):
3541+
return column_empty(length, dtype=dtype)
3542+
pa.scalar(arbitrary, type=dtype.pyarrow_dtype)
3543+
35393544
pa_type = None
35403545
if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
35413546
return as_column(
@@ -3768,6 +3773,13 @@ def as_column(
37683773

37693774
from_pandas = nan_as_null is None or nan_as_null
37703775
if dtype is not None:
3776+
if is_arrow_null_dtype(dtype):
3777+
arbitrary = pa.array(
3778+
arbitrary,
3779+
type=dtype.pyarrow_dtype,
3780+
from_pandas=True,
3781+
)
3782+
return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
37713783
try:
37723784
arbitrary = pa.array(
37733785
arbitrary,

python/cudf/cudf/core/dtype/validators.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def is_dtype_obj_string(obj: DtypeObj) -> bool:
3535
and (
3636
pa.types.is_string(obj.pyarrow_dtype)
3737
or pa.types.is_large_string(obj.pyarrow_dtype)
38+
or pa.types.is_null(obj.pyarrow_dtype)
3839
)
3940
)
4041
)

python/cudf/cudf/core/indexed_frame.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6574,12 +6574,16 @@ def convert_dtypes(
65746574
if dtype_backend == "pyarrow":
65756575
cols = []
65766576
for col in self._columns:
6577-
arrow_dtype = pd.ArrowDtype(
6578-
pa.null()
6579-
if col.null_count == len(col)
6580-
else cudf_dtype_to_pa_type(col.dtype)
6581-
)
6582-
cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
6577+
if len(col) == 0 and is_dtype_obj_string(col.dtype):
6578+
cols.append(col)
6579+
continue
6580+
if len(col) != 0 and col.null_count == len(col):
6581+
cols.append(as_column(col, dtype=pd.ArrowDtype(pa.null())))
6582+
else:
6583+
arrow_dtype = pd.ArrowDtype(
6584+
cudf_dtype_to_pa_type(col.dtype)
6585+
)
6586+
cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
65836587
return self._from_data_like_self(
65846588
self._data._from_columns_like_self(cols, verify=False)
65856589
)

python/cudf/cudf/pandas/scripts/conftest-patch.py

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -723,7 +723,6 @@ def pytest_unconfigure(config):
723723
"tests/base/test_constructors.py::TestConstruction::test_constructor_datetime_outofbound[Series-object-string]": "Failed: DID NOT RAISE <class 'pandas.errors.OutOfBoundsDatetime'>",
724724
"tests/base/test_conversion.py::test_array[index-arr3-_left]": "TODO: Add a reason for failure",
725725
"tests/base/test_conversion.py::test_array[index-arr4-_sparse_values]": "TODO: Add a reason for failure",
726-
"tests/base/test_conversion.py::test_array[series-arr3-_left]": "TODO: Add a reason for failure",
727726
"tests/base/test_conversion.py::test_array[series-arr4-_sparse_values]": "TODO: Add a reason for failure",
728727
"tests/base/test_conversion.py::test_to_numpy[array-False-arr1-expected1-False]": "AssertionError: numpy array are different",
729728
"tests/base/test_conversion.py::test_to_numpy[array-True-arr1-expected1-False]": "AssertionError: numpy array are different",
@@ -1640,28 +1639,6 @@ def pytest_unconfigure(config):
16401639
"tests/extension/test_interval.py::TestIntervalArray::test_grouping_grouper": "AssertionError: ndarray Expected type <class 'numpy.ndarray'>, found <class 'pandas.arrays.ArrowStringArray'> instead",
16411640
"tests/extension/test_interval.py::TestIntervalArray::test_in_numeric_groupby": "TODO: Add a reason for failure",
16421641
"tests/extension/test_interval.py::TestIntervalArray::test_is_extension_array_dtype": "TODO: Add a reason for failure",
1643-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[BooleanDtype-c]": "TODO: Add a reason for failure",
1644-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[BooleanDtype-python]": "TODO: Add a reason for failure",
1645-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float32Dtype-c]": "TODO: Add a reason for failure",
1646-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float32Dtype-python]": "TODO: Add a reason for failure",
1647-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float64Dtype-c]": "TODO: Add a reason for failure",
1648-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float64Dtype-python]": "TODO: Add a reason for failure",
1649-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int16Dtype-c]": "TODO: Add a reason for failure",
1650-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int16Dtype-python]": "TODO: Add a reason for failure",
1651-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int32Dtype-c]": "TODO: Add a reason for failure",
1652-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int32Dtype-python]": "TODO: Add a reason for failure",
1653-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int64Dtype-c]": "TODO: Add a reason for failure",
1654-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int64Dtype-python]": "TODO: Add a reason for failure",
1655-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int8Dtype-c]": "TODO: Add a reason for failure",
1656-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int8Dtype-python]": "TODO: Add a reason for failure",
1657-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt16Dtype-c]": "TODO: Add a reason for failure",
1658-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt16Dtype-python]": "TODO: Add a reason for failure",
1659-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt32Dtype-c]": "TODO: Add a reason for failure",
1660-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt32Dtype-python]": "TODO: Add a reason for failure",
1661-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt64Dtype-c]": "TODO: Add a reason for failure",
1662-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt64Dtype-python]": "TODO: Add a reason for failure",
1663-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt8Dtype-c]": "TODO: Add a reason for failure",
1664-
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt8Dtype-python]": "TODO: Add a reason for failure",
16651642
"tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumprod-False]": "TODO: Add a reason for failure",
16661643
"tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumprod-True]": "TODO: Add a reason for failure",
16671644
"tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumsum-False]": "TODO: Add a reason for failure",
@@ -1908,10 +1885,6 @@ def pytest_unconfigure(config):
19081885
"tests/extension/test_numpy.py::TestNumpyExtensionArray::test_value_counts_with_normalize[object]": "TODO: Add a reason for failure",
19091886
"tests/extension/test_period.py::Test2DCompat::test_copy_order[2D]": "TODO: Add a reason for failure",
19101887
"tests/extension/test_period.py::Test2DCompat::test_copy_order[D]": "TODO: Add a reason for failure",
1911-
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[2D-c]": "TODO: Add a reason for failure",
1912-
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[2D-python]": "TODO: Add a reason for failure",
1913-
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[D-c]": "TODO: Add a reason for failure",
1914-
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[D-python]": "TODO: Add a reason for failure",
19151888
"tests/extension/test_period.py::TestPeriodArray::test_astype_own_type[2D-False]": "TODO: Add a reason for failure",
19161889
"tests/extension/test_period.py::TestPeriodArray::test_astype_own_type[D-False]": "TODO: Add a reason for failure",
19171890
"tests/extension/test_period.py::TestPeriodArray::test_astype_str[2D]": "AssertionError: Attributes of Series are different",
@@ -1940,22 +1913,6 @@ def pytest_unconfigure(config):
19401913
"tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-absolute]": "TODO: Add a reason for failure",
19411914
"tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-negative]": "TODO: Add a reason for failure",
19421915
"tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-positive]": "TODO: Add a reason for failure",
1943-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-False-c]": "TODO: Add a reason for failure",
1944-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-False-python]": "TODO: Add a reason for failure",
1945-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-True-c]": "TODO: Add a reason for failure",
1946-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-True-python]": "TODO: Add a reason for failure",
1947-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-False-c]": "TODO: Add a reason for failure",
1948-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-False-python]": "TODO: Add a reason for failure",
1949-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-True-c]": "TODO: Add a reason for failure",
1950-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-True-python]": "TODO: Add a reason for failure",
1951-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-False-c]": "TODO: Add a reason for failure",
1952-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-False-python]": "TODO: Add a reason for failure",
1953-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-True-c]": "TODO: Add a reason for failure",
1954-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-True-python]": "TODO: Add a reason for failure",
1955-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-False-c]": "TODO: Add a reason for failure",
1956-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-False-python]": "TODO: Add a reason for failure",
1957-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-True-c]": "TODO: Add a reason for failure",
1958-
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-True-python]": "TODO: Add a reason for failure",
19591916
"tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=str[python]-False-__add__]": "AssertionError: Attributes of Series are different",
19601917
"tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=str[python]-True-__add__]": "AssertionError: Attributes of Series are different",
19611918
"tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=string[pyarrow]-False-__radd__]": "TODO: Add a reason for failure",
@@ -2190,7 +2147,6 @@ def pytest_unconfigure(config):
21902147
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_avoid_block_splitting": "TODO: Add a reason for failure",
21912148
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
21922149
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_backend_no_conversion": "TODO: Add a reason for failure",
2193-
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_dtype_empty_object": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='0') are different",
21942150
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_engine_lines_false": "TODO: Add a reason for failure",
21952151
"tests/frame/methods/test_copy.py::TestCopy::test_copy_consolidates": "TODO: Add a reason for failure",
21962152
"tests/frame/methods/test_count.py::TestDataFrameCount::test_count": "TODO: Add a reason for failure",
@@ -4256,7 +4212,6 @@ def pytest_unconfigure(config):
42564212
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_constructor_pass_closed[breaks3]": "TODO: Add a reason for failure",
42574213
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_constructor_pass_closed[breaks4]": "TODO: Add a reason for failure",
42584214
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_generic_errors": "TODO: Add a reason for failure",
4259-
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_left_right_dont_share_data": "TODO: Add a reason for failure",
42604215
"tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks0]": "TODO: Add a reason for failure",
42614216
"tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks2]": "TODO: Add a reason for failure",
42624217
"tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks3]": "TODO: Add a reason for failure",
@@ -6745,7 +6700,6 @@ def pytest_unconfigure(config):
67456700
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data11-maindtype11-Int8-expected_other11]": "AssertionError: Attributes of Series are different",
67466701
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data2-maindtype2-expected_default2-expected_other2]": "AssertionError: Attributes of Series are different",
67476702
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data6-maindtype6-Int64-expected_other6]": "AssertionError: Attributes of Series are different",
6748-
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_null": "AssertionError: Attributes of Series are different",
67496703
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
67506704
"tests/series/methods/test_diff.py::TestSeriesDiff::test_diff_bool": "AssertionError: Attributes of Series are different",
67516705
"tests/series/methods/test_drop.py::test_drop_exception_raised[drop_labels1-0-KeyError-not found in axis]": "Failed: DID NOT RAISE <class 'KeyError'>",

python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
22
# SPDX-License-Identifier: Apache-2.0
33
import pandas as pd
44
import pytest
@@ -45,3 +45,13 @@ def test_convert_dtypes():
4545
with pytest.raises(NotImplementedError):
4646
# category and datetime64[ns] are not nullable
4747
gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True)
48+
49+
50+
def test_convert_dtypes_pyarrow_null():
51+
pytest.importorskip("pyarrow")
52+
data = {"a": [None, None]}
53+
54+
expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
55+
result = cudf.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
56+
57+
assert_eq(result.to_pandas(), expected)

python/cudf/cudf/tests/series/methods/test_convert_dtypes.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
22
# SPDX-License-Identifier: Apache-2.0
33
import pandas as pd
44
import pytest
@@ -43,3 +43,13 @@ def test_convert_integer_false_convert_floating_true():
4343
.to_pandas(nullable=True)
4444
)
4545
assert_eq(result, expected)
46+
47+
48+
def test_convert_dtypes_pyarrow_null():
49+
pytest.importorskip("pyarrow")
50+
data = [None, None]
51+
52+
expected = pd.Series(data).convert_dtypes(dtype_backend="pyarrow")
53+
result = cudf.Series(data).convert_dtypes(dtype_backend="pyarrow")
54+
55+
assert_eq(result.to_pandas(), expected)

python/cudf/cudf/utils/dtypes.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -601,29 +601,6 @@ def is_arrow_null_dtype(dtype: DtypeObj) -> bool:
601601
)
602602

603603

604-
def maybe_normalize_arrow_null(
605-
col: plc.Column, dtype: DtypeObj
606-
) -> tuple[plc.Column, DtypeObj, DtypeObj | None]:
607-
"""Normalize ArrowDtype(pa.null()) columns for internal construction.
608-
609-
For pandas nullable null types (ArrowDtype wrapping pa.null()),
610-
the column data is normalized and the dtype is replaced with
611-
``np.dtype("object")`` for internal dispatch. The original dtype
612-
is returned as ``old_dtype`` so it can be stored on the column.
613-
614-
Returns
615-
-------
616-
tuple of (col, dtype, old_dtype)
617-
``old_dtype`` is the original dtype if normalization occurred,
618-
otherwise ``None``.
619-
"""
620-
from cudf.core.column.column import _normalize_types_column
621-
622-
if is_arrow_null_dtype(dtype):
623-
return _normalize_types_column(col), np.dtype("object"), dtype
624-
return col, dtype, None
625-
626-
627604
SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: dict[np.dtype[Any], plc.types.TypeId] = {
628605
np.dtype("int8"): plc.types.TypeId.INT8,
629606
np.dtype("int16"): plc.types.TypeId.INT16,

0 commit comments

Comments
 (0)