diff --git a/CHANGELOG.md b/CHANGELOG.md index 4863076f7e2..911ab370a33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ - PR #2446 Add __array_function__ for index - PR #2437 ORC reader: Add 'use_np_dtypes' option - PR #2382 Add CategoricalAccessor add, remove, rename, and ordering methods +- PR #2442 Add __setitem__ - PR #2449 Java column vector: added support for getting byte count of strings in a ColumnVector - PR #2358 Add the function to convert column of floating points with `nan`s into `bitmask` - PR #2489 Add drop argument to set_index @@ -150,6 +151,7 @@ - PR #2517 Fix device memory leak in to_dlpack tensor deleter - PR #2511 Added import of orc, refactored exception handlers to not squash fatal exceptions + # cuDF 0.8.0 (27 June 2019) ## New Features diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index a47433d1905..4068523416d 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -78,29 +78,24 @@ void copy_range(gdf_column *out_column, gdf_column const &in_column, // before copying to ensure the strings referred to by the new indices // are included in the destination column - gdf_column * input_cols[2] = {out_column, - const_cast(&in_column)}; - // make temporary columns which will have synced categories // TODO: these copies seem excessively expensive, but // sync_column_categories doesn't copy the valid mask gdf_column temp_out = cudf::copy(*out_column); gdf_column temp_in = cudf::copy(in_column); - gdf_column * temp_cols[2] = {&temp_out, &temp_in}; + + gdf_column * input_cols[2] = {&temp_out, + const_cast(&in_column)}; + gdf_column * temp_cols[2] = {out_column, &temp_in}; // sync categories CUDF_EXPECTS(GDF_SUCCESS == sync_column_categories(input_cols, temp_cols, 2), "Failed to synchronize NVCategory"); - detail::copy_range(&temp_out, + detail::copy_range(out_column, detail::column_range_factory{temp_in, in_begin}, out_begin, out_end); - - std::swap(out_column->data, temp_out.data); - std::swap(out_column->valid, temp_out.valid); - std::swap(out_column->null_count, temp_out.null_count); - std::swap(out_column->dtype_info.category, temp_out.dtype_info.category); gdf_column_free(&temp_out); gdf_column_free(&temp_in); diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index c46fcffaf2f..c7607ca49f8 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -199,6 +199,7 @@ table scatter(table const& source, gdf_index_type const scatter_map[], } detail::scatter(&source, scatter_map, &output); + nvcategory_gather_table(output, output); return output; diff --git a/python/cudf/cudf/bindings/copying.pxd b/python/cudf/cudf/bindings/copying.pxd index 1635f7a4cd3..8600110e0e5 100644 --- a/python/cudf/cudf/bindings/copying.pxd +++ b/python/cudf/cudf/bindings/copying.pxd @@ -19,3 +19,15 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: cdef gdf_column copy( const gdf_column &input ) except + + + cudf_table scatter( + const cudf_table source, + const gdf_index_type* scatter_map, + const cudf_table target) + + cdef void copy_range( + gdf_column *out_column, + const gdf_column in_column, + gdf_index_type out_begin, + gdf_index_type out_end, + gdf_index_type in_begin) diff --git a/python/cudf/cudf/bindings/copying.pyx b/python/cudf/cudf/bindings/copying.pyx index 5bb94156688..d41c4c4b16b 100644 --- a/python/cudf/cudf/bindings/copying.pyx +++ b/python/cudf/cudf/bindings/copying.pyx @@ -5,11 +5,16 @@ # cython: embedsignature = True # cython: language_level = 3 +from cudf.dataframe import columnops +from cudf.dataframe.buffer import Buffer from cudf.bindings.cudf_cpp cimport * from cudf.bindings.cudf_cpp import * -from cudf.utils.cudautils import modulo +from cudf.bindings.copying cimport * +import cudf.utils.utils as utils +from cudf.bindings.utils cimport columns_from_table, table_from_columns from librmm_cffi import librmm as rmm +import numba import numpy as np import pandas as pd import pyarrow as pa @@ -27,7 +32,6 @@ pandas_version = tuple(map(int, pd.__version__.split('.', 2)[:2])) def clone_columns_with_size(in_cols, row_size): - from cudf.dataframe import columnops out_cols = [] for col in in_cols: o_col = columnops.column_empty_like(col, @@ -39,26 +43,48 @@ def clone_columns_with_size(in_cols, row_size): return out_cols -def apply_gather(in_cols, maps, out_cols=None): - """ - Call cudf::gather. +def _normalize_maps(maps, size): + maps = columnops.as_column(maps).astype("int32") + maps = maps.binary_operator("mod", maps.normalize_binop_value(size)) + maps = maps.data.mem + return maps - * in_cols input column array - * maps RMM device array with gdf_index_type (np.int32 compatible dtype) - * out_cols the destination column array to output - * returns out_cols +def apply_gather(source, maps, dest=None): """ + Gathers elements from source into dest (if given) using the gathermap maps. + If dest is not given, it is allocated inside the function and returned. + + Parameters + ---------- + source : Column or list of Columns + maps : DeviceNDArray + dest : Column or list of Columns (optional) + + Returns + ------- + Column or list of Columns, or None if dest is given + """ + if isinstance(source, (list, tuple)): + if dest is not None: + assert(isinstance(dest, (list, tuple))) + in_cols = source + out_cols = dest + else: + in_cols = [source] + out_cols = None if dest is None else [dest] + + for i, in_col in enumerate(in_cols): + in_cols[i] = columnops.as_column(in_cols[i]) + if dest is not None: + out_cols[i] = columnops.as_column(out_cols[i]) + if in_cols[0].dtype == np.dtype("object"): in_size = in_cols[0].data.size() else: in_size = in_cols[0].data.size - from cudf.dataframe import columnops - maps = columnops.as_column(maps).astype("int32") - maps = maps.data.mem - # TODO: replace with libcudf pymod when available - maps = modulo(maps, in_size) + maps = _normalize_maps(maps, in_size) col_count=len(in_cols) gather_count = len(maps) @@ -108,42 +134,59 @@ def apply_gather(in_cols, maps, out_cols=None): free_table(c_in_table, c_in_cols) - return out_cols + if dest is not None: + return + if isinstance(source, (list, tuple)): + return out_cols + else: + return out_cols[0] -def apply_gather_column(in_col, maps, out_col=None): - """ - Call cudf::gather. - * in_cols input column - * maps device array - * out_cols the destination column to output +def apply_scatter(source, maps, target): + cdef cudf_table* c_source_table + cdef cudf_table* c_target_table + cdef cudf_table c_result_table + cdef uintptr_t c_maps_ptr + cdef gdf_index_type* c_maps - * returns out_col - """ + source_cols = source + target_cols = target - in_cols = [in_col] - out_cols = None if out_col is None else [out_col] + if not isinstance(target_cols, (list, tuple)): + target_cols = [target_cols] - out_cols = apply_gather(in_cols, maps, out_cols) + if not isinstance(source_cols, (list, tuple)): + source_cols = [source_cols] * len(target_cols) - return out_cols[0] + for i in range(len(target_cols)): + target_cols[i] = columnops.as_column(target_cols[i]) + source_cols[i] = columnops.as_column(source_cols[i]) + assert source_cols[i].dtype == target_cols[i].dtype + c_source_table = table_from_columns(source_cols) + c_target_table = table_from_columns(target_cols) -def apply_gather_array(dev_array, maps, out_col=None): - """ - Call cudf::gather. + maps = _normalize_maps(maps, len(target_cols[0])) - * dev_array input device array - * maps device array - * out_cols the destination column to output + c_maps_ptr = get_ctype_ptr(maps) + c_maps = c_maps_ptr - * returns out_col - """ - from cudf.dataframe import columnops + with nogil: + c_result_table = scatter( + c_source_table[0], + c_maps, + c_target_table[0]) - in_col = columnops.as_column(dev_array) - return apply_gather_column(in_col, maps, out_col) + result_cols = columns_from_table(&c_result_table) + + del c_source_table + del c_target_table + + if isinstance(target, (list, tuple)): + return result_cols + else: + return result_cols[0] def copy_column(input_col): @@ -163,3 +206,53 @@ def copy_column(input_col): free(output) return Column.from_mem_views(data, mask, output.null_count) + + +def apply_copy_range(out_col, in_col, int out_begin, int out_end, + int in_begin): + from cudf.dataframe.column import Column + + if abs(out_end - out_begin) <= 1: + return out_col + + if out_begin < 0: + out_begin = len(out_col) + out_begin + if out_end < 0: + out_end = len(out_col) + out_end + + if out_begin > out_end: + return out_col + + if out_col.null_count == 0 and in_col.has_null_mask: + mask = utils.make_mask(len(out_col)) + cudautils.fill_value(mask, 0xff) + out_col._mask = Buffer(mask) + out_col._null_count = 0 + + if in_col.null_count == 0 and out_col.has_null_mask: + mask = utils.make_mask(len(in_col)) + cudautils.fill_value(mask, 0xff) + in_col._mask = Buffer(mask) + in_col._null_count = 0 + + cdef gdf_column* c_out_col = column_view_from_column(out_col) + cdef gdf_column* c_in_col = column_view_from_column(in_col) + + with nogil: + copy_range(c_out_col, + c_in_col[0], + out_begin, + out_end, + in_begin) + + out_col._update_null_count(c_out_col.null_count) + + if out_col.dtype == np.dtype("object") and len(out_col) > 0: + update_nvstrings_col( + out_col, + c_out_col.dtype_info.category) + + free(c_in_col) + free(c_out_col) + + return out_col diff --git a/python/cudf/cudf/bindings/cudf_cpp.pxd b/python/cudf/cudf/bindings/cudf_cpp.pxd index 47d7f24f054..150d92839d4 100644 --- a/python/cudf/cudf/bindings/cudf_cpp.pxd +++ b/python/cudf/cudf/bindings/cudf_cpp.pxd @@ -27,7 +27,7 @@ cpdef gdf_time_unit np_dtype_to_gdf_time_unit(dtype) cpdef gdf_time_unit_to_np_dtype(gdf_time_unit time_unit) cdef np_dtype_from_gdf_column(gdf_column* col) -cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*) + cdef get_scalar_value(gdf_scalar scalar, dtype) @@ -391,3 +391,5 @@ cdef extern from "cudf/legacy/table.hpp" namespace "cudf" nogil: # const gdf_column* const* begin() const except + # gdf_column const* const* end() const # gdf_column const* get_column(gdf_index_type index) const except + + +cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*) except? GDF_invalid diff --git a/python/cudf/cudf/bindings/cudf_cpp.pyx b/python/cudf/cudf/bindings/cudf_cpp.pyx index ded32b6d7ee..3ff9a1ec8c6 100644 --- a/python/cudf/cudf/bindings/cudf_cpp.pyx +++ b/python/cudf/cudf/bindings/cudf_cpp.pyx @@ -161,7 +161,7 @@ cdef np_dtype_from_gdf_column(gdf_column* col): raise TypeError('cannot convert gdf_dtype `%s` to numpy dtype' % (dtype)) -cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None): +cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None) except? GDF_invalid: """Util to convert a column's or np.scalar's dtype to gdf dtype. Parameters @@ -171,7 +171,8 @@ cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None): dtype : numpy.dtype; optional The dtype to convert to a gdf_dtype. Defaults to *col.dtype*. """ - dtype = col.dtype if dtype is None else np.dtype(dtype) + dtype = col.dtype if dtype is None else pd.api.types.pandas_dtype(dtype) + # if dtype is pd.CategoricalDtype, use the codes' gdf_dtype if is_categorical_dtype(dtype): if col is None: @@ -307,7 +308,7 @@ cdef gdf_column* column_view_from_column(col, col_name=None) except? NULL: else: data_ptr = 0 - if col._mask is not None and col.null_count > 0: + if col._mask is not None: valid_ptr = get_column_valid_ptr(col) else: valid_ptr = 0 diff --git a/python/cudf/cudf/bindings/replace.pyx b/python/cudf/cudf/bindings/replace.pyx index 7aac8c8c8fe..66ae5917bdd 100644 --- a/python/cudf/cudf/bindings/replace.pyx +++ b/python/cudf/cudf/bindings/replace.pyx @@ -11,7 +11,7 @@ from cudf.bindings.cudf_cpp cimport * from cudf.bindings.cudf_cpp import * from cudf.bindings.replace cimport * from cudf.dataframe.column import Column -from cudf.utils.utils import is_single_value +from cudf.utils.utils import is_scalar from libc.stdlib cimport calloc, malloc, free @@ -79,7 +79,7 @@ cpdef apply_replace_nulls(inp, replacement): Call replace_nulls """ - if is_single_value(replacement): + if is_scalar(replacement): return apply_replace_nulls_scalar(inp, replacement) else: return apply_replace_nulls_column(inp, replacement) diff --git a/python/cudf/cudf/bindings/typecast.pyx b/python/cudf/cudf/bindings/typecast.pyx index 9264eca1cde..9bdd568bdda 100644 --- a/python/cudf/cudf/bindings/typecast.pyx +++ b/python/cudf/cudf/bindings/typecast.pyx @@ -12,6 +12,7 @@ from cudf.dataframe.column import Column from libc.stdlib cimport free import numpy as np +import pandas as pd def apply_cast(incol, dtype=np.float64): @@ -19,20 +20,19 @@ def apply_cast(incol, dtype=np.float64): Return a Column with values in `incol` casted to `dtype`. Currently supports numeric and datetime dtypes. """ - check_gdf_compatibility(incol) + dtype = np.dtype(np.float64 if dtype is None else dtype) - cdef gdf_column* c_incol = column_view_from_column(incol) + if pd.api.types.is_dtype_equal(incol.dtype, dtype): + return incol - dtype = np.dtype(np.float64 if dtype is None else dtype) + cdef gdf_column* c_incol = column_view_from_column(incol) cdef gdf_dtype c_out_dtype = gdf_dtype_from_value(incol, dtype) cdef uintptr_t c_category - cdef gdf_dtype_extra_info c_out_info = gdf_dtype_extra_info( time_unit=np_dtype_to_gdf_time_unit(dtype), category=c_category ) - cdef gdf_column result with nogil: diff --git a/python/cudf/cudf/dataframe/buffer.py b/python/cudf/cudf/dataframe/buffer.py index 32cb1efc221..999b473100b 100644 --- a/python/cudf/cudf/dataframe/buffer.py +++ b/python/cudf/cudf/dataframe/buffer.py @@ -163,6 +163,11 @@ def as_contiguous(self): def is_contiguous(self): return self.mem.is_c_contiguous() + def astype(self, dtype): + from cudf.dataframe import columnops + + return columnops.as_column(self).astype(dtype).data + class BufferSentryError(ValueError): pass diff --git a/python/cudf/cudf/dataframe/categorical.py b/python/cudf/cudf/dataframe/categorical.py index 9efcada3f62..c302f47e1ba 100644 --- a/python/cudf/cudf/dataframe/categorical.py +++ b/python/cudf/cudf/dataframe/categorical.py @@ -64,7 +64,8 @@ def as_unordered(self, inplace=False): return Series(data=self._parent.replace(ordered=False)) def add_categories(self, new_categories, **kwargs): - data = None if kwargs["inplace"] else self._parent + inplace = kwargs.get("inplace", False) + data = None if inplace else self._parent new_categories = columnops.as_column(new_categories) new_categories = self._parent._categories.append(new_categories) if not self._categories_equal(new_categories, **kwargs): diff --git a/python/cudf/cudf/dataframe/column.py b/python/cudf/cudf/dataframe/column.py index 90d4e37dbe5..fd32869ef02 100644 --- a/python/cudf/cudf/dataframe/column.py +++ b/python/cudf/cudf/dataframe/column.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd -from numba.cuda.cudadrv.devicearray import DeviceNDArray import nvstrings from librmm_cffi import librmm as rmm @@ -435,12 +434,14 @@ def element_indexing(self, index): ------ ``IndexError`` if out-of-bound """ - index = int(index) + index = np.int32(index) if index < 0: index = len(self) + index if index > len(self) - 1: raise IndexError val = self.data[index] # this can raise IndexError + if isinstance(val, nvstrings.nvstrings): + val = val.to_host()[0] valid = ( cudautils.mask_get.py_func(self.nullmask, index) if self.has_null_mask @@ -449,6 +450,8 @@ def element_indexing(self, index): return val if valid else None def __getitem__(self, arg): + from cudf.dataframe import columnops + if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) @@ -461,8 +464,12 @@ def __getitem__(self, arg): # slicing data subdata = self.data[arg] # slicing mask + if self.dtype == "object": + data_size = self.data.size() + else: + data_size = self.data.size bytemask = cudautils.expand_mask_bits( - self.data.size, self.mask.to_gpu_array() + data_size, self.mask.to_gpu_array() ) submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg])) col = self.replace(data=subdata, mask=submask) @@ -470,40 +477,80 @@ def __getitem__(self, arg): else: newbuffer = self.data[arg] return self.replace(data=newbuffer) - elif isinstance(arg, (list, np.ndarray)): - arg = np.array(arg) - arg = rmm.to_device(arg) - - if isinstance(arg, DeviceNDArray): - return self.take(arg) else: + arg = columnops.as_column(arg) + if len(arg) == 0: + arg = columnops.as_column([], dtype="int32") + if pd.api.types.is_integer_dtype(arg.dtype): + return self.take(arg.data.mem) + if pd.api.types.is_bool_dtype(arg.dtype): + return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg)) - def masked_assign(self, value, mask): - """Assign a scalar value to a series using a boolean mask - df[df < 0] = 0 - - Parameters - ---------- - value : scalar - scalar value for assignment - mask : cudf Series - Boolean Series + def __setitem__(self, key, value): + """ + Set the value of self[key] to value. - Returns - ------- - cudf Series - cudf series with new value set to where mask is True + If value and self are of different types, + value is coerced to self.dtype """ + import cudf.bindings.copying as cpp_copying + from cudf.dataframe import columnops - # need to invert to properly use gpu_fill_mask - mask_invert = mask._column._invert() - out = cudautils.fill_mask( - data=self.data.to_gpu_array(), - mask=mask_invert.as_mask(), - value=value, - ) - return self.replace(data=Buffer(out), mask=None, null_count=0) + if isinstance(key, slice): + key_start, key_stop, key_stride = key.indices(len(self)) + if key_stride != 1: + raise NotImplementedError("Stride not supported in slice") + nelem = abs(key_stop - key_start) + else: + key = columnops.as_column(key) + if pd.api.types.is_bool_dtype(key.dtype): + if not len(key) == len(self): + raise ValueError( + "Boolean mask must be of same length as column" + ) + key = columnops.as_column(cudautils.arange(len(self)))[key] + nelem = len(key) + + if utils.is_scalar(value): + if is_categorical_dtype(self.dtype): + from cudf.dataframe.categorical import CategoricalColumn + from cudf.dataframe.buffer import Buffer + from cudf.utils.cudautils import fill_value + + data = rmm.device_array(nelem, dtype="int8") + fill_value(data, self._encode(value)) + value = CategoricalColumn( + data=Buffer(data), + categories=self._categories, + ordered=False, + ) + elif value is None: + value = columnops.column_empty(nelem, self.dtype, masked=True) + else: + to_dtype = pd.api.types.pandas_dtype(self.dtype) + value = utils.scalar_broadcast_to(value, nelem, to_dtype) + + value = columnops.as_column(value).astype(self.dtype) + + if len(value) != nelem: + msg = ( + f"Size mismatch: cannot set value " + f"of size {len(value)} to indexing result of size " + f"{nelem}" + ) + raise ValueError(msg) + + if isinstance(key, slice): + out = cpp_copying.apply_copy_range( + self, value, key_start, key_stop, 0 + ) + else: + out = cpp_copying.apply_scatter(value, key, self) + + self._data = out.data + self._mask = out.mask + self._update_null_count() def fillna(self, value): """Fill null values with ``value``. @@ -610,14 +657,17 @@ def take(self, indices, ignore_index=False): """Return Column by taking values from the corresponding *indices*. """ import cudf.bindings.copying as cpp_copying + from cudf.dataframe.columnops import column_empty_like indices = Buffer(indices).to_gpu_array() # Handle zero size if indices.size == 0: - return self.copy() + return column_empty_like(self, newsize=0) # Returns a new column - return cpp_copying.apply_gather_column(self, indices) + result = cpp_copying.apply_gather(self, indices) + result.name = self.name + return result def as_mask(self): """Convert booleans to bitmask diff --git a/python/cudf/cudf/dataframe/columnops.py b/python/cudf/cudf/dataframe/columnops.py index 2aac87a2e49..c61a1ea8208 100755 --- a/python/cudf/cudf/dataframe/columnops.py +++ b/python/cudf/cudf/dataframe/columnops.py @@ -272,7 +272,7 @@ def column_select_by_position(column, positions): import cudf.bindings.copying as cpp_copying pos_ary = positions.data.to_gpu_array() - selected_values = cpp_copying.apply_gather_column(column, pos_ary) + selected_values = cpp_copying.apply_gather(column, pos_ary) selected_index = Buffer(pos_ary) return ( diff --git a/python/cudf/cudf/dataframe/dataframe.py b/python/cudf/cudf/dataframe/dataframe.py index 09a27a51f38..94a59e63ab0 100644 --- a/python/cudf/cudf/dataframe/dataframe.py +++ b/python/cudf/cudf/dataframe/dataframe.py @@ -24,7 +24,6 @@ import cudf.bindings.join as cpp_join from cudf import formatting from cudf._sort import get_sorted_inds -from cudf.bindings import copying as cpp_copying from cudf.bindings.nvtx import nvtx_range_pop, nvtx_range_push from cudf.bindings.stream_compaction import ( apply_drop_duplicates as cpp_drop_duplicates, @@ -313,7 +312,7 @@ def __getitem__(self, arg): self.columns, cudf.dataframe.multiindex.MultiIndex ) and isinstance(arg, tuple): return self.columns._get_column_major(self, arg) - if utils.is_single_value(arg) or isinstance(arg, tuple): + if utils.is_scalar(arg) or isinstance(arg, tuple): s = self._cols[arg] s.name = arg s.index = self.index @@ -371,9 +370,7 @@ def __setitem__(self, name, col): if isinstance(name, DataFrame): for col_name in self._cols: mask = name[col_name] - self._cols[col_name] = self._cols[col_name].masked_assign( - value=col, mask=mask - ) + self._cols[col_name][mask] = col elif name in self._cols: self._cols[name] = self._prepare_series_for_add(col) @@ -818,68 +815,138 @@ def iteritems(self): @property def loc(self): """ - Returns a label-based indexer for row-slicing and column selection. + Selecting rows and columns by label or boolean mask. Examples -------- - >>> df = DataFrame([('a', list(range(20))), - ... ('b', list(range(20))), - ... ('c', list(range(20)))]) - Get the row by index label from 'a' and 'b' columns + DataFrame with string index. + + >>> print(df) + a b + a 0 5 + b 1 6 + c 2 7 + d 3 8 + e 4 9 + + Select a single row by label. - >>> df.loc[0, ['a', 'b']] + >>> print(df.loc['a']) a 0 - b 0 + b 5 + Name: a, dtype: int64 + + Select multiple rows and a single column. - Get rows from index 2 to index 5 from 'a' and 'b' columns. + >>> print(df.loc[['a', 'c', 'e'], 'b']) + a 5 + c 7 + e 9 + Name: b, dtype: int64 - >>> df.loc[2:5, ['a', 'b']] + Selection by boolean mask. + >>> print(df.loc[df.a > 2]) a b - 2 2 2 - 3 3 3 - 4 4 4 - 5 5 5 + d 3 8 + e 4 9 - Get the every 3rd rows from index 2 to 10 from 'a' and 'b' + Setting values using loc. + >>> df.loc[['a', 'c', 'e'], 'a'] = 0 + >>> print(df) + a b + a 0 5 + b 1 6 + c 0 7 + d 3 8 + e 0 9 - >>> df.loc[2:10:3, ['a', 'b']] - a b - 2 2 2 - 5 5 5 - 8 8 8 + See also + -------- + DataFrame.iloc """ return _DataFrameLocIndexer(self) @property def iloc(self): """ - Returns a integer-location based indexer for selection by position. + Selecting rows and column by position. Examples -------- >>> df = DataFrame([('a', list(range(20))), ... ('b', list(range(20))), ... ('c', list(range(20)))]) - >>> df.iloc[1] # get the row from index 1st + + Select a single row using an integer index. + + >>> print(df.iloc[1]) a 1 b 1 c 1 - >>> df.iloc[[0, 2, 9, 18]] # get the rows from indices 0,2,9 and 18. + + Select multiple rows using a list of integers. + + >>> print(df.iloc[[0, 2, 9, 18]]) a b c 0 0 0 0 2 2 2 2 9 9 9 9 18 18 18 18 - >>> df.iloc[3:10:2] # get the rows using slice indices + + Select rows using a slice. + + >>> print(df.iloc[3:10:2]) a b c 3 3 3 3 5 5 5 5 7 7 7 7 9 9 9 9 + + Select both rows and columns. + + >>> print(df.iloc[[1, 3, 5, 7], 2]) + 1 1 + 3 3 + 5 5 + 7 7 + Name: c, dtype: int64 + + Setting values in a column using iloc. + + >>> df.iloc[:4] = 0 + >>> print(df) + a b c + 0 0 0 0 + 1 0 0 0 + 2 0 0 0 + 3 0 0 0 + 4 4 4 4 + 5 5 5 5 + 6 6 6 6 + 7 7 7 7 + 8 8 8 8 + 9 9 9 9 + [10 more rows] + + See also + -------- + DataFrame.loc """ return _DataFrameIlocIndexer(self) + def iat(self): + """ + Alias for ``DataFrame.iloc``; provided for compatibility with Pandas. + """ + return self.iloc + + def at(self): + """ + Alias for ``DataFrame.loc``; provided for compatibility with Pandas. + """ + return self.loc + @property def columns(self): """Returns a tuple of columns @@ -1106,24 +1173,13 @@ def reset_index(self, drop=False): def take(self, positions, ignore_index=False): out = DataFrame() - if self._cols: - positions = columnops.as_column(positions).astype("int32").data.mem - cols = [s._column for s in self._cols.values()] - result_cols = cpp_copying.apply_gather(cols, positions) for i, col_name in enumerate(self._cols): - out[col_name] = result_cols[i] - - if isinstance(self.columns, cudf.MultiIndex): - out.columns = self.columns - + out[col_name] = self[col_name][positions] if ignore_index: out.index = RangeIndex(len(out)) - elif len(out) == 0: - out = out.set_index(self.index.take(positions)) else: - out.index = self.index.take(positions) - + out._index = self.index.take(positions) return out def _take_columns(self, positions): @@ -1418,7 +1474,7 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False): ] in_index = self.index if isinstance(in_index, cudf.dataframe.multiindex.MultiIndex): - in_index = RangeIndex(len(in_index)) + in_index = RangeIndex(len(in_index), name=in_index.name) out_cols, new_index = cpp_drop_duplicates( [in_index.as_column()], in_cols, subset_cols, keep ) diff --git a/python/cudf/cudf/dataframe/datetime.py b/python/cudf/cudf/dataframe/datetime.py index bb7fd6f8557..ee623c5f02b 100644 --- a/python/cudf/cudf/dataframe/datetime.py +++ b/python/cudf/cudf/dataframe/datetime.py @@ -17,7 +17,7 @@ from cudf.dataframe import columnops from cudf.dataframe.buffer import Buffer from cudf.utils import utils -from cudf.utils.utils import is_single_value +from cudf.utils.utils import is_scalar # nanoseconds per time_unit _numpy_to_pandas_conversion = { @@ -224,7 +224,7 @@ def default_na_value(self): ) def fillna(self, fill_value, inplace=False): - if is_single_value(fill_value): + if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = columnops.as_column(fill_value, nan_as_null=False) @@ -236,7 +236,7 @@ def fillna(self, fill_value, inplace=False): def sort_by_values(self, ascending=True, na_position="last"): col_inds = get_sorted_inds(self, ascending, na_position) - col_keys = cpp_copying.apply_gather_column(self, col_inds.data.mem) + col_keys = cpp_copying.apply_gather(self, col_inds) col_inds.name = self.name return col_keys, col_inds @@ -274,7 +274,7 @@ def unique(self, method="sort"): raise NotImplementedError(msg) segs, sortedvals = self._unique_segments() # gather result - out_col = cpp_copying.apply_gather_array(sortedvals, segs) + out_col = cpp_copying.apply_gather(sortedvals, segs) return out_col @property diff --git a/python/cudf/cudf/dataframe/index.py b/python/cudf/cudf/dataframe/index.py index d58e26aa78c..acb87861adf 100644 --- a/python/cudf/cudf/dataframe/index.py +++ b/python/cudf/cudf/dataframe/index.py @@ -7,13 +7,11 @@ import numpy as np import pandas as pd -from numba.cuda.cudadrv.devicearray import DeviceNDArray import nvstrings from librmm_cffi import librmm as rmm import cudf -import cudf.bindings.copying as cpp_copying from cudf.comm.serialize import register_distributed_serializer from cudf.dataframe import columnops from cudf.dataframe.buffer import Buffer @@ -22,7 +20,6 @@ from cudf.dataframe.datetime import DatetimeColumn from cudf.dataframe.numerical import NumericalColumn from cudf.dataframe.string import StringColumn -from cudf.indexing import _IndexLocIndexer from cudf.utils import cudautils, ioutils, utils from cudf.utils.dtypes import is_categorical_dtype @@ -68,18 +65,12 @@ def take(self, indices): --- indices: An array-like that maps to values contained in this Index. """ - # Gather - indices = columnops.as_column(indices) - index = cpp_copying.apply_gather_array( - self.gpu_values, indices.data.mem - ) - col = self.as_column().replace(data=index.data) - new_index = col - new_index.name = self.name - return new_index + return self[indices] def argsort(self, ascending=True): - return self.as_column().argsort(ascending=ascending) + indices = self.as_column().argsort(ascending=ascending) + indices.name = self.name + return indices @property def values(self): @@ -301,9 +292,6 @@ def to_series(self): return Series(self._values) @property - def loc(self): - return _IndexLocIndexer(self) - @property def is_unique(self): raise (NotImplementedError) @@ -435,10 +423,12 @@ def __getitem__(self, index): index = np.array(index) index = rmm.to_device(index) - if isinstance(index, (DeviceNDArray)): - return self.take(index) else: - raise ValueError(index) + if pd.api.types.is_scalar(index): + index = utils.min_signed_type(index)(index) + index = columnops.as_column(index).data.mem + + return as_index(self.as_column()[index], name=self.name) def __eq__(self, other): return super(type(self), self).__eq__(other) @@ -494,7 +484,9 @@ def as_column(self): vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) else: vals = rmm.device_array(0, dtype=self.dtype) - return NumericalColumn(data=Buffer(vals), dtype=vals.dtype) + return NumericalColumn( + data=Buffer(vals), dtype=vals.dtype, name=self.name + ) def to_gpu_array(self): return self.as_column().to_gpu_array() @@ -607,16 +599,19 @@ def __repr__(self): ) def __getitem__(self, index): - res = self._values[index] + res = self.as_column()[index] if not isinstance(index, int): - return as_index(res) + res = as_index(res) + return res else: return res def as_column(self): """Convert the index as a Series. """ - return self._values + col = self._values + col.name = self.name + return col @property def dtype(self): @@ -811,7 +806,6 @@ def __init__(self, values, name=None): self._values = columnops.build_column( nvstrings.to_device(values), dtype="object" ) - assert self._values.null_count == 0 self.name = name def to_pandas(self): @@ -819,7 +813,7 @@ def to_pandas(self): return result def take(self, indices): - return self._values.element_indexing(indices) + return self._values[indices] def __repr__(self): return ( diff --git a/python/cudf/cudf/dataframe/numerical.py b/python/cudf/cudf/dataframe/numerical.py index 23e0fc6fc7a..25a4a546263 100644 --- a/python/cudf/cudf/dataframe/numerical.py +++ b/python/cudf/cudf/dataframe/numerical.py @@ -154,7 +154,7 @@ def as_numerical_column(self, dtype, **kwargs): def sort_by_values(self, ascending=True, na_position="last"): sort_inds = get_sorted_inds(self, ascending, na_position) - col_keys = cpp_copying.apply_gather_column(self, sort_inds.data.mem) + col_keys = cpp_copying.apply_gather(self, sort_inds.data.mem) col_inds = self.replace( data=sort_inds.data, mask=sort_inds.mask, @@ -200,7 +200,7 @@ def unique(self, method="sort"): raise NotImplementedError(msg) segs, sortedvals = self._unique_segments() # gather result - out_col = cpp_copying.apply_gather_array(sortedvals, segs) + out_col = cpp_copying.apply_gather(sortedvals, segs) return out_col def all(self): diff --git a/python/cudf/cudf/dataframe/series.py b/python/cudf/cudf/dataframe/series.py index e5f55b4a0ce..a98884c575c 100644 --- a/python/cudf/cudf/dataframe/series.py +++ b/python/cudf/cudf/dataframe/series.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd -from numba.cuda.cudadrv.devicearray import DeviceNDArray from pandas.api.types import is_dict_like, is_scalar from librmm_cffi import librmm as rmm @@ -20,7 +19,6 @@ ) from cudf.comm.serialize import register_distributed_serializer from cudf.dataframe import columnops -from cudf.dataframe.buffer import Buffer from cudf.dataframe.column import Column from cudf.dataframe.datetime import DatetimeColumn from cudf.dataframe.index import Index, RangeIndex, as_index @@ -361,66 +359,39 @@ def empty(self): return not len(self) def __getitem__(self, arg): - if isinstance( - arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray) - ): - if len(arg) == 0: - arg = Series(np.array([], dtype="int32")) - else: - arg = Series(arg) - if isinstance(arg, Series): - if issubclass(arg.dtype.type, np.integer): - maps = columnops.as_column(arg).data.mem - index = self.index.take(maps) - selvals = self._column.take(maps) - elif arg.dtype in [np.bool, np.bool_]: - selvals = self._column.apply_boolean_mask(arg) - index = self.index.as_column().apply_boolean_mask(arg) - else: - raise NotImplementedError(arg.dtype) - return self._copy_construct(data=selvals, index=index) - elif isinstance(arg, slice): - index = self.index[arg] # slice index - col = self._column[arg] # slice column - return self._copy_construct(data=col, index=index) - elif isinstance(arg, Number): - # The following triggers a IndexError if out-of-bound - return self._column.element_indexing(arg) + data = self._column[arg] + index = self.index.take(arg) + if utils.is_scalar(data) or data is None: + return data + return self._copy_construct(data=data, index=index) + + def __setitem__(self, key, value): + # coerce value into a scalar or column + if utils.is_scalar(value): + value = utils.to_cudf_compatible_scalar(value) else: - raise NotImplementedError(type(arg)) + value = columnops.as_column(value) + + if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype( + value.dtype + ): + # normalize types if necessary: + if not pd.api.types.is_integer(key): + to_dtype = np.result_type(value.dtype, self._column.dtype) + value = value.astype(to_dtype) + self._column = self._column.astype(to_dtype) + + self._column[key] = value def take(self, indices, ignore_index=False): """Return Series by taking values from the corresponding *indices*. """ - from cudf import Series - - if isinstance(indices, Series): - indices = indices.to_gpu_array() - else: - indices = Buffer(indices).to_gpu_array() - # Handle zero size - if indices.size == 0: - return self._copy_construct( - data=self.data[:0], index=self.index[:0] - ) - - if self.dtype == np.dtype("object"): - return self[indices] - - col = cpp_copying.apply_gather_array(self.data.to_gpu_array(), indices) - - if self._column.mask: - mask = self._get_mask_as_series().take(indices).as_mask() - mask = Buffer(mask) - else: - mask = None + result = self[indices] if ignore_index: - index = RangeIndex(indices.size) + index = RangeIndex(len(result)) + return result._copy_construct(index=index) else: - index = self.index.take(indices) - - col = self._column.replace(data=col.data, mask=mask) - return self._copy_construct(data=col, index=index) + return result def _get_mask_as_series(self): mask = Series(cudautils.ones(len(self), dtype=np.bool)) @@ -1075,26 +1046,6 @@ def has_null_mask(self): """A boolean indicating whether a null-mask is needed""" return self._column.has_null_mask - def masked_assign(self, value, mask): - """Assign a scalar value to a series using a boolean mask - df[df < 0] = 0 - - Parameters - ---------- - value : scalar - scalar value for assignment - mask : cudf Series - Boolean Series - - Returns - ------- - cudf Series - cudf series with new value set to where mask is True - """ - - data = self._column.masked_assign(value, mask) - return self._copy_construct(data=data) - def drop_duplicates(self, keep="first", inplace=False): """ Return Series with duplicate values removed @@ -1357,42 +1308,19 @@ def index(self, _index): @property def loc(self): + """ + Select values by label. + + See DataFrame.loc + """ return _SeriesLocIndexer(self) @property def iloc(self): """ - For integer-location based selection. - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series(list(range(20))) - - Get the value from 1st index - - >>> sr.iloc[1] - 1 - - Get the values from 0,2,9 and 18th index + Select values by position. - >>> sr.iloc[0,2,9,18] - 0 0 - 2 2 - 9 9 - 18 18 - - Get the values using slice indices - - >>> sr.iloc[3:10:2] - 3 3 - 5 5 - 7 7 - 9 9 - - Returns - ------- - Series containing the elements corresponding to the indices + See DataFrame.iloc """ return _SeriesIlocIndexer(self) @@ -1601,8 +1529,8 @@ def reverse(self): rinds = cudautils.arange_reversed( self._column.data.size, dtype=np.int32 ) - col = cpp_copying.apply_gather_column(self._column, rinds) - index = cpp_copying.apply_gather_array(self.index.gpu_values, rinds) + col = cpp_copying.apply_gather(self._column, rinds) + index = cpp_copying.apply_gather(self.index.as_column(), rinds) return self._copy_construct(data=col, index=index) def one_hot_encoding(self, cats, dtype="float64"): diff --git a/python/cudf/cudf/dataframe/string.py b/python/cudf/cudf/dataframe/string.py index a1112c58a7d..04d4fc683e8 100644 --- a/python/cudf/cudf/dataframe/string.py +++ b/python/cudf/cudf/dataframe/string.py @@ -1,12 +1,10 @@ # Copyright (c) 2019, NVIDIA CORPORATION. import warnings -from numbers import Number import numpy as np import pandas as pd import pyarrow as pa -from numba.cuda.cudadrv.devicearray import DeviceNDArray import nvstrings from librmm_cffi import librmm as rmm @@ -17,7 +15,7 @@ from cudf.comm.serialize import register_distributed_serializer from cudf.dataframe import column, columnops from cudf.dataframe.buffer import Buffer -from cudf.utils import cudautils, utils +from cudf.utils import utils _str_to_numeric_typecast_functions = { np.dtype("int32"): nvstrings.nvstrings.stoi, @@ -512,45 +510,6 @@ def indices(self): self._indices = Buffer(out_dev_arr) return self._indices - def element_indexing(self, arg): - from cudf.dataframe.numerical import NumericalColumn - - if isinstance(arg, Number): - arg = int(arg) - if arg < 0: - arg = len(self) + arg - if arg > (len(self) - 1): - raise IndexError - out = self._data[arg].to_host()[0] - return out - elif isinstance(arg, slice): - out = self._data[arg] - elif isinstance(arg, list): - out = self._data[arg] - elif isinstance(arg, np.ndarray): - gpu_arr = rmm.to_device(arg) - return self.element_indexing(gpu_arr) - elif isinstance(arg, DeviceNDArray): - # NVStrings gather call expects an array of int32s - import cudf.bindings.typecast as typecast - - arg = typecast.apply_cast(columnops.as_column(arg), dtype=np.int32) - arg = cudautils.modulo(arg.data.mem, len(self)) - if len(arg) > 0: - gpu_ptr = get_ctype_ptr(arg) - out = self._data.gather(gpu_ptr, len(arg)) - else: - out = self._data.gather([]) - elif isinstance(arg, NumericalColumn): - return self.element_indexing(arg.data.mem) - else: - raise NotImplementedError(type(arg)) - - return columnops.as_column(out) - - def __getitem__(self, arg): - return self.element_indexing(arg) - def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) @@ -794,9 +753,6 @@ def normalize_binop_value(self, other): def default_na_value(self): return None - def take(self, indices): - return self.element_indexing(indices) - def binary_operator(self, binop, rhs, reflect=False): lhs = self if reflect: diff --git a/python/cudf/cudf/groupby/groupby.py b/python/cudf/cudf/groupby/groupby.py index 5f557060904..3440f2dbd8e 100644 --- a/python/cudf/cudf/groupby/groupby.py +++ b/python/cudf/cudf/groupby/groupby.py @@ -7,7 +7,7 @@ from cudf import MultiIndex from cudf.bindings.groupby import apply_groupby as cpp_apply_groupby from cudf.bindings.nvtx import nvtx_range_pop -from cudf.utils.utils import is_single_value +from cudf.utils.utils import is_scalar def columns_from_dataframe(df): @@ -74,7 +74,7 @@ def _apply_aggregation(self, agg): return result def __getitem__(self, arg): - if is_single_value(arg): + if is_scalar(arg): return self.__getattr__(arg) else: arg = list(arg) @@ -159,7 +159,7 @@ def key_from_by(self, by): """ Get (key_name, key_column) pair from a single *by* argument """ - if is_single_value(by): + if is_scalar(by): key_name = by key_column = self.obj[by]._column else: diff --git a/python/cudf/cudf/indexing.py b/python/cudf/cudf/indexing.py index 6f1294f00c0..559517140a4 100755 --- a/python/cudf/cudf/indexing.py +++ b/python/cudf/cudf/indexing.py @@ -3,7 +3,44 @@ from numba.cuda.cudadrv.devicearray import DeviceNDArray import cudf -from cudf.utils.utils import is_single_value +from cudf.utils.cudautils import arange +from cudf.utils.dtypes import is_categorical_dtype +from cudf.utils.utils import is_scalar + + +def indices_from_labels(obj, labels): + from cudf.dataframe import columnops + + labels = columnops.as_column(labels) + + if is_categorical_dtype(obj.index): + labels = labels.astype("category") + labels._data = labels.data.astype(obj.index._values.data.dtype) + else: + labels = labels.astype(obj.index.dtype) + + lhs = cudf.DataFrame({}, index=labels) + rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index) + return lhs.join(rhs)["_"] + + +class _SeriesIlocIndexer(object): + """ + For integer-location based selection. + """ + + def __init__(self, sr): + self._sr = sr + + def __getitem__(self, arg): + if isinstance(arg, tuple): + arg = list(arg) + return self._sr[arg] + + def __setitem__(self, key, value): + if isinstance(key, tuple): + key = list(key) + self._sr[key] = value class _SeriesLocIndexer(object): @@ -15,8 +52,16 @@ def __init__(self, sr): self._sr = sr def __getitem__(self, arg): + arg = self._loc_to_iloc(arg) + return self._sr.iloc[arg] + + def __setitem__(self, key, value): + key = self._loc_to_iloc(key) + self._sr.iloc[key] = value + + def _loc_to_iloc(self, arg): from cudf.dataframe.series import Series - from cudf.dataframe.index import Index, RangeIndex + from cudf.dataframe.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray) @@ -27,27 +72,17 @@ def __getitem__(self, arg): arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: - return self._sr.iloc[arg] - # To do this efficiently we need a solution to - # https://github.com/rapidsai/cudf/issues/1087 - out = Series( - [], - dtype=self._sr.dtype, - index=self._sr.index.__class__(start=0) - if isinstance(self._sr.index, RangeIndex) - else self._sr.index.__class__([]), - ) - for s in arg: - out = out.append(self._sr.loc[s:s], ignore_index=False) - return out - elif is_single_value(arg): + return arg + else: + return indices_from_labels(self._sr, arg) + elif is_scalar(arg): found_index = self._sr.index.find_label_range(arg, None)[0] - return self._sr.iloc[found_index] + return found_index elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop ) - return self._sr.iloc[start_index : stop_index : arg.step] + return slice(start_index, stop_index, arg.step) else: raise NotImplementedError( ".loc not implemented for label type {}".format( @@ -56,20 +91,6 @@ def __getitem__(self, arg): ) -class _SeriesIlocIndexer(object): - """ - For integer-location based selection. - """ - - def __init__(self, sr): - self._sr = sr - - def __getitem__(self, arg): - if isinstance(arg, tuple): - arg = list(arg) - return self._sr[arg] - - class _DataFrameIndexer(object): def __getitem__(self, arg): from cudf import MultiIndex @@ -88,6 +109,11 @@ def __getitem__(self, arg): arg = (arg, slice(None)) return self._getitem_tuple_arg(arg) + def __setitem__(self, key, value): + if not isinstance(key, tuple): + key = (key, slice(None)) + return self._setitem_tuple_arg(key, value) + def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used @@ -100,7 +126,7 @@ def _can_downcast_to_series(self, df, arg): nrows, ncols = df.shape if nrows == 1: if type(arg[0]) is slice: - if not is_single_value(arg[1]): + if not is_scalar(arg[1]): return False dtypes = df.dtypes.values.tolist() all_numeric = all( @@ -111,7 +137,7 @@ def _can_downcast_to_series(self, df, arg): return True if ncols == 1: if type(arg[1]) is slice: - if not is_single_value(arg[0]): + if not is_scalar(arg[0]): return False return True return False @@ -124,7 +150,7 @@ def _downcast_to_series(self, df, arg): nrows, ncols = df.shape # determine the axis along which the Series is taken: if nrows == 1 and ncols == 1: - if not is_single_value(arg[0]): + if not is_scalar(arg[0]): axis = 1 else: axis = 0 @@ -198,8 +224,22 @@ def _getitem_tuple_arg(self, arg): return self._downcast_to_series(df, arg) return df + def _setitem_tuple_arg(self, key, value): + if isinstance(self._df.index, cudf.MultiIndex) or isinstance( + self._df.columns, cudf.MultiIndex + ): + raise NotImplementedError( + "Setting values using df.loc[] not supported on " + "DataFrames with a MultiIndex" + ) + + columns = self._get_column_selection(key[1]) + + for col in columns: + self._df[col].loc[key[0]] = value + def _get_column_selection(self, arg): - if is_single_value(arg): + if is_scalar(arg): return [arg] elif isinstance(arg, slice): @@ -313,13 +353,19 @@ def _getitem_tuple_arg(self, arg): df._index = RangeIndex(start, stop) return df + def _setitem_tuple_arg(self, key, value): + columns = self._get_column_selection(key[1]) + + for col in columns: + self._df[col].iloc[key[0]] = value + def _getitem_scalar(self, arg): col = self._df.columns[arg[1]] return self._df[col].iloc[arg[0]] def _get_column_selection(self, arg): cols = self._df.columns - if is_single_value(arg): + if is_scalar(arg): return [cols[arg]] else: return cols[arg] @@ -331,13 +377,3 @@ def _normalize_dtypes(df): for name, col in df._cols.items(): df[name] = col.astype(normalized_dtype) return df - - -class _IndexLocIndexer(object): - def __init__(self, idx): - self.idx = idx - - def __getitem__(self, arg): - from cudf.dataframe.index import as_index - - return as_index(self.idx.to_series().loc[arg]) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 84410de5af7..50977e4a340 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -14,7 +14,7 @@ def test_gather_single_col(): device_gather_map = rmm.to_device(gather_map) - out = cpp_copying.apply_gather_column(col, device_gather_map) + out = cpp_copying.apply_gather(col, device_gather_map) np.testing.assert_array_equal(out.to_array(), gather_map) @@ -39,10 +39,10 @@ def test_gather_cols(): def test_gather_string_col(): col = columnops.as_column(["a", "b", "c", "d"]) gather_map = columnops.as_column([0, 2, 3], dtype="int32").data.mem - result = cpp_copying.apply_gather([col], gather_map) - assert result[0].data.to_host() == ["a", "c", "d"] + result = cpp_copying.apply_gather(col, gather_map) + assert result.data.to_host() == ["a", "c", "d"] col = columnops.as_column(["a", "b", None, "d"]) gather_map = columnops.as_column([0, 2, 3], dtype="int32").data.mem - result = cpp_copying.apply_gather([col], gather_map) - assert result[0].data.to_host() == ["a", None, "d"] + result = cpp_copying.apply_gather(col, gather_map) + assert result.data.to_host() == ["a", None, "d"] diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index 1e7f559034c..8e5ad237d27 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -11,7 +11,7 @@ from cudf import DataFrame -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 1000, 5000]) +@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) def test_df_apply_rows(nelem): def kernel(in1, in2, in3, out1, out2, extra1, extra2): for i, (x, y, z) in enumerate(zip(in1, in2, in3)): @@ -43,7 +43,7 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2, expect_out2) -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 1000, 5000]) +@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) @pytest.mark.parametrize("chunksize", [1, 2, 3, 4, 23]) def test_df_apply_chunks(nelem, chunksize): def kernel(in1, in2, in3, out1, out2, extra1, extra2): @@ -77,7 +77,7 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2, expect_out2) -@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 1000]) +@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) def test_df_apply_custom_chunks(nelem): def kernel(in1, in2, in3, out1, out2, extra1, extra2): for i, (x, y, z) in enumerate(zip(in1, in2, in3)): @@ -116,7 +116,7 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2, expect_out2) -@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 1000]) +@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) @pytest.mark.parametrize("tpb", [1, 8, 16, 64]) def test_df_apply_custom_chunks_tpb(nelem, tpb): def kernel(in1, in2, in3, out1, out2, extra1, extra2): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 301cc8d488a..8820040ced8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2,7 +2,6 @@ import array as arr import operator -from itertools import combinations import numpy as np import pandas as pd @@ -134,81 +133,6 @@ def test_series_append(): np.testing.assert_equal(series.to_array(), np.hstack([a6, a5])) -index_dtypes = [np.int64, np.int32, np.int16, np.int8] - - -@pytest.mark.parametrize( - "i1, i2, i3", - ( - [ - (slice(None, 12), slice(3, None), slice(None, None, 2)), - (range(12), range(3, 12), range(0, 9, 2)), - (np.arange(12), np.arange(3, 12), np.arange(0, 9, 2)), - (list(range(12)), list(range(3, 12)), list(range(0, 9, 2))), - ( - pd.Series(range(12)), - pd.Series(range(3, 12)), - pd.Series(range(0, 9, 2)), - ), - (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))), - ( - [i in range(12) for i in range(20)], - [i in range(3, 12) for i in range(12)], - [i in range(0, 9, 2) for i in range(9)], - ), - ( - np.array([i in range(12) for i in range(20)], dtype=bool), - np.array([i in range(3, 12) for i in range(12)], dtype=bool), - np.array([i in range(0, 9, 2) for i in range(9)], dtype=bool), - ), - ] - + [ - ( - np.arange(12, dtype=t), - np.arange(3, 12, dtype=t), - np.arange(0, 9, 2, dtype=t), - ) - for t in index_dtypes - ] - ), - ids=( - [ - "slice", - "range", - "numpy.array", - "list", - "pandas.Series", - "Series", - "list[bool]", - "numpy.array[bool]", - ] - + ["numpy.array[%s]" % t.__name__ for t in index_dtypes] - ), -) -def test_series_indexing(i1, i2, i3): - a1 = np.arange(20) - series = Series(a1) - # Indexing - sr1 = series[i1] - assert sr1.null_count == 0 - np.testing.assert_equal(sr1.to_array(), a1[:12]) - sr2 = sr1[i2] - assert sr2.null_count == 0 - np.testing.assert_equal(sr2.to_array(), a1[3:12]) - # Index with stride - sr3 = sr2[i3] - assert sr3.null_count == 0 - np.testing.assert_equal(sr3.to_array(), a1[3:12:2]) - - # Integer indexing - if isinstance(i1, range): - for i in i1: # Python int-s - assert series[i] == a1[i] - if isinstance(i1, np.ndarray) and i1.dtype in index_dtypes: - for i in i1: # numpy integers - assert series[i] == a1[i] - - def test_series_init_none(): # test for creating empty series @@ -276,44 +200,6 @@ def test_dataframe_basic(): np.testing.assert_equal(data, df_tup[(1, "foobar")].to_array()) -def test_dataframe_column_name_indexing(): - df = DataFrame() - data = np.asarray(range(10), dtype=np.int32) - df["a"] = data - df[1] = data - np.testing.assert_equal( - df["a"].to_array(), np.asarray(range(10), dtype=np.int32) - ) - np.testing.assert_equal( - df[1].to_array(), np.asarray(range(10), dtype=np.int32) - ) - - pdf = pd.DataFrame() - nelem = 10 - pdf["key1"] = np.random.randint(0, 5, nelem) - pdf["key2"] = np.random.randint(0, 3, nelem) - pdf[1] = np.arange(1, 1 + nelem) - pdf[2] = np.random.random(nelem) - df = DataFrame.from_pandas(pdf) - - assert_eq(df[df.columns], df) - assert_eq(df[df.columns[:1]], df[["key1"]]) - - for i in range(1, len(pdf.columns) + 1): - for idx in combinations(pdf.columns, i): - assert pdf[list(idx)].equals(df[list(idx)].to_pandas()) - - # test for only numeric columns - df = pd.DataFrame() - for i in range(0, 10): - df[i] = range(nelem) - gdf = DataFrame.from_pandas(df) - assert_eq(gdf, df) - - assert_eq(gdf[gdf.columns], gdf) - assert_eq(gdf[gdf.columns[:3]], gdf[[0, 1, 2]]) - - def test_dataframe_drop_method(): df = DataFrame() data = np.asarray(range(10)) @@ -417,337 +303,6 @@ def test_index_astype(nelem): np.testing.assert_equal(df.index.to_array(), df["a"].to_array()) -def test_dataframe_slicing(): - df = DataFrame() - size = 123 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) - df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( - np.int64 - ) - df["d"] = hd = np.random.random(size).astype(np.float64) - - # Row slice first 10 - first_10 = df[:10] - assert len(first_10) == 10 - assert tuple(first_10.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(first_10["a"].to_array(), ha[:10]) - np.testing.assert_equal(first_10["b"].to_array(), hb[:10]) - np.testing.assert_equal(first_10["c"].to_array(), hc[:10]) - np.testing.assert_equal(first_10["d"].to_array(), hd[:10]) - del first_10 - - # Row slice last 10 - last_10 = df[-10:] - assert len(last_10) == 10 - assert tuple(last_10.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(last_10["a"].to_array(), ha[-10:]) - np.testing.assert_equal(last_10["b"].to_array(), hb[-10:]) - np.testing.assert_equal(last_10["c"].to_array(), hc[-10:]) - np.testing.assert_equal(last_10["d"].to_array(), hd[-10:]) - del last_10 - - # Row slice [begin:end] - begin = 7 - end = 121 - subrange = df[begin:end] - assert len(subrange) == end - begin - assert tuple(subrange.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(subrange["a"].to_array(), ha[begin:end]) - np.testing.assert_equal(subrange["b"].to_array(), hb[begin:end]) - np.testing.assert_equal(subrange["c"].to_array(), hc[begin:end]) - np.testing.assert_equal(subrange["d"].to_array(), hd[begin:end]) - del subrange - - -@pytest.mark.parametrize("step", [1, 2, 5]) -@pytest.mark.parametrize("scalar", [0, 20, 100]) -def test_dataframe_loc(scalar, step): - size = 123 - pdf = pd.DataFrame( - { - "a": np.random.randint(low=0, high=100, size=size), - "b": np.random.random(size).astype(np.float32), - "c": np.random.random(size).astype(np.float64), - "d": np.random.random(size).astype(np.float64), - } - ) - - df = DataFrame.from_pandas(pdf) - - # Scalar label - assert_eq(df.loc[scalar], pdf.loc[scalar]) - - # Full slice - assert_eq(df.loc[:, "c"], pdf.loc[:, "c"]) - - begin = 110 - end = 122 - - assert_eq( - df.loc[begin:end:step, ["c", "d", "a"]], - pdf.loc[begin:end:step, ["c", "d", "a"]], - ) - - assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]]) - - # Slicing on columns: - assert_eq( - df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"] - ) - - # Slicing of size 1: - assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"]) - - # TODO: Pandas changes the dtype here when it shouldn't - assert_eq( - df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False - ) - - # Make int64 index - offset = 50 - df2 = df[offset:] - pdf2 = pdf[offset:] - begin = 117 - end = 122 - assert_eq( - df2.loc[begin:end, ["c", "d", "a"]], - pdf2.loc[begin:end, ["c", "d", "a"]], - ) - - -@pytest.mark.xfail(raises=IndexError, reason="label scalar is out of bound") -def test_dataframe_loc_outbound(): - df = DataFrame() - size = 10 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - np.testing.assert_equal(df.loc[11].to_array(), pdf.loc[11]) - - -def test_series_loc_numerical(): - ps = pd.Series([1, 2, 3, 4, 5], index=[5, 6, 7, 8, 9]) - gs = Series.from_pandas(ps) - - assert_eq(ps.loc[5], gs.loc[5]) - assert_eq(ps.loc[6], gs.loc[6]) - assert_eq(ps.loc[6:8], gs.loc[6:8]) - assert_eq(ps.loc[:8], gs.loc[:8]) - assert_eq(ps.loc[6:], gs.loc[6:]) - assert_eq(ps.loc[::2], gs.loc[::2]) - assert_eq(ps.loc[[5, 8, 9]], gs.loc[[5, 8, 9]]) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -def test_series_loc_string(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] - ) - gs = Series.from_pandas(ps) - - assert_eq(ps.loc["one"], gs.loc["one"]) - assert_eq(ps.loc["five"], gs.loc["five"]) - assert_eq(ps.loc["two":"four"], gs.loc["two":"four"]) - assert_eq(ps.loc[:"four"], gs.loc[:"four"]) - assert_eq(ps.loc["two":], gs.loc["two":]) - assert_eq(ps.loc[::2], gs.loc[::2]) - assert_eq(ps.loc[["one", "four", "five"]], gs.loc[["one", "four", "five"]]) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -def test_series_loc_datetime(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") - ) - gs = Series.from_pandas(ps) - - # a few different ways of specifying a datetime label: - assert_eq(ps.loc["20010101"], gs.loc["20010101"]) - assert_eq(ps.loc["2001-01-01"], gs.loc["2001-01-01"]) - assert_eq( - ps.loc[pd.to_datetime("2001-01-01")], - gs.loc[pd.to_datetime("2001-01-01")], - ) - assert_eq( - ps.loc[np.datetime64("2001-01-01")], - gs.loc[np.datetime64("2001-01-01")], - ) - - assert_eq( - ps.loc["2001-01-02":"2001-01-05"], gs.loc["2001-01-02":"2001-01-05"] - ) - assert_eq(ps.loc["2001-01-02":], gs.loc["2001-01-02":]) - assert_eq(ps.loc[:"2001-01-04"], gs.loc[:"2001-01-04"]) - assert_eq(ps.loc[::2], gs.loc[::2]) - # - # assert_eq(ps.loc[['2001-01-01', '2001-01-04', '2001-01-05']], - # gs.loc[['2001-01-01', '2001-01-04', '2001-01-05']]) - # looks like a bug in Pandas doesn't let us check for the above, - # so instead: - assert_eq( - ps.loc[ - [ - pd.to_datetime("2001-01-01"), - pd.to_datetime("2001-01-04"), - pd.to_datetime("2001-01-05"), - ] - ], - gs.loc[ - [ - pd.to_datetime("2001-01-01"), - pd.to_datetime("2001-01-04"), - pd.to_datetime("2001-01-05"), - ] - ], - ) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -def test_series_loc_categorical(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) - ) - gs = Series.from_pandas(ps) - - assert_eq(ps.loc["a"], gs.loc["a"]) - assert_eq(ps.loc["e"], gs.loc["e"]) - assert_eq(ps.loc["b":"d"], gs.loc["b":"d"]) - assert_eq(ps.loc[:"d"], gs.loc[:"d"]) - assert_eq(ps.loc["b":], gs.loc["b":]) - assert_eq(ps.loc[::2], gs.loc[::2]) - - # order of categories changes, so we can only - # compare values: - assert_eq( - ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_array() - ) - - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -@pytest.mark.parametrize("nelem", [2, 5, 20, 100]) -def test_series_iloc(nelem): - - # create random series - np.random.seed(12) - ps = pd.Series(np.random.sample(nelem)) - - # gpu series - gs = Series(ps) - - # positive tests for indexing - np.testing.assert_allclose(gs.iloc[-1 * nelem], ps.iloc[-1 * nelem]) - np.testing.assert_allclose(gs.iloc[-1], ps.iloc[-1]) - np.testing.assert_allclose(gs.iloc[0], ps.iloc[0]) - np.testing.assert_allclose(gs.iloc[1], ps.iloc[1]) - np.testing.assert_allclose(gs.iloc[nelem - 1], ps.iloc[nelem - 1]) - - # positive tests for slice - np.testing.assert_allclose(gs.iloc[-1:1], ps.iloc[-1:1]) - np.testing.assert_allclose( - gs.iloc[nelem - 1 : -1], ps.iloc[nelem - 1 : -1] - ) - np.testing.assert_allclose(gs.iloc[0 : nelem - 1], ps.iloc[0 : nelem - 1]) - np.testing.assert_allclose(gs.iloc[0:nelem], ps.iloc[0:nelem]) - np.testing.assert_allclose(gs.iloc[1:1], ps.iloc[1:1]) - np.testing.assert_allclose(gs.iloc[1:2], ps.iloc[1:2]) - np.testing.assert_allclose( - gs.iloc[nelem - 1 : nelem + 1], ps.iloc[nelem - 1 : nelem + 1] - ) - np.testing.assert_allclose( - gs.iloc[nelem : nelem * 2], ps.iloc[nelem : nelem * 2] - ) - - -@pytest.mark.parametrize("nelem", [2, 5, 20, 100]) -def test_dataframe_iloc(nelem): - gdf = DataFrame() - - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - assert_eq(gdf.iloc[-1:1], pdf.iloc[-1:1]) - assert_eq(gdf.iloc[nelem - 1 : -1], pdf.iloc[nelem - 1 : -1]) - assert_eq(gdf.iloc[0 : nelem - 1], pdf.iloc[0 : nelem - 1]) - assert_eq(gdf.iloc[0:nelem], pdf.iloc[0:nelem]) - assert_eq(gdf.iloc[1:1], pdf.iloc[1:1]) - assert_eq(gdf.iloc[1:2], pdf.iloc[1:2]) - assert_eq(gdf.iloc[nelem - 1 : nelem + 1], pdf.iloc[nelem - 1 : nelem + 1]) - assert_eq(gdf.iloc[nelem : nelem * 2], pdf.iloc[nelem : nelem * 2]) - assert_eq(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem]) - assert_eq(gdf.iloc[-1], pdf.iloc[-1]) - assert_eq(gdf.iloc[0], pdf.iloc[0]) - assert_eq(gdf.iloc[1], pdf.iloc[1]) - assert_eq(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1]) - - -def test_dataframe_iloc_tuple(): - gdf = DataFrame() - nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - got = gdf.iloc[1, [1]] - expect = pdf.iloc[1, [1]] - - assert_eq(got, expect, check_dtype=False) - - -@pytest.mark.xfail( - raises=IndexError, reason="positional indexers are out-of-bounds" -) -def test_dataframe_iloc_index_error(): - gdf = DataFrame() - nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - def assert_col(g, p): - np.testing.assert_equal(g["a"].to_array(), p["a"]) - np.testing.assert_equal(g["b"].to_array(), p["b"]) - - assert_col(gdf.iloc[nelem * 2], pdf.iloc[nelem * 2]) - - def test_dataframe_to_string(): with set_options(formatting={"nrows": 5, "ncols": 8}): # Test basic @@ -991,33 +546,6 @@ def test_dataframe_as_gpu_matrix_null_values(): np.testing.assert_array_equal(refvalues[k], mat[:, i]) -@pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) -def test_dataframe_take(ntake): - np.random.seed(0) - df = DataFrame() - - nelem = 123 - df["ii"] = ii = np.random.randint(0, 20, nelem) - df["ff"] = ff = np.random.random(nelem) - - take_indices = np.random.randint(0, len(df), ntake) - - def check(**kwargs): - out = df.take(take_indices, **kwargs) - assert len(out) == ntake - assert out.ii.null_count == 0 - assert out.ff.null_count == 0 - np.testing.assert_array_equal(out.ii.to_array(), ii[take_indices]) - np.testing.assert_array_equal(out.ff.to_array(), ff[take_indices]) - if kwargs.get("ignore_index"): - np.testing.assert_array_equal(out.index, np.arange(ntake)) - else: - np.testing.assert_array_equal(out.index, take_indices) - - check() - check(ignore_index=True) - - def test_dataframe_append_empty(): pdf = pd.DataFrame( { @@ -1299,25 +827,6 @@ def test_nonmatching_index_setitem(nrows): ) -@pytest.mark.parametrize("nelem", [0, 1, 5, 20, 100]) -@pytest.mark.parametrize("slice_start", [None, 0, 1, 3, 10, -10]) -@pytest.mark.parametrize("slice_end", [None, 0, 1, 30, 50, -1]) -def test_dataframe_masked_slicing(nelem, slice_start, slice_end): - gdf = DataFrame() - gdf["a"] = list(range(nelem)) - gdf["b"] = list(range(nelem, 2 * nelem)) - gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) - gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem)) - - def do_slice(x): - return x[slice_start:slice_end] - - expect = do_slice(gdf.to_pandas()) - got = do_slice(gdf).to_pandas() - - pd.testing.assert_frame_equal(expect, got) - - def test_from_pandas(): df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) gdf = gd.DataFrame.from_pandas(df) @@ -1848,121 +1357,6 @@ def test_is_monotonic(gdf): assert not gdf.index.is_monotonic_decreasing -def test_dataframe_boolean_mask_with_None(): - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - gdf = DataFrame.from_pandas(pdf) - pdf_masked = pdf[[True, False, True, False]] - gdf_masked = gdf[[True, False, True, False]] - assert_eq(pdf_masked, gdf_masked) - - -@pytest.mark.parametrize("dtype", [int, float, str]) -def test_empty_boolean_mask(dtype): - gdf = gd.datasets.randomdata(nrows=0, dtypes={"a": dtype}) - pdf = gdf.to_pandas() - - expected = pdf[pdf.a == 1] - got = gdf[gdf.a == 1] - assert_eq(expected, got) - - expected = pdf.a[pdf.a == 1] - got = gdf.a[gdf.a == 1] - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - [1.0, 2.0, 3.0, 4.0], - ["one", "two", "three", "four"], - pd.Series(["a", "b", "c", "d"], dtype="category"), - pd.Series(pd.date_range("2010-01-01", "2010-01-04")), - ], -) -@pytest.mark.parametrize( - "mask", - [ - [True, True, True, True], - [False, False, False, False], - [True, False, True, False], - [True, False, False, True], - np.array([True, False, True, False]), - pd.Series([True, False, True, False]), - gd.Series([True, False, True, False]), - ], -) -@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) -def test_series_apply_boolean_mask(data, mask, nulls): - psr = pd.Series(data) - - if len(data) > 0: - if nulls == "one": - p = np.random.randint(0, 4) - psr[p] = None - elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) - psr[p1] = None - psr[p2] = None - elif nulls == "all": - psr[:] = None - - gsr = gd.from_pandas(psr) - - # TODO: from_pandas(psr) has dtype "float64" - # when psr has dtype "object" and is all None - if psr.dtype == "object" and nulls == "all": - gsr = gd.Series([None, None, None, None], dtype="object") - - if isinstance(mask, gd.Series): - expect = psr[mask.to_pandas()] - else: - expect = psr[mask] - got = gsr[mask] - - assert_eq(expect, got) - - -def test_dataframe_apply_boolean_mask(): - pdf = pd.DataFrame( - { - "a": [0, 1, 2, 3], - "b": [0.1, 0.2, None, 0.3], - "c": ["a", None, "b", "c"], - } - ) - gdf = DataFrame.from_pandas(pdf) - assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]]) - - -""" -This test compares cudf and Pandas dataframe boolean indexing. -""" - - -@pytest.mark.parametrize( - "mask_fn", [lambda x: x, lambda x: np.array(x), lambda x: pd.Series(x)] -) -def test_dataframe_boolean_mask(pdf, gdf, mask_fn): - mask_base = [ - True, - False, - True, - False, - True, - False, - True, - False, - True, - False, - ] - mask = mask_fn(mask_base) - assert len(mask) == gdf.shape[0] - pdf_masked = pdf[mask] - gdf_masked = gdf[mask] - assert pdf_masked.to_string().split() == gdf_masked.to_string().split() - - def test_iter(pdf, gdf): assert list(pdf) == list(gdf) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py new file mode 100644 index 00000000000..a3eb8a5c428 --- /dev/null +++ b/python/cudf/cudf/tests/test_indexing.py @@ -0,0 +1,869 @@ +from itertools import combinations + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf import DataFrame, Series +from cudf.tests import utils +from cudf.tests.utils import assert_eq + +index_dtypes = [np.int64, np.int32, np.int16, np.int8] + + +@pytest.fixture +def pdf_gdf(): + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": ["c", "d", "e"]}, index=["one", "two", "three"] + ) + gdf = cudf.from_pandas(pdf) + return pdf, gdf + + +@pytest.fixture +def pdf_gdf_multi(): + pdf = pd.DataFrame(np.random.rand(7, 5)) + pdfIndex = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + ], + ) + pdfIndex.names = ["alpha", "location", "weather", "sign"] + pdf.index = pdfIndex + gdf = cudf.from_pandas(pdf) + return pdf, gdf + + +@pytest.mark.parametrize( + "i1, i2, i3", + ( + [ + (slice(None, 12), slice(3, None), slice(None, None, 2)), + (range(12), range(3, 12), range(0, 9, 2)), + (np.arange(12), np.arange(3, 12), np.arange(0, 9, 2)), + (list(range(12)), list(range(3, 12)), list(range(0, 9, 2))), + ( + pd.Series(range(12)), + pd.Series(range(3, 12)), + pd.Series(range(0, 9, 2)), + ), + (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))), + ( + [i in range(12) for i in range(20)], + [i in range(3, 12) for i in range(12)], + [i in range(0, 9, 2) for i in range(9)], + ), + ( + np.array([i in range(12) for i in range(20)], dtype=bool), + np.array([i in range(3, 12) for i in range(12)], dtype=bool), + np.array([i in range(0, 9, 2) for i in range(9)], dtype=bool), + ), + ] + + [ + ( + np.arange(12, dtype=t), + np.arange(3, 12, dtype=t), + np.arange(0, 9, 2, dtype=t), + ) + for t in index_dtypes + ] + ), + ids=( + [ + "slice", + "range", + "numpy.array", + "list", + "pandas.Series", + "Series", + "list[bool]", + "numpy.array[bool]", + ] + + ["numpy.array[%s]" % t.__name__ for t in index_dtypes] + ), +) +def test_series_indexing(i1, i2, i3): + a1 = np.arange(20) + series = Series(a1) + # Indexing + sr1 = series[i1] + assert sr1.null_count == 0 + np.testing.assert_equal(sr1.to_array(), a1[:12]) + sr2 = sr1[i2] + assert sr2.null_count == 0 + np.testing.assert_equal(sr2.to_array(), a1[3:12]) + # Index with stride + sr3 = sr2[i3] + assert sr3.null_count == 0 + np.testing.assert_equal(sr3.to_array(), a1[3:12:2]) + + # Integer indexing + if isinstance(i1, range): + for i in i1: # Python int-s + assert series[i] == a1[i] + if isinstance(i1, np.ndarray) and i1.dtype in index_dtypes: + for i in i1: # numpy integers + assert series[i] == a1[i] + + +def test_dataframe_column_name_indexing(): + df = DataFrame() + data = np.asarray(range(10), dtype=np.int32) + df["a"] = data + df[1] = data + np.testing.assert_equal( + df["a"].to_array(), np.asarray(range(10), dtype=np.int32) + ) + np.testing.assert_equal( + df[1].to_array(), np.asarray(range(10), dtype=np.int32) + ) + + pdf = pd.DataFrame() + nelem = 10 + pdf["key1"] = np.random.randint(0, 5, nelem) + pdf["key2"] = np.random.randint(0, 3, nelem) + pdf[1] = np.arange(1, 1 + nelem) + pdf[2] = np.random.random(nelem) + df = DataFrame.from_pandas(pdf) + + assert_eq(df[df.columns], df) + assert_eq(df[df.columns[:1]], df[["key1"]]) + + for i in range(1, len(pdf.columns) + 1): + for idx in combinations(pdf.columns, i): + assert pdf[list(idx)].equals(df[list(idx)].to_pandas()) + + # test for only numeric columns + df = pd.DataFrame() + for i in range(0, 10): + df[i] = range(nelem) + gdf = DataFrame.from_pandas(df) + assert_eq(gdf, df) + + assert_eq(gdf[gdf.columns], gdf) + assert_eq(gdf[gdf.columns[:3]], gdf[[0, 1, 2]]) + + +def test_dataframe_slicing(): + df = DataFrame() + size = 123 + df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( + np.int32 + ) + df["b"] = hb = np.random.random(size).astype(np.float32) + df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( + np.int64 + ) + df["d"] = hd = np.random.random(size).astype(np.float64) + + # Row slice first 10 + first_10 = df[:10] + assert len(first_10) == 10 + assert tuple(first_10.columns) == ("a", "b", "c", "d") + np.testing.assert_equal(first_10["a"].to_array(), ha[:10]) + np.testing.assert_equal(first_10["b"].to_array(), hb[:10]) + np.testing.assert_equal(first_10["c"].to_array(), hc[:10]) + np.testing.assert_equal(first_10["d"].to_array(), hd[:10]) + del first_10 + + # Row slice last 10 + last_10 = df[-10:] + assert len(last_10) == 10 + assert tuple(last_10.columns) == ("a", "b", "c", "d") + np.testing.assert_equal(last_10["a"].to_array(), ha[-10:]) + np.testing.assert_equal(last_10["b"].to_array(), hb[-10:]) + np.testing.assert_equal(last_10["c"].to_array(), hc[-10:]) + np.testing.assert_equal(last_10["d"].to_array(), hd[-10:]) + del last_10 + + # Row slice [begin:end] + begin = 7 + end = 121 + subrange = df[begin:end] + assert len(subrange) == end - begin + assert tuple(subrange.columns) == ("a", "b", "c", "d") + np.testing.assert_equal(subrange["a"].to_array(), ha[begin:end]) + np.testing.assert_equal(subrange["b"].to_array(), hb[begin:end]) + np.testing.assert_equal(subrange["c"].to_array(), hc[begin:end]) + np.testing.assert_equal(subrange["d"].to_array(), hd[begin:end]) + del subrange + + +@pytest.mark.parametrize("step", [1, 2, 5]) +@pytest.mark.parametrize("scalar", [0, 20, 100]) +def test_dataframe_loc(scalar, step): + size = 123 + pdf = pd.DataFrame( + { + "a": np.random.randint(low=0, high=100, size=size), + "b": np.random.random(size).astype(np.float32), + "c": np.random.random(size).astype(np.float64), + "d": np.random.random(size).astype(np.float64), + } + ) + + df = DataFrame.from_pandas(pdf) + + # Scalar label + assert_eq(df.loc[scalar], pdf.loc[scalar]) + + # Full slice + assert_eq(df.loc[:, "c"], pdf.loc[:, "c"]) + + begin = 110 + end = 122 + + assert_eq( + df.loc[begin:end:step, ["c", "d", "a"]], + pdf.loc[begin:end:step, ["c", "d", "a"]], + ) + + assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]]) + + # Slicing on columns: + assert_eq( + df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"] + ) + + # Slicing of size 1: + assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"]) + + # TODO: Pandas changes the dtype here when it shouldn't + assert_eq( + df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False + ) + + # Make int64 index + offset = 50 + df2 = df[offset:] + pdf2 = pdf[offset:] + begin = 117 + end = 122 + assert_eq( + df2.loc[begin:end, ["c", "d", "a"]], + pdf2.loc[begin:end, ["c", "d", "a"]], + ) + + +@pytest.mark.xfail(raises=IndexError, reason="label scalar is out of bound") +def test_dataframe_loc_outbound(): + df = DataFrame() + size = 10 + df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( + np.int32 + ) + df["b"] = hb = np.random.random(size).astype(np.float32) + + pdf = pd.DataFrame() + pdf["a"] = ha + pdf["b"] = hb + + np.testing.assert_equal(df.loc[11].to_array(), pdf.loc[11]) + + +def test_series_loc_numerical(): + ps = pd.Series([1, 2, 3, 4, 5], index=[5, 6, 7, 8, 9]) + gs = Series.from_pandas(ps) + + assert_eq(ps.loc[5], gs.loc[5]) + assert_eq(ps.loc[6], gs.loc[6]) + assert_eq(ps.loc[6:8], gs.loc[6:8]) + assert_eq(ps.loc[:8], gs.loc[:8]) + assert_eq(ps.loc[6:], gs.loc[6:]) + assert_eq(ps.loc[::2], gs.loc[::2]) + assert_eq(ps.loc[[5, 8, 9]], gs.loc[[5, 8, 9]]) + assert_eq( + ps.loc[[True, False, True, False, True]], + gs.loc[[True, False, True, False, True]], + ) + + +def test_series_loc_string(): + ps = pd.Series( + [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] + ) + gs = Series.from_pandas(ps) + + assert_eq(ps.loc["one"], gs.loc["one"]) + assert_eq(ps.loc["five"], gs.loc["five"]) + assert_eq(ps.loc["two":"four"], gs.loc["two":"four"]) + assert_eq(ps.loc[:"four"], gs.loc[:"four"]) + assert_eq(ps.loc["two":], gs.loc["two":]) + assert_eq(ps.loc[::2], gs.loc[::2]) + assert_eq(ps.loc[["one", "four", "five"]], gs.loc[["one", "four", "five"]]) + assert_eq( + ps.loc[[True, False, True, False, True]], + gs.loc[[True, False, True, False, True]], + ) + + +def test_series_loc_datetime(): + ps = pd.Series( + [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") + ) + gs = Series.from_pandas(ps) + + # a few different ways of specifying a datetime label: + assert_eq(ps.loc["20010101"], gs.loc["20010101"]) + assert_eq(ps.loc["2001-01-01"], gs.loc["2001-01-01"]) + assert_eq( + ps.loc[pd.to_datetime("2001-01-01")], + gs.loc[pd.to_datetime("2001-01-01")], + ) + assert_eq( + ps.loc[np.datetime64("2001-01-01")], + gs.loc[np.datetime64("2001-01-01")], + ) + + assert_eq( + ps.loc["2001-01-02":"2001-01-05"], gs.loc["2001-01-02":"2001-01-05"] + ) + assert_eq(ps.loc["2001-01-02":], gs.loc["2001-01-02":]) + assert_eq(ps.loc[:"2001-01-04"], gs.loc[:"2001-01-04"]) + assert_eq(ps.loc[::2], gs.loc[::2]) + # + # assert_eq(ps.loc[['2001-01-01', '2001-01-04', '2001-01-05']], + # gs.loc[['2001-01-01', '2001-01-04', '2001-01-05']]) + # looks like a bug in Pandas doesn't let us check for the above, + # so instead: + assert_eq( + ps.loc[ + [ + pd.to_datetime("2001-01-01"), + pd.to_datetime("2001-01-04"), + pd.to_datetime("2001-01-05"), + ] + ], + gs.loc[ + [ + pd.to_datetime("2001-01-01"), + pd.to_datetime("2001-01-04"), + pd.to_datetime("2001-01-05"), + ] + ], + ) + assert_eq( + ps.loc[[True, False, True, False, True]], + gs.loc[[True, False, True, False, True]], + ) + + +def test_series_loc_categorical(): + ps = pd.Series( + [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) + ) + gs = Series.from_pandas(ps) + + assert_eq(ps.loc["a"], gs.loc["a"]) + assert_eq(ps.loc["e"], gs.loc["e"]) + assert_eq(ps.loc["b":"d"], gs.loc["b":"d"]) + assert_eq(ps.loc[:"d"], gs.loc[:"d"]) + assert_eq(ps.loc["b":], gs.loc["b":]) + assert_eq(ps.loc[::2], gs.loc[::2]) + + # order of categories changes, so we can only + # compare values: + assert_eq( + ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_array() + ) + + assert_eq( + ps.loc[[True, False, True, False, True]], + gs.loc[[True, False, True, False, True]], + ) + + +@pytest.mark.parametrize("nelem", [2, 5, 20, 100]) +def test_series_iloc(nelem): + + # create random series + np.random.seed(12) + ps = pd.Series(np.random.sample(nelem)) + + # gpu series + gs = Series(ps) + + # positive tests for indexing + np.testing.assert_allclose(gs.iloc[-1 * nelem], ps.iloc[-1 * nelem]) + np.testing.assert_allclose(gs.iloc[-1], ps.iloc[-1]) + np.testing.assert_allclose(gs.iloc[0], ps.iloc[0]) + np.testing.assert_allclose(gs.iloc[1], ps.iloc[1]) + np.testing.assert_allclose(gs.iloc[nelem - 1], ps.iloc[nelem - 1]) + + # positive tests for slice + np.testing.assert_allclose(gs.iloc[-1:1], ps.iloc[-1:1]) + np.testing.assert_allclose( + gs.iloc[nelem - 1 : -1], ps.iloc[nelem - 1 : -1] + ) + np.testing.assert_allclose(gs.iloc[0 : nelem - 1], ps.iloc[0 : nelem - 1]) + np.testing.assert_allclose(gs.iloc[0:nelem], ps.iloc[0:nelem]) + np.testing.assert_allclose(gs.iloc[1:1], ps.iloc[1:1]) + np.testing.assert_allclose(gs.iloc[1:2], ps.iloc[1:2]) + np.testing.assert_allclose( + gs.iloc[nelem - 1 : nelem + 1], ps.iloc[nelem - 1 : nelem + 1] + ) + np.testing.assert_allclose( + gs.iloc[nelem : nelem * 2], ps.iloc[nelem : nelem * 2] + ) + + +@pytest.mark.parametrize("nelem", [2, 5, 20, 100]) +def test_dataframe_iloc(nelem): + gdf = DataFrame() + + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( + np.int32 + ) + gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + + pdf = pd.DataFrame() + pdf["a"] = ha + pdf["b"] = hb + + assert_eq(gdf.iloc[-1:1], pdf.iloc[-1:1]) + assert_eq(gdf.iloc[nelem - 1 : -1], pdf.iloc[nelem - 1 : -1]) + assert_eq(gdf.iloc[0 : nelem - 1], pdf.iloc[0 : nelem - 1]) + assert_eq(gdf.iloc[0:nelem], pdf.iloc[0:nelem]) + assert_eq(gdf.iloc[1:1], pdf.iloc[1:1]) + assert_eq(gdf.iloc[1:2], pdf.iloc[1:2]) + assert_eq(gdf.iloc[nelem - 1 : nelem + 1], pdf.iloc[nelem - 1 : nelem + 1]) + assert_eq(gdf.iloc[nelem : nelem * 2], pdf.iloc[nelem : nelem * 2]) + + assert_eq(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem]) + assert_eq(gdf.iloc[-1], pdf.iloc[-1]) + assert_eq(gdf.iloc[0], pdf.iloc[0]) + assert_eq(gdf.iloc[1], pdf.iloc[1]) + assert_eq(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1]) + + +@pytest.mark.xfail(raises=AssertionError, reason="Series.index are different") +def test_dataframe_iloc_tuple(): + gdf = DataFrame() + nelem = 123 + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( + np.int32 + ) + gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + + pdf = pd.DataFrame() + pdf["a"] = ha + pdf["b"] = hb + + # We don't support passing the column names into the index quite yet + got = gdf.iloc[1, [1]] + expect = pdf.iloc[1, [1]] + + assert_eq(got, expect) + + +@pytest.mark.xfail( + raises=IndexError, reason="positional indexers are out-of-bounds" +) +def test_dataframe_iloc_index_error(): + gdf = DataFrame() + nelem = 123 + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( + np.int32 + ) + gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + + pdf = pd.DataFrame() + pdf["a"] = ha + pdf["b"] = hb + + def assert_col(g, p): + np.testing.assert_equal(g["a"].to_array(), p["a"]) + np.testing.assert_equal(g["b"].to_array(), p["b"]) + + assert_col(gdf.iloc[nelem * 2], pdf.iloc[nelem * 2]) + + +@pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) +def test_dataframe_take(ntake): + np.random.seed(0) + df = DataFrame() + + nelem = 123 + df["ii"] = ii = np.random.randint(0, 20, nelem) + df["ff"] = ff = np.random.random(nelem) + + take_indices = np.random.randint(0, len(df), ntake) + + def check(**kwargs): + out = df.take(take_indices, **kwargs) + assert len(out) == ntake + assert out.ii.null_count == 0 + assert out.ff.null_count == 0 + np.testing.assert_array_equal(out.ii.to_array(), ii[take_indices]) + np.testing.assert_array_equal(out.ff.to_array(), ff[take_indices]) + if kwargs.get("ignore_index"): + np.testing.assert_array_equal(out.index, np.arange(ntake)) + else: + np.testing.assert_array_equal(out.index, take_indices) + + check() + check(ignore_index=True) + + +@pytest.mark.parametrize("nelem", [0, 1, 5, 20, 100]) +@pytest.mark.parametrize("slice_start", [None, 0, 1, 3, 10, -10]) +@pytest.mark.parametrize("slice_end", [None, 0, 1, 30, 50, -1]) +def test_dataframe_masked_slicing(nelem, slice_start, slice_end): + gdf = DataFrame() + gdf["a"] = list(range(nelem)) + gdf["b"] = list(range(nelem, 2 * nelem)) + gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) + gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem)) + + def do_slice(x): + return x[slice_start:slice_end] + + expect = do_slice(gdf.to_pandas()) + got = do_slice(gdf).to_pandas() + + pd.testing.assert_frame_equal(expect, got) + + +def test_dataframe_boolean_mask_with_None(): + pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) + gdf = DataFrame.from_pandas(pdf) + pdf_masked = pdf[[True, False, True, False]] + gdf_masked = gdf[[True, False, True, False]] + assert_eq(pdf_masked, gdf_masked) + + +@pytest.mark.parametrize("dtype", [int, float, str]) +def test_empty_boolean_mask(dtype): + gdf = cudf.datasets.randomdata(nrows=0, dtypes={"a": dtype}) + pdf = gdf.to_pandas() + + expected = pdf[pdf.a == 1] + got = gdf[gdf.a == 1] + assert_eq(expected, got) + + expected = pdf.a[pdf.a == 1] + got = gdf.a[gdf.a == 1] + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + [1.0, 2.0, 3.0, 4.0], + ["one", "two", "three", "four"], + pd.Series(["a", "b", "c", "d"], dtype="category"), + pd.Series(pd.date_range("2010-01-01", "2010-01-04")), + ], +) +@pytest.mark.parametrize( + "mask", + [ + [True, True, True, True], + [False, False, False, False], + [True, False, True, False], + [True, False, False, True], + np.array([True, False, True, False]), + pd.Series([True, False, True, False]), + cudf.Series([True, False, True, False]), + ], +) +@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) +def test_series_apply_boolean_mask(data, mask, nulls): + psr = pd.Series(data) + + if len(data) > 0: + if nulls == "one": + p = np.random.randint(0, 4) + psr[p] = None + elif nulls == "some": + p1, p2 = np.random.randint(0, 4, (2,)) + psr[p1] = None + psr[p2] = None + elif nulls == "all": + psr[:] = None + + gsr = cudf.from_pandas(psr) + + # TODO: from_pandas(psr) has dtype "float64" + # when psr has dtype "object" and is all None + if psr.dtype == "object" and nulls == "all": + gsr = cudf.Series([None, None, None, None], dtype="object") + + if isinstance(mask, cudf.Series): + expect = psr[mask.to_pandas()] + else: + expect = psr[mask] + got = gsr[mask] + + assert_eq(expect, got) + + +def test_dataframe_apply_boolean_mask(): + pdf = pd.DataFrame( + { + "a": [0, 1, 2, 3], + "b": [0.1, 0.2, None, 0.3], + "c": ["a", None, "b", "c"], + } + ) + gdf = DataFrame.from_pandas(pdf) + assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]]) + + +""" +This test compares cudf and Pandas dataframe boolean indexing. +""" + + +@pytest.mark.parametrize( + "mask_fn", [lambda x: x, lambda x: np.array(x), lambda x: pd.Series(x)] +) +def test_dataframe_boolean_mask(mask_fn): + mask_base = [ + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + ] + pdf = pd.DataFrame({"x": range(10), "y": range(10)}) + gdf = cudf.from_pandas(pdf) + mask = mask_fn(mask_base) + assert len(mask) == gdf.shape[0] + pdf_masked = pdf[mask] + gdf_masked = gdf[mask] + assert pdf_masked.to_string().split() == gdf_masked.to_string().split() + + +@pytest.mark.parametrize( + "key, value", + [ + (0, 4), + (1, 4), + ([0, 1], 4), + ([0, 1], [4, 5]), + (slice(0, 2), [4, 5]), + (slice(1, None), [4, 5, 6, 7]), + ([], 1), + ([], []), + (slice(None, None), 1), + (slice(-1, -3), 7), + ], +) +@pytest.mark.parametrize("nulls", ["none", "some", "all"]) +def test_series_setitem_basics(key, value, nulls): + psr = pd.Series([1, 2, 3, 4, 5]) + if nulls == "some": + psr[[0, 4]] = None + elif nulls == "all": + psr[:] = None + gsr = cudf.from_pandas(psr) + psr[key] = value + gsr[key] = value + assert_eq(psr, gsr, check_dtype=False) + + +def test_series_setitem_null(): + gsr = cudf.Series([1, 2, 3, 4]) + gsr[0] = None + + expect = cudf.Series([None, 2, 3, 4]) + got = gsr + assert_eq(expect, got) + + gsr = cudf.Series([None, 2, 3, 4]) + gsr[0] = 1 + + expect = cudf.Series([1, 2, 3, 4]) + got = gsr + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "key, value", + [ + (0, 4), + (1, 4), + ([0, 1], 4), + ([0, 1], [4, 5]), + (slice(0, 2), [4, 5]), + (slice(1, None), [4, 5, 6, 7]), + ([], 1), + ([], []), + (slice(None, None), 1), + (slice(-1, -3), 7), + ], +) +@pytest.mark.parametrize("nulls", ["none", "some", "all"]) +def test_series_setitem_iloc(key, value, nulls): + psr = pd.Series([1, 2, 3, 4, 5]) + if nulls == "some": + psr[[0, 4]] = None + elif nulls == "all": + psr[:] = None + gsr = cudf.from_pandas(psr) + psr.iloc[key] = value + gsr.iloc[key] = value + assert_eq(psr, gsr, check_dtype=False) + + +@pytest.mark.parametrize( + "key, value", + [ + (0, 0.5), + ([0, 1], 0.5), + ([0, 1], [0.5, 2.5]), + (slice(0, 2), [0.5, 0.25]), + ], +) +def test_series_setitem_dtype(key, value): + psr = pd.Series([1, 2, 3], dtype="int32") + gsr = cudf.from_pandas(psr) + psr[key] = value + gsr[key] = value + assert_eq(psr, gsr) + + +def test_series_setitem_datetime(): + psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") + gsr = cudf.from_pandas(psr) + + psr[0] = "2005" + gsr[0] = "2005" + + assert_eq(psr, gsr) + + +def test_series_setitem_categorical(): + psr = pd.Series(["a", "b", "a", "c", "d"], dtype="category") + gsr = cudf.from_pandas(psr) + + psr[0] = "d" + gsr[0] = "d" + assert_eq(psr, gsr) + + psr = psr.cat.add_categories(["e"]) + gsr = gsr.cat.add_categories(["e"]) + psr[0] = "e" + gsr[0] = "e" + assert_eq(psr, gsr) + + psr[[0, 1]] = "b" + gsr[[0, 1]] = "b" + assert_eq(psr, gsr) + + psr[0:3] = "e" + gsr[0:3] = "e" + assert_eq(psr, gsr) + + +@pytest.mark.parametrize( + "key, value", + [ + (0, "d"), + (0, "g"), + ([0, 1], "g"), + ([0, 1], None), + (slice(None, 2), "g"), + (slice(None, 2), ["g", None]), + ], +) +def test_series_setitem_string(key, value): + psr = pd.Series(["a", "b", "c", "d", "e"]) + gsr = cudf.from_pandas(psr) + psr[key] = value + gsr[key] = value + assert_eq(psr, gsr) + + psr = pd.Series(["a", None, "c", "d", "e"]) + gsr = cudf.from_pandas(psr) + psr[key] = value + gsr[key] = value + assert_eq(psr, gsr) + + +@pytest.mark.parametrize( + "key, value", + [ + ("a", 4), + ("b", 4), + (["a", "b"], 4), + (["a", "b"], [4, 5]), + ([True, False, True], 4), + ([False, False, False], 4), + ([True, False, True], [4, 5]), + ], +) +def test_series_setitem_loc(key, value): + psr = pd.Series([1, 2, 3], ["a", "b", "c"]) + gsr = cudf.from_pandas(psr) + psr.loc[key] = value + gsr.loc[key] = value + assert_eq(psr, gsr) + + +@pytest.mark.parametrize( + "key, value", + [ + ((0, 0), 5), + ((slice(None), 0), 5), + ((slice(None), 0), range(3)), + ((slice(None, -1), 0), range(2)), + (([0, 1], 0), 5), + ], +) +def test_dataframe_setitem_iloc(key, value, pdf_gdf): + pdf, gdf = pdf_gdf + pdf.iloc[key] = value + gdf.iloc[key] = value + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "key, value", + [ + (("one", "a"), 5), + ((slice(None), "a"), 5), + ((slice(None), "a"), range(3)), + ((slice(None, "two"), "a"), range(2)), + ((["one", "two"], "a"), 5), + ], +) +def test_dataframe_setitem_loc(key, value, pdf_gdf): + pdf, gdf = pdf_gdf + pdf.loc[key] = value + gdf.loc[key] = value + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "key,value", + [ + ((0, 0), 5.0), + ((slice(None), 0), 5.0), + ((slice(None), 0), np.arange(7, dtype="float64")), + ], +) +def test_dataframe_setitem_iloc_multiindex(key, value, pdf_gdf_multi): + pdf, gdf = pdf_gdf_multi + + pdf.iloc[key] = value + gdf.iloc[key] = value + + assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index eba5e16c1f4..f47ffd4ca8d 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -75,6 +75,8 @@ def require_writeable_array(arr): def scalar_broadcast_to(scalar, shape, dtype): from cudf.utils.cudautils import fill_value + scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) + if not isinstance(shape, tuple): shape = (shape,) @@ -160,9 +162,10 @@ def cudf_dtype_from_pydata_dtype(dtype): return infer_dtype_from_object(dtype) -def is_single_value(val): +def is_scalar(val): return ( - isinstance(val, str) + val is None + or isinstance(val, str) or isinstance(val, numbers.Number) or np.isscalar(val) or isinstance(val, pd.Timestamp) @@ -170,6 +173,30 @@ def is_single_value(val): ) +def to_cudf_compatible_scalar(val, dtype=None): + """ + Converts the value `val` to a numpy/Pandas scalar, + optionally casting to `dtype`. + + If `val` is None, returns None. + """ + if val is None: + return val + + if not is_scalar(val): + raise ValueError( + f"Cannot convert value of type {type(val).__name__} " + " to cudf scalar" + ) + + val = pd.api.types.pandas_dtype(type(val)).type(val) + + if dtype is not None: + val = val.astype(dtype) + + return val + + def is_list_like(obj): """ This function checks if the given `obj`