Skip to content
Merged
Show file tree
Hide file tree
Changes from 87 commits
Commits
Show all changes
101 commits
Select commit Hold shift + click to select a range
6134d56
Use join for series loc indexing - fails on categorical loc test
shwina Jul 16, 2019
43976fe
black
shwina Jul 16, 2019
b786286
Enable conversinon from string to categorical columns
shwina Jul 16, 2019
b5e0811
Merge branch 'as-categorical' into slice-loc-improve
shwina Jul 16, 2019
7b98c69
Add utility to convert from categorical to string columns
shwina Jul 17, 2019
0a3d823
Fix column.take() for empty input
shwina Jul 17, 2019
44632d4
Add test for empty loc
shwina Jul 17, 2019
0cec124
Handle categorical index in loc correctly
shwina Jul 17, 2019
020f9be
Restore else statement
shwina Jul 17, 2019
0d44b8c
Simplify dtype equality check
shwina Jul 17, 2019
29db7bc
Better naming
shwina Jul 17, 2019
14d3efe
changelog
shwina Jul 17, 2019
72c9e89
Fix checking for wrong dtype
shwina Jul 17, 2019
6a2d664
flake8
shwina Jul 17, 2019
88795f9
Merge branch 'opt-gather-scatter' of https://github.com/hummingtree/c…
shwina Jul 23, 2019
4ba36b5
Rename selvals to data for consistency
shwina Jul 23, 2019
f9f84bf
Replace multiple apply_gather* functions with a single one
shwina Jul 23, 2019
0c8ad34
Merge branch 'branch-0.9' into loc-perf
kkraus14 Jul 23, 2019
336ee22
Eliminate duplication of __getitem__ logic between Series and Column
shwina Jul 24, 2019
daa4ccb
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Jul 24, 2019
440dfe8
More work on localizing __getitem__() to Column.__getitem__()
shwina Jul 24, 2019
efced4c
Enable apply_gather to handle both Column and list of Column
shwina Jul 24, 2019
ec18e9a
Use base implementation of element_indexing() and take() in StringColumn
shwina Jul 24, 2019
5ee7f72
Remove check for non-zero null count in StringIndex
shwina Jul 24, 2019
f6687f4
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into lo…
shwina Jul 29, 2019
5fbe16c
Remove unused import
shwina Jul 29, 2019
04d589a
Restore import
shwina Jul 29, 2019
515abe6
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Jul 29, 2019
f1e2778
Move indexing/selection tests to new file test_indexing.py
shwina Jul 29, 2019
bb448ca
Merge branch 'opt-gather-scatter' of https://github.com/hummingtree/c…
shwina Jul 29, 2019
2ec445b
Add basic Series.__setitem__()
shwina Jul 29, 2019
8adea18
Merge remote-tracking branch 'origin/loc-perf' into add-setitem
shwina Jul 29, 2019
e3adf46
Default to label-based indexing in Series.__setitem__
shwina Jul 29, 2019
7014577
Add Python bindings for cudf::copy_range
shwina Jul 30, 2019
31c1d56
Remove check for zero null_count in column_view_from_column
shwina Jul 30, 2019
48a36cf
Add basic slice __setitem__
shwina Jul 30, 2019
ca72358
Handle negative indices and dtype in setitem
shwina Jul 30, 2019
986f399
fix dtype handling in setitem
shwina Jul 30, 2019
8fd5b4e
Merge branch 'opt-gather-scatter' of https://github.com/hummingtree/c…
shwina Jul 31, 2019
fa7438a
Merge branch 'simplify-getitem' into add-setitem
shwina Jul 31, 2019
bcaf234
Add series loc/iloc setitem
shwina Jul 31, 2019
8f608c3
Simpilify Series.take()
shwina Jul 31, 2019
d0a33c3
Simplify DataFrame.take
shwina Jul 31, 2019
e1393d5
Merge branch 'loc-perf' into simplify-getitem
shwina Jul 31, 2019
549415e
Pull up SeriesIlocIndexer
shwina Jul 31, 2019
35286bd
Add loc setitem tests
shwina Jul 31, 2019
e519d5d
Improve dtype handling
shwina Aug 1, 2019
810e416
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 1, 2019
fc8474a
Remove string methods reintroduced after merge
shwina Aug 2, 2019
ee6923d
style?
shwina Aug 2, 2019
c751b7b
Merge branch 'simplify-getitem' into add-setitem
shwina Aug 2, 2019
671e5ce
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 2, 2019
c5e21aa
getitem/setitem fixes after branch-0.9 merge
shwina Aug 5, 2019
206660f
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 5, 2019
a92cbb2
flake8
shwina Aug 5, 2019
07dfc12
Cython flake8
shwina Aug 5, 2019
4915753
changelog
shwina Aug 5, 2019
d7cff85
Add a basic DataFrame.__setitem__()
shwina Aug 5, 2019
204a77f
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 5, 2019
6f39ca4
Fix ununsed variable
shwina Aug 5, 2019
5dee4d1
Fix handling of inplace param in add_categories
shwina Aug 5, 2019
3186e3f
Add broadcasting of value into categorical column in setitem
shwina Aug 5, 2019
dd693f2
Fix setitem test and a couple of setitem bugs
shwina Aug 5, 2019
c7a2ae8
Remove a circular import issue
shwina Aug 6, 2019
aab0216
Add index iloc
shwina Aug 6, 2019
8d240d7
Forbid df.loc[] with multiindex for now
shwina Aug 6, 2019
869e0e4
Add basic multiindex iloc setitem tests
shwina Aug 6, 2019
a2aac9a
Fix scalar_broadcast_to
shwina Aug 6, 2019
78f32fa
Copy scatter results to nvcategory (credit to @hummingtree)
shwina Aug 6, 2019
677ccc4
Rename is_single_value to is_scalar and improve scalar handling
shwina Aug 7, 2019
79729be
Add more setitem tests
shwina Aug 7, 2019
d57607f
Handle setitem with boolmask
shwina Aug 7, 2019
d4cbf5d
Handle setitem with None
shwina Aug 7, 2019
4b6fa22
Add at() and iat() as aliases for loc() and iloc(0
shwina Aug 7, 2019
3f31906
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 7, 2019
dcf1efc
Fix nvcategory issue in copy_range
shwina Aug 7, 2019
98494eb
Fix issue relating to scatter/nvcategory (credit: @hummingtree)
shwina Aug 7, 2019
e305b79
Add tests for string setitem
shwina Aug 7, 2019
2c3d1f6
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 8, 2019
88d8011
Fixes to gdf_dtype_from_value utility
shwina Aug 8, 2019
946a3c1
Avoid typecast when casting to the same dtype
shwina Aug 8, 2019
ce227cf
Fix call to apply_gather
shwina Aug 8, 2019
0401ee7
Fix fillna() for datetime
shwina Aug 8, 2019
0cdefc2
Remove unused utility function
shwina Aug 8, 2019
d2e4c9e
Ensure dtype consistency before join in labels_from_indices
shwina Aug 8, 2019
f9285ca
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 8, 2019
428d909
flake8
shwina Aug 8, 2019
944e194
Remove repeated entry in CHANGELOG
shwina Aug 9, 2019
401fb1e
Addressing review comments
shwina Aug 9, 2019
27e3506
style
shwina Aug 9, 2019
be68e0f
Remove masked_assign function (replaced by more general setitem)
shwina Aug 9, 2019
d479f7c
Remove unused _as_categorical_column
shwina Aug 9, 2019
cbf8c6d
Update documentation for loc/iloc
shwina Aug 9, 2019
1396ce4
More doc updates
shwina Aug 9, 2019
f866965
Fix a typo
shwina Aug 9, 2019
d5919d8
Remove Index loc/iloc as they aren't supposed to exist
shwina Aug 9, 2019
5c90d71
Merge branch 'branch-0.9' into add-setitem
kkraus14 Aug 12, 2019
dd4f106
style
shwina Aug 12, 2019
a38899a
Merge branch 'add-setitem' of https://github.com/shwina/cudf into add…
shwina Aug 12, 2019
42b7080
Better check for int
shwina Aug 12, 2019
c026969
Merge branch 'branch-0.9' of https://github.com/rapidsai/cudf into ad…
shwina Aug 12, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
- PR #2446 Add __array_function__ for index
- PR #2437 ORC reader: Add 'use_np_dtypes' option
- PR #2382 Add CategoricalAccessor add, remove, rename, and ordering methods
- PR #2449 Java column vector: added support for getting byte count of strings in a ColumnVector
- PR #2442 Add __setitem__
- PR #2449 Java column vector: added support for getting byte count of strings in a ColumnVector
- PR #2489 Add drop argument to set_index
- PR #2491 Add Java bindings for ORC reader 'use_np_dtypes' option
Expand Down
15 changes: 5 additions & 10 deletions cpp/src/copying/copy_range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,24 @@ void copy_range(gdf_column *out_column, gdf_column const &in_column,
// before copying to ensure the strings referred to by the new indices
// are included in the destination column

gdf_column * input_cols[2] = {out_column,
const_cast<gdf_column*>(&in_column)};

// make temporary columns which will have synced categories
// TODO: these copies seem excessively expensive, but
// sync_column_categories doesn't copy the valid mask
gdf_column temp_out = cudf::copy(*out_column);
gdf_column temp_in = cudf::copy(in_column);
gdf_column * temp_cols[2] = {&temp_out, &temp_in};

gdf_column * input_cols[2] = {&temp_out,
const_cast<gdf_column*>(&in_column)};
gdf_column * temp_cols[2] = {out_column, &temp_in};

// sync categories
CUDF_EXPECTS(GDF_SUCCESS ==
sync_column_categories(input_cols, temp_cols, 2),
"Failed to synchronize NVCategory");

detail::copy_range(&temp_out,
detail::copy_range(out_column,
detail::column_range_factory{temp_in, in_begin},
out_begin, out_end);

std::swap(out_column->data, temp_out.data);
std::swap(out_column->valid, temp_out.valid);
std::swap(out_column->null_count, temp_out.null_count);
std::swap(out_column->dtype_info.category, temp_out.dtype_info.category);

gdf_column_free(&temp_out);
gdf_column_free(&temp_in);
Expand Down
1 change: 1 addition & 0 deletions cpp/src/copying/gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ void gather(table const *source_table, gdf_index_type const gather_map[],
}
}


} // namespace detail

void gather(table const *source_table, gdf_index_type const gather_map[],
Expand Down
1 change: 1 addition & 0 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ table scatter(table const& source, gdf_index_type const scatter_map[],
}

detail::scatter(&source, scatter_map, &output);
nvcategory_gather_table(output, output);

return output;

Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/bindings/copying.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,15 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
cdef gdf_column copy(
const gdf_column &input
) except +

cudf_table scatter(
const cudf_table source,
const gdf_index_type* scatter_map,
const cudf_table target)

cdef void copy_range(
gdf_column *out_column,
const gdf_column in_column,
gdf_index_type out_begin,
gdf_index_type out_end,
gdf_index_type in_begin)
169 changes: 131 additions & 38 deletions python/cudf/cudf/bindings/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@
# cython: embedsignature = True
# cython: language_level = 3

from cudf.dataframe import columnops
from cudf.dataframe.buffer import Buffer
from cudf.bindings.cudf_cpp cimport *
from cudf.bindings.cudf_cpp import *
from cudf.utils.cudautils import modulo
from cudf.bindings.copying cimport *
import cudf.utils.utils as utils
from cudf.bindings.utils cimport columns_from_table, table_from_columns
from librmm_cffi import librmm as rmm

import numba
import numpy as np
import pandas as pd
import pyarrow as pa
Expand All @@ -27,7 +32,6 @@ pandas_version = tuple(map(int, pd.__version__.split('.', 2)[:2]))


def clone_columns_with_size(in_cols, row_size):
from cudf.dataframe import columnops
out_cols = []
for col in in_cols:
o_col = columnops.column_empty_like(col,
Expand All @@ -39,26 +43,48 @@ def clone_columns_with_size(in_cols, row_size):
return out_cols


def apply_gather(in_cols, maps, out_cols=None):
"""
Call cudf::gather.
def _normalize_maps(maps, size):
maps = columnops.as_column(maps).astype("int32")
maps = maps.binary_operator("mod", maps.normalize_binop_value(size))
maps = maps.data.mem
return maps

* in_cols input column array
* maps RMM device array with gdf_index_type (np.int32 compatible dtype)
* out_cols the destination column array to output

* returns out_cols
def apply_gather(source, maps, dest=None):
"""
Gathers elements from source into dest (if given) using the gathermap maps.
If dest is not given, it is allocated inside the function and returned.

Parameters
----------
source : Column or list of Columns
maps : DeviceNDArray
dest : Column or list of Columns (optional)

Returns
-------
Column or list of Columns, or None if dest is given
"""
if isinstance(source, (list, tuple)):
if dest is not None:
assert(isinstance(dest, (list, tuple)))
in_cols = source
out_cols = dest
else:
in_cols = [source]
out_cols = None if dest is None else [dest]

for i, in_col in enumerate(in_cols):
in_cols[i] = columnops.as_column(in_cols[i])
if dest is not None:
out_cols[i] = columnops.as_column(out_cols[i])

if in_cols[0].dtype == np.dtype("object"):
in_size = in_cols[0].data.size()
else:
in_size = in_cols[0].data.size

from cudf.dataframe import columnops
maps = columnops.as_column(maps).astype("int32")
maps = maps.data.mem
# TODO: replace with libcudf pymod when available
maps = modulo(maps, in_size)
maps = _normalize_maps(maps, in_size)

col_count=len(in_cols)
gather_count = len(maps)
Expand Down Expand Up @@ -108,42 +134,59 @@ def apply_gather(in_cols, maps, out_cols=None):

free_table(c_in_table, c_in_cols)

return out_cols
if dest is not None:
return

if isinstance(source, (list, tuple)):
return out_cols
else:
return out_cols[0]

def apply_gather_column(in_col, maps, out_col=None):
"""
Call cudf::gather.

* in_cols input column
* maps device array
* out_cols the destination column to output
def apply_scatter(source, maps, target):
cdef cudf_table* c_source_table
cdef cudf_table* c_target_table
cdef cudf_table c_result_table
cdef uintptr_t c_maps_ptr
cdef gdf_index_type* c_maps

* returns out_col
"""
source_cols = source
target_cols = target

in_cols = [in_col]
out_cols = None if out_col is None else [out_col]
if not isinstance(target_cols, (list, tuple)):
target_cols = [target_cols]

out_cols = apply_gather(in_cols, maps, out_cols)
if not isinstance(source_cols, (list, tuple)):
source_cols = [source_cols] * len(target_cols)

return out_cols[0]
for i in range(len(target_cols)):
target_cols[i] = columnops.as_column(target_cols[i])
source_cols[i] = columnops.as_column(source_cols[i])
assert source_cols[i].dtype == target_cols[i].dtype

c_source_table = table_from_columns(source_cols)
c_target_table = table_from_columns(target_cols)

def apply_gather_array(dev_array, maps, out_col=None):
"""
Call cudf::gather.
maps = _normalize_maps(maps, len(target_cols[0]))

* dev_array input device array
* maps device array
* out_cols the destination column to output
c_maps_ptr = get_ctype_ptr(maps)
c_maps = <gdf_index_type*>c_maps_ptr

* returns out_col
"""
from cudf.dataframe import columnops
with nogil:
c_result_table = scatter(
c_source_table[0],
c_maps,
c_target_table[0])

in_col = columnops.as_column(dev_array)
return apply_gather_column(in_col, maps, out_col)
result_cols = columns_from_table(&c_result_table)

del c_source_table
del c_target_table

if isinstance(target, (list, tuple)):
return result_cols
else:
return result_cols[0]


def copy_column(input_col):
Expand All @@ -163,3 +206,53 @@ def copy_column(input_col):
free(output)

return Column.from_mem_views(data, mask, output.null_count)


def apply_copy_range(out_col, in_col, int out_begin, int out_end,
int in_begin):
from cudf.dataframe.column import Column

if abs(out_end - out_begin) <= 1:
return out_col

if out_begin < 0:
out_begin = len(out_col) + out_begin
if out_end < 0:
out_end = len(out_col) + out_end

if out_begin > out_end:
return out_col

if out_col.null_count == 0 and in_col.has_null_mask:
mask = utils.make_mask(len(out_col))
cudautils.fill_value(mask, 0xff)
out_col._mask = Buffer(mask)
out_col._null_count = 0

if in_col.null_count == 0 and out_col.has_null_mask:
mask = utils.make_mask(len(in_col))
cudautils.fill_value(mask, 0xff)
in_col._mask = Buffer(mask)
in_col._null_count = 0

cdef gdf_column* c_out_col = column_view_from_column(out_col)
cdef gdf_column* c_in_col = column_view_from_column(in_col)

with nogil:
copy_range(c_out_col,
c_in_col[0],
out_begin,
out_end,
in_begin)

out_col._update_null_count(c_out_col.null_count)

if out_col.dtype == np.dtype("object") and len(out_col) > 0:
update_nvstrings_col(
out_col,
<uintptr_t>c_out_col.dtype_info.category)

free(c_in_col)
free(c_out_col)

return out_col
4 changes: 3 additions & 1 deletion python/cudf/cudf/bindings/cudf_cpp.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cpdef gdf_time_unit np_dtype_to_gdf_time_unit(dtype)
cpdef gdf_time_unit_to_np_dtype(gdf_time_unit time_unit)

cdef np_dtype_from_gdf_column(gdf_column* col)
cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*)


cdef get_scalar_value(gdf_scalar scalar, dtype)

Expand Down Expand Up @@ -391,3 +391,5 @@ cdef extern from "cudf/legacy/table.hpp" namespace "cudf" nogil:
# const gdf_column* const* begin() const except +
# gdf_column const* const* end() const
# gdf_column const* get_column(gdf_index_type index) const except +

cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*) except? GDF_invalid
7 changes: 4 additions & 3 deletions python/cudf/cudf/bindings/cudf_cpp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ cdef np_dtype_from_gdf_column(gdf_column* col):
raise TypeError('cannot convert gdf_dtype `%s` to numpy dtype' % (dtype))


cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None):
cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None) except? GDF_invalid:
"""Util to convert a column's or np.scalar's dtype to gdf dtype.

Parameters
Expand All @@ -172,7 +172,8 @@ cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None):
dtype : numpy.dtype; optional
The dtype to convert to a gdf_dtype. Defaults to *col.dtype*.
"""
dtype = col.dtype if dtype is None else np.dtype(dtype)
dtype = col.dtype if dtype is None else pd.api.types.pandas_dtype(dtype)

# if dtype is pd.CategoricalDtype, use the codes' gdf_dtype
if is_categorical_dtype(dtype):
if col is None:
Expand Down Expand Up @@ -308,7 +309,7 @@ cdef gdf_column* column_view_from_column(col, col_name=None) except? NULL:
else:
data_ptr = 0

if col._mask is not None and col.null_count > 0:
if col._mask is not None:
valid_ptr = get_column_valid_ptr(col)
else:
valid_ptr = 0
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/bindings/replace.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ from cudf.bindings.cudf_cpp cimport *
from cudf.bindings.cudf_cpp import *
from cudf.bindings.replace cimport *
from cudf.dataframe.column import Column
from cudf.utils.utils import is_single_value
from cudf.utils.utils import is_scalar

from libc.stdlib cimport calloc, malloc, free

Expand Down Expand Up @@ -79,7 +79,7 @@ cpdef apply_replace_nulls(inp, replacement):
Call replace_nulls
"""

if is_single_value(replacement):
if is_scalar(replacement):
return apply_replace_nulls_scalar(inp, replacement)
else:
return apply_replace_nulls_column(inp, replacement)
9 changes: 5 additions & 4 deletions python/cudf/cudf/bindings/typecast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,28 @@ from cudf.dataframe.column import Column
from libc.stdlib cimport free

import numpy as np
import pandas as pd


def apply_cast(incol, dtype=np.float64):
"""
Return a Column with values in `incol` casted to `dtype`.
Currently supports numeric and datetime dtypes.
"""
dtype = np.dtype(np.float64 if dtype is None else dtype)

check_gdf_compatibility(incol)

cdef gdf_column* c_incol = column_view_from_column(incol)
if pd.api.types.is_dtype_equal(incol.dtype, dtype):
return incol

dtype = np.dtype(np.float64 if dtype is None else dtype)
cdef gdf_column* c_incol = column_view_from_column(incol)
cdef gdf_dtype c_out_dtype = gdf_dtype_from_value(incol, dtype)
cdef uintptr_t c_category

cdef gdf_dtype_extra_info c_out_info = gdf_dtype_extra_info(
time_unit=np_dtype_to_gdf_time_unit(dtype),
category=<void*>c_category
)

cdef gdf_column result

with nogil:
Expand Down
Loading