rapidsai · kkraus14 · Aug 12, 2019 · Jul 16, 2019 · Jul 16, 2019 · Jul 16, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,8 @@
 - PR #2446 Add __array_function__ for index
 - PR #2437 ORC reader: Add 'use_np_dtypes' option
 - PR #2382 Add CategoricalAccessor add, remove, rename, and ordering methods
+- PR #2449 Java column vector: added support for getting byte count of strings in a ColumnVector
+- PR #2442 Add __setitem__
 - PR #2449 Java column vector: added support for getting byte count of strings in a ColumnVector 
 - PR #2489 Add drop argument to set_index
 - PR #2491 Add Java bindings for ORC reader 'use_np_dtypes' option

@@ -78,29 +78,24 @@ void copy_range(gdf_column *out_column, gdf_column const &in_column,
       // before copying to ensure the strings referred to by the new indices
       // are included in the destination column
 
-      gdf_column * input_cols[2] = {out_column,
-                                    const_cast<gdf_column*>(&in_column)};
-
       // make temporary columns which will have synced categories
       // TODO: these copies seem excessively expensive, but 
       // sync_column_categories doesn't copy the valid mask
       gdf_column temp_out = cudf::copy(*out_column);
       gdf_column temp_in  = cudf::copy(in_column);
-      gdf_column * temp_cols[2] = {&temp_out, &temp_in};
+
+      gdf_column * input_cols[2] = {&temp_out,
+                                    const_cast<gdf_column*>(&in_column)};
+      gdf_column * temp_cols[2] = {out_column, &temp_in};
 
       // sync categories
       CUDF_EXPECTS(GDF_SUCCESS ==
         sync_column_categories(input_cols, temp_cols, 2),
         "Failed to synchronize NVCategory");
 
-      detail::copy_range(&temp_out,
+      detail::copy_range(out_column,
                          detail::column_range_factory{temp_in, in_begin},
                          out_begin, out_end);
-
-      std::swap(out_column->data, temp_out.data);
-      std::swap(out_column->valid, temp_out.valid);
-      std::swap(out_column->null_count, temp_out.null_count);
-      std::swap(out_column->dtype_info.category, temp_out.dtype_info.category);
 
       gdf_column_free(&temp_out);
       gdf_column_free(&temp_in);

diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
@@ -354,6 +354,7 @@ void gather(table const *source_table, gdf_index_type const gather_map[],
   }
 }
 
+
 } // namespace detail
 
 void gather(table const *source_table, gdf_index_type const gather_map[],

@@ -199,6 +199,7 @@ table scatter(table const& source, gdf_index_type const scatter_map[],
   }
 
   detail::scatter(&source, scatter_map, &output);
+  nvcategory_gather_table(output, output);
 
   return output;
 

@@ -19,3 +19,15 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     cdef gdf_column copy(
         const gdf_column &input
     ) except +
+
+    cudf_table scatter(
+        const cudf_table source,
+        const gdf_index_type* scatter_map,
+        const cudf_table target)
+
+    cdef void copy_range(
+        gdf_column *out_column,
+        const gdf_column in_column,
+        gdf_index_type out_begin,
+        gdf_index_type out_end,
+        gdf_index_type in_begin)
@@ -5,11 +5,16 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+from cudf.dataframe import columnops
+from cudf.dataframe.buffer import Buffer
 from cudf.bindings.cudf_cpp cimport *
 from cudf.bindings.cudf_cpp import *
-from cudf.utils.cudautils import modulo
+from cudf.bindings.copying cimport *
+import cudf.utils.utils as utils
+from cudf.bindings.utils cimport columns_from_table, table_from_columns
 from librmm_cffi import librmm as rmm
 
+import numba
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -27,7 +32,6 @@ pandas_version = tuple(map(int, pd.__version__.split('.', 2)[:2]))
 
 
 def clone_columns_with_size(in_cols, row_size):
-    from cudf.dataframe import columnops
     out_cols = []
     for col in in_cols:
         o_col = columnops.column_empty_like(col,
@@ -39,26 +43,48 @@ def clone_columns_with_size(in_cols, row_size):
     return out_cols
 
 
-def apply_gather(in_cols, maps, out_cols=None):
-    """
-      Call cudf::gather.
+def _normalize_maps(maps, size):
+    maps = columnops.as_column(maps).astype("int32")
+    maps = maps.binary_operator("mod", maps.normalize_binop_value(size))
+    maps = maps.data.mem
+    return maps
 
-     * in_cols input column array
-     * maps RMM device array with gdf_index_type (np.int32 compatible dtype)
-     * out_cols the destination column array to output
 
-     * returns out_cols
+def apply_gather(source, maps, dest=None):
     """
+    Gathers elements from source into dest (if given) using the gathermap maps.
+    If dest is not given, it is allocated inside the function and returned.
+
+    Parameters
+    ----------
+    source : Column or list of Columns
+    maps : DeviceNDArray
+    dest : Column or list of Columns (optional)
+
+    Returns
+    -------
+    Column or list of Columns, or None if dest is given
+    """
+    if isinstance(source, (list, tuple)):
+        if dest is not None:
+            assert(isinstance(dest, (list, tuple)))
+        in_cols = source
+        out_cols = dest
+    else:
+        in_cols = [source]
+        out_cols = None if dest is None else [dest]
+
+    for i, in_col in enumerate(in_cols):
+        in_cols[i] = columnops.as_column(in_cols[i])
+        if dest is not None:
+            out_cols[i] = columnops.as_column(out_cols[i])
+
     if in_cols[0].dtype == np.dtype("object"):
         in_size = in_cols[0].data.size()
     else:
         in_size = in_cols[0].data.size
 
-    from cudf.dataframe import columnops
-    maps = columnops.as_column(maps).astype("int32")
-    maps = maps.data.mem
-    # TODO: replace with libcudf pymod when available
-    maps = modulo(maps, in_size)
+    maps = _normalize_maps(maps, in_size)
 
     col_count=len(in_cols)
     gather_count = len(maps)
@@ -108,42 +134,59 @@ def apply_gather(in_cols, maps, out_cols=None):
 
     free_table(c_in_table, c_in_cols)
 
-    return out_cols
+    if dest is not None:
+        return
 
+    if isinstance(source, (list, tuple)):
+        return out_cols
+    else:
+        return out_cols[0]
 
-def apply_gather_column(in_col, maps, out_col=None):
-    """
-      Call cudf::gather.
 
-     * in_cols input column
-     * maps device array
-     * out_cols the destination column to output
+def apply_scatter(source, maps, target):
+    cdef cudf_table* c_source_table
+    cdef cudf_table* c_target_table
+    cdef cudf_table c_result_table
+    cdef uintptr_t c_maps_ptr
+    cdef gdf_index_type* c_maps
 
-     * returns out_col
-    """
+    source_cols = source
+    target_cols = target
 
-    in_cols = [in_col]
-    out_cols = None if out_col is None else [out_col]
+    if not isinstance(target_cols, (list, tuple)):
+        target_cols = [target_cols]
 
-    out_cols = apply_gather(in_cols, maps, out_cols)
+    if not isinstance(source_cols, (list, tuple)):
+        source_cols = [source_cols] * len(target_cols)
 
-    return out_cols[0]
+    for i in range(len(target_cols)):
+        target_cols[i] = columnops.as_column(target_cols[i])
+        source_cols[i] = columnops.as_column(source_cols[i])
+        assert source_cols[i].dtype == target_cols[i].dtype
 
+    c_source_table = table_from_columns(source_cols)
+    c_target_table = table_from_columns(target_cols)
 
-def apply_gather_array(dev_array, maps, out_col=None):
-    """
-      Call cudf::gather.
+    maps = _normalize_maps(maps, len(target_cols[0]))
 
-     * dev_array input device array
-     * maps device array
-     * out_cols the destination column to output
+    c_maps_ptr = get_ctype_ptr(maps)
+    c_maps = <gdf_index_type*>c_maps_ptr
 
-     * returns out_col
-    """
-    from cudf.dataframe import columnops
+    with nogil:
+        c_result_table = scatter(
+            c_source_table[0],
+            c_maps,
+            c_target_table[0])
 
-    in_col = columnops.as_column(dev_array)
-    return apply_gather_column(in_col, maps, out_col)
+    result_cols = columns_from_table(&c_result_table)
+
+    del c_source_table
+    del c_target_table
+
+    if isinstance(target, (list, tuple)):
+        return result_cols
+    else:
+        return result_cols[0]
 
 
 def copy_column(input_col):
@@ -163,3 +206,53 @@ def copy_column(input_col):
     free(output)
 
     return Column.from_mem_views(data, mask, output.null_count)
+
+
+def apply_copy_range(out_col, in_col, int out_begin, int out_end,
+                     int in_begin):
+    from cudf.dataframe.column import Column
+
+    if abs(out_end - out_begin) <= 1:
+        return out_col
+
+    if out_begin < 0:
+        out_begin = len(out_col) + out_begin
+    if out_end < 0:
+        out_end = len(out_col) + out_end
+
+    if out_begin > out_end:
+        return out_col
+
+    if out_col.null_count == 0 and in_col.has_null_mask:
+        mask = utils.make_mask(len(out_col))
+        cudautils.fill_value(mask, 0xff)
+        out_col._mask = Buffer(mask)
+        out_col._null_count = 0
+
+    if in_col.null_count == 0 and out_col.has_null_mask:
+        mask = utils.make_mask(len(in_col))
+        cudautils.fill_value(mask, 0xff)
+        in_col._mask = Buffer(mask)
+        in_col._null_count = 0
+
+    cdef gdf_column* c_out_col = column_view_from_column(out_col)
+    cdef gdf_column* c_in_col = column_view_from_column(in_col)
+
+    with nogil:
+        copy_range(c_out_col,
+                   c_in_col[0],
+                   out_begin,
+                   out_end,
+                   in_begin)
+
+    out_col._update_null_count(c_out_col.null_count)
+
+    if out_col.dtype == np.dtype("object") and len(out_col) > 0:
+        update_nvstrings_col(
+            out_col,
+            <uintptr_t>c_out_col.dtype_info.category)
+
+    free(c_in_col)
+    free(c_out_col)
+
+    return out_col
@@ -27,7 +27,7 @@ cpdef gdf_time_unit np_dtype_to_gdf_time_unit(dtype)
 cpdef gdf_time_unit_to_np_dtype(gdf_time_unit time_unit)
 
 cdef np_dtype_from_gdf_column(gdf_column* col)
-cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*)
+
 
 cdef get_scalar_value(gdf_scalar scalar, dtype)
 
@@ -391,3 +391,5 @@ cdef extern from "cudf/legacy/table.hpp" namespace "cudf" nogil:
 #        const gdf_column* const* begin() const except +
 #        gdf_column const* const* end() const
 #        gdf_column const* get_column(gdf_index_type index) const except +
+
+cpdef gdf_dtype gdf_dtype_from_value(col, dtype=*) except? GDF_invalid
@@ -162,7 +162,7 @@ cdef np_dtype_from_gdf_column(gdf_column* col):
     raise TypeError('cannot convert gdf_dtype `%s` to numpy dtype' % (dtype))
 
 
-cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None):
+cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None) except? GDF_invalid:
     """Util to convert a column's or np.scalar's dtype to gdf dtype.
 
     Parameters
@@ -172,7 +172,8 @@ cpdef gdf_dtype gdf_dtype_from_value(col, dtype=None):
     dtype : numpy.dtype; optional
         The dtype to convert to a gdf_dtype.  Defaults to *col.dtype*.
     """
-    dtype = col.dtype if dtype is None else np.dtype(dtype)
+    dtype = col.dtype if dtype is None else pd.api.types.pandas_dtype(dtype)
+
     # if dtype is pd.CategoricalDtype, use the codes' gdf_dtype
     if is_categorical_dtype(dtype):
         if col is None:
@@ -308,7 +309,7 @@ cdef gdf_column* column_view_from_column(col, col_name=None) except? NULL:
         else:
             data_ptr = 0
 
-    if col._mask is not None and col.null_count > 0:
+    if col._mask is not None:
         valid_ptr = get_column_valid_ptr(col)
     else:
         valid_ptr = 0

@@ -11,7 +11,7 @@ from cudf.bindings.cudf_cpp cimport *
 from cudf.bindings.cudf_cpp import *
 from cudf.bindings.replace cimport *
 from cudf.dataframe.column import Column
-from cudf.utils.utils import is_single_value
+from cudf.utils.utils import is_scalar
 
 from libc.stdlib cimport calloc, malloc, free
 
@@ -79,7 +79,7 @@ cpdef apply_replace_nulls(inp, replacement):
     Call replace_nulls
     """
 
-    if is_single_value(replacement):
+    if is_scalar(replacement):
         return apply_replace_nulls_scalar(inp, replacement)
     else:
         return apply_replace_nulls_column(inp, replacement)
@@ -12,27 +12,28 @@ from cudf.dataframe.column import Column
 from libc.stdlib cimport free
 
 import numpy as np
+import pandas as pd
 
 
 def apply_cast(incol, dtype=np.float64):
     """
     Return a Column with values in `incol` casted to `dtype`.
     Currently supports numeric and datetime dtypes.
     """
+    dtype = np.dtype(np.float64 if dtype is None else dtype)
 
     check_gdf_compatibility(incol)
 
-    cdef gdf_column* c_incol = column_view_from_column(incol)
+    if pd.api.types.is_dtype_equal(incol.dtype, dtype):
+        return incol
 
-    dtype = np.dtype(np.float64 if dtype is None else dtype)
+    cdef gdf_column* c_incol = column_view_from_column(incol)
     cdef gdf_dtype c_out_dtype = gdf_dtype_from_value(incol, dtype)
     cdef uintptr_t c_category
-
     cdef gdf_dtype_extra_info c_out_info = gdf_dtype_extra_info(
         time_unit=np_dtype_to_gdf_time_unit(dtype),
         category=<void*>c_category
     )
-
     cdef gdf_column result
 
     with nogil:
-Original file line number
+Diff line change
@@ Expand Up @@
       }
     }
     } // namespace detail
     void gather(table const *source_table, gdf_index_type const gather_map[],
@@ Expand Down @@