Skip to content

Commit e9efb62

Browse files
huaxingaoHyukjinKwon
authored andcommitted
[SPARK-24187][R][SQL] Add array_join function to SparkR
## What changes were proposed in this pull request? This PR adds array_join function to SparkR ## How was this patch tested? Add unit test in test_sparkSQL.R Author: Huaxin Gao <huaxing@us.ibm.com> Closes #21313 from huaxingao/spark-24187.
1 parent 93df3cd commit e9efb62

4 files changed

Lines changed: 46 additions & 3 deletions

File tree

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ exportMethods("%<=>%",
201201
"approxCountDistinct",
202202
"approxQuantile",
203203
"array_contains",
204+
"array_join",
204205
"array_max",
205206
"array_min",
206207
"array_position",

R/pkg/R/functions.R

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,9 @@ NULL
221221
#' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
222222
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
223223
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
224-
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))}
224+
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
225+
#' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
226+
#' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))}
225227
NULL
226228

227229
#' Window functions for Column operations
@@ -3006,6 +3008,27 @@ setMethod("array_contains",
30063008
column(jc)
30073009
})
30083010

3011+
#' @details
3012+
#' \code{array_join}: Concatenates the elements of column using the delimiter.
3013+
#' Null values are replaced with nullReplacement if set, otherwise they are ignored.
3014+
#'
3015+
#' @param delimiter a character string that is used to concatenate the elements of column.
3016+
#' @param nullReplacement an optional character string that is used to replace the Null values.
3017+
#' @rdname column_collection_functions
3018+
#' @aliases array_join array_join,Column-method
3019+
#' @note array_join since 2.4.0
3020+
setMethod("array_join",
3021+
signature(x = "Column", delimiter = "character"),
3022+
function(x, delimiter, nullReplacement = NULL) {
3023+
jc <- if (is.null(nullReplacement)) {
3024+
callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter)
3025+
} else {
3026+
callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter,
3027+
as.character(nullReplacement))
3028+
}
3029+
column(jc)
3030+
})
3031+
30093032
#' @details
30103033
#' \code{array_max}: Returns the maximum value of the array.
30113034
#'
@@ -3197,8 +3220,8 @@ setMethod("size",
31973220
#' (or starting from the end if start is negative) with the specified length.
31983221
#'
31993222
#' @rdname column_collection_functions
3200-
#' @param start an index indicating the first element occuring in the result.
3201-
#' @param length a number of consecutive elements choosen to the result.
3223+
#' @param start an index indicating the first element occurring in the result.
3224+
#' @param length a number of consecutive elements chosen to the result.
32023225
#' @aliases slice slice,Column-method
32033226
#' @note slice since 2.4.0
32043227
setMethod("slice",

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,10 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
757757
#' @name NULL
758758
setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
759759

760+
#' @rdname column_collection_functions
761+
#' @name NULL
762+
setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
763+
760764
#' @rdname column_collection_functions
761765
#' @name NULL
762766
setGeneric("array_max", function(x) { standardGeneric("array_max") })

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,6 +1518,21 @@ test_that("column functions", {
15181518
result <- collect(select(df, arrays_overlap(df[[1]], df[[2]])))[[1]]
15191519
expect_equal(result, c(TRUE, FALSE, NA))
15201520

1521+
# Test array_join()
1522+
df <- createDataFrame(list(list(list("Hello", "World!"))))
1523+
result <- collect(select(df, array_join(df[[1]], "#")))[[1]]
1524+
expect_equal(result, "Hello#World!")
1525+
df2 <- createDataFrame(list(list(list("Hello", NA, "World!"))))
1526+
result <- collect(select(df2, array_join(df2[[1]], "#", "Beautiful")))[[1]]
1527+
expect_equal(result, "Hello#Beautiful#World!")
1528+
result <- collect(select(df2, array_join(df2[[1]], "#")))[[1]]
1529+
expect_equal(result, "Hello#World!")
1530+
df3 <- createDataFrame(list(list(list("Hello", NULL, "World!"))))
1531+
result <- collect(select(df3, array_join(df3[[1]], "#", "Beautiful")))[[1]]
1532+
expect_equal(result, "Hello#Beautiful#World!")
1533+
result <- collect(select(df3, array_join(df3[[1]], "#")))[[1]]
1534+
expect_equal(result, "Hello#World!")
1535+
15211536
# Test array_sort() and sort_array()
15221537
df <- createDataFrame(list(list(list(2L, 1L, 3L, NA)), list(list(NA, 6L, 5L, NA, 4L))))
15231538

0 commit comments

Comments
 (0)