Skip to content

Commit d981bfc

Browse files
zero323shivaram
authored andcommitted
[SPARK-11086][SPARKR] Use dropFactors column-wise instead of nested loop when createDataFrame
Use `dropFactors` column-wise instead of nested loop when `createDataFrame` from a `data.frame` At this moment SparkR createDataFrame is using nested loop to convert factors to character when called on a local data.frame. It works but is incredibly slow especially with data.table (~ 2 orders of magnitude compared to PySpark / Pandas version on a DateFrame of size 1M rows x 2 columns). A simple improvement is to apply `dropFactor `column-wise and then reshape output list. It should at least partially address [SPARK-8277](https://issues.apache.org/jira/browse/SPARK-8277). Author: zero323 <[email protected]> Closes #9099 from zero323/SPARK-11086. (cherry picked from commit d7d9fa0) Signed-off-by: Shivaram Venkataraman <[email protected]>
1 parent 2f0f8bb commit d981bfc

2 files changed

Lines changed: 49 additions & 21 deletions

File tree

R/pkg/R/SQLContext.R

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,33 @@
1717

1818
# SQLcontext.R: SQLContext-driven functions
1919

20+
21+
# Map top level R type to SQL type
22+
getInternalType <- function(x) {
23+
# class of POSIXlt is c("POSIXlt" "POSIXt")
24+
switch(class(x)[[1]],
25+
integer = "integer",
26+
character = "string",
27+
logical = "boolean",
28+
double = "double",
29+
numeric = "double",
30+
raw = "binary",
31+
list = "array",
32+
struct = "struct",
33+
environment = "map",
34+
Date = "date",
35+
POSIXlt = "timestamp",
36+
POSIXct = "timestamp",
37+
stop(paste("Unsupported type for DataFrame:", class(x))))
38+
}
39+
2040
#' infer the SQL type
2141
infer_type <- function(x) {
2242
if (is.null(x)) {
2343
stop("can not infer type from NULL")
2444
}
2545

26-
# class of POSIXlt is c("POSIXlt" "POSIXt")
27-
type <- switch(class(x)[[1]],
28-
integer = "integer",
29-
character = "string",
30-
logical = "boolean",
31-
double = "double",
32-
numeric = "double",
33-
raw = "binary",
34-
list = "array",
35-
struct = "struct",
36-
environment = "map",
37-
Date = "date",
38-
POSIXlt = "timestamp",
39-
POSIXct = "timestamp",
40-
stop(paste("Unsupported type for DataFrame:", class(x))))
46+
type <- getInternalType(x)
4147

4248
if (type == "map") {
4349
stopifnot(length(x) > 0)
@@ -90,19 +96,25 @@ createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0
9096
if (is.null(schema)) {
9197
schema <- names(data)
9298
}
93-
n <- nrow(data)
94-
m <- ncol(data)
99+
95100
# get rid of factor type
96-
dropFactor <- function(x) {
101+
cleanCols <- function(x) {
97102
if (is.factor(x)) {
98103
as.character(x)
99104
} else {
100105
x
101106
}
102107
}
103-
data <- lapply(1:n, function(i) {
104-
lapply(1:m, function(j) { dropFactor(data[i,j]) })
105-
})
108+
109+
# drop factors and wrap lists
110+
data <- setNames(lapply(data, cleanCols), NULL)
111+
112+
# check if all columns have supported type
113+
lapply(data, getInternalType)
114+
115+
# convert to rows
116+
args <- list(FUN = list, SIMPLIFY = FALSE, USE.NAMES = FALSE)
117+
data <- do.call(mapply, append(args, data))
106118
}
107119
if (is.list(data)) {
108120
sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sqlContext)

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,14 @@ test_that("create DataFrame from list or data.frame", {
242242
expect_equal(count(df), 3)
243243
ldf2 <- collect(df)
244244
expect_equal(ldf$a, ldf2$a)
245+
246+
irisdf <- createDataFrame(sqlContext, iris)
247+
iris_collected <- collect(irisdf)
248+
expect_equivalent(iris_collected[,-5], iris[,-5])
249+
expect_equal(iris_collected$Species, as.character(iris$Species))
250+
251+
mtcarsdf <- createDataFrame(sqlContext, mtcars)
252+
expect_equivalent(collect(mtcarsdf), mtcars)
245253
})
246254

247255
test_that("create DataFrame with different data types", {
@@ -283,6 +291,14 @@ test_that("create DataFrame with complex types", {
283291
expect_equal(s$b, 3L)
284292
})
285293

294+
test_that("create DataFrame from a data.frame with complex types", {
295+
ldf <- data.frame(row.names=1:2)
296+
ldf$a_list <- list(list(1, 2), list(3, 4))
297+
sdf <- createDataFrame(sqlContext, ldf)
298+
299+
expect_equivalent(ldf, collect(sdf))
300+
})
301+
286302
# For test map type and struct type in DataFrame
287303
mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
288304
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",

0 commit comments

Comments
 (0)