Skip to content

Commit 1a8e046

Browse files
felixcheungdavies
authored andcommitted
[SPARK-11468] [SPARKR] add stddev/variance agg functions for Column
Checked names, none of them should conflict with anything in base shivaram davies rxin Author: felixcheung <[email protected]> Closes #9489 from felixcheung/rstddev.
1 parent fac53d8 commit 1a8e046

5 files changed

Lines changed: 297 additions & 30 deletions

File tree

R/pkg/NAMESPACE

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ exportMethods("%in%",
155155
"isNaN",
156156
"isNotNull",
157157
"isNull",
158+
"kurtosis",
158159
"lag",
159160
"last",
160161
"last_day",
@@ -207,12 +208,17 @@ exportMethods("%in%",
207208
"shiftLeft",
208209
"shiftRight",
209210
"shiftRightUnsigned",
211+
"sd",
210212
"sign",
211213
"signum",
212214
"sin",
213215
"sinh",
214216
"size",
217+
"skewness",
215218
"soundex",
219+
"stddev",
220+
"stddev_pop",
221+
"stddev_samp",
216222
"sqrt",
217223
"startsWith",
218224
"substr",
@@ -231,6 +237,10 @@ exportMethods("%in%",
231237
"unhex",
232238
"unix_timestamp",
233239
"upper",
240+
"var",
241+
"variance",
242+
"var_pop",
243+
"var_samp",
234244
"weekofyear",
235245
"when",
236246
"year")

R/pkg/R/functions.R

Lines changed: 174 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,22 @@ setMethod("isNaN",
520520
column(jc)
521521
})
522522

523+
#' kurtosis
524+
#'
525+
#' Aggregate function: returns the kurtosis of the values in a group.
526+
#'
527+
#' @rdname kurtosis
528+
#' @name kurtosis
529+
#' @family agg_funcs
530+
#' @export
531+
#' @examples \dontrun{kurtosis(df$c)}
532+
setMethod("kurtosis",
533+
signature(x = "Column"),
534+
function(x) {
535+
jc <- callJStatic("org.apache.spark.sql.functions", "kurtosis", x@jc)
536+
column(jc)
537+
})
538+
523539
#' last
524540
#'
525541
#' Aggregate function: returns the last value in a group.
@@ -861,6 +877,28 @@ setMethod("rtrim",
861877
column(jc)
862878
})
863879

880+
#' sd
881+
#'
882+
#' Aggregate function: alias for \link{stddev_samp}
883+
#'
884+
#' @rdname sd
885+
#' @name sd
886+
#' @family agg_funcs
887+
#' @seealso \link{stddev_pop}, \link{stddev_samp}
888+
#' @export
889+
#' @examples
890+
#'\dontrun{
891+
#'stddev(df$c)
892+
#'select(df, stddev(df$age))
893+
#'agg(df, sd(df$age))
894+
#'}
895+
setMethod("sd",
896+
signature(x = "Column"),
897+
function(x, na.rm = FALSE) {
898+
# In R, sample standard deviation is calculated with the sd() function.
899+
stddev_samp(x)
900+
})
901+
864902
#' second
865903
#'
866904
#' Extracts the seconds as an integer from a given date/timestamp/string.
@@ -958,6 +996,22 @@ setMethod("size",
958996
column(jc)
959997
})
960998

999+
#' skewness
1000+
#'
1001+
#' Aggregate function: returns the skewness of the values in a group.
1002+
#'
1003+
#' @rdname skewness
1004+
#' @name skewness
1005+
#' @family agg_funcs
1006+
#' @export
1007+
#' @examples \dontrun{skewness(df$c)}
1008+
setMethod("skewness",
1009+
signature(x = "Column"),
1010+
function(x) {
1011+
jc <- callJStatic("org.apache.spark.sql.functions", "skewness", x@jc)
1012+
column(jc)
1013+
})
1014+
9611015
#' soundex
9621016
#'
9631017
#' Return the soundex code for the specified expression.
@@ -974,6 +1028,49 @@ setMethod("soundex",
9741028
column(jc)
9751029
})
9761030

1031+
#' @rdname sd
1032+
#' @name stddev
1033+
setMethod("stddev",
1034+
signature(x = "Column"),
1035+
function(x) {
1036+
jc <- callJStatic("org.apache.spark.sql.functions", "stddev", x@jc)
1037+
column(jc)
1038+
})
1039+
1040+
#' stddev_pop
1041+
#'
1042+
#' Aggregate function: returns the population standard deviation of the expression in a group.
1043+
#'
1044+
#' @rdname stddev_pop
1045+
#' @name stddev_pop
1046+
#' @family agg_funcs
1047+
#' @seealso \link{sd}, \link{stddev_samp}
1048+
#' @export
1049+
#' @examples \dontrun{stddev_pop(df$c)}
1050+
setMethod("stddev_pop",
1051+
signature(x = "Column"),
1052+
function(x) {
1053+
jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
1054+
column(jc)
1055+
})
1056+
1057+
#' stddev_samp
1058+
#'
1059+
#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group.
1060+
#'
1061+
#' @rdname stddev_samp
1062+
#' @name stddev_samp
1063+
#' @family agg_funcs
1064+
#' @seealso \link{stddev_pop}, \link{sd}
1065+
#' @export
1066+
#' @examples \dontrun{stddev_samp(df$c)}
1067+
setMethod("stddev_samp",
1068+
signature(x = "Column"),
1069+
function(x) {
1070+
jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
1071+
column(jc)
1072+
})
1073+
9771074
#' sqrt
9781075
#'
9791076
#' Computes the square root of the specified float value.
@@ -1168,6 +1265,71 @@ setMethod("upper",
11681265
column(jc)
11691266
})
11701267

1268+
#' var
1269+
#'
1270+
#' Aggregate function: alias for \link{var_samp}.
1271+
#'
1272+
#' @rdname var
1273+
#' @name var
1274+
#' @family agg_funcs
1275+
#' @seealso \link{var_pop}, \link{var_samp}
1276+
#' @export
1277+
#' @examples
1278+
#'\dontrun{
1279+
#'variance(df$c)
1280+
#'select(df, var_pop(df$age))
1281+
#'agg(df, var(df$age))
1282+
#'}
1283+
setMethod("var",
1284+
signature(x = "Column"),
1285+
function(x, y = NULL, na.rm = FALSE, use) {
1286+
# In R, sample variance is calculated with the var() function.
1287+
var_samp(x)
1288+
})
1289+
1290+
#' @rdname var
1291+
#' @name variance
1292+
setMethod("variance",
1293+
signature(x = "Column"),
1294+
function(x) {
1295+
jc <- callJStatic("org.apache.spark.sql.functions", "variance", x@jc)
1296+
column(jc)
1297+
})
1298+
1299+
#' var_pop
1300+
#'
1301+
#' Aggregate function: returns the population variance of the values in a group.
1302+
#'
1303+
#' @rdname var_pop
1304+
#' @name var_pop
1305+
#' @family agg_funcs
1306+
#' @seealso \link{var}, \link{var_samp}
1307+
#' @export
1308+
#' @examples \dontrun{var_pop(df$c)}
1309+
setMethod("var_pop",
1310+
signature(x = "Column"),
1311+
function(x) {
1312+
jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
1313+
column(jc)
1314+
})
1315+
1316+
#' var_samp
1317+
#'
1318+
#' Aggregate function: returns the unbiased variance of the values in a group.
1319+
#'
1320+
#' @rdname var_samp
1321+
#' @name var_samp
1322+
#' @family agg_funcs
1323+
#' @seealso \link{var_pop}, \link{var}
1324+
#' @export
1325+
#' @examples \dontrun{var_samp(df$c)}
1326+
setMethod("var_samp",
1327+
signature(x = "Column"),
1328+
function(x) {
1329+
jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
1330+
column(jc)
1331+
})
1332+
11711333
#' weekofyear
11721334
#'
11731335
#' Extracts the week number as an integer from a given date/timestamp/string.
@@ -2020,10 +2182,10 @@ setMethod("ifelse",
20202182
#'
20212183
#' Window function: returns the cumulative distribution of values within a window partition,
20222184
#' i.e. the fraction of rows that are below the current row.
2023-
#'
2185+
#'
20242186
#' N = total number of rows in the partition
20252187
#' cumeDist(x) = number of values before (and including) x / N
2026-
#'
2188+
#'
20272189
#' This is equivalent to the CUME_DIST function in SQL.
20282190
#'
20292191
#' @rdname cumeDist
@@ -2039,13 +2201,13 @@ setMethod("cumeDist",
20392201
})
20402202

20412203
#' denseRank
2042-
#'
2204+
#'
20432205
#' Window function: returns the rank of rows within a window partition, without any gaps.
20442206
#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
20452207
#' sequence when there are ties. That is, if you were ranking a competition using denseRank
20462208
#' and had three people tie for second place, you would say that all three were in second
20472209
#' place and that the next person came in third.
2048-
#'
2210+
#'
20492211
#' This is equivalent to the DENSE_RANK function in SQL.
20502212
#'
20512213
#' @rdname denseRank
@@ -2065,7 +2227,7 @@ setMethod("denseRank",
20652227
#' Window function: returns the value that is `offset` rows before the current row, and
20662228
#' `defaultValue` if there is less than `offset` rows before the current row. For example,
20672229
#' an `offset` of one will return the previous row at any given point in the window partition.
2068-
#'
2230+
#'
20692231
#' This is equivalent to the LAG function in SQL.
20702232
#'
20712233
#' @rdname lag
@@ -2092,7 +2254,7 @@ setMethod("lag",
20922254
#' Window function: returns the value that is `offset` rows after the current row, and
20932255
#' `null` if there is less than `offset` rows after the current row. For example,
20942256
#' an `offset` of one will return the next row at any given point in the window partition.
2095-
#'
2257+
#'
20962258
#' This is equivalent to the LEAD function in SQL.
20972259
#'
20982260
#' @rdname lead
@@ -2119,7 +2281,7 @@ setMethod("lead",
21192281
#' Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window
21202282
#' partition. Fow example, if `n` is 4, the first quarter of the rows will get value 1, the second
21212283
#' quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
2122-
#'
2284+
#'
21232285
#' This is equivalent to the NTILE function in SQL.
21242286
#'
21252287
#' @rdname ntile
@@ -2137,9 +2299,9 @@ setMethod("ntile",
21372299
#' percentRank
21382300
#'
21392301
#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
2140-
#'
2302+
#'
21412303
#' This is computed by:
2142-
#'
2304+
#'
21432305
#' (rank of row in its partition - 1) / (number of rows in the partition - 1)
21442306
#'
21452307
#' This is equivalent to the PERCENT_RANK function in SQL.
@@ -2159,12 +2321,12 @@ setMethod("percentRank",
21592321
#' rank
21602322
#'
21612323
#' Window function: returns the rank of rows within a window partition.
2162-
#'
2324+
#'
21632325
#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
21642326
#' sequence when there are ties. That is, if you were ranking a competition using denseRank
21652327
#' and had three people tie for second place, you would say that all three were in second
21662328
#' place and that the next person came in third.
2167-
#'
2329+
#'
21682330
#' This is equivalent to the RANK function in SQL.
21692331
#'
21702332
#' @rdname rank
@@ -2189,7 +2351,7 @@ setMethod("rank",
21892351
#' rowNumber
21902352
#'
21912353
#' Window function: returns a sequential number starting at 1 within a window partition.
2192-
#'
2354+
#'
21932355
#' This is equivalent to the ROW_NUMBER function in SQL.
21942356
#'
21952357
#' @rdname rowNumber

R/pkg/R/generics.R

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,10 @@ setGeneric("instr", function(y, x) { standardGeneric("instr") })
798798
#' @export
799799
setGeneric("isNaN", function(x) { standardGeneric("isNaN") })
800800

801+
#' @rdname kurtosis
802+
#' @export
803+
setGeneric("kurtosis", function(x) { standardGeneric("kurtosis") })
804+
801805
#' @rdname lag
802806
#' @export
803807
setGeneric("lag", function(x, offset, defaultValue = NULL) { standardGeneric("lag") })
@@ -935,6 +939,10 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
935939
#' @export
936940
setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
937941

942+
#' @rdname sd
943+
#' @export
944+
setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") })
945+
938946
#' @rdname second
939947
#' @export
940948
setGeneric("second", function(x) { standardGeneric("second") })
@@ -967,10 +975,26 @@ setGeneric("signum", function(x) { standardGeneric("signum") })
967975
#' @export
968976
setGeneric("size", function(x) { standardGeneric("size") })
969977

978+
#' @rdname skewness
979+
#' @export
980+
setGeneric("skewness", function(x) { standardGeneric("skewness") })
981+
970982
#' @rdname soundex
971983
#' @export
972984
setGeneric("soundex", function(x) { standardGeneric("soundex") })
973985

986+
#' @rdname sd
987+
#' @export
988+
setGeneric("stddev", function(x) { standardGeneric("stddev") })
989+
990+
#' @rdname stddev_pop
991+
#' @export
992+
setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") })
993+
994+
#' @rdname stddev_samp
995+
#' @export
996+
setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") })
997+
974998
#' @rdname substring_index
975999
#' @export
9761000
setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") })
@@ -1019,6 +1043,22 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta
10191043
#' @export
10201044
setGeneric("upper", function(x) { standardGeneric("upper") })
10211045

1046+
#' @rdname var
1047+
#' @export
1048+
setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") })
1049+
1050+
#' @rdname var
1051+
#' @export
1052+
setGeneric("variance", function(x) { standardGeneric("variance") })
1053+
1054+
#' @rdname var_pop
1055+
#' @export
1056+
setGeneric("var_pop", function(x) { standardGeneric("var_pop") })
1057+
1058+
#' @rdname var_samp
1059+
#' @export
1060+
setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
1061+
10221062
#' @rdname weekofyear
10231063
#' @export
10241064
setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })

0 commit comments

Comments
 (0)