@@ -1657,9 +1657,7 @@ setMethod("dapplyCollect",
16571657# '
16581658# ' @param cols grouping columns.
16591659# ' @param func a function to be applied to each group partition specified by grouping
1660- # ' column of the SparkDataFrame. The function \code{func} takes as argument
1661- # ' a key - grouping columns and a data frame - a local R data.frame.
1662- # ' The output of \code{func} is a local R data.frame.
1660+ # ' column of the SparkDataFrame. See Details.
16631661# ' @param schema the schema of the resulting SparkDataFrame after the function is applied.
16641662# ' The schema must match to output of \code{func}. It has to be defined for each
16651663# ' output column with preferred output column name and corresponding data type.
@@ -1669,29 +1667,43 @@ setMethod("dapplyCollect",
16691667# ' @aliases gapply,SparkDataFrame-method
16701668# ' @rdname gapply
16711669# ' @name gapply
1670+ # ' @details
1671+ # ' \code{func} is a function of two arguments. The first, usually named \code{key}
1672+ # ' (though this is not enforced) corresponds to the grouping key, will be an
1673+ # ' unnamed \code{list} of \code{length(cols)} length-one objects corresponding
1674+ # ' to the grouping columns' values for the current group.
1675+ # '
1676+ # ' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the
1677+ # ' columns of the input not in \code{cols} for the rows corresponding to \code{key}.
1678+ # '
1679+ # ' The output of \code{func} must be a \code{data.frame} matching \code{schema} --
1680+ # ' in particular this means the names of the output \code{data.frame} are irrelevant
1681+ # '
16721682# ' @seealso \link{gapplyCollect}
16731683# ' @examples
16741684# '
16751685# ' \dontrun{
1676- # ' Computes the arithmetic mean of the second column by grouping
1677- # ' on the first and third columns. Output the grouping values and the average.
1686+ # ' # Computes the arithmetic mean of the second column by grouping
1687+ # ' # on the first and third columns. Output the grouping values and the average.
16781688# '
16791689# ' df <- createDataFrame (
16801690# ' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
16811691# ' c("a", "b", "c", "d"))
16821692# '
1683- # ' Here our output contains three columns, the key which is a combination of two
1684- # ' columns with data types integer and string and the mean which is a double.
1693+ # ' # Here our output contains three columns, the key which is a combination of two
1694+ # ' # columns with data types integer and string and the mean which is a double.
16851695# ' schema <- structType(structField("a", "integer"), structField("c", "string"),
16861696# ' structField("avg", "double"))
16871697# ' result <- gapply(
16881698# ' df,
16891699# ' c("a", "c"),
16901700# ' function(key, x) {
1701+ # ' # key will either be list(1L, '1') (for the group where a=1L,c='1') or
1702+ # ' # list(3L, '3') (for the group where a=3L,c='3')
16911703# ' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
16921704# ' }, schema)
16931705# '
1694- # ' The schema also can be specified in a DDL-formatted string.
1706+ # ' # The schema also can be specified in a DDL-formatted string.
16951707# ' schema <- "a INT, c STRING, avg DOUBLE"
16961708# ' result <- gapply(
16971709# ' df,
@@ -1700,8 +1712,8 @@ setMethod("dapplyCollect",
17001712# ' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
17011713# ' }, schema)
17021714# '
1703- # ' We can also group the data and afterwards call gapply on GroupedData.
1704- # ' For Example :
1715+ # ' # We can also group the data and afterwards call gapply on GroupedData.
1716+ # ' # For example :
17051717# ' gdf <- group_by(df, "a", "c")
17061718# ' result <- gapply(
17071719# ' gdf,
@@ -1710,15 +1722,15 @@ setMethod("dapplyCollect",
17101722# ' }, schema)
17111723# ' collect(result)
17121724# '
1713- # ' Result
1714- # ' ------
1715- # ' a c avg
1716- # ' 3 3 3.0
1717- # ' 1 1 1.5
1725+ # ' # Result
1726+ # ' # ------
1727+ # ' # a c avg
1728+ # ' # 3 3 3.0
1729+ # ' # 1 1 1.5
17181730# '
1719- # ' Fits linear models on iris dataset by grouping on the 'Species' column and
1720- # ' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1721- # ' and 'Petal_Width' as training features.
1731+ # ' # Fits linear models on iris dataset by grouping on the 'Species' column and
1732+ # ' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1733+ # ' # and 'Petal_Width' as training features.
17221734# '
17231735# ' df <- createDataFrame (iris)
17241736# ' schema <- structType(structField("(Intercept)", "double"),
@@ -1734,12 +1746,12 @@ setMethod("dapplyCollect",
17341746# ' }, schema)
17351747# ' collect(df1)
17361748# '
1737- # ' Result
1738- # ' ---------
1739- # ' Model (Intercept) Sepal_Width Petal_Length Petal_Width
1740- # ' 1 0.699883 0.3303370 0.9455356 -0.1697527
1741- # ' 2 1.895540 0.3868576 0.9083370 -0.6792238
1742- # ' 3 2.351890 0.6548350 0.2375602 0.2521257
1749+ # ' # Result
1750+ # ' # ---------
1751+ # ' # Model (Intercept) Sepal_Width Petal_Length Petal_Width
1752+ # ' # 1 0.699883 0.3303370 0.9455356 -0.1697527
1753+ # ' # 2 1.895540 0.3868576 0.9083370 -0.6792238
1754+ # ' # 3 2.351890 0.6548350 0.2375602 0.2521257
17431755# '
17441756# '}
17451757# ' @note gapply(SparkDataFrame) since 2.0.0
@@ -1757,20 +1769,30 @@ setMethod("gapply",
17571769# '
17581770# ' @param cols grouping columns.
17591771# ' @param func a function to be applied to each group partition specified by grouping
1760- # ' column of the SparkDataFrame. The function \code{func} takes as argument
1761- # ' a key - grouping columns and a data frame - a local R data.frame.
1762- # ' The output of \code{func} is a local R data.frame.
1772+ # ' column of the SparkDataFrame. See Details.
17631773# ' @return A data.frame.
17641774# ' @family SparkDataFrame functions
17651775# ' @aliases gapplyCollect,SparkDataFrame-method
17661776# ' @rdname gapplyCollect
17671777# ' @name gapplyCollect
1778+ # ' @details
1779+ # ' \code{func} is a function of two arguments. The first, usually named \code{key}
1780+ # ' (though this is not enforced) corresponds to the grouping key, will be an
1781+ # ' unnamed \code{list} of \code{length(cols)} length-one objects corresponding
1782+ # ' to the grouping columns' values for the current group.
1783+ # '
1784+ # ' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the
1785+ # ' columns of the input not in \code{cols} for the rows corresponding to \code{key}.
1786+ # '
1787+ # ' The output of \code{func} must be a \code{data.frame} matching \code{schema} --
1788+ # ' in particular this means the names of the output \code{data.frame} are irrelevant
1789+ # '
17681790# ' @seealso \link{gapply}
17691791# ' @examples
17701792# '
17711793# ' \dontrun{
1772- # ' Computes the arithmetic mean of the second column by grouping
1773- # ' on the first and third columns. Output the grouping values and the average.
1794+ # ' # Computes the arithmetic mean of the second column by grouping
1795+ # ' # on the first and third columns. Output the grouping values and the average.
17741796# '
17751797# ' df <- createDataFrame (
17761798# ' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
@@ -1785,8 +1807,8 @@ setMethod("gapply",
17851807# ' y
17861808# ' })
17871809# '
1788- # ' We can also group the data and afterwards call gapply on GroupedData.
1789- # ' For Example :
1810+ # ' # We can also group the data and afterwards call gapply on GroupedData.
1811+ # ' # For example :
17901812# ' gdf <- group_by(df, "a", "c")
17911813# ' result <- gapplyCollect(
17921814# ' gdf,
@@ -1796,15 +1818,15 @@ setMethod("gapply",
17961818# ' y
17971819# ' })
17981820# '
1799- # ' Result
1800- # ' ------
1801- # ' key_a key_c mean_b
1802- # ' 3 3 3.0
1803- # ' 1 1 1.5
1821+ # ' # Result
1822+ # ' # ------
1823+ # ' # key_a key_c mean_b
1824+ # ' # 3 3 3.0
1825+ # ' # 1 1 1.5
18041826# '
1805- # ' Fits linear models on iris dataset by grouping on the 'Species' column and
1806- # ' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1807- # ' and 'Petal_Width' as training features.
1827+ # ' # Fits linear models on iris dataset by grouping on the 'Species' column and
1828+ # ' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
1829+ # ' # and 'Petal_Width' as training features.
18081830# '
18091831# ' df <- createDataFrame (iris)
18101832# ' result <- gapplyCollect(
@@ -1816,12 +1838,12 @@ setMethod("gapply",
18161838# ' data.frame(t(coef(m)))
18171839# ' })
18181840# '
1819- # ' Result
1820- # '---------
1821- # ' Model X.Intercept. Sepal_Width Petal_Length Petal_Width
1822- # ' 1 0.699883 0.3303370 0.9455356 -0.1697527
1823- # ' 2 1.895540 0.3868576 0.9083370 -0.6792238
1824- # ' 3 2.351890 0.6548350 0.2375602 0.2521257
1841+ # ' # Result
1842+ # ' # ---------
1843+ # ' # Model X.Intercept. Sepal_Width Petal_Length Petal_Width
1844+ # ' # 1 0.699883 0.3303370 0.9455356 -0.1697527
1845+ # ' # 2 1.895540 0.3868576 0.9083370 -0.6792238
1846+ # ' # 3 2.351890 0.6548350 0.2375602 0.2521257
18251847# '
18261848# '}
18271849# ' @note gapplyCollect(SparkDataFrame) since 2.0.0
0 commit comments