From 2886c4f41e204020df6c8848e4fba47bc805e73e Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 30 Nov 2022 11:08:33 -0700 Subject: [PATCH 01/26] delete old commented code --- src/fcast.c | 89 ----------------------------------------------------- 1 file changed, 89 deletions(-) diff --git a/src/fcast.c b/src/fcast.c index 46d7465e9c..961b689b6b 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -96,92 +96,3 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil UNPROTECT(1); return(ans); } - -// commenting all unused functions, but not deleting it, just in case - -// // internal functions that are not used anymore.. - -// // # nocov start -// // Note: all these functions below are internal functions and are designed specific to fcast. -// SEXP zero_init(R_len_t n) { -// SEXP ans; -// if (n < 0) error(_("Input argument 'n' to 'zero_init' must be >= 0")); -// ans = PROTECT(allocVector(INTSXP, n)); -// for (int i=0; i 0")); -// PROTECT(call = lang3(install("do.call"), install("CJ"), s)); -// r = eval(call, env); -// UNPROTECT(1); -// return(r); -// } - -// SEXP diff_int(SEXP x, R_len_t n) { -// SEXP ans; -// if (TYPEOF(x) != INTSXP) error(_("Argument 'x' to 'diff_int' must be an integer vector")); -// ans = PROTECT(allocVector(INTSXP, length(x))); -// for (int i=1; i= 1 -// for (int i=0; i Date: Wed, 30 Nov 2022 11:34:03 -0700 Subject: [PATCH 02/26] new test for no warning fails --- inst/tests/tests.Rraw | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3cbe67680f..5a6e1a74f3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3720,6 +3720,12 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2],20,TRUE), d1=runif(20), d2=1L) test(1102.38, names(dcast(DT, x ~ y + z, fun.aggregate=length, value.var = "d2", sep=".")), c("x", "1.a", "1.b", "2.a", "2.b")) + + # test for #5512, only compute default fill if needed. + DT = data.table(chr=c("a","b","b"), int=1:3) + test(1102.39, dcast(DT, int ~ chr, min, value.var="int"), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int"), warning="NAs introduced by coercion to integer range") + test(1102.40, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) + } # test for freading commands From 26745f40647cd91abe5668d34a47ab20d170c8a6 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 30 Nov 2022 11:38:20 -0700 Subject: [PATCH 03/26] only compute default fill if missing cells present --- man/dcast.data.table.Rd | 2 +- src/fcast.c | 33 ++++++++++++++++++--------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index 2aa265a96c..8d35c199d5 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -22,7 +22,7 @@ \item{\dots}{Any other arguments that may be passed to the aggregating function.} \item{margins}{Not implemented yet. Should take variable names to compute margins on. A value of \code{TRUE} would compute all margins.} \item{subset}{Specified if casting should be done on a subset of the data. Ex: \code{subset = .(col1 <= 5)} or \code{subset = .(variable != "January")}.} - \item{fill}{Value with which to fill missing cells. If \code{fun.aggregate} is present, takes the value by applying the function on a 0-length vector.} + \item{fill}{Value with which to fill missing cells. If \code{fill=NULL} and missing cells are present, then \code{fun.aggregate} is used on a 0-length vector to obtain a fill value.} \item{drop}{\code{FALSE} will cast by including all missing combinations. \code{c(FALSE, TRUE)} will only include all missing combinations of formula \code{LHS}; \code{c(TRUE, FALSE)} will only include all missing combinations of formula RHS. See Examples.} diff --git a/src/fcast.c b/src/fcast.c index 961b689b6b..a2972bab34 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -20,52 +20,55 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SEXP thisfill = fill; SEXPTYPE thistype = TYPEOF(thiscol); int nprotect = 0; - if (isNull(fill)) { - if (LOGICAL(is_agg)[0]) { - thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; - } else thisfill = VECTOR_ELT(fill_d, i); + int nfill = 0; + for (int j=0; j Date: Wed, 30 Nov 2022 12:02:38 -0700 Subject: [PATCH 04/26] any_NA_int helper --- src/fcast.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/fcast.c b/src/fcast.c index a2972bab34..d67ac1dd1f 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -3,6 +3,13 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); +bool any_NA_int(int N_data, int *idx){ + for (int data_i=0; data_i Date: Thu, 1 Dec 2022 12:55:07 -0700 Subject: [PATCH 05/26] bugfix #5512 --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 06c6013fe1..a76cc5beb9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -561,6 +561,8 @@ identical(DT1, DT2) # TRUE ``` +55. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512). Thanks to Toby Dylan Hocking for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From 360ba9d01c23c8ded9738f8399b790cc110d9f53 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Dec 2022 09:32:10 -0700 Subject: [PATCH 06/26] Update src/fcast.c Co-authored-by: Xianying Tan --- src/fcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fcast.c b/src/fcast.c index d67ac1dd1f..4beed58cb6 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -28,7 +28,7 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SEXP thisfill = fill; SEXPTYPE thistype = TYPEOF(thiscol); int nprotect = 0; - if(some_fill){ + if (some_fill) { if (isNull(fill)) { if (LOGICAL(is_agg)[0]) { thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; From 75102bfb8aa1ea828f7c69cbd9c737392e553e21 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 4 Dec 2022 09:40:42 -0700 Subject: [PATCH 07/26] Update src/fcast.c Co-authored-by: Xianying Tan --- src/fcast.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fcast.c b/src/fcast.c index 4beed58cb6..f4a60676df 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -3,9 +3,9 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -bool any_NA_int(int N_data, int *idx){ - for (int data_i=0; data_i Date: Mon, 5 Dec 2022 09:29:24 -0700 Subject: [PATCH 08/26] mention warning text --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index a76cc5beb9..4bc74b00e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -561,7 +561,7 @@ identical(DT1, DT2) # TRUE ``` -55. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512). Thanks to Toby Dylan Hocking for the PR. +55. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512). Thanks to Toby Dylan Hocking for the PR. ## NOTES From 50553069b636b15cdcf15741e7e00de999b2a36a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 5 Dec 2022 09:52:56 -0700 Subject: [PATCH 09/26] const int args --- src/fcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fcast.c b/src/fcast.c index f4a60676df..4fb8d6ee90 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -3,7 +3,7 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -inline bool any_NA_int(int n, int *idx) { +inline bool any_NA_int(const int n, const int *idx) { for (int i=0; i Date: Mon, 5 Dec 2022 09:55:19 -0700 Subject: [PATCH 10/26] add back ithiscol --- src/fcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fcast.c b/src/fcast.c index 4fb8d6ee90..fc2376f25c 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -43,11 +43,12 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil case LGLSXP: { for (int j=0; j Date: Wed, 7 Dec 2022 12:20:45 -0700 Subject: [PATCH 11/26] get pointer before for loop --- src/fcast.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/fcast.c b/src/fcast.c index fc2376f25c..5453e18cc4 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -41,36 +41,44 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil switch (thistype) { case INTSXP: case LGLSXP: { + const int *ithiscol = INTEGER(thiscol); + const int *ithisfill = 0; + if (some_fill) ithisfill = INTEGER(thisfill); for (int j=0; j Date: Mon, 4 Mar 2024 11:06:11 -0700 Subject: [PATCH 12/26] add test case from Michael --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5a6e1a74f3..779d936b5c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3725,6 +3725,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(chr=c("a","b","b"), int=1:3) test(1102.39, dcast(DT, int ~ chr, min, value.var="int"), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int"), warning="NAs introduced by coercion to integer range") test(1102.40, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) + test(1102.41, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) } From 02f2c3aa6cbc99d4d70885c17537d6ccddf6ea13 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 8 Mar 2024 09:17:16 -0700 Subject: [PATCH 13/26] test min(dbl) and no warning when fill specified --- inst/tests/tests.Rraw | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2c38e7960c..dfb0e8728f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3731,8 +3731,10 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, c("x", "1.a", "1.b", "2.a", "2.b")) # test for #5512, only compute default fill if needed. - DT = data.table(chr=c("a","b","b"), int=1:3) - test(1102.39, dcast(DT, int ~ chr, min, value.var="int"), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int"), warning="NAs introduced by coercion to integer range") + DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) + test(1102.39, dcast(DT, int ~ chr, min, value.var="int"), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int"), warning="NAs introduced by coercion to integer range")#as.integer(min(integer()) is Inf) is NA + test(1102.391, dcast(DT, int ~ chr, min, value.var="int", fill=NA), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int")) + test(1102.392, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int")) test(1102.40, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) test(1102.41, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) From fc542ec9e62b4814d6a1cfe675007a1351b04570 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 10 Mar 2024 07:43:07 -0700 Subject: [PATCH 14/26] Revert "delete old commented code" This reverts commit 2886c4f41e204020df6c8848e4fba47bc805e73e. --- src/fcast.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/src/fcast.c b/src/fcast.c index ffeedc7adc..13f1a9da9b 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -109,3 +109,92 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil UNPROTECT(1); return(ans); } + +// commenting all unused functions, but not deleting it, just in case + +// // internal functions that are not used anymore.. + +// // # nocov start +// // Note: all these functions below are internal functions and are designed specific to fcast. +// SEXP zero_init(R_len_t n) { +// SEXP ans; +// if (n < 0) error(_("Input argument 'n' to 'zero_init' must be >= 0")); +// ans = PROTECT(allocVector(INTSXP, n)); +// for (int i=0; i 0")); +// PROTECT(call = lang3(install("do.call"), install("CJ"), s)); +// r = eval(call, env); +// UNPROTECT(1); +// return(r); +// } + +// SEXP diff_int(SEXP x, R_len_t n) { +// SEXP ans; +// if (TYPEOF(x) != INTSXP) error(_("Argument 'x' to 'diff_int' must be an integer vector")); +// ans = PROTECT(allocVector(INTSXP, length(x))); +// for (int i=1; i= 1 +// for (int i=0; i Date: Sun, 10 Mar 2024 08:12:04 -0700 Subject: [PATCH 15/26] use suggestions from Michael --- inst/tests/tests.Rraw | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dfb0e8728f..67f8ec249b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3732,11 +3732,12 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # test for #5512, only compute default fill if needed. DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) - test(1102.39, dcast(DT, int ~ chr, min, value.var="int"), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int"), warning="NAs introduced by coercion to integer range")#as.integer(min(integer()) is Inf) is NA - test(1102.391, dcast(DT, int ~ chr, min, value.var="int", fill=NA), data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int")) - test(1102.392, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int")) - test(1102.40, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) - test(1102.41, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) + test(1102.39, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))#fill not used in output, so default fill not computed, and no warning emitted during computation of that default fill. + ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") + test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning="inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")#warning emitted during computation of default fill since as.integer(min(integer()) is Inf) is NA. + test(1102.41, dcast(DT, int ~ chr, min, value.var="int", fill=NA), ans) + test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int")) + test(1102.43, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) } From eb95ab88a5e7de369297e930e555047cca0cff73 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Sun, 10 Mar 2024 08:12:31 -0700 Subject: [PATCH 16/26] rm inline any_NA_int since that causes install to fail --- src/fcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fcast.c b/src/fcast.c index 13f1a9da9b..f65973ec85 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -3,7 +3,7 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -inline bool any_NA_int(const int n, const int *idx) { +bool any_NA_int(const int n, const int *idx) { for (int i=0; i Date: Sun, 10 Mar 2024 08:14:19 -0700 Subject: [PATCH 17/26] clarify comment --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 67f8ec249b..d5522392b3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3732,7 +3732,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # test for #5512, only compute default fill if needed. DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) - test(1102.39, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))#fill not used in output, so default fill not computed, and no warning emitted during computation of that default fill. + test(1102.39, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))#fill not used in output, so default fill not computed, and no warning emitted. ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning="inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")#warning emitted during computation of default fill since as.integer(min(integer()) is Inf) is NA. test(1102.41, dcast(DT, int ~ chr, min, value.var="int", fill=NA), ans) From 3c7fb24483cbe05cf2db0638be8a71c8f5120299 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 11 Mar 2024 07:33:47 -0700 Subject: [PATCH 18/26] link 5390 --- NEWS.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7cd013dc18..dff623278e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,10 +2,6 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) -## BUG FIXES - -X. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512). Thanks to Toby Dylan Hocking for the PR. - ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. @@ -32,6 +28,8 @@ X. `dcast(fill=NULL)` only computes default fill value if necessary, which elimi 3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and Benjamin Schwendinger for the fix. +4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. From dcb51ed343285161c4ec49c562a98f82468e1444 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 12 Mar 2024 23:11:38 -0700 Subject: [PATCH 19/26] mymin test fails --- inst/tests/tests.Rraw | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d5522392b3..34091a5259 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3732,12 +3732,15 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # test for #5512, only compute default fill if needed. DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) - test(1102.39, dcast(DT, . ~ chr, min, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))#fill not used in output, so default fill not computed, and no warning emitted. + mymin <- function(x){ + if(length(x)==0)stop("calling mymin on vector of length 0") + min(x) + } + test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))#fill not used in output, so default fill not computed. ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") - test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning="inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")#warning emitted during computation of default fill since as.integer(min(integer()) is Inf) is NA. - test(1102.41, dcast(DT, int ~ chr, min, value.var="int", fill=NA), ans) - test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int")) - test(1102.43, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) + test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning="inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")#warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. + test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()). + test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int")) # no warning, because no coercion. } From 83b0cf5eab10bbca3594bbd8483002e11545d524 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 13 Mar 2024 01:00:04 -0700 Subject: [PATCH 20/26] compute some_fill using anyNA in R then pass to C --- R/fcast.R | 22 +++++++++++++--------- inst/tests/tests.Rraw | 6 +++--- src/data.table.h | 2 +- src/fcast.c | 11 ++--------- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/R/fcast.R b/R/fcast.R index bb59d8409b..10460f1489 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -152,7 +152,6 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., dat = .Call(CsubsetDT, dat, idx, seq_along(dat)) } fun.call = m[["fun.aggregate"]] - fill.default = NULL if (is.null(fun.call)) { oo = forderv(dat, by=varnames, retGrp=TRUE) if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) { @@ -160,15 +159,15 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., fun.call = quote(length) } } - if (!is.null(fun.call)) { + dat_for_default_fill = dat + run_agg_funs = !is.null(fun.call) + if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) - errmsg = gettext("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") - if (is.null(fill)) { - fill.default = suppressWarnings(dat[0L][, eval(fun.call)]) - # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stopf(errmsg)) - if (nrow(fill.default) != 1L) stopf(errmsg) + maybe_err = function(list.of.columns) { + if (any(sapply(list.of.columns, length) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") + list.of.columns } - dat = dat[, eval(fun.call), by=c(varnames)] + dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)] } order_ = function(x) { o = forderv(x, retGrp=TRUE, sort=TRUE) @@ -211,7 +210,12 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., } maplen = vapply_1i(mapunique, length) idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join. - ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call)) + fill.default = NULL + some_fill = anyNA(idx) + if (run_agg_funs && is.null(fill) && some_fill) { + fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))] + } + ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill) allcols = do.call("paste", c(rhs, sep=sep)) if (length(valnames) > 1L) allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 34091a5259..0e7c1eb7cc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3736,11 +3736,11 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, if(length(x)==0)stop("calling mymin on vector of length 0") min(x) } - test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key="."))#fill not used in output, so default fill not computed. + test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) # fill not used in output, so default fill not computed. ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") - test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning="inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")#warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. + test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c("no non-missing arguments to min; returning Inf", "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()). - test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int")) # no warning, because no coercion. + test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning="no non-missing arguments to min; returning Inf") # only one warning, because no coercion. } diff --git a/src/data.table.h b/src/data.table.h index 0a6eb207a8..da82af7be2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -289,7 +289,7 @@ SEXP setlistelt(SEXP, SEXP, SEXP); SEXP address(SEXP); SEXP expandAltRep(SEXP); SEXP fmelt(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP issorted(SEXP, SEXP); SEXP gforce(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP gsum(SEXP, SEXP); diff --git a/src/fcast.c b/src/fcast.c index f65973ec85..d049711bf5 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -3,15 +3,8 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -bool any_NA_int(const int n, const int *idx) { - for (int i=0; i Date: Wed, 13 Mar 2024 09:56:16 -0700 Subject: [PATCH 21/26] Update R/fcast.R Co-authored-by: Michael Chirico --- R/fcast.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fcast.R b/R/fcast.R index 10460f1489..30b57fcc51 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -164,7 +164,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) maybe_err = function(list.of.columns) { - if (any(sapply(list.of.columns, length) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") + if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") list.of.columns } dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)] From ee93c5fc7b7fde45b37ebf95d06ddf7f58af1c5e Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 13 Mar 2024 12:26:05 -0700 Subject: [PATCH 22/26] Update R/fcast.R Co-authored-by: Michael Chirico --- R/fcast.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/R/fcast.R b/R/fcast.R index 30b57fcc51..b29ad01a24 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -210,11 +210,8 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., } maplen = vapply_1i(mapunique, length) idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join. - fill.default = NULL some_fill = anyNA(idx) - if (run_agg_funs && is.null(fill) && some_fill) { - fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))] - } + fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[0L][, maybe_err(eval(fun.call))] ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill) allcols = do.call("paste", c(rhs, sep=sep)) if (length(valnames) > 1L) From 747c76cd8a65e46294dd42edf69294a60f3b9e94 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 13 Mar 2024 12:32:09 -0700 Subject: [PATCH 23/26] dat_for_default_fill is zero-row dt --- R/fcast.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fcast.R b/R/fcast.R index 10460f1489..4b7581d357 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -159,7 +159,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., fun.call = quote(length) } } - dat_for_default_fill = dat + dat_for_default_fill = dat[0] run_agg_funs = !is.null(fun.call) if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) @@ -213,7 +213,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., fill.default = NULL some_fill = anyNA(idx) if (run_agg_funs && is.null(fill) && some_fill) { - fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))] + fill.default = dat_for_default_fill[, maybe_err(eval(fun.call))] } ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill) allcols = do.call("paste", c(rhs, sep=sep)) From 4d6c0e192b4f72f37b83b9a9c72be5561672bca8 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 13 Mar 2024 12:35:33 -0700 Subject: [PATCH 24/26] !length instead of length==0 --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0e7c1eb7cc..831e1cfdd8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3733,7 +3733,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # test for #5512, only compute default fill if needed. DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) mymin <- function(x){ - if(length(x)==0)stop("calling mymin on vector of length 0") + if (!length(x)) stop("calling mymin on vector of length 0") min(x) } test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) # fill not used in output, so default fill not computed. From 359c3c375fa2f748027725614c099a761c4af2c9 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 14 Mar 2024 07:31:44 -0700 Subject: [PATCH 25/26] new dcast tests with fill=character --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 831e1cfdd8..4c06fac21f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3741,6 +3741,8 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c("no non-missing arguments to min; returning Inf", "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()). test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning="no non-missing arguments to min; returning Inf") # only one warning, because no coercion. + test(1102.43, dcast(DT, int ~ chr, min, value.var="dbl", fill="coerced to NA"), data.table(int=1:3, a=c(4,NA,NA), b=c(NA,5,6), key="int"), warning=c("Coercing 'character' RHS to 'double' to match the type of target vector.", "NAs introduced by coercion")) + test(1102.44, dcast(DT, int ~ ., value.var="dbl", fill="ignored"), data.table(int=1:3, .=c(4,5,6), key="int")) } From 4b96d350ca978825e9bbaaf186edc4958062edfd Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 14 Mar 2024 08:17:06 -0700 Subject: [PATCH 26/26] dat_for_default_fill is dat again, not 0-row, because that causes some test failure --- R/fcast.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/fcast.R b/R/fcast.R index ead50dd4b8..7c0766cfef 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -159,7 +159,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., fun.call = quote(length) } } - dat_for_default_fill = dat[0] + dat_for_default_fill = dat run_agg_funs = !is.null(fun.call) if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) @@ -212,6 +212,9 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join. some_fill = anyNA(idx) fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))] + if (run_agg_funs && is.null(fill) && some_fill) { + fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))] + } ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill) allcols = do.call("paste", c(rhs, sep=sep)) if (length(valnames) > 1L)