Rdatatable · rikivillalba · Dec 21, 2024 · Dec 30, 2024 · Dec 30, 2024 · Jan 1, 2025
@@ -206,3 +206,5 @@ S3method(format_list_item, data.frame)
 
 export(fdroplevels, setdroplevels)
 S3method(droplevels, data.table)
+
+S3method(sort_by, data.table)
@@ -69,6 +69,9 @@ rowwiseDT(
 
 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`.
 
+7. Base R generic `sort_by()` (new in R 4.4.0) is implemented for data.table's. It internally uses data.table's `forder()` instead of base R `order()` for efficiency. Hence, it uses C-locale as data.table's conventional sorting (suggested by @rikivillalba).
+
+
 ## BUG FIXES
 
 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.

@@ -2532,6 +2532,18 @@
   }
 }
 
+sort_by.data.table <- function (x, y, ...)
+{
+  if (!cedta()) return(NextMethod()) # nocov
+  if (inherits(y, "formula"))
+    y <- .formula2varlist(y, x)
+  if (!is.list(y))
+    y <- list(y)
 # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency 
 # evaluate formula mirroring split.data.frame #5392. Mimics base::.formula2varlist. 
 if (inherits(f, "formula")) 
     f <- eval(attr(terms(f), "variables"), x, environment(f)) 
 # be sure to use x[ind, , drop = FALSE], not x[ind], in case downstream methods don't follow the same subsetting semantics (#5365) 
 return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind, , drop = FALSE])) 
 # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency 
 # evaluate formula mirroring split.data.frame #5392. Mimics base::.formula2varlist. 
 if (inherits(f, "formula")) 
     f <- eval(attr(terms(f), "variables"), x, environment(f)) 
 # be sure to use x[ind, , drop = FALSE], not x[ind], in case downstream methods don't follow the same subsetting semantics (#5365) 
 return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind, , drop = FALSE])) 
+  # use forder instead of base 'order'
+  o <- do.call(forder, c(unname(y), list(...)))
+  x[o, , drop = FALSE]
+}
+
 # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
 
 copy = function(x) {

@@ -20686,6 +20686,13 @@ test(2299.10, data.table(a=1),                                        output="a\
 test(2299.11, data.table(a=list(data.frame(b=1))),                    output="a\n1: <data.frame[1x1]>")
 test(2299.12, data.table(a=list(data.table(b=1))),                    output="a\n1: <data.table[1x1]>")
 
+# sort_by.data.table
+DT1 = data.table(a = c(1, 3, 2, NA, 3) , b = 4:0)
+DT2 = data.table(a =  c("c", "a", "B"))  # data.table uses C-locale and should sort_by if cedta()
+test(2300.01, sort_by(DT1, ~ a + b), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2300.02, sort_by(DT1, ~ I(a + b)), data.table(a = c(3,2,1,3,NA), b = c(0L,2L,4L,3L,1L)))
+test(2300.03, sort_by(DT2, ~ a), data.table(a = c("B", "a", "c")))
+
 if (test_bit64) {
   # Join to integer64 doesn't require integer32 representation, just integer64, #6625
   i64_val = .Machine$integer.max + 1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -206,3 +206,5 @@ S3method(format_list_item, data.frame)

		export(fdroplevels, setdroplevels)
		S3method(droplevels, data.table)

		S3method(sort_by, data.table)
MichaelChirico marked this conversation as resolved. Show resolved Hide resolved