Rdatatable · rikivillalba · Dec 21, 2024 · Dec 30, 2024 · Dec 30, 2024 · Jan 1, 2025
@@ -206,3 +206,6 @@ S3method(format_list_item, data.frame)
 
 export(fdroplevels, setdroplevels)
 S3method(droplevels, data.table)
+
+# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html
+if (getRversion() >= "4.4.0") S3method(sort_by, data.table)  
@@ -69,6 +69,8 @@ rowwiseDT(
 
 6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`.
 
+7. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
+
 ## BUG FIXES
 
 1. `fwrite()` respects `dec=','` for timestamp columns (`POSIXct` or `nanotime`) with sub-second accuracy, [#6446](https://github.com/Rdatatable/data.table/issues/6446). Thanks @kav2k for pointing out the inconsistency and @MichaelChirico for the PR.

@@ -2532,6 +2532,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
   }
 }
 
+sort_by.data.table <- function(x, y, ...)
+{
+  if (!cedta()) return(NextMethod()) # nocov
+  if (inherits(y, "formula"))
+    y <- .formula2varlist(y, x)
+  if (!is.list(y))
+    y <- list(y)
 # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency 
 # evaluate formula mirroring split.data.frame #5392. Mimics base::.formula2varlist. 
 if (inherits(f, "formula")) 
     f <- eval(attr(terms(f), "variables"), x, environment(f)) 
 # be sure to use x[ind, , drop = FALSE], not x[ind], in case downstream methods don't follow the same subsetting semantics (#5365) 
 return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind, , drop = FALSE])) 
 # same as split.data.frame - handling all exceptions, factor orders etc, in a single stream of processing was a nightmare in factor and drop consistency 
 # evaluate formula mirroring split.data.frame #5392. Mimics base::.formula2varlist. 
 if (inherits(f, "formula")) 
     f <- eval(attr(terms(f), "variables"), x, environment(f)) 
 # be sure to use x[ind, , drop = FALSE], not x[ind], in case downstream methods don't follow the same subsetting semantics (#5365) 
 return(lapply(split(x = seq_len(nrow(x)), f = f, drop = drop, ...), function(ind) x[ind, , drop = FALSE])) 
+  # use forder instead of base 'order'
+  o <- do.call(forder, c(unname(y), list(...)))
+  x[o, , drop = FALSE]
+}
+
 # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
 
 copy = function(x) {

@@ -20697,3 +20697,30 @@ if (test_bit64) {
   test(2300.3, DT1[DT2, on='id'], error="Incompatible join types")
   test(2300.4, DT2[DT1, on='id'], error="Incompatible join types")
 }
+
+# sort_by.data.table
+DT1 = data.table(a = c(1, 3, 2, NA, 3) , b = 4:0)
+DT2 = data.table(a =  c("c", "a", "B"))  # data.table uses C-locale and should sort_by if cedta()
+DT3 = data.table(a = c(1,2,3), b = list(c("a","b","",NA),c(1,3,2,0), c(T,T,F,NA)))  # list column
+
+# sort_by.data.table: basics
+test(2301.01, sort_by(DT1, ~ a + b), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2301.02, sort_by(DT1, ~ I(a + b)), data.table(a = c(3,2,1,3,NA), b = c(0L,2L,4L,3L,1L)))
+test(2301.03, sort_by(DT2, ~ a), data.table(a = c("B", "a", "c")))
+
+# sort_by.data.table: list columns.
+# NOTE 1: .formula2varlist works well with list columns.
+# NOTE 2: 4 elem in DT of 3 row because forderv takes a list column as a DT.
+test(2301.04, sort_by(DT3, ~b), DT3[order(b)])   # should be consistent.
+
+# sort_by.data.table: additional C-locale sorting
+test(2301.10, DT2[, sort_by(.SD, a)], data.table(a = c("B", "a", "c")))
+test(2301.11, DT2[, sort_by(.SD, ~ a)], data.table(a = c("B", "a", "c")))
+
+# sort_by.data.table: various working interfaces
+test(2301.20, sort_by(DT1, list(DT1$a, DT1$b)), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2301.21, sort_by(DT1, DT1[, .(a, b)]), data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2301.22, DT1[, sort_by(.SD, .(a, b))], data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2301.23, DT1[, sort_by(.SD, ~ a + b)], data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+test(2301.24, DT1[, sort_by(.SD, ~ .(a, b))], data.table(a = c(1,2,3,3,NA), b = c(4L,2L,0L,3L,1L)))
+
@@ -5,6 +5,7 @@
 \alias{fastorder}
 \alias{forder}
 \alias{forderv}
+\alias{sort_by}
 
 \title{Fast row reordering of a data.table by reference}
 \description{
@@ -32,6 +33,7 @@ setorderv(x, cols = colnames(x), order=1L, na.last=FALSE)
 # optimised to use data.table's internal fast order
 # x[order(., na.last=TRUE)]
 # x[order(., decreasing=TRUE)]
+# sort_by(x, ., na.last=TRUE, decreasing=FALSE)
 }
 \arguments{
 \item{x}{ A \code{data.table}. }
@@ -46,7 +48,7 @@ when \code{b} is of type \code{character} as well. }
 \code{order} must be either \code{1} or equal to that of \code{cols}. If
 \code{length(order) == 1}, it is recycled to \code{length(cols)}. }
 \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE}, they are placed first; if \code{NA} they are removed.
-\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its
+\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and related \code{sort_by(x, .)} and its
 default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept
 \code{TRUE}/\code{FALSE} with default \code{FALSE}. }
 }
@@ -71,8 +73,8 @@ sets the \code{sorted} attribute.
 
 \code{na.last} argument, by default, is \code{FALSE} for \code{setorder} and
 \code{setorderv} to be consistent with \code{data.table}'s \code{setkey} and
-is \code{TRUE} for \code{x[order(.)]} to be consistent with \code{base::order}.
-Only \code{x[order(.)]} can have \code{na.last = NA} as it is a subset operation
+is \code{TRUE} for \code{x[order(.)]} and \code{sort_by(x, .)} to be consistent with \code{base::order}.
+Only \code{x[order(.)]} (and related \code{sort_by(x, .)}) can have \code{na.last = NA} as it is a subset operation
 as opposed to \code{setorder} or \code{setorderv} which reorders the data.table
 by reference.
 
@@ -96,6 +98,11 @@ was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE}
 
 If \code{setorder} results in reordering of the rows of a keyed \code{data.table},
 then its key will be set to \code{NULL}.
+
+\code{sort_by(x, y, \dots)} is the S3 method for the generic \code{sort_by} for \code{data.table}'s. 
+It uses the same formula or list interfaces as data.frame's \code{sort_by} but internally uses \code{data.table}'s fast ordering, 
+hence it behaves the same as \code{x[order(.)]} and takes the same optional named arguments and their defaults.
+
 }
 \value{
 The input is modified by reference, and returned (invisibly) so it can be used