tidyverse · DavisVaughan · Jun 13, 2022 · May 10, 2022 · May 10, 2022 · May 10, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -60,6 +60,7 @@ Suggests:
     RMySQL,
     RPostgreSQL,
     RSQLite,
+    stringi (>= 1.7.6),
     testthat (>= 3.1.1),
     tidyr, 
     withr

diff --git a/NAMESPACE b/NAMESPACE
@@ -270,6 +270,7 @@ export(distinct_prepare)
 export(do)
 export(do_)
 export(dplyr_col_modify)
+export(dplyr_locale)
 export(dplyr_reconstruct)
 export(dplyr_row_slice)
 export(ends_with)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,16 @@
 # dplyr (development version)
 
+* `arrange()` now uses a faster algorithm for sorting character vectors, which
+  is heavily inspired by data.table's `forder()`. Additionally, the default
+  locale for sorting character vectors is now the C locale, which is a breaking
+  change from the previous behavior that utilized the system locale. The new
+  `.locale` argument can be used to adjust this to, say, the American English
+  locale, which is an optional feature that requires the stringi package. This
+  change improves reproducibility across R sessions and operating systems. For a
+  fuller explanation, refer to this
+  [tidyup](https://github.com/tidyverse/tidyups/blob/main/003-dplyr-radix-ordering.md)
+  which outlines and justifies this change (#4962).
+
 * `tbl_sum()` is no longer reexported from tibble (#6284).
 
 * `slice_sample()` now gives a more informative error when `replace = FALSE` and

diff --git a/R/arrange.R b/R/arrange.R
@@ -10,10 +10,6 @@
 #' once per data frame, not once per group.
 #'
 #' @details
-#' ## Locales
-#' The sort order for character vectors will depend on the collating sequence
-#' of the locale in use: see [locales()].
-#'
 #' ## Missing values
 #' Unlike base sorting with `sort()`, `NA` are:
 #' * always sorted to the end for local data, even when wrapped with `desc()`.
@@ -42,6 +38,24 @@
 #'   variables. Use [desc()] to sort a variable in descending order.
 #' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to
 #'   grouped data frames only.
+#' @param .locale The locale to sort character vectors in.
+#'
+#'   - Defaults to [dplyr_locale()], which uses the `"C"` locale unless this is
+#'     explicitly overriden. See the help page for [dplyr_locale()] for the
+#'     exact details.
+#'
+#'   - If a single string from [stringi::stri_locale_list()] is supplied, then
+#'     this will be used as the locale to sort with. For example, `"en"` will
+#'     sort with the American English locale. This requires the stringi package.
+#'
+#'   - If `"C"` is supplied, then character vectors will always be sorted in the
+#'     C locale. This does not require stringi and is often much faster than
+#'     supplying a locale identifier.
+#'
+#'   The C locale is not the same as English locales, such as `"en"`,
+#'   particularly when it comes to data containing a mix of upper and lower case
+#'   letters. This is explained in more detail in the help page of
+#'   [dplyr_locale()] under the `Default locale` section.
 #' @family single table verbs
 #' @examples
 #' arrange(mtcars, cyl, disp)
@@ -68,32 +82,46 @@ arrange <- function(.data, ..., .by_group = FALSE) {
   UseMethod("arrange")
 }
 
+#' @rdname arrange
 #' @export
-arrange.data.frame <- function(.data, ..., .by_group = FALSE) {
+arrange.data.frame <- function(.data,
+                               ...,
+                               .by_group = FALSE,
+                               .locale = dplyr_locale()) {
   dots <- enquos(...)
 
   if (.by_group) {
     dots <- c(quos(!!!groups(.data)), dots)
   }
 
-  loc <- arrange_rows(.data, dots)
+  loc <- arrange_rows(.data, dots = dots, locale = .locale)
   dplyr_row_slice(.data, loc)
 }
 
 # Helpers -----------------------------------------------------------------
 
-arrange_rows <- function(.data, dots, error_call = caller_env()) {
+arrange_rows <- function(data,
+                         dots,
+                         locale,
+                         error_call = caller_env()) {
   error_call <- dplyr_error_call(error_call)
 
+  chr_proxy_collate <- locale_to_chr_proxy_collate(
+    locale = locale,
+    error_call = error_call
+  )
+
   if (length(dots) == 0L) {
-    out <- seq_len(nrow(.data))
+    out <- seq_len(nrow(data))
     return(out)
   }
 
   directions <- map_chr(dots, function(quosure) {
     if(quo_is_call(quosure, "desc")) "desc" else "asc"
   })
 
+  na_values <- if_else(directions == "desc", "smallest", "largest")
+
   quosures <- map(dots, function(quosure) {
     if (quo_is_call(quosure, "desc", ns = c("", "dplyr"))) {
       expr <- quo_get_expr(quosure)
@@ -117,7 +145,7 @@ arrange_rows <- function(.data, dots, error_call = caller_env()) {
   #       revisit when we have something like mutate_one() to
   #       evaluate one quosure in the data mask
   data <- withCallingHandlers({
-    transmute(new_data_frame(.data), !!!quosures)
+    transmute(new_data_frame(data), !!!quosures)
   }, error = function(cnd) {
 
     if (inherits(cnd, "dplyr:::mutate_error")) {
@@ -144,24 +172,43 @@ arrange_rows <- function(.data, dots, error_call = caller_env()) {
 
   })
 
-  # we can't just use vec_compare_proxy(data) because we need to apply
-  # direction for each column, so we get a list of proxies instead
-  # and then mimic vctrs:::order_proxy
-  #
-  # should really be map2(quosures, directions, ...)
-  proxies <- map2(data, directions, function(column, direction) {
-    proxy <- vec_proxy_order(column)
-    desc <- identical(direction, "desc")
-    if (is.data.frame(proxy)) {
-      proxy <- order(vec_order(proxy,
-        direction = direction,
-        na_value = if(desc) "smallest" else "largest"
-      ))
-    } else if(desc) {
-      proxy <- desc(proxy)
+  vec_order_radix(
+    x = data,
+    direction = directions,
+    na_value = na_values,
+    chr_proxy_collate = chr_proxy_collate
+  )
+}
+
+locale_to_chr_proxy_collate <- function(locale,
+                                        ...,
+                                        has_stringi = has_minimum_stringi(),
+                                        error_call = caller_env()) {
+  check_dots_empty0(...)
+
+  if (identical(locale, "C")) {
+    return(NULL)
+  }
+
+  if (is_character(locale)) {
+    if (!is_string(locale)) {
+      abort("If `.locale` is a character vector, it must be a single string.", call = error_call)
+    }
+    if (!has_stringi) {
+      abort("stringi >=1.5.3 is required to arrange in a different locale.", call = error_call)
+    }
+    if (!locale %in% stringi::stri_locale_list()) {
+      abort("`.locale` must be one of the locales within `stringi::stri_locale_list()`.", call = error_call)
     }
-    proxy
-  })
 
-  exec("order", !!!unname(proxies), decreasing = FALSE, na.last = TRUE)
+    return(sort_key_generator(locale))
+  }
+
+  abort("`.locale` must be a string.", call = error_call)
+}
+
+sort_key_generator <- function(locale) {
+  function(x) {
+    stringi::stri_sort_key(x, locale = locale)
+  }
 }
diff --git a/R/colwise-arrange.R b/R/colwise-arrange.R
@@ -29,31 +29,45 @@
 #' arrange_all(df, desc)
 #' # ->
 #' arrange(df, across(everything(), desc))
-arrange_all <- function(.tbl, .funs = list(), ..., .by_group = FALSE) {
+arrange_all <- function(.tbl,
+                        .funs = list(),
+                        ...,
+                        .by_group = FALSE,
+                        .locale = dplyr_locale()) {
   lifecycle::signal_stage("superseded", "arrange_all()")
   funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_all")
   if (!length(funs)) {
     funs <- syms(tbl_vars(.tbl))
   }
-  arrange(.tbl, !!!funs, .by_group = .by_group)
+  arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale)
 }
 #' @rdname arrange_all
 #' @export
-arrange_at <- function(.tbl, .vars, .funs = list(), ..., .by_group = FALSE) {
+arrange_at <- function(.tbl,
+                       .vars,
+                       .funs = list(),
+                       ...,
+                       .by_group = FALSE,
+                       .locale = dplyr_locale()) {
   lifecycle::signal_stage("superseded", "arrange_at()")
   funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_at")
   if (!length(funs)) {
     funs <- tbl_at_syms(.tbl, .vars, .include_group_vars = TRUE)
   }
-  arrange(.tbl, !!!funs, .by_group = .by_group)
+  arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale)
 }
 #' @rdname arrange_all
 #' @export
-arrange_if <- function(.tbl, .predicate, .funs = list(), ..., .by_group = FALSE) {
+arrange_if <- function(.tbl,
+                       .predicate,
+                       .funs = list(),
+                       ...,
+                       .by_group = FALSE,
+                       .locale = dplyr_locale()) {
   lifecycle::signal_stage("superseded", "arrange_if()")
   funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_if")
   if (!length(funs)) {
     funs <- tbl_if_syms(.tbl, .predicate, .include_group_vars = TRUE)
   }
-  arrange(.tbl, !!!funs, .by_group = .by_group)
+  arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale)
 }
diff --git a/R/grouped-df.r b/R/grouped-df.r
@@ -298,7 +298,7 @@ vec_split_id_order <- function(x) {
   split_id <- vec_group_loc(x)
   split_id$loc <- new_list_of(split_id$loc, ptype = integer())
 
-  vec_slice(split_id, vec_order(split_id$key))
+  vec_slice(split_id, vec_order_base(split_id$key))
 }
 
 group_intersect <- function(x, new) {

diff --git a/R/locale.R b/R/locale.R
@@ -0,0 +1,96 @@
+#' Locale used by dplyr
+#'
+#' @description
+#' `dplyr_locale()` returns a single string representing the default locale used
+#' by dplyr when ordering character vectors. It is used as the default value of
+#' `.locale` in [arrange()].
+#'
+#' ## Default locale
+#'
+#' The default locale returned by `dplyr_locale()` is the C locale, identical
+#' to explicitly supplying `.locale = "C"`.
+#'
+#' The C locale is not exactly the same as English locales, such as `"en"`. The
+#' main difference is that the C locale groups the English alphabet by _case_,
+#' while most English locales group the alphabet by _letter_. For example,
+#' `c("a", "b", "C", "B", "c")` will sort as `c("B", "C", "a", "b", "c")` in the
+#' C locale, with all uppercase letters coming before lowercase letters, but
+#' will sort as `c("a", "b", "B", "c", "C")` in an English locale. This often
+#' makes little practical difference during data analysis, because both return
+#' identical results when case is consistent between observations.
+#'
+#' ## Global override
+#'
+#' To override the above default behavior, you can set the global option,
+#' `dplyr.locale`, to a stringi locale identifier from
+#' [stringi::stri_locale_list()] to globally alter the default locale. This
+#' requires stringi >=1.5.3.
+#'
+#' We generally recommend that you set the `.locale` argument of [arrange()]
+#' explicitly rather than overriding the global locale, if possible.
+#'
+#' Another alternative is to only change the global locale within a limited
+#' scope through the use of [rlang::local_options()] or [rlang::with_options()].
+#' This can be useful when a package that you don't control calls `arrange()`
+#' internally.
+#'
+#' ## Reproducibility
+#'
+#' The C locale has the benefit of being completely reproducible across all
+#' supported R versions and operating systems with no extra effort.
+#'
+#' If you set `.locale` to an option from [stringi::stri_locale_list()], then
+#' stringi must be installed by anyone who wants to run your code. If you
+#' utilize this in a package, then stringi should be placed in `Imports`.
+#' @export
+#' @keywords internal
+#' @examplesIf dplyr:::has_minimum_stringi()
+#' # Default locale is C
+#' dplyr_locale()
+#'
+#' df <- tibble(x = c("a", "b", "C", "B", "c"))
+#' df
+#'
+#' # The C locale groups the English alphabet by case, placing uppercase letters
+#' # before lowercase letters. This is the default.
+#' arrange(df, x)
+#'
+#' # The American English locale groups the alphabet by letter.
+#' # Explicitly override `.locale` with `"en"` for this ordering.
+#' arrange(df, x, .locale = "en")
+#'
+#' # Or temporarily override the `dplyr.locale` global option, which is useful
+#' # if `arrange()` is called from a function you don't control
+#' col_sorter <- function(df) {
+#'   arrange(df, x)
+#' }
+#'
+#' rlang::with_options(dplyr.locale = "en", {
+#'   col_sorter(df)
+#' })
+#'
+#' # This Danish letter is expected to sort after `z`
+#' df <- tibble(x = c("o", "p", "\u00F8", "z"))
+#' df
+#'
+#' # The American English locale sorts it right after `o`
+#' arrange(df, x, .locale = "en")
+#'
+#' # Using `"da"` for Danish ordering gives the expected result
+#' arrange(df, x, .locale = "da")
+dplyr_locale <- function() {
+  locale <- peek_option("dplyr.locale")
+
+  if (is_string(locale)) {
+    return(locale)
+  }
+  if (!is_null(locale)) {
+    abort("If set, the global option `dplyr.locale` must be a string.")
+  }
+
+  "C"
+}
+
+has_minimum_stringi <- function() {
+  is_installed("stringi", version = "1.5.3")
+}
diff --git a/R/order-by.R b/R/order-by.R
@@ -61,7 +61,7 @@ order_by <- function(order_by, call) {
 #' @keywords internal
 #' @export
 with_order <- function(order_by, fun, x, ...) {
-  ord <- vec_order(order_by)
+  ord <- vec_order_base(order_by)
   undo <- vec_match(seq_along(order_by), ord)
 
   out <- fun(vec_slice(x, ord), ...)