From f6a6123a051b218ac6f9f09c7297502c7925c7f7 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 10 May 2022 11:18:31 -0400 Subject: [PATCH 1/9] Import unexported `vec_order_base/radix()` --- R/zzz.r | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/R/zzz.r b/R/zzz.r index d1e85b385b..4780deb58c 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -1,4 +1,6 @@ .onLoad <- function(libname, pkgname) { + ns_dplyr <- ns_env(pkgname) + op <- options() op.dplyr <- list( dplyr.show_progress = TRUE @@ -6,7 +8,7 @@ toset <- !(names(op.dplyr) %in% names(op)) if (any(toset)) options(op.dplyr[toset]) - .Call(dplyr_init_library, ns_env("dplyr"), ns_env("vctrs"), ns_env("rlang")) + .Call(dplyr_init_library, ns_dplyr, ns_env("vctrs"), ns_env("rlang")) has_dbplyr <- is_installed("dbplyr") if (!has_dbplyr || !exists("count.tbl_sql", ns_env("dbplyr"))) { @@ -14,6 +16,16 @@ s3_register("dplyr::tally", "tbl_sql") } + # TODO: For `arrange()`, `group_by()`, and `with_order()` until vctrs changes + # `vec_order()` to the new ordering algorithm and officially exports + # `vec_order_base()`, at which point we should use `vec_order()` and + # `vec_order_base()` directly and remove this. + env_bind( + .env = ns_dplyr, + vec_order_base = import_vctrs("vec_order_base"), + vec_order_radix = import_vctrs("vec_order_radix") + ) + run_on_load() invisible() @@ -34,3 +46,16 @@ .onDetach <- function(libpath) { setHook(packageEvent("plyr", "attach"), NULL, "replace") } + +import_vctrs <- function(name) { + import_from(name, "vctrs") +} +import_from <- function(name, package) { + ns <- getNamespace(package) + + if (!exists(name, mode = "function", envir = ns, inherits = FALSE)) { + abort(sprintf("No such '%s' function: `%s()`.", package, name)) + } + + get(name, mode = "function", envir = ns, inherits = FALSE) +} From 7693d0f972a315a7e80eacea1b6d5ef5538b96da Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 10 May 2022 11:44:26 -0400 Subject: [PATCH 2/9] Use `vec_order_base()` in `with_order()` and `group_by()` internals This allows us to isolate changes in this PR to just `arrange()`, keeping other usage of `vec_order()` the same for now. --- R/grouped-df.r | 2 +- R/order-by.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/grouped-df.r b/R/grouped-df.r index 0aaa53c465..e80c9233ce 100644 --- a/R/grouped-df.r +++ b/R/grouped-df.r @@ -298,7 +298,7 @@ vec_split_id_order <- function(x) { split_id <- vec_group_loc(x) split_id$loc <- new_list_of(split_id$loc, ptype = integer()) - vec_slice(split_id, vec_order(split_id$key)) + vec_slice(split_id, vec_order_base(split_id$key)) } group_intersect <- function(x, new) { diff --git a/R/order-by.R b/R/order-by.R index 59fd8b9676..7902e000a1 100644 --- a/R/order-by.R +++ b/R/order-by.R @@ -61,7 +61,7 @@ order_by <- function(order_by, call) { #' @keywords internal #' @export with_order <- function(order_by, fun, x, ...) { - ord <- vec_order(order_by) + ord <- vec_order_base(order_by) undo <- vec_match(seq_along(order_by), ord) out <- fun(vec_slice(x, ord), ...) From 580774316dc31edbab00ef4e87a8271d3f8d7ed5 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 10 May 2022 11:51:10 -0400 Subject: [PATCH 3/9] Add a `.locale` argument to `arrange()`, and implement `dplyr_locale()` --- DESCRIPTION | 1 + NAMESPACE | 1 + R/arrange.R | 95 ++++++++++++++++++++--------- R/locale.R | 101 +++++++++++++++++++++++++++++++ man/arrange.Rd | 24 ++++++-- man/dplyr_locale.Rd | 66 ++++++++++++++++++++ tests/testthat/_snaps/arrange.md | 32 ++++++++++ tests/testthat/_snaps/locale.md | 20 ++++++ tests/testthat/test-arrange.r | 59 ++++++++++++++++-- tests/testthat/test-locale.R | 20 ++++++ 10 files changed, 380 insertions(+), 39 deletions(-) create mode 100644 R/locale.R create mode 100644 man/dplyr_locale.Rd create mode 100644 tests/testthat/_snaps/locale.md create mode 100644 tests/testthat/test-locale.R diff --git a/DESCRIPTION b/DESCRIPTION index 3a6795b00d..b2ca1a6c2b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -60,6 +60,7 @@ Suggests: RMySQL, RPostgreSQL, RSQLite, + stringi (>= 1.7.6), testthat (>= 3.1.1), tidyr, withr diff --git a/NAMESPACE b/NAMESPACE index d1d4d398a2..489e47fb7f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -270,6 +270,7 @@ export(distinct_prepare) export(do) export(do_) export(dplyr_col_modify) +export(dplyr_locale) export(dplyr_reconstruct) export(dplyr_row_slice) export(ends_with) diff --git a/R/arrange.R b/R/arrange.R index 55055a3399..efc3130084 100644 --- a/R/arrange.R +++ b/R/arrange.R @@ -10,10 +10,6 @@ #' once per data frame, not once per group. #' #' @details -#' ## Locales -#' The sort order for character vectors will depend on the collating sequence -#' of the locale in use: see [locales()]. -#' #' ## Missing values #' Unlike base sorting with `sort()`, `NA` are: #' * always sorted to the end for local data, even when wrapped with `desc()`. @@ -42,6 +38,21 @@ #' variables. Use [desc()] to sort a variable in descending order. #' @param .by_group If `TRUE`, will sort first by grouping variable. Applies to #' grouped data frames only. +#' @param .locale The locale to sort character vectors in. +#' +#' - Defaults to [dplyr_locale()], which uses the American English locale +#' if the stringi package is installed and the global default locale +#' has not been altered. See the help page for [dplyr_locale()] for the +#' exact details. +#' +#' - If a single string is supplied, then this will be used as the sorting +#' locale. For example, `"fr"` will sort with the French locale. This +#' requires the stringi package. Use [stringi::stri_locale_list()] to +#' generate a list of possible locale identifiers. +#' +#' - If `"C"` is supplied, then character vectors will be sorted in the C +#' locale. This does not require stringi and is often much faster than +#' supplying a locale identifier. #' @family single table verbs #' @examples #' arrange(mtcars, cyl, disp) @@ -68,25 +79,34 @@ arrange <- function(.data, ..., .by_group = FALSE) { UseMethod("arrange") } +#' @rdname arrange #' @export -arrange.data.frame <- function(.data, ..., .by_group = FALSE) { +arrange.data.frame <- function(.data, + ..., + .by_group = FALSE, + .locale = dplyr_locale()) { dots <- enquos(...) if (.by_group) { dots <- c(quos(!!!groups(.data)), dots) } - loc <- arrange_rows(.data, dots) + chr_proxy_collate <- locale_to_chr_proxy_collate(.locale) + + loc <- arrange_rows(.data, dots, chr_proxy_collate) dplyr_row_slice(.data, loc) } # Helpers ----------------------------------------------------------------- -arrange_rows <- function(.data, dots, error_call = caller_env()) { +arrange_rows <- function(data, + dots, + chr_proxy_collate, + error_call = caller_env()) { error_call <- dplyr_error_call(error_call) if (length(dots) == 0L) { - out <- seq_len(nrow(.data)) + out <- seq_len(nrow(data)) return(out) } @@ -94,6 +114,8 @@ arrange_rows <- function(.data, dots, error_call = caller_env()) { if(quo_is_call(quosure, "desc")) "desc" else "asc" }) + na_values <- if_else(directions == "desc", "smallest", "largest") + quosures <- map(dots, function(quosure) { if (quo_is_call(quosure, "desc", ns = c("", "dplyr"))) { expr <- quo_get_expr(quosure) @@ -117,7 +139,7 @@ arrange_rows <- function(.data, dots, error_call = caller_env()) { # revisit when we have something like mutate_one() to # evaluate one quosure in the data mask data <- withCallingHandlers({ - transmute(new_data_frame(.data), !!!quosures) + transmute(new_data_frame(data), !!!quosures) }, error = function(cnd) { if (inherits(cnd, "dplyr:::mutate_error")) { @@ -144,24 +166,43 @@ arrange_rows <- function(.data, dots, error_call = caller_env()) { }) - # we can't just use vec_compare_proxy(data) because we need to apply - # direction for each column, so we get a list of proxies instead - # and then mimic vctrs:::order_proxy - # - # should really be map2(quosures, directions, ...) - proxies <- map2(data, directions, function(column, direction) { - proxy <- vec_proxy_order(column) - desc <- identical(direction, "desc") - if (is.data.frame(proxy)) { - proxy <- order(vec_order(proxy, - direction = direction, - na_value = if(desc) "smallest" else "largest" - )) - } else if(desc) { - proxy <- desc(proxy) + vec_order_radix( + x = data, + direction = directions, + na_value = na_values, + chr_proxy_collate = chr_proxy_collate + ) +} + +locale_to_chr_proxy_collate <- function(locale, + ..., + has_stringi = has_minimum_stringi(), + error_call = caller_env()) { + check_dots_empty0(...) + + if (identical(locale, "C")) { + return(NULL) + } + + if (is_character(locale)) { + if (!is_string(locale)) { + abort("If `.locale` is a character vector, it must be a single string.", call = error_call) + } + if (!has_stringi) { + abort("stringi >=1.5.3 is required to arrange in a different locale.", call = error_call) + } + if (!locale %in% stringi::stri_locale_list()) { + abort("`.locale` must be one of the locales within `stringi::stri_locale_list()`.", call = error_call) } - proxy - }) - exec("order", !!!unname(proxies), decreasing = FALSE, na.last = TRUE) + return(sort_key_generator(locale)) + } + + abort("`.locale` must be a string.", call = error_call) +} + +sort_key_generator <- function(locale) { + function(x) { + stringi::stri_sort_key(x, locale = locale) + } } diff --git a/R/locale.R b/R/locale.R new file mode 100644 index 0000000000..1d64450c9f --- /dev/null +++ b/R/locale.R @@ -0,0 +1,101 @@ +#' Locale used by dplyr +#' +#' @description +#' `dplyr_locale()` returns a single string representing the default locale used +#' by dplyr when ordering character vectors. It is used as the default value of +#' `.locale` in [arrange()]. +#' +#' ## Default locale +#' +#' - If stringi >=1.5.3 is installed, the default locale is set to American +#' English, represented by the locale identifier `"en"`. +#' +#' - If stringi is not installed or is older than 1.5.3, the default locale +#' falls back to the C locale, represented by `"C"`. When this occurs, a +#' warning will be thrown encouraging you to either install stringi or +#' replace usage of `dplyr_locale()` with `"C"` to explicitly force the C +#' locale. +#' +#' ## Global override +#' +#' To override the above default behavior, you can set the global option, +#' `dplyr.locale`, to either `"C"` or a stringi locale identifier from +#' [stringi::stri_locale_list()] to globally alter the default locale. +#' Setting this option to anything other than `"C"` requires stringi >=1.5.3. +#' +#' We generally recommend that you set the `.locale` argument of [arrange()] +#' explicitly rather than overriding the global locale, if possible. +#' +#' Another alternative is to only change the global locale within a limited +#' scope through the use of [rlang::local_options()] or [rlang::with_options()]. +#' This can be useful when a package that you don't control calls `arrange()` +#' internally. +#' @export +#' @examplesIf dplyr:::has_minimum_stringi() +#' # Default locale is American English +#' dplyr_locale() +#' +#' # This Danish letter is typically sorted after `z` +#' df <- tibble(x = x <- c("o", "p", "\u00F8", "z")) +#' df +#' +#' # The American English locale sorts it right after `o` +#' arrange(df, x) +#' +#' # Explicitly override `.locale` to `"da"` for Danish ordering +#' arrange(df, x, .locale = "da") +#' +#' # Or temporarily override the `dplyr.locale` global option, which is useful +#' # if `arrange()` is called from a function you don't control +#' col_sorter <- function(df) { +#' arrange(df, x) +#' } +#' +#' rlang::with_options(dplyr.locale = "da", { +#' col_sorter(df) +#' }) +dplyr_locale <- function() { + locale <- peek_option("dplyr.locale") + + if (is_string(locale)) { + return(locale) + } + if (!is_null(locale)) { + abort("If set, the global option `dplyr.locale` must be a string.") + } + + dplyr_locale_default() +} + +dplyr_locale_default <- function(has_stringi = has_minimum_stringi()) { + if (has_stringi) { + "en" + } else { + warn_locale_fallback() + "C" + } +} + +warn_locale_fallback <- function() { + header <- paste0( + "`dplyr_locale()` attempted to default to the American English locale (\"en\"), ", + "but the required package, stringi >=1.5.3, is not installed." + ) + + bullets <- c( + i = "Falling back to the C locale.", + i = paste0( + "Silence this warning by installing stringi or by ", + "explicitly replacing usage of `dplyr_locale()` with \"C\"." + ) + ) + + warn( + message = c(header, bullets), + class = "dplyr_warn_locale_fallback" + ) +} + +has_minimum_stringi <- function() { + is_installed("stringi", version = "1.5.3") +} diff --git a/man/arrange.Rd b/man/arrange.Rd index 132ec0bf9f..e2dbfbbcca 100644 --- a/man/arrange.Rd +++ b/man/arrange.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/arrange.R \name{arrange} \alias{arrange} +\alias{arrange.data.frame} \title{Arrange rows by column values} \usage{ arrange(.data, ..., .by_group = FALSE) + +\method{arrange}{data.frame}(.data, ..., .by_group = FALSE, .locale = dplyr_locale()) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -16,6 +19,21 @@ variables. Use \code{\link[=desc]{desc()}} to sort a variable in descending orde \item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to grouped data frames only.} + +\item{.locale}{The locale to sort character vectors in. +\itemize{ +\item Defaults to \code{\link[=dplyr_locale]{dplyr_locale()}}, which uses the American English locale +if the stringi package is installed and the global default locale +has not been altered. See the help page for \code{\link[=dplyr_locale]{dplyr_locale()}} for the +exact details. +\item If a single string is supplied, then this will be used as the sorting +locale. For example, \code{"fr"} will sort with the French locale. This +requires the stringi package. Use \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to +generate a list of possible locale identifiers. +\item If \code{"C"} is supplied, then character vectors will be sorted in the C +locale. This does not require stringi and is often much faster than +supplying a locale identifier. +}} } \value{ An object of the same type as \code{.data}. The output has the following @@ -37,12 +55,6 @@ in order to group by them, and functions of variables are evaluated once per data frame, not once per group. } \details{ -\subsection{Locales}{ - -The sort order for character vectors will depend on the collating sequence -of the locale in use: see \code{\link[=locales]{locales()}}. -} - \subsection{Missing values}{ Unlike base sorting with \code{sort()}, \code{NA} are: diff --git a/man/dplyr_locale.Rd b/man/dplyr_locale.Rd new file mode 100644 index 0000000000..cf322684cd --- /dev/null +++ b/man/dplyr_locale.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locale.R +\name{dplyr_locale} +\alias{dplyr_locale} +\title{Locale used by dplyr} +\usage{ +dplyr_locale() +} +\description{ +\code{dplyr_locale()} returns a single string representing the default locale used +by dplyr when ordering character vectors. It is used as the default value of +\code{.locale} in \code{\link[=arrange]{arrange()}}. +\subsection{Default locale}{ +\itemize{ +\item If stringi >=1.5.3 is installed, the default locale is set to American +English, represented by the locale identifier \code{"en"}. +\item If stringi is not installed or is older than 1.5.3, the default locale +falls back to the C locale, represented by \code{"C"}. When this occurs, a +warning will be thrown encouraging you to either install stringi or +replace usage of \code{dplyr_locale()} with \code{"C"} to explicitly force the C +locale. +} +} + +\subsection{Global override}{ + +To override the above default behavior, you can set the global option, +\code{dplyr.locale}, to either \code{"C"} or a stringi locale identifier from +\code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to globally alter the default locale. +Setting this option to anything other than \code{"C"} requires stringi >=1.5.3. + +We generally recommend that you set the \code{.locale} argument of \code{\link[=arrange]{arrange()}} +explicitly rather than overriding the global locale, if possible. + +Another alternative is to only change the global locale within a limited +scope through the use of \code{\link[rlang:local_options]{rlang::local_options()}} or \code{\link[rlang:local_options]{rlang::with_options()}}. +This can be useful when a package that you don't control calls \code{arrange()} +internally. +} +} +\examples{ +\dontshow{if (dplyr:::has_minimum_stringi()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Default locale is American English +dplyr_locale() + +# This Danish letter is typically sorted after `z` +df <- tibble(x = x <- c("o", "p", "\u00F8", "z")) +df + +# The American English locale sorts it right after `o` +arrange(df, x) + +# Explicitly override `.locale` to `"da"` for Danish ordering +arrange(df, x, .locale = "da") + +# Or temporarily override the `dplyr.locale` global option, which is useful +# if `arrange()` is called from a function you don't control +col_sorter <- function(df) { + arrange(df, x) +} + +rlang::with_options(dplyr.locale = "da", { + col_sorter(df) +}) +\dontshow{\}) # examplesIf} +} diff --git a/tests/testthat/_snaps/arrange.md b/tests/testthat/_snaps/arrange.md index 43e2ec42c4..6d23a5e50c 100644 --- a/tests/testthat/_snaps/arrange.md +++ b/tests/testthat/_snaps/arrange.md @@ -26,6 +26,38 @@ x Problem while computing `..1 = rep(x, 2)`. x `..1` must be size 1, not 2. +# arrange errors if stringi is not installed and a locale identifier is used + + Code + locale_to_chr_proxy_collate("fr", has_stringi = FALSE) + Condition + Error: + ! stringi >=1.5.3 is required to arrange in a different locale. + +# arrange validates `.locale` + + Code + arrange(df, .locale = 1) + Condition + Error in `arrange()`: + ! `.locale` must be a string. + +--- + + Code + arrange(df, .locale = c("en_US", "fr_BF")) + Condition + Error in `arrange()`: + ! If `.locale` is a character vector, it must be a single string. + +--- + + Code + arrange(df, .locale = "x") + Condition + Error in `arrange()`: + ! `.locale` must be one of the locales within `stringi::stri_locale_list()`. + # desc() inside arrange() checks the number of arguments (#5921) Code diff --git a/tests/testthat/_snaps/locale.md b/tests/testthat/_snaps/locale.md new file mode 100644 index 0000000000..99aeea8eb1 --- /dev/null +++ b/tests/testthat/_snaps/locale.md @@ -0,0 +1,20 @@ +# `dplyr_locale()` respects `dplyr.locale` + + Code + dplyr_locale() + Condition + Error in `dplyr_locale()`: + ! If set, the global option `dplyr.locale` must be a string. + +# `dplyr_locale()` falls back to the C locale with a warning if stringi is not available + + Code + dplyr_locale_default(has_stringi = FALSE) + Condition + Warning: + `dplyr_locale()` attempted to default to the American English locale ("en"), but the required package, stringi >=1.5.3, is not installed. + i Falling back to the C locale. + i Silence this warning by installing stringi or by explicitly replacing usage of `dplyr_locale()` with "C". + Output + [1] "C" + diff --git a/tests/testthat/test-arrange.r b/tests/testthat/test-arrange.r index d022904046..7b7fcb2aee 100644 --- a/tests/testthat/test-arrange.r +++ b/tests/testthat/test-arrange.r @@ -82,14 +82,61 @@ test_that("arrange handles S4 classes (#1105)", { expect_equal(arrange(df, y), df[3:1, ]) }) -test_that("arrange respects locale (#1280)", { - df2 <- tibble(words = c("casa", "\u00e1rbol", "zona", "\u00f3rgano")) +# locale -------------------------------------------------------------- - res <- df2 %>% arrange(words) - expect_equal(res$words, sort(df2$words)) +test_that("arrange defaults to the American English locale if stringi is installed", { + skip_if_not_installed("stringi", "1.5.3") - res <- df2 %>% arrange(desc(words)) - expect_equal(res$words, sort(df2$words, decreasing = TRUE)) + x <- c("A", "a", "b", "B") + df <- tibble(x = x) + + res <- arrange(df, x) + expect_identical(res$x, c("a", "A", "b", "B")) + + res <- arrange(df, desc(x)) + expect_identical(res$x, rev(c("a", "A", "b", "B"))) +}) + +test_that("locale can be set to C", { + x <- c("A", "a", "b", "B") + df <- tibble(x = x) + + res <- arrange(df, x, .locale = "C") + expect_identical(res$x, c("A", "B", "a", "b")) +}) + +test_that("non-English locales can be used", { + skip_if_not_installed("stringi", "1.5.3") + + # Danish `o` with `/` through it sorts after `z` + x <- c("o", "\u00F8", "p", "z") + df <- tibble(x = x) + + res <- arrange(df, x) + expect_identical(res$x, x) + + res <- arrange(df, x, .locale = "da") + expect_identical(res$x, x[c(1, 3, 4, 2)]) +}) + +test_that("arrange errors if stringi is not installed and a locale identifier is used", { + expect_snapshot(error = TRUE, { + locale_to_chr_proxy_collate("fr", has_stringi = FALSE) + }) +}) + +test_that("arrange validates `.locale`", { + df <- tibble() + + expect_snapshot(error = TRUE, { + arrange(df, .locale = 1) + }) + expect_snapshot(error = TRUE, { + arrange(df, .locale = c("en_US", "fr_BF")) + }) + expect_snapshot(error = TRUE, { + arrange(df, .locale = "x") + }) }) # data ---------------------------------------------------------------- diff --git a/tests/testthat/test-locale.R b/tests/testthat/test-locale.R new file mode 100644 index 0000000000..50271a6cd0 --- /dev/null +++ b/tests/testthat/test-locale.R @@ -0,0 +1,20 @@ +test_that("`dplyr_locale()` uses 'en' if stringi is available", { + skip_if_not_installed("stringi", "1.5.3") + + expect_identical(dplyr_locale(), "en") +}) + +test_that("`dplyr_locale()` respects `dplyr.locale`", { + local_options(dplyr.locale = "fr") + expect_identical(dplyr_locale(), "fr") + + local_options(dplyr.locale = 1) + expect_snapshot(error = TRUE, { + dplyr_locale() + }) +}) + +test_that("`dplyr_locale()` falls back to the C locale with a warning if stringi is not available", { + expect_snapshot(dplyr_locale_default(has_stringi = FALSE)) + expect_warning(dplyr_locale_default(has_stringi = FALSE), class = "dplyr_warn_locale_fallback") +}) From 6f7612e638276e59120d2b2696c623a2e0cbc6e0 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 10 May 2022 11:52:21 -0400 Subject: [PATCH 4/9] Give scoped `arrange()` variants a `.locale` argument Even though these are superseded, this should help ease the transition a little, since without this argument it would be difficult to choose a different locale, and if stringi wasn't installed then you'd unconditionally get a warning you couldn't silence --- R/colwise-arrange.R | 26 +++++++++++++---- man/arrange_all.Rd | 41 +++++++++++++++++++++++++-- tests/testthat/test-colwise-arrange.R | 17 +++++++++++ 3 files changed, 75 insertions(+), 9 deletions(-) diff --git a/R/colwise-arrange.R b/R/colwise-arrange.R index 73f5752e45..467db0a094 100644 --- a/R/colwise-arrange.R +++ b/R/colwise-arrange.R @@ -29,31 +29,45 @@ #' arrange_all(df, desc) #' # -> #' arrange(df, across(everything(), desc)) -arrange_all <- function(.tbl, .funs = list(), ..., .by_group = FALSE) { +arrange_all <- function(.tbl, + .funs = list(), + ..., + .by_group = FALSE, + .locale = dplyr_locale()) { lifecycle::signal_stage("superseded", "arrange_all()") funs <- manip_all(.tbl, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_all") if (!length(funs)) { funs <- syms(tbl_vars(.tbl)) } - arrange(.tbl, !!!funs, .by_group = .by_group) + arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale) } #' @rdname arrange_all #' @export -arrange_at <- function(.tbl, .vars, .funs = list(), ..., .by_group = FALSE) { +arrange_at <- function(.tbl, + .vars, + .funs = list(), + ..., + .by_group = FALSE, + .locale = dplyr_locale()) { lifecycle::signal_stage("superseded", "arrange_at()") funs <- manip_at(.tbl, .vars, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_at") if (!length(funs)) { funs <- tbl_at_syms(.tbl, .vars, .include_group_vars = TRUE) } - arrange(.tbl, !!!funs, .by_group = .by_group) + arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale) } #' @rdname arrange_all #' @export -arrange_if <- function(.tbl, .predicate, .funs = list(), ..., .by_group = FALSE) { +arrange_if <- function(.tbl, + .predicate, + .funs = list(), + ..., + .by_group = FALSE, + .locale = dplyr_locale()) { lifecycle::signal_stage("superseded", "arrange_if()") funs <- manip_if(.tbl, .predicate, .funs, enquo(.funs), caller_env(), .include_group_vars = TRUE, ..., .caller = "arrange_if") if (!length(funs)) { funs <- tbl_if_syms(.tbl, .predicate, .include_group_vars = TRUE) } - arrange(.tbl, !!!funs, .by_group = .by_group) + arrange(.tbl, !!!funs, .by_group = .by_group, .locale = .locale) } diff --git a/man/arrange_all.Rd b/man/arrange_all.Rd index 4e139aff4a..3dd8d111d9 100644 --- a/man/arrange_all.Rd +++ b/man/arrange_all.Rd @@ -6,11 +6,31 @@ \alias{arrange_if} \title{Arrange rows by a selection of variables} \usage{ -arrange_all(.tbl, .funs = list(), ..., .by_group = FALSE) +arrange_all( + .tbl, + .funs = list(), + ..., + .by_group = FALSE, + .locale = dplyr_locale() +) -arrange_at(.tbl, .vars, .funs = list(), ..., .by_group = FALSE) +arrange_at( + .tbl, + .vars, + .funs = list(), + ..., + .by_group = FALSE, + .locale = dplyr_locale() +) -arrange_if(.tbl, .predicate, .funs = list(), ..., .by_group = FALSE) +arrange_if( + .tbl, + .predicate, + .funs = list(), + ..., + .by_group = FALSE, + .locale = dplyr_locale() +) } \arguments{ \item{.tbl}{A \code{tbl} object.} @@ -23,6 +43,21 @@ arrange_if(.tbl, .predicate, .funs = list(), ..., .by_group = FALSE) \item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to grouped data frames only.} +\item{.locale}{The locale to sort character vectors in. +\itemize{ +\item Defaults to \code{\link[=dplyr_locale]{dplyr_locale()}}, which uses the American English locale +if the stringi package is installed and the global default locale +has not been altered. See the help page for \code{\link[=dplyr_locale]{dplyr_locale()}} for the +exact details. +\item If a single string is supplied, then this will be used as the sorting +locale. For example, \code{"fr"} will sort with the French locale. This +requires the stringi package. Use \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to +generate a list of possible locale identifiers. +\item If \code{"C"} is supplied, then character vectors will be sorted in the C +locale. This does not require stringi and is often much faster than +supplying a locale identifier. +}} + \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column positions, or \code{NULL}.} diff --git a/tests/testthat/test-colwise-arrange.R b/tests/testthat/test-colwise-arrange.R index 8beaa1b7b8..f698570412 100644 --- a/tests/testthat/test-colwise-arrange.R +++ b/tests/testthat/test-colwise-arrange.R @@ -65,3 +65,20 @@ test_that("scoped arrange respect .by_group (#3245)",{ arrange(d, cyl, mpg, disp) ) }) + +test_that("scoped `arrange()` respects `.locale`", { + df <- tibble(x = c("A", "a", "b", "B")) + + expect_identical( + arrange_all(df, .locale = "C"), + arrange(df, x, .locale = "C") + ) + expect_identical( + arrange_if(df, is.character, .locale = "C"), + arrange(df, x, .locale = "C") + ) + expect_identical( + arrange_at(df, vars(x), .locale = "C"), + arrange(df, x, .locale = "C") + ) +}) From 42d928b12798074f65e7150e3913e2a78a39fd7c Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 10 May 2022 11:53:21 -0400 Subject: [PATCH 5/9] NEWS bullet --- NEWS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/NEWS.md b/NEWS.md index 4143c1ec24..bed32a1f78 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,13 @@ # dplyr (development version) +* `arrange()` now uses a faster algorithm for sorting character vectors, + which is heavily inspired by data.table's `forder()`. Additionally, the + default locale is now American English, which is a breaking change from + the previous behavior which utilized the system locale. The new `.locale` + argument can be used to adjust this. For a fuller explanation, refer to this + [tidyup](https://github.com/tidyverse/tidyups/blob/main/003-dplyr-radix-ordering.md) + which outlines and justifies this change (#4962). + * `tbl_sum()` is no longer reexported from tibble (#6284). * `slice_sample()` now gives a more informative error when `replace = FALSE` and From 5f78c31b1ea4b33da83bee4ab20568796217f397 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 10 May 2022 12:27:46 -0400 Subject: [PATCH 6/9] Mark `dplyr_locale()` documentation as internal With the intention being that it shouldn't show up on the pkgdown site reference page. We expect most people to get to this page through `arrange()`. --- R/locale.R | 1 + man/dplyr_locale.Rd | 1 + 2 files changed, 2 insertions(+) diff --git a/R/locale.R b/R/locale.R index 1d64450c9f..38577f3b91 100644 --- a/R/locale.R +++ b/R/locale.R @@ -31,6 +31,7 @@ #' This can be useful when a package that you don't control calls `arrange()` #' internally. #' @export +#' @keywords internal #' @examplesIf dplyr:::has_minimum_stringi() #' # Default locale is American English #' dplyr_locale() diff --git a/man/dplyr_locale.Rd b/man/dplyr_locale.Rd index cf322684cd..6590984caa 100644 --- a/man/dplyr_locale.Rd +++ b/man/dplyr_locale.Rd @@ -64,3 +64,4 @@ rlang::with_options(dplyr.locale = "da", { }) \dontshow{\}) # examplesIf} } +\keyword{internal} From 0221c59ab0530bc4bf1a5f752b305e11dbd61d0b Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Fri, 13 May 2022 10:40:38 -0400 Subject: [PATCH 7/9] Default `dplyr_locale()` to C This is reproducible across all R sessions and OSes, and is much faster, which makes it a good default. It will also make the default of `arrange()` continue to align with what `group_by() + summarize()` returns, since that will also unconditionally use the C locale. --- NEWS.md | 17 +++--- R/arrange.R | 21 ++++---- R/locale.R | 88 +++++++++++++++----------------- man/arrange.Rd | 23 +++++---- man/arrange_all.Rd | 23 +++++---- man/dplyr_locale.Rd | 62 +++++++++++++++------- tests/testthat/_snaps/arrange.md | 2 +- tests/testthat/_snaps/locale.md | 12 ----- tests/testthat/test-arrange.r | 40 +++++++++++---- tests/testthat/test-locale.R | 11 +--- 10 files changed, 165 insertions(+), 134 deletions(-) diff --git a/NEWS.md b/NEWS.md index bed32a1f78..bc4aa8531f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,12 +1,15 @@ # dplyr (development version) -* `arrange()` now uses a faster algorithm for sorting character vectors, - which is heavily inspired by data.table's `forder()`. Additionally, the - default locale is now American English, which is a breaking change from - the previous behavior which utilized the system locale. The new `.locale` - argument can be used to adjust this. For a fuller explanation, refer to this - [tidyup](https://github.com/tidyverse/tidyups/blob/main/003-dplyr-radix-ordering.md) - which outlines and justifies this change (#4962). +* `arrange()` now uses a faster algorithm for sorting character vectors, which + is heavily inspired by data.table's `forder()`. Additionally, the default + locale for sorting character vectors is now the C locale, which is a breaking + change from the previous behavior that utilized the system locale. The new + `.locale` argument can be used to adjust this to, say, the American English + locale, which is an optional feature that requires the stringi package. This + change improves reproducibility across R sessions and operating systems. For a + fuller explanation, refer to this + [tidyup](https://github.com/tidyverse/tidyups/blob/main/003-dplyr-radix-ordering.md) + which outlines and justifies this change (#4962). * `tbl_sum()` is no longer reexported from tibble (#6284). diff --git a/R/arrange.R b/R/arrange.R index efc3130084..670b12652e 100644 --- a/R/arrange.R +++ b/R/arrange.R @@ -40,19 +40,22 @@ #' grouped data frames only. #' @param .locale The locale to sort character vectors in. #' -#' - Defaults to [dplyr_locale()], which uses the American English locale -#' if the stringi package is installed and the global default locale -#' has not been altered. See the help page for [dplyr_locale()] for the +#' - Defaults to [dplyr_locale()], which uses the `"C"` locale unless this is +#' explicitly overriden. See the help page for [dplyr_locale()] for the #' exact details. #' -#' - If a single string is supplied, then this will be used as the sorting -#' locale. For example, `"fr"` will sort with the French locale. This -#' requires the stringi package. Use [stringi::stri_locale_list()] to -#' generate a list of possible locale identifiers. +#' - If a single string from [stringi::stri_locale_list()] is supplied, then +#' this will be used as the locale to sort with. For example, `"en"` will +#' sort with the American English locale. This requires the stringi package. #' -#' - If `"C"` is supplied, then character vectors will be sorted in the C -#' locale. This does not require stringi and is often much faster than +#' - If `"C"` is supplied, then character vectors will always be sorted in the +#' C locale. This does not require stringi and is often much faster than #' supplying a locale identifier. +#' +#' The C locale is not the same as English locales, such as `"en"`, +#' particularly when it comes to case-sensitivity. This is explained in +#' more detail in the help page of [dplyr_locale()] under the `Default locale` +#' section. #' @family single table verbs #' @examples #' arrange(mtcars, cyl, disp) diff --git a/R/locale.R b/R/locale.R index 38577f3b91..9b76e7c0d7 100644 --- a/R/locale.R +++ b/R/locale.R @@ -7,21 +7,24 @@ #' #' ## Default locale #' -#' - If stringi >=1.5.3 is installed, the default locale is set to American -#' English, represented by the locale identifier `"en"`. +#' The default locale returned by `dplyr_locale()` is the C locale, identical +#' to explicitly supplying `.locale = "C"`. #' -#' - If stringi is not installed or is older than 1.5.3, the default locale -#' falls back to the C locale, represented by `"C"`. When this occurs, a -#' warning will be thrown encouraging you to either install stringi or -#' replace usage of `dplyr_locale()` with `"C"` to explicitly force the C -#' locale. +#' The C locale is not exactly the same as English locales, such as `"en"`. The +#' main difference is that the C locale groups the English alphabet by _case_, +#' while most English locales group the alphabet by _letter_. For example, +#' `c("a", "b", "C", "B", "c")` will sort as `c("B", "C", "a", "b", "c")` in the +#' C locale, with all uppercase letters coming before lowercase letters, but +#' will sort as `c("a", "b", "B", "c", "C")` in an English locale. This often +#' makes little practical difference during data analysis, because both return +#' identical results when case is consistent between observations. #' #' ## Global override #' #' To override the above default behavior, you can set the global option, -#' `dplyr.locale`, to either `"C"` or a stringi locale identifier from -#' [stringi::stri_locale_list()] to globally alter the default locale. -#' Setting this option to anything other than `"C"` requires stringi >=1.5.3. +#' `dplyr.locale`, to a stringi locale identifier from +#' [stringi::stri_locale_list()] to globally alter the default locale. This +#' requires stringi >=1.5.3. #' #' We generally recommend that you set the `.locale` argument of [arrange()] #' explicitly rather than overriding the global locale, if possible. @@ -30,21 +33,31 @@ #' scope through the use of [rlang::local_options()] or [rlang::with_options()]. #' This can be useful when a package that you don't control calls `arrange()` #' internally. +#' +#' ## Reproducibility +#' +#' The C locale has the benefit of being completely reproducible across all +#' supported R versions and operating systems with no extra effort. +#' +#' If you set `.locale` to an option from [stringi::stri_locale_list()], then +#' stringi must be installed by anyone who wants to run your code. If you +#' utilize this in a package, then stringi should be placed in `Imports`. #' @export #' @keywords internal #' @examplesIf dplyr:::has_minimum_stringi() -#' # Default locale is American English +#' # Default locale is C #' dplyr_locale() #' -#' # This Danish letter is typically sorted after `z` -#' df <- tibble(x = x <- c("o", "p", "\u00F8", "z")) +#' df <- tibble(x = c("a", "b", "C", "B", "c")) #' df #' -#' # The American English locale sorts it right after `o` +#' # The C locale groups the English alphabet by case, placing uppercase letters +#' # before lowercase letters. This is the default. #' arrange(df, x) #' -#' # Explicitly override `.locale` to `"da"` for Danish ordering -#' arrange(df, x, .locale = "da") +#' # The American English locale groups the alphabet by letter. +#' # Explicitly override `.locale` with `"en"` for this ordering. +#' arrange(df, x, .locale = "en") #' #' # Or temporarily override the `dplyr.locale` global option, which is useful #' # if `arrange()` is called from a function you don't control @@ -52,9 +65,19 @@ #' arrange(df, x) #' } #' -#' rlang::with_options(dplyr.locale = "da", { +#' rlang::with_options(dplyr.locale = "en", { #' col_sorter(df) #' }) +#' +#' # This Danish letter is expected to sort after `z` +#' df <- tibble(x = c("o", "p", "\u00F8", "z")) +#' df +#' +#' # The American English locale sorts it right after `o` +#' arrange(df, x, .locale = "en") +#' +#' # Using `"da"` for Danish ordering gives the expected result +#' arrange(df, x, .locale = "da") dplyr_locale <- function() { locale <- peek_option("dplyr.locale") @@ -65,36 +88,7 @@ dplyr_locale <- function() { abort("If set, the global option `dplyr.locale` must be a string.") } - dplyr_locale_default() -} - -dplyr_locale_default <- function(has_stringi = has_minimum_stringi()) { - if (has_stringi) { - "en" - } else { - warn_locale_fallback() - "C" - } -} - -warn_locale_fallback <- function() { - header <- paste0( - "`dplyr_locale()` attempted to default to the American English locale (\"en\"), ", - "but the required package, stringi >=1.5.3, is not installed." - ) - - bullets <- c( - i = "Falling back to the C locale.", - i = paste0( - "Silence this warning by installing stringi or by ", - "explicitly replacing usage of `dplyr_locale()` with \"C\"." - ) - ) - - warn( - message = c(header, bullets), - class = "dplyr_warn_locale_fallback" - ) + "C" } has_minimum_stringi <- function() { diff --git a/man/arrange.Rd b/man/arrange.Rd index e2dbfbbcca..87534d826f 100644 --- a/man/arrange.Rd +++ b/man/arrange.Rd @@ -22,18 +22,21 @@ grouped data frames only.} \item{.locale}{The locale to sort character vectors in. \itemize{ -\item Defaults to \code{\link[=dplyr_locale]{dplyr_locale()}}, which uses the American English locale -if the stringi package is installed and the global default locale -has not been altered. See the help page for \code{\link[=dplyr_locale]{dplyr_locale()}} for the +\item Defaults to \code{\link[=dplyr_locale]{dplyr_locale()}}, which uses the \code{"C"} locale unless this is +explicitly overriden. See the help page for \code{\link[=dplyr_locale]{dplyr_locale()}} for the exact details. -\item If a single string is supplied, then this will be used as the sorting -locale. For example, \code{"fr"} will sort with the French locale. This -requires the stringi package. Use \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to -generate a list of possible locale identifiers. -\item If \code{"C"} is supplied, then character vectors will be sorted in the C -locale. This does not require stringi and is often much faster than +\item If a single string from \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} is supplied, then +this will be used as the locale to sort with. For example, \code{"en"} will +sort with the American English locale. This requires the stringi package. +\item If \code{"C"} is supplied, then character vectors will always be sorted in the +C locale. This does not require stringi and is often much faster than supplying a locale identifier. -}} +} + +The C locale is not the same as English locales, such as \code{"en"}, +particularly when it comes to case-sensitivity. This is explained in +more detail in the help page of \code{\link[=dplyr_locale]{dplyr_locale()}} under the \verb{Default locale} +section.} } \value{ An object of the same type as \code{.data}. The output has the following diff --git a/man/arrange_all.Rd b/man/arrange_all.Rd index 3dd8d111d9..1d35075e05 100644 --- a/man/arrange_all.Rd +++ b/man/arrange_all.Rd @@ -45,18 +45,21 @@ grouped data frames only.} \item{.locale}{The locale to sort character vectors in. \itemize{ -\item Defaults to \code{\link[=dplyr_locale]{dplyr_locale()}}, which uses the American English locale -if the stringi package is installed and the global default locale -has not been altered. See the help page for \code{\link[=dplyr_locale]{dplyr_locale()}} for the +\item Defaults to \code{\link[=dplyr_locale]{dplyr_locale()}}, which uses the \code{"C"} locale unless this is +explicitly overriden. See the help page for \code{\link[=dplyr_locale]{dplyr_locale()}} for the exact details. -\item If a single string is supplied, then this will be used as the sorting -locale. For example, \code{"fr"} will sort with the French locale. This -requires the stringi package. Use \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to -generate a list of possible locale identifiers. -\item If \code{"C"} is supplied, then character vectors will be sorted in the C -locale. This does not require stringi and is often much faster than +\item If a single string from \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} is supplied, then +this will be used as the locale to sort with. For example, \code{"en"} will +sort with the American English locale. This requires the stringi package. +\item If \code{"C"} is supplied, then character vectors will always be sorted in the +C locale. This does not require stringi and is often much faster than supplying a locale identifier. -}} +} + +The C locale is not the same as English locales, such as \code{"en"}, +particularly when it comes to case-sensitivity. This is explained in +more detail in the help page of \code{\link[=dplyr_locale]{dplyr_locale()}} under the \verb{Default locale} +section.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column diff --git a/man/dplyr_locale.Rd b/man/dplyr_locale.Rd index 6590984caa..63b412c8f7 100644 --- a/man/dplyr_locale.Rd +++ b/man/dplyr_locale.Rd @@ -11,23 +11,26 @@ dplyr_locale() by dplyr when ordering character vectors. It is used as the default value of \code{.locale} in \code{\link[=arrange]{arrange()}}. \subsection{Default locale}{ -\itemize{ -\item If stringi >=1.5.3 is installed, the default locale is set to American -English, represented by the locale identifier \code{"en"}. -\item If stringi is not installed or is older than 1.5.3, the default locale -falls back to the C locale, represented by \code{"C"}. When this occurs, a -warning will be thrown encouraging you to either install stringi or -replace usage of \code{dplyr_locale()} with \code{"C"} to explicitly force the C -locale. -} + +The default locale returned by \code{dplyr_locale()} is the C locale, identical +to explicitly supplying \code{.locale = "C"}. + +The C locale is not exactly the same as English locales, such as \code{"en"}. The +main difference is that the C locale groups the English alphabet by \emph{case}, +while most English locales group the alphabet by \emph{letter}. For example, +\code{c("a", "b", "C", "B", "c")} will sort as \code{c("B", "C", "a", "b", "c")} in the +C locale, with all uppercase letters coming before lowercase letters, but +will sort as \code{c("a", "b", "B", "c", "C")} in an English locale. This often +makes little practical difference during data analysis, because both return +identical results when case is consistent between observations. } \subsection{Global override}{ To override the above default behavior, you can set the global option, -\code{dplyr.locale}, to either \code{"C"} or a stringi locale identifier from -\code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to globally alter the default locale. -Setting this option to anything other than \code{"C"} requires stringi >=1.5.3. +\code{dplyr.locale}, to a stringi locale identifier from +\code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} to globally alter the default locale. This +requires stringi >=1.5.3. We generally recommend that you set the \code{.locale} argument of \code{\link[=arrange]{arrange()}} explicitly rather than overriding the global locale, if possible. @@ -37,21 +40,32 @@ scope through the use of \code{\link[rlang:local_options]{rlang::local_options() This can be useful when a package that you don't control calls \code{arrange()} internally. } + +\subsection{Reproducibility}{ + +The C locale has the benefit of being completely reproducible across all +supported R versions and operating systems with no extra effort. + +If you set \code{.locale} to an option from \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}}, then +stringi must be installed by anyone who wants to run your code. If you +utilize this in a package, then stringi should be placed in \code{Imports}. +} } \examples{ \dontshow{if (dplyr:::has_minimum_stringi()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -# Default locale is American English +# Default locale is C dplyr_locale() -# This Danish letter is typically sorted after `z` -df <- tibble(x = x <- c("o", "p", "\u00F8", "z")) +df <- tibble(x = c("a", "b", "C", "B", "c")) df -# The American English locale sorts it right after `o` +# The C locale groups the English alphabet by case, placing uppercase letters +# before lowercase letters. This is the default. arrange(df, x) -# Explicitly override `.locale` to `"da"` for Danish ordering -arrange(df, x, .locale = "da") +# The American English locale groups the alphabet by letter. +# Explicitly override `.locale` with `"en"` for this ordering. +arrange(df, x, .locale = "en") # Or temporarily override the `dplyr.locale` global option, which is useful # if `arrange()` is called from a function you don't control @@ -59,9 +73,19 @@ col_sorter <- function(df) { arrange(df, x) } -rlang::with_options(dplyr.locale = "da", { +rlang::with_options(dplyr.locale = "en", { col_sorter(df) }) + +# This Danish letter is expected to sort after `z` +df <- tibble(x = c("o", "p", "\u00F8", "z")) +df + +# The American English locale sorts it right after `o` +arrange(df, x, .locale = "en") + +# Using `"da"` for Danish ordering gives the expected result +arrange(df, x, .locale = "da") \dontshow{\}) # examplesIf} } \keyword{internal} diff --git a/tests/testthat/_snaps/arrange.md b/tests/testthat/_snaps/arrange.md index 6d23a5e50c..ed1f501106 100644 --- a/tests/testthat/_snaps/arrange.md +++ b/tests/testthat/_snaps/arrange.md @@ -50,7 +50,7 @@ Error in `arrange()`: ! If `.locale` is a character vector, it must be a single string. ---- +# arrange validates that `.locale` must be one from stringi Code arrange(df, .locale = "x") diff --git a/tests/testthat/_snaps/locale.md b/tests/testthat/_snaps/locale.md index 99aeea8eb1..283d5e2c0f 100644 --- a/tests/testthat/_snaps/locale.md +++ b/tests/testthat/_snaps/locale.md @@ -6,15 +6,3 @@ Error in `dplyr_locale()`: ! If set, the global option `dplyr.locale` must be a string. -# `dplyr_locale()` falls back to the C locale with a warning if stringi is not available - - Code - dplyr_locale_default(has_stringi = FALSE) - Condition - Warning: - `dplyr_locale()` attempted to default to the American English locale ("en"), but the required package, stringi >=1.5.3, is not installed. - i Falling back to the C locale. - i Silence this warning by installing stringi or by explicitly replacing usage of `dplyr_locale()` with "C". - Output - [1] "C" - diff --git a/tests/testthat/test-arrange.r b/tests/testthat/test-arrange.r index 7b7fcb2aee..4ff67a7e6b 100644 --- a/tests/testthat/test-arrange.r +++ b/tests/testthat/test-arrange.r @@ -84,41 +84,54 @@ test_that("arrange handles S4 classes (#1105)", { # locale -------------------------------------------------------------- -test_that("arrange defaults to the American English locale if stringi is installed", { - skip_if_not_installed("stringi", "1.5.3") - +test_that("arrange defaults to the C locale", { x <- c("A", "a", "b", "B") df <- tibble(x = x) res <- arrange(df, x) - expect_identical(res$x, c("a", "A", "b", "B")) + expect_identical(res$x, c("A", "B", "a", "b")) res <- arrange(df, desc(x)) - expect_identical(res$x, rev(c("a", "A", "b", "B"))) + expect_identical(res$x, rev(c("A", "B", "a", "b"))) }) -test_that("locale can be set to C", { +test_that("locale can be set to an English locale", { + skip_if_not_installed("stringi", "1.5.3") + x <- c("A", "a", "b", "B") df <- tibble(x = x) - res <- arrange(df, x, .locale = "C") - expect_identical(res$x, c("A", "B", "a", "b")) + res <- arrange(df, x, .locale = "en") + expect_identical(res$x, c("a", "A", "b", "B")) }) test_that("non-English locales can be used", { skip_if_not_installed("stringi", "1.5.3") - # Danish `o` with `/` through it sorts after `z` + # Danish `o` with `/` through it sorts after `z` in Danish locale x <- c("o", "\u00F8", "p", "z") df <- tibble(x = x) - res <- arrange(df, x) + # American English locale puts it right after `o` + res <- arrange(df, x, .locale = "en") expect_identical(res$x, x) res <- arrange(df, x, .locale = "da") expect_identical(res$x, x[c(1, 3, 4, 2)]) }) +test_that("arrange respects the `dplyr.locale` global option", { + skip_if_not_installed("stringi", "1.5.3") + + local_options(dplyr.locale = "da") + + x <- c("o", "\u00F8", "p", "z") + df <- tibble(x = x) + + res <- arrange(df, x) + expect_identical(res$x, x[c(1, 3, 4, 2)]) +}) + test_that("arrange errors if stringi is not installed and a locale identifier is used", { expect_snapshot(error = TRUE, { locale_to_chr_proxy_collate("fr", has_stringi = FALSE) @@ -134,6 +147,13 @@ test_that("arrange validates `.locale`", { expect_snapshot(error = TRUE, { arrange(df, .locale = c("en_US", "fr_BF")) }) +}) + +test_that("arrange validates that `.locale` must be one from stringi", { + skip_if_not_installed("stringi", "1.5.3") + + df <- tibble() + expect_snapshot(error = TRUE, { arrange(df, .locale = "x") }) diff --git a/tests/testthat/test-locale.R b/tests/testthat/test-locale.R index 50271a6cd0..a226a03e1f 100644 --- a/tests/testthat/test-locale.R +++ b/tests/testthat/test-locale.R @@ -1,7 +1,5 @@ -test_that("`dplyr_locale()` uses 'en' if stringi is available", { - skip_if_not_installed("stringi", "1.5.3") - - expect_identical(dplyr_locale(), "en") +test_that("`dplyr_locale()` uses the C locale by default", { + expect_identical(dplyr_locale(), "C") }) test_that("`dplyr_locale()` respects `dplyr.locale`", { @@ -13,8 +11,3 @@ test_that("`dplyr_locale()` respects `dplyr.locale`", { dplyr_locale() }) }) - -test_that("`dplyr_locale()` falls back to the C locale with a warning if stringi is not available", { - expect_snapshot(dplyr_locale_default(has_stringi = FALSE)) - expect_warning(dplyr_locale_default(has_stringi = FALSE), class = "dplyr_warn_locale_fallback") -}) From ea54714bb14ec14e65aeae6f79860d68108a8034 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 13 Jun 2022 12:30:18 -0400 Subject: [PATCH 8/9] Avoid saying "case-sensitivity" because that is a little confusing --- R/arrange.R | 6 +++--- man/arrange.Rd | 6 +++--- man/arrange_all.Rd | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R/arrange.R b/R/arrange.R index 670b12652e..9730f6c5f1 100644 --- a/R/arrange.R +++ b/R/arrange.R @@ -53,9 +53,9 @@ #' supplying a locale identifier. #' #' The C locale is not the same as English locales, such as `"en"`, -#' particularly when it comes to case-sensitivity. This is explained in -#' more detail in the help page of [dplyr_locale()] under the `Default locale` -#' section. +#' particularly when it comes to data containing a mix of upper and lower case +#' letters. This is explained in more detail in the help page of +#' [dplyr_locale()] under the `Default locale` section. #' @family single table verbs #' @examples #' arrange(mtcars, cyl, disp) diff --git a/man/arrange.Rd b/man/arrange.Rd index 87534d826f..86748e2619 100644 --- a/man/arrange.Rd +++ b/man/arrange.Rd @@ -34,9 +34,9 @@ supplying a locale identifier. } The C locale is not the same as English locales, such as \code{"en"}, -particularly when it comes to case-sensitivity. This is explained in -more detail in the help page of \code{\link[=dplyr_locale]{dplyr_locale()}} under the \verb{Default locale} -section.} +particularly when it comes to data containing a mix of upper and lower case +letters. This is explained in more detail in the help page of +\code{\link[=dplyr_locale]{dplyr_locale()}} under the \verb{Default locale} section.} } \value{ An object of the same type as \code{.data}. The output has the following diff --git a/man/arrange_all.Rd b/man/arrange_all.Rd index 1d35075e05..6f893bec85 100644 --- a/man/arrange_all.Rd +++ b/man/arrange_all.Rd @@ -57,9 +57,9 @@ supplying a locale identifier. } The C locale is not the same as English locales, such as \code{"en"}, -particularly when it comes to case-sensitivity. This is explained in -more detail in the help page of \code{\link[=dplyr_locale]{dplyr_locale()}} under the \verb{Default locale} -section.} +particularly when it comes to data containing a mix of upper and lower case +letters. This is explained in more detail in the help page of +\code{\link[=dplyr_locale]{dplyr_locale()}} under the \verb{Default locale} section.} \item{.vars}{A list of columns generated by \code{\link[=vars]{vars()}}, a character vector of column names, a numeric vector of column From d9d78d213fbc9e08cdf1efef6aa34b63e8e7cd26 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 13 Jun 2022 12:41:59 -0400 Subject: [PATCH 9/9] Move conversion of `locale` to `chr_proxy_collate` into `arrange_rows()` This is a technical detail of `vec_order_radix()` so it is better to keep the conversion close to where we actually call it. Should make calling `arrange_rows()` on its own a little easier too. --- R/arrange.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/arrange.R b/R/arrange.R index 9730f6c5f1..9b5c51f991 100644 --- a/R/arrange.R +++ b/R/arrange.R @@ -94,9 +94,7 @@ arrange.data.frame <- function(.data, dots <- c(quos(!!!groups(.data)), dots) } - chr_proxy_collate <- locale_to_chr_proxy_collate(.locale) - - loc <- arrange_rows(.data, dots, chr_proxy_collate) + loc <- arrange_rows(.data, dots = dots, locale = .locale) dplyr_row_slice(.data, loc) } @@ -104,10 +102,15 @@ arrange.data.frame <- function(.data, arrange_rows <- function(data, dots, - chr_proxy_collate, + locale, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) + chr_proxy_collate <- locale_to_chr_proxy_collate( + locale = locale, + error_call = error_call + ) + if (length(dots) == 0L) { out <- seq_len(nrow(data)) return(out)