Label group names correctly and check for balanced data in `pivot_lon…

…ger()` (#230) * Label group names correctly and check for balanced data. Closes #228, closes #229 * Update tests
tidyverse · Mar 23, 2021 · e56d8a6 · e56d8a6
1 parent f8ef87a
commit e56d8a6
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 24 deletions.
diff --git a/R/step-call-pivot_longer.R b/R/step-call-pivot_longer.R
@@ -90,21 +90,40 @@ pivot_longer.dtplyr_step <- function(data,
 
   if (uses_dot_value) {
     if (!is.null(names_sep)) {
-      .value <- str_separate(measure_vars, into = names_to, sep = names_sep)$.value
+      names_to_setup <- str_separate(measure_vars, into = names_to, sep = names_sep)
     } else if (!is.null(names_pattern)) {
-      .value <- str_extract(measure_vars, into = names_to, names_pattern)$.value
+      names_to_setup <- str_extract(measure_vars, into = names_to, names_pattern)
     } else {
       abort("If you use '.value' in `names_to` you must also supply
             `names_sep' or `names_pattern")
     }
 
+    .value <- names_to_setup$.value
+
     v_fct <- factor(.value, levels = unique(.value))
     measure_vars <- split(measure_vars, v_fct)
     values_to <- names(measure_vars)
     names(measure_vars) <- NULL
 
     if (multiple_names_to) {
       variable_name <- names_to[!names_to == ".value"]
+
+      .value_ids <- split(names_to_setup[[variable_name]], v_fct)
+      .value_id <- .value_ids[[1]]
+
+      # Make sure data is "balanced"
+      # https://github.com/Rdatatable/data.table/issues/2575
+      # The list passed to measure.vars also needs the same number of column names per element
+      equal_ids <- vapply(
+        .value_ids[-1],
+        function(.x) isTRUE(all.equal(.value_id, .x)),
+        logical(1)
+      )
+      if (all(equal_ids)) {
+        .value_id <- vctrs::vec_rep_each(.value_id, length(pull(data)))
+      } else {
+        abort("`data.table::melt()` doesn't currently support melting of unbalanced datasets.")
+      }
     }
   } else if (multiple_names_to) {
     if (is.null(names_sep) && is.null(names_pattern)) {
@@ -152,7 +171,9 @@ pivot_longer.dtplyr_step <- function(data,
     out <- mutate(out, !!variable_name := gsub(paste0("^", names_prefix), "", !!sym(variable_name)))
   }
 
-  if (multiple_names_to && !uses_dot_value) {
+  if (multiple_names_to && uses_dot_value) {
+    out <- mutate(out, !!variable_name := !!.value_id)
+  } else if (multiple_names_to && !uses_dot_value) {
     if (!is.null(names_sep)) {
       into_cols <- str_separate(pull(out, !!sym(variable_name)), names_to, sep = names_sep)
     } else {

diff --git a/tests/testthat/_snaps/step-call-pivot_longer.md b/tests/testthat/_snaps/step-call-pivot_longer.md
@@ -5,7 +5,15 @@
     Output
       melt(DT, measure.vars = list(c("x1", "x2", "x3", "x4"), c("y1", 
       "y2", "y3", "y4")), variable.name = "set", value.name = c("x", 
-      "y"), variable.factor = FALSE)
+      "y"), variable.factor = FALSE)[, `:=`(set = c("1", "1", "2", 
+      "2", "3", "3", "4", "4"))]
+
+# errors on unbalanced datasets
+
+    Code
+      pivot_longer(dt, everything(), names_to = c(".value", "id"), names_sep = "_")
+    Error <rlang_error>
+      `data.table::melt()` doesn't currently support melting of unbalanced datasets.
 
 # informative errors on unsupported features
 

diff --git a/tests/testthat/test-step-call-pivot_longer.R b/tests/testthat/test-step-call-pivot_longer.R
@@ -44,26 +44,6 @@ test_that("can drop missing values", {
   expect_equal(out$value, c(1, 2))
 })
 
-test_that("can handle missing combinations", {
-  tbl <- tribble(
-    ~id, ~x_1, ~x_2, ~y_2,
-    "A",    1,    2,  "a",
-    "B",    3,    4,  "b",
-  )
-  dt <- lazy_dt(tbl, "DT")
-  step <- pivot_longer(dt, -id, names_to = c(".value", "n"), names_sep = "_")
-  out <- collect(step)
-
-  expect_equal(
-    show_query(step),
-    expr(melt(DT, measure.vars = !!list(c("x_1", "x_2"), "y_2"), variable.name = "n",
-              value.name = !!c("x", "y"), variable.factor = FALSE))
-  )
-  expect_equal(step$vars, c("id", "n", "x", "y"))
-  expect_equal(out$x, c(1, 3, 2, 4))
-  expect_equal(out$y, c("a", "b", NA, NA))
-})
-
 test_that("can pivot to multiple measure cols", {
   dt <- lazy_dt(head(anscombe, 2), "DT")
   step <- pivot_longer(
@@ -105,6 +85,15 @@ test_that(".value can be at any position in `names_to`", {
   expect_identical(value_first, value_second)
 })
 
+test_that("errors on unbalanced datasets", {
+  tbl <- tibble(x_1 = 1, x_2 = 1, y_3 = 1, y_4 = 1)
+  dt <- lazy_dt(tbl, "DT")
+
+  expect_snapshot(error = TRUE,
+                  pivot_longer(dt, everything(), names_to = c(".value", "id"), names_sep = "_")
+  )
+})
+
 test_that("can use names_prefix", {
   tbl <- tibble(x_x = 1:2, x_y = 3:4)
   dt <- lazy_dt(tbl, "DT")
@@ -161,3 +150,4 @@ test_that("informative errors on unsupported features", {
   })
 
 })
+