Rdatatable · mattdowle · May 28, 2019 · May 22, 2019 · May 22, 2019 · May 22, 2019
@@ -39,7 +39,8 @@ Authors@R: c(
   person("Michael","Quinn",        role="ctb"),
   person("@javrucebo","",          role="ctb"),
   person("@marc-outins","",        role="ctb"),
-  person("Roy","Storey",           role="ctb"))
+  person("Roy","Storey",           role="ctb"),
+  person("Manish","Saraswat",      role="ctb"))
 Depends: R (>= 3.1.0)
 Imports: methods
 Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml

@@ -122,6 +122,8 @@
 
 16. `rbind` and `rbindlist` of an item containing an ordered factor with levels containing an `NA` (as opposed to an NA integer) could segfault, [#3601](https://github.com/Rdatatable/data.table/issues/3601). This was a a regression in v1.12.2. Thanks to Damian Betebenner for reporting.
 
+17. `setkey` now checks for existing index before doing `forder` [#3582](https://github.com/Rdatatable/data.table/pull/3582/). Executes `forder` only if the index does not exist. Thanks to @MichaelChirico for reporting and @saraswatmks for the PR.
+
 #### NOTES
 
 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.

@@ -86,12 +86,19 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
     if (!typeof(.xi) %chin% c("integer","logical","character","double")) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.")
   }
   if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
-  if (verbose) {
-    tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
-    # suppress needed for tests 644 and 645 in verbose mode
-    cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n")
+
+  newkey = paste0(cols, collapse="__")
+  if (!any(indices(x) == newkey)) {
+    if (verbose) {
+      tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
+      # suppress needed for tests 644 and 645 in verbose mode
+      cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n")
+    } else {
+      o = forderv(x, cols, sort=TRUE, retGrp=FALSE)
+    }
   } else {
-    o = forderv(x, cols, sort=TRUE, retGrp=FALSE)
+    if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="")
+    o = getindex(x, newkey)
   }
   if (!physical) {
     if (is.null(attr(x,"index",exact=TRUE))) setattr(x, "index", integer())
@@ -160,7 +167,7 @@ setreordervec = function(x, order) .Call(Creorder, x, order)
 
 is.sorted = function(x, by=seq_along(x)) {
   if (is.list(x)) {
-    warning("Use 'if (length(o<-forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.")
+    warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.")
     # could pass through a flag for forderv to return early on first FALSE. But we don't need that internally
     # since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in.
     # This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by)
@@ -235,7 +242,7 @@ forder = function(x, ..., na.last=TRUE, decreasing=FALSE)
       }
       if (is.name(v)) {
         ix = chmatch(as.character(v), xcols, nomatch=0L)
-        if (ix != 0L) ans <- point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies
+        if (ix != 0L) ans = point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies
         else {
           v = as.call(list(as.name("list"), v))
           ans = point(ans, i, eval(v, x, parent.frame()), 1L)
@@ -258,7 +265,7 @@ forder = function(x, ..., na.last=TRUE, decreasing=FALSE)
 fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...)
 {
   containsNAs = FALSE
-  if (typeof(x)=="double" && !decreasing && !(containsNAs<-anyNA(x))) {
+  if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) {
       if (internal) stop("Internal code should not be being called on type double")
       return(.Call(Cfsort, x, verbose))
   }

@@ -5934,7 +5934,7 @@ test(1419.18, key(thisDT), "x1")
 ## testing secondary index retainment on assign (#2372)
 
 allIndicesValid <- function(DT){
-  ## checks that the order of all indices is correct
+  ## checks that the order of all indices are correct
   for(idx in seq_along(indices(DT))){
     index <- attr(attr(DT, "index"), paste0("__", indices(DT)[idx], collapse = ""))
     if(!length(index)) index <- seq_len(nrow(DT))
@@ -6031,6 +6031,33 @@ thisDT <- copy(DT)[, c("aaa", "b") := 2]
 test(1419.58, indices(thisDT), c("a", "ab"))
 test(1419.59, allIndicesValid(thisDT), TRUE)
 
+## setkey on same col as existing index, #2889
+DT <- data.table(a =c(4,1,3,9,2,1,8,7,6,5),
+                 aaa = c(1,1,2,2,2,1,1,2,2,2))
+setindex(DT, a)
+test(1419.60, allIndicesValid(DT), TRUE)
+setindex(DT, NULL)
+setkey(DT, a)
+test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9))
+setkey(DT, NULL)
+setindex(DT, a)
+test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"),
+              output="setkey on columns [a] using existing index 'a'")  # checks also that the prior index a is dropped (because y is keyed with no index)
+
+# setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals)
+DT = data.table(a =   c(3,3,4,4,5,6,1,1,7,2),
+                aaa = c(1,1,2,2,2,1,1,2,2,2),
+                bbb = c(1,1,2,0,1,1,1,0,1,1))
+setkey(DT, a)
+test(1419.63, DT$a, c(1,1,2,3,3,4,4,5,6,7))
+setkey(DT, NULL)
+test(1419.64, setkey(DT, a, verbose=TRUE), output="forder took")
+setkey(DT, NULL)
+setindex(DT, aaa, a)
+setindex(DT, aaa)   # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching
+test(1419.65, allIndicesValid(DT), TRUE)
+test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"),
+              output="setkey on columns [aaa] using existing index 'aaa'")  # checks that all indexes are dropped (aaa_a too)
 
 # setnames updates secondary key
 DT = data.table(a=1:5,b=10:6)