r-lib · hadley · Jul 6, 2023 · Jun 30, 2023
diff --git a/R/utils.R b/R/utils.R
@@ -16,3 +16,9 @@ with_pandoc_available <- function(code) {
   }
   force(code)
 }
+
+
+update_urltools <- function() {
+  lines <- readLines("https://github.com/wch/r-source/trunk/src/library/tools/R/urltools.R")
+  writeLines(lines, "inst/tools/urltools.R")
+}
diff --git a/inst/tools/README.md b/inst/tools/README.md
@@ -1,7 +1 @@
-The file `urltools.R` in this directory is copied from the SVN with the following
-
-```
-svn cat -r 80050 https://svn.r-project.org/R/trunk/src/library/tools/R/urltools.R > urltools.R
-```
-
-When you want to update it run the above command with the latest SVN revision number
+Re-generate `urltools.R` with `update_urltools()`.
diff --git a/inst/tools/urltools.R b/inst/tools/urltools.R
@@ -1,7 +1,7 @@
 #  File src/library/tools/R/urltools.R
 #  Part of the R package, https://www.R-project.org
 #
-#  Copyright (C) 2015-2021 The R Core Team
+#  Copyright (C) 2015-2023 The R Core Team
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
@@ -16,6 +16,9 @@
 #  A copy of the GNU General Public License is available at
 #  https://www.R-project.org/Licenses/
 
+## See RFC 3986 <https://www.rfc-editor.org/rfc/rfc3986> and
+## <https://url.spec.whatwg.org/>.
+
 get_IANA_URI_scheme_db <-
 function()
 {
@@ -25,13 +28,13 @@ function()
     db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
                           stringsAsFactors = FALSE, encoding = "UTF-8")
     names(db) <- chartr(".", "_", names(db))
+    db$URI_Scheme <- sub(" .*", "", db$URI_Scheme)
     db
 }
 
 parse_URI_reference <-
 function(x)
 {
-    ## See RFC_3986 <https://tools.ietf.org/html/rfc3986>.
     re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
     if(length(x)) {
         y <- do.call(rbind, regmatches(x, regexec(re, x)))
@@ -204,6 +207,11 @@ function(meta)
             "([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])"
         m <- gregexpr(pattern, v)
         urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
+        regmatches(v, m) <- ""
+        pattern <- "<([A-Za-z][A-Za-z0-9.+-]*:[^>]+)>"
+        ##   scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+        m <- gregexpr(pattern, v)
+        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 2L))
     }
 
     url_db(urls, rep.int("DESCRIPTION", length(urls)))
@@ -289,7 +297,7 @@ function(dir, installed = FALSE)
 
 url_db_from_package_sources <-
 function(dir, add = FALSE) {
-    meta <- .read_description(file.path(dir, "DESCRIPTION"))
+    meta <- .get_package_metadata(dir, FALSE)
     db <- rbind(url_db_from_package_metadata(meta),
                 url_db_from_package_Rd_db(Rd_db(dir = dir)),
                 url_db_from_package_citation(dir, meta),
@@ -353,9 +361,9 @@ function()
 }
 
 ## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
-## and <http://tools.ietf.org/html/rfc959>,
+## and <https://www.rfc-editor.org/rfc/rfc959>,
 ## Section 4.2.2 "Numeric Order List of Reply Codes",
-## and <https://tools.ietf.org/html/rfc2228>,
+## and <https://www.rfc-editor.org/rfc/rfc2228>,
 ## Section 5 "New FTP Replies".
 ## Only need those >= 400.
 table_of_FTP_server_return_codes <-
@@ -463,13 +471,11 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
         }
         ## Look for redirected URLs
         ## According to
-        ## <https://tools.ietf.org/html/rfc7230#section-3.1.2> the first
+        ## <https://www.rfc-editor.org/rfc/rfc7230#section-3.1.2> the first
         ## line of a response is the status-line, with "a possibly empty
         ## textual phrase describing the status code", so only look for
         ## a 301 status code in the first line.
-        if(grepl(" 301 ", h[1L], useBytes = TRUE) &&
-           !startsWith(u, "https://doi.org/") &&
-           !startsWith(u, "http://dx.doi.org/")) {
+        if(grepl(" 301 ", h[1L], useBytes = TRUE)) {
             ## Get the new location from the last consecutive 301
             ## obtained.
             h <- split(h, c(0L, cumsum(h == "\r\n")[-length(h)]))
@@ -485,7 +491,7 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
                 ## Ouch.  According to RFC 7231, the location is a URI
                 ## reference, and may be relative in which case it needs
                 ## resolving against the effect request URI.
-                ## <https://tools.ietf.org/html/rfc7231#section-7.1.2>.
+                ## <https://www.rfc-editor.org/rfc/rfc7231#section-7.1.2>.
                 ## Not quite straightforward, hence do not report such
                 ## 301s. 
                 ## (Alternatively, could try reporting the 301 but no
@@ -522,7 +528,7 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
     .check_http_B <- function(u) {
         ul <- tolower(u)
         cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
-                  !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]_]+(html|pdf|rds)$",
+                  !grepl("^https?://cran.r-project.org/web/packages/([.[:alnum:]_]+(html|pdf|rds))?$",
                          ul)) ||
                  (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
                         ul)) ||
@@ -568,9 +574,18 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
 
     ## Invalid URI schemes.
     schemes <- parts[, 1L]
-    ind <- is.na(match(schemes,
+    ind <- is.na(match(tolower(schemes),
                        c("",
                          IANA_URI_scheme_db$URI_Scheme,
+                         "arxiv",
+                         ## Also allow 'isbn' and 'issn', which in fact
+                         ## are registered URN namespaces but not
+                         ## registered URI schemes, see
+                         ## <https://www.iana.org/assignments/urn-formal/isbn>
+                         ## <https://www.iana.org/assignments/urn-formal/issn>
+                         ## <https://doi.org/10.17487/rfc3986>
+                         ## <https://doi.org/10.17487/rfc8141>.
+                         "isbn", "issn",
                          ## Also allow 'javascript' scheme, see
                          ## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
                          ## (but apparently never registered with IANA).
@@ -584,6 +599,9 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
         bad <- rbind(bad,
                      .gather(urls[ind], parents[ind], m = msg))
     }
+
+    ## Could check urn URIs at least for appropriate namespaces using
+    ## <https://www.iana.org/assignments/urn-namespaces/urn-namespaces-1.csv>
 
     ## ftp.
     pos <- which(schemes == "ftp")
@@ -606,8 +624,27 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
 
     ## http/https.
     pos <- which(schemes == "http" | schemes == "https")
-    if(length(pos)) {
+    if(length(pos) && remote) {
         urlspos <- urls[pos]
+        ## Check DOI URLs via the DOI handle API, as we nowadays do for
+        ## checking DOIs.
+        myparts <- parts[pos, , drop = FALSE]
+        ind <- (((myparts[, 2L] == "doi.org") | 
+                 (myparts[, 2L] == "dx.doi.org")) &
+                startsWith(myparts[, 3L], "/10.") &
+                !nzchar(myparts[, 4L]) &
+                !nzchar(myparts[, 5L]))
+        if(any(ind))
+            urlspos[ind] <- paste0("https://doi.org/api/handles",
+                                   myparts[ind, 3L])
+        ## Could also use regexps, e.g.
+        ##    pat <- "^https?://(dx[.])?doi.org/10[.]([^?#]+)$"
+        ##    ind <- grep(pat, urlspos)
+        ##    if(length(ind))
+        ##         urlspos[ind] <-
+        ##             paste0("https://doi.org/api/handles/10.",
+        ##                     sub(pat, "\\2", urlspos[ind]))
+        ## but using the parts is considerably faster ...
         headers <- .fetch_headers(urlspos)
         results <- do.call(rbind, Map(.check_http, urlspos, headers))
         status <- as.numeric(results[, 1L])
@@ -694,15 +731,36 @@ function(x, ...)
     y
 }
 
-.fetch_headers_via_base <- function(urls, verbose = FALSE, ids = urls)
+.fetch_headers_via_base <-
+function(urls, verbose = FALSE, ids = urls)
     Map(function(u, verbose, i) {
             if(verbose) message(sprintf("processing %s", i))
             tryCatch(curlGetHeaders(u), error = identity)
         },
         urls, verbose, ids)
 
-.fetch_headers_via_curl <- function(urls, verbose = FALSE, pool = NULL) {
+.fetch_headers_via_curl <-
+function(urls, verbose = FALSE, pool = NULL) {
+    out <- .curl_multi_run_worker(urls, TRUE, verbose, pool)
+    ind <- !vapply(out, inherits, NA, "error")
+    if(any(ind))
+        out[ind] <- lapply(out[ind],
+                           function(x) {
+                               y <- strsplit(rawToChar(x$headers),
+                                             "(?<=\r\n)",
+                                             perl = TRUE)[[1L]]
+                               attr(y, "status") <- x$status_code
+                               y
+                           })
+    out
+}
 
+
+.curl_multi_run_worker <-
+function(urls, nobody = FALSE, verbose = FALSE, pool = NULL)
+{
+    ## Use 'nobody = TRUE' to fetch only headers.
+
     .progress_bar <- function(length, msg = "") {
         bar <- new.env(parent = baseenv())
         if(is.null(length)) {
@@ -732,14 +790,15 @@ function(x, ...)
     if(is.null(pool))
         pool <- curl::new_pool()
 
-    hs <- vector("list", length(urls))
+    bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ")    
+
+    out <- vector("list", length(urls))
 
-    bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ")
-    for(i in seq_along(hs)) {
+    for(i in seq_along(out)) {
         u <- urls[[i]]
         h <- curl::new_handle(url = u)
         curl::handle_setopt(h,
-                            nobody = TRUE,
+                            nobody = nobody,
                             cookiesession = 1L,
                             followlocation = 1L,
                             http_version = 2L,
@@ -756,14 +815,14 @@ function(x, ...)
         handle_result <- local({
             i <- i
             function(x) {
-                hs[[i]] <<- x
+                out[[i]] <<- x
                 bar$update()
             }
         })
         handle_error <- local({
             i <- i
             function(x) {
-                hs[[i]] <<-
+                out[[i]] <<-
                     structure(list(message = x),
                               class = c("curl_error", "error", "condition"))
                 bar$update()
@@ -777,21 +836,9 @@ function(x, ...)
 
     curl::multi_run(pool = pool)
 
-    out <- vector("list", length(hs))
-    for(i in seq_along(out)) {
-        if(inherits(hs[[i]], "error")) {
-            out[[i]] <- hs[[i]]
-        } else {
-            out[[i]] <- strsplit(rawToChar(hs[[i]]$headers),
-                                 "(?<=\r\n)",
-                                 perl = TRUE)[[1L]]
-            attr(out[[i]], "status") <- hs[[i]]$status_code
-        }
-    }
-
     out
 }
-    
+
 .curl_GET_status <-
 function(u, verbose = FALSE)
 {