Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Easier way to update urltools.R #35

Merged
merged 1 commit into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ with_pandoc_available <- function(code) {
}
force(code)
}


update_urltools <- function() {
lines <- readLines("https://github.com/wch/r-source/trunk/src/library/tools/R/urltools.R")
writeLines(lines, "inst/tools/urltools.R")
}
8 changes: 1 addition & 7 deletions inst/tools/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
The file `urltools.R` in this directory is copied from the SVN with the following

```
svn cat -r 80050 https://svn.r-project.org/R/trunk/src/library/tools/R/urltools.R > urltools.R
```

When you want to update it run the above command with the latest SVN revision number
Re-generate `urltools.R` with `update_urltools()`.
115 changes: 81 additions & 34 deletions inst/tools/urltools.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# File src/library/tools/R/urltools.R
# Part of the R package, https://www.R-project.org
#
# Copyright (C) 2015-2021 The R Core Team
# Copyright (C) 2015-2023 The R Core Team
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
Expand All @@ -16,6 +16,9 @@
# A copy of the GNU General Public License is available at
# https://www.R-project.org/Licenses/

## See RFC 3986 <https://www.rfc-editor.org/rfc/rfc3986> and
## <https://url.spec.whatwg.org/>.

get_IANA_URI_scheme_db <-
function()
{
Expand All @@ -25,13 +28,13 @@ function()
db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
stringsAsFactors = FALSE, encoding = "UTF-8")
names(db) <- chartr(".", "_", names(db))
db$URI_Scheme <- sub(" .*", "", db$URI_Scheme)
db
}

parse_URI_reference <-
function(x)
{
## See RFC_3986 <https://tools.ietf.org/html/rfc3986>.
re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
if(length(x)) {
y <- do.call(rbind, regmatches(x, regexec(re, x)))
Expand Down Expand Up @@ -204,6 +207,11 @@ function(meta)
"([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])"
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
regmatches(v, m) <- ""
pattern <- "<([A-Za-z][A-Za-z0-9.+-]*:[^>]+)>"
## scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
m <- gregexpr(pattern, v)
urls <- c(urls, .gregexec_at_pos(pattern, v, m, 2L))
}

url_db(urls, rep.int("DESCRIPTION", length(urls)))
Expand Down Expand Up @@ -289,7 +297,7 @@ function(dir, installed = FALSE)

url_db_from_package_sources <-
function(dir, add = FALSE) {
meta <- .read_description(file.path(dir, "DESCRIPTION"))
meta <- .get_package_metadata(dir, FALSE)
db <- rbind(url_db_from_package_metadata(meta),
url_db_from_package_Rd_db(Rd_db(dir = dir)),
url_db_from_package_citation(dir, meta),
Expand Down Expand Up @@ -353,9 +361,9 @@ function()
}

## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
## and <http://tools.ietf.org/html/rfc959>,
## and <https://www.rfc-editor.org/rfc/rfc959>,
## Section 4.2.2 "Numeric Order List of Reply Codes",
## and <https://tools.ietf.org/html/rfc2228>,
## and <https://www.rfc-editor.org/rfc/rfc2228>,
## Section 5 "New FTP Replies".
## Only need those >= 400.
table_of_FTP_server_return_codes <-
Expand Down Expand Up @@ -463,13 +471,11 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
}
## Look for redirected URLs
## According to
## <https://tools.ietf.org/html/rfc7230#section-3.1.2> the first
## <https://www.rfc-editor.org/rfc/rfc7230#section-3.1.2> the first
## line of a response is the status-line, with "a possibly empty
## textual phrase describing the status code", so only look for
## a 301 status code in the first line.
if(grepl(" 301 ", h[1L], useBytes = TRUE) &&
!startsWith(u, "https://doi.org/") &&
!startsWith(u, "http://dx.doi.org/")) {
if(grepl(" 301 ", h[1L], useBytes = TRUE)) {
## Get the new location from the last consecutive 301
## obtained.
h <- split(h, c(0L, cumsum(h == "\r\n")[-length(h)]))
Expand All @@ -485,7 +491,7 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
## Ouch. According to RFC 7231, the location is a URI
## reference, and may be relative in which case it needs
## resolving against the effect request URI.
## <https://tools.ietf.org/html/rfc7231#section-7.1.2>.
## <https://www.rfc-editor.org/rfc/rfc7231#section-7.1.2>.
## Not quite straightforward, hence do not report such
## 301s.
## (Alternatively, could try reporting the 301 but no
Expand Down Expand Up @@ -522,7 +528,7 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
.check_http_B <- function(u) {
ul <- tolower(u)
cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
!grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]_]+(html|pdf|rds)$",
!grepl("^https?://cran.r-project.org/web/packages/([.[:alnum:]_]+(html|pdf|rds))?$",
ul)) ||
(grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
ul)) ||
Expand Down Expand Up @@ -568,9 +574,18 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)

## Invalid URI schemes.
schemes <- parts[, 1L]
ind <- is.na(match(schemes,
ind <- is.na(match(tolower(schemes),
c("",
IANA_URI_scheme_db$URI_Scheme,
"arxiv",
## Also allow 'isbn' and 'issn', which in fact
## are registered URN namespaces but not
## registered URI schemes, see
## <https://www.iana.org/assignments/urn-formal/isbn>
## <https://www.iana.org/assignments/urn-formal/issn>
## <https://doi.org/10.17487/rfc3986>
## <https://doi.org/10.17487/rfc8141>.
"isbn", "issn",
## Also allow 'javascript' scheme, see
## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
## (but apparently never registered with IANA).
Expand All @@ -584,6 +599,9 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)
bad <- rbind(bad,
.gather(urls[ind], parents[ind], m = msg))
}

## Could check urn URIs at least for appropriate namespaces using
## <https://www.iana.org/assignments/urn-namespaces/urn-namespaces-1.csv>

## ftp.
pos <- which(schemes == "ftp")
Expand All @@ -606,8 +624,27 @@ function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL)

## http/https.
pos <- which(schemes == "http" | schemes == "https")
if(length(pos)) {
if(length(pos) && remote) {
urlspos <- urls[pos]
## Check DOI URLs via the DOI handle API, as we nowadays do for
## checking DOIs.
myparts <- parts[pos, , drop = FALSE]
ind <- (((myparts[, 2L] == "doi.org") |
(myparts[, 2L] == "dx.doi.org")) &
startsWith(myparts[, 3L], "/10.") &
!nzchar(myparts[, 4L]) &
!nzchar(myparts[, 5L]))
if(any(ind))
urlspos[ind] <- paste0("https://doi.org/api/handles",
myparts[ind, 3L])
## Could also use regexps, e.g.
## pat <- "^https?://(dx[.])?doi.org/10[.]([^?#]+)$"
## ind <- grep(pat, urlspos)
## if(length(ind))
## urlspos[ind] <-
## paste0("https://doi.org/api/handles/10.",
## sub(pat, "\\2", urlspos[ind]))
## but using the parts is considerably faster ...
headers <- .fetch_headers(urlspos)
results <- do.call(rbind, Map(.check_http, urlspos, headers))
status <- as.numeric(results[, 1L])
Expand Down Expand Up @@ -694,15 +731,36 @@ function(x, ...)
y
}

.fetch_headers_via_base <- function(urls, verbose = FALSE, ids = urls)
.fetch_headers_via_base <-
function(urls, verbose = FALSE, ids = urls)
Map(function(u, verbose, i) {
if(verbose) message(sprintf("processing %s", i))
tryCatch(curlGetHeaders(u), error = identity)
},
urls, verbose, ids)

.fetch_headers_via_curl <- function(urls, verbose = FALSE, pool = NULL) {
.fetch_headers_via_curl <-
function(urls, verbose = FALSE, pool = NULL) {
out <- .curl_multi_run_worker(urls, TRUE, verbose, pool)
ind <- !vapply(out, inherits, NA, "error")
if(any(ind))
out[ind] <- lapply(out[ind],
function(x) {
y <- strsplit(rawToChar(x$headers),
"(?<=\r\n)",
perl = TRUE)[[1L]]
attr(y, "status") <- x$status_code
y
})
out
}


.curl_multi_run_worker <-
function(urls, nobody = FALSE, verbose = FALSE, pool = NULL)
{
## Use 'nobody = TRUE' to fetch only headers.

.progress_bar <- function(length, msg = "") {
bar <- new.env(parent = baseenv())
if(is.null(length)) {
Expand Down Expand Up @@ -732,14 +790,15 @@ function(x, ...)
if(is.null(pool))
pool <- curl::new_pool()

hs <- vector("list", length(urls))
bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ")

out <- vector("list", length(urls))

bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ")
for(i in seq_along(hs)) {
for(i in seq_along(out)) {
u <- urls[[i]]
h <- curl::new_handle(url = u)
curl::handle_setopt(h,
nobody = TRUE,
nobody = nobody,
cookiesession = 1L,
followlocation = 1L,
http_version = 2L,
Expand All @@ -756,14 +815,14 @@ function(x, ...)
handle_result <- local({
i <- i
function(x) {
hs[[i]] <<- x
out[[i]] <<- x
bar$update()
}
})
handle_error <- local({
i <- i
function(x) {
hs[[i]] <<-
out[[i]] <<-
structure(list(message = x),
class = c("curl_error", "error", "condition"))
bar$update()
Expand All @@ -777,21 +836,9 @@ function(x, ...)

curl::multi_run(pool = pool)

out <- vector("list", length(hs))
for(i in seq_along(out)) {
if(inherits(hs[[i]], "error")) {
out[[i]] <- hs[[i]]
} else {
out[[i]] <- strsplit(rawToChar(hs[[i]]$headers),
"(?<=\r\n)",
perl = TRUE)[[1L]]
attr(out[[i]], "status") <- hs[[i]]$status_code
}
}

out
}

.curl_GET_status <-
function(u, verbose = FALSE)
{
Expand Down
Loading