diff --git a/.gitignore b/.gitignore index 88c5a3d..e2fec9e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] .Rproj.user +.Rhistory /nogit /docs diff --git a/NAMESPACE b/NAMESPACE index 2291398..688287c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,6 +62,8 @@ export(multiqc_tidy_json) export(portal_meta_read) export(rdf2tab) export(read) +export(s3_files_list_filter_relevant) +export(s3_search) export(session_info_kable) export(time_metrics_process) export(tso_rmd) diff --git a/R/s3.R b/R/s3.R new file mode 100644 index 0000000..c798a70 --- /dev/null +++ b/R/s3.R @@ -0,0 +1,101 @@ +#' List Relevant Files In AWS S3 Directory +#' +#' Lists relevant files in an AWS S3 directory. +#' +#' @param s3dir GDS directory. +#' @param pattern Pattern to further filter the returned file type tibble. +#' @param page_size The size of each page to get in the AWS service call (def: 1000). +#' @param max_items The total number of items to return in the command’s output (def: 1000). +#' @param presign Include presigned URLs (def: FALSE). +#' @param expiry_sec Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)). +#' +#' @return A tibble with path, date, file size, file type, and presigned URL if requested. +#' @examples +#' \dontrun{ +#' s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables" +#' s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) +#' } +#' @export +s3_files_list_filter_relevant <- function(s3dir, pattern = NULL, page_size = 1000, max_items = 1000, presign = FALSE, expiry_sec = 43200) { + assertthat::assert_that(grepl("^s3://", s3dir), rlang::is_logical(presign)) + pattern <- pattern %||% ".*" # keep all recognisable files by default + b <- sub("s3://(.*?)/.*", "\\1", s3dir) + p <- sub("s3://(.*?)/(.*)", "\\2", s3dir) + cmd <- glue( + "aws --output json s3api list-objects-v2 --bucket {b} --prefix {p} ", + "--max-items {max_items} --page-size {page_size}" + ) + l <- system(cmd, intern = TRUE) + j <- jsonlite::fromJSON(l) + assertthat::assert_that("Contents" %in% names(j)) + d <- j[["Contents"]] |> + tibble::as_tibble() |> + dplyr::mutate( + path = glue("s3://{b}/{.data$Key}"), + date_utc = .data$LastModified, + size = fs::as_fs_bytes(.data$Size) + ) |> + dplyr::rowwise() |> + dplyr::mutate( + bname = basename(.data$path), + type = purrr::map_chr(.data$bname, match_regex) + ) |> + dplyr::ungroup() |> + dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> + dplyr::select("path", "date_utc", "size", "type") + + if (presign) { + d <- d |> + dplyr::rowwise() |> + dplyr::mutate(presigned_url = s3_file_presignedurl(.data$path, expiry_seconds = expiry_sec)) |> + dplyr::ungroup() + } + d +} + +s3_file_presignedurl <- function(s3path, expiry_seconds = 3600) { + p <- system(glue("aws s3 presign {s3path} --expires-in {expiry_seconds}"), intern = TRUE) + p +} + +#' Search AWS S3 Objects +#' +#' Searches for the given pattern in the UMCCR `umccr-primary-data-prod` AWS S3 +#' bucket. +#' +#' @param pat Pattern to search for (e.g. 'multiqc_data.json'). +#' @param rows Max number of rows to return. +#' +#' @return Tibble with S3 path, object size, date modified, id, unique hash. +#' +#' @examples +#' \dontrun{ +#' pat <- "qc_summary.tsv.gz" +#' s3_search(pat, 10) +#' } +#' @export +s3_search <- function(pat, rows) { + au_tz <- "Australia/Melbourne" + utc_tz <- "UTC" + base_url <- "https://api.portal.prod.umccr.org/iam/s3" + url1 <- utils::URLencode(glue("{base_url}?rowsPerPage={rows}&search={pat}")) + awscurl_cmd <- glue( + "awscurl '{url1}' ", + "--header 'Accept: application/json'" + ) + message(glue("Running {awscurl_cmd}")) + j <- system(awscurl_cmd, intern = TRUE) + date_fmt <- "%Y-%m-%dT%H:%M:%S" + d <- j |> + jsonlite::fromJSON() |> + purrr::pluck("results") |> + tibble::as_tibble() + d |> + dplyr::mutate( + date1 = as.POSIXct(.data$last_modified_date, tz = utc_tz, format = date_fmt), + date_aest = lubridate::with_tz(.data$date1, tz = au_tz), + path = glue("s3://{bucket}/{key}"), + size = fs::as_fs_bytes(.data$size) + ) |> + dplyr::select("path", "size", "date_aest", "id", "unique_hash") +} diff --git a/inst/rmd/umccr_workflows/umccrise/multi.Rmd b/inst/rmd/umccr_workflows/umccrise/multi.Rmd index 4689ad5..21b7bb2 100644 --- a/inst/rmd/umccr_workflows/umccrise/multi.Rmd +++ b/inst/rmd/umccr_workflows/umccrise/multi.Rmd @@ -47,10 +47,10 @@ knitr::opts_chunk$set( ```{r load_pkgs} { - require(dplyr) + require(dplyr) # import all dplyr funcs require(readr, include.only = c("read_rds")) require(purrr, include.only = c("map")) - require(tidyr, include.only = c("unnest", "unnest_wider")) + require(tidyr, include.only = c("unnest")) require(dracarys) require(glue, include.only = "glue") require(here, include.only = "here") @@ -60,11 +60,53 @@ knitr::opts_chunk$set( require(ggplot2, include.only = c("ggplot", "aes")) require(lubridate, include.only = c("as_datetime")) require(plotly, include.only = c("ggplotly")) + require(openssl, include.only = c("sha256")) } ``` ```{r data_setup, eval=FALSE} -options(width = 150) +#---- S3 ----# +s3 <- here::here(glue::glue("nogit/umccrise/rds/portal_meta/2023-09-12_pmeta_s3.rds")) |> + readr::read_rds() +s3_get_presigned1 <- function(x, row_slice) { + start_time <- Sys.time() + s3_map <- x |> + slice(row_slice) |> + rowwise() |> + mutate( + s3_contents = list(s3_files_list_filter_relevant( + s3dir = .data$dir1, presign = TRUE + )) + ) |> + ungroup() |> + tidyr::unnest("s3_contents") |> + select( + "SubjectID", "LibraryID_tumor", "SampleID_tumor", + "date_utc", "type", "size", "path", "presigned_url" + ) + end_time <- Sys.time() + total_time <- end_time - start_time + print(total_time) + s3_map +} +# 2 seconds per row +s3_map1 <- s3_get_presigned1(s3, 1:100) +s3_map2 <- s3_get_presigned1(s3, 101:200) +s3_map3 <- s3_get_presigned1(s3, 201:300) +s3_map4 <- s3_get_presigned1(s3, 301:400) +s3_map5 <- s3_get_presigned1(s3, 401:449) + +saveRDS(s3_map1, here("nogit/umccrise/rds/s3/map1_2023-09-12.rds")) +saveRDS(s3_map2, here("nogit/umccrise/rds/s3/map2_2023-09-12.rds")) +saveRDS(s3_map3, here("nogit/umccrise/rds/s3/map3_2023-09-12.rds")) +saveRDS(s3_map4, here("nogit/umccrise/rds/s3/map4_2023-09-12.rds")) +saveRDS(s3_map5, here("nogit/umccrise/rds/s3/map5_2023-09-12.rds")) +s3_map <- fs::dir_ls(here("nogit/umccrise/rds/s3"), regexp = "map.*rds") |> + purrr::map(readr::read_rds) |> + bind_rows() +saveRDS(s3_map, here("nogit/umccrise/rds/s3_map_2023-09-12.rds")) + +#---- GDS ----# token <- dracarys::ica_token_validate(Sys.getenv("ICA_ACCESS_TOKEN_PRO")) pmeta <- here("nogit/umccrise/rds/portal_meta/2023-09-04_pmeta_final.rds") |> readr::read_rds() @@ -84,26 +126,38 @@ gds_map <- pmeta |> filter(type != "MultiqcFile") saveRDS(gds_map, here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) +``` -parse_files <- function(gds_map, row_slice, rds_out) { +```{r data_parse, eval=FALSE} +parse_files <- function(x, row_slice, rds_out) { start_time <- Sys.time() - dat1 <- gds_map |> - dplyr::slice(row_slice) |> - dplyr::rowwise() |> - dplyr::mutate( + dat1 <- x |> + slice(row_slice) |> + rowwise() |> + mutate( gen = list(dracarys::dr_func_eval(.data$type)), obj = list(.data$gen$new(.data$presigned_url)), objp = list(.data$obj$read()) ) |> - dplyr::ungroup() + ungroup() end_time <- Sys.time() total_time <- end_time - start_time print(total_time) readr::write_rds(x = dat1, file = rds_out) } +rds_path_out <- here::here("nogit/umccrise/rds/results") +#---- S3 ----# +s3_map <- readr::read_rds(here("nogit/umccrise/rds/s3_map_2023-09-12.rds")) +s0 <- parse_files(s3_map, 1:10, file.path(rds_path_out, "s0.rds")) +s1 <- parse_files(s3_map, 1:500, file.path(rds_path_out, "s1.rds")) +s2 <- parse_files(s3_map, 501:1000, file.path(rds_path_out, "s2.rds")) +s3 <- parse_files(s3_map, 1001:1500, file.path(rds_path_out, "s3.rds")) +s4 <- parse_files(s3_map, 1501:2000, file.path(rds_path_out, "s4.rds")) +s5 <- parse_files(s3_map, 2001:2245, file.path(rds_path_out, "s5.rds")) + +#---- GDS ----# gds_map <- readr::read_rds(here("nogit/umccrise/rds/gds_map_2023-09-05.rds")) -rds_path_out <- here("nogit/umccrise/rds/results") x0 <- parse_files(gds_map, 1:10, file.path(rds_path_out, "x0.rds")) x1 <- parse_files(gds_map, 1:500, file.path(rds_path_out, "x1.rds")) x2 <- parse_files(gds_map, 501:1000, file.path(rds_path_out, "x2.rds")) @@ -115,11 +169,38 @@ x5 <- parse_files(gds_map, 2001:2245, file.path(rds_path_out, "x5.rds")) ```{r data_load} lims_raw <- here("nogit/umccrise/rds/lims/2023-09-04_lims_raw.rds") |> readr::read_rds() -dat1 <- fs::dir_ls(here("nogit/umccrise/rds/results")) |> +dat_s3_raw <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "s[1-5]{1}.rds") |> + purrr::map(readr::read_rds) |> + bind_rows() +# create sha256 for umccrise directory to distinguish between runs +# keep first 8 digits and append to umccrise date folder. +dat_s3 <- dat_s3_raw |> + mutate( + um_dir = sub("s3://umccr-primary-data-prod/(.*)/cancer_report_tables/.*", "\\1", path), + date_dir = basename(dirname(dirname(um_dir))), + date_dir = gsub("-", "", date_dir), + hash256 = openssl::sha256(um_dir), + hash256 = substr(hash256, 1, 8), + portal_run_id = glue("fake.{date_dir}{hash256}") + ) |> + select(-c(um_dir, date_dir, hash256, SampleID_tumor)) +dat_gds <- fs::dir_ls(here("nogit/umccrise/rds/results"), regexp = "x[1-5]{1}.rds") |> purrr::map(readr::read_rds) |> - dplyr::bind_rows() + bind_rows() -o <- dat1 |> +dat_s3_res <- dat_s3 |> + mutate( + type = case_when( + grepl("snv_2015.tsv.gz", path) ~ "UmSigsSnvFile2015", + grepl("snv_2020.tsv.gz", path) ~ "UmSigsSnvFile2020", + .default = .data$type + ), + date_utc2 = lubridate::as_datetime(.data$date_utc, format = "%Y-%m-%dT%H:%M:%S+00:00"), + date_analysed_aest = lubridate::with_tz(.data$date_utc2, tz = "Australia/Melbourne"), + date_analysed_aest = as.character(.data$date_analysed_aest) + ) |> + select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id) +dat_gds_res <- dat_gds |> mutate( type = case_when( grepl("snv_2015.tsv.gz", bname) ~ "UmSigsSnvFile2015", @@ -128,35 +209,42 @@ o <- dat1 |> ), date_analysed_aest = as.character(.data$end), ) |> - select( - date_analysed_aest, - SubjectID, - LibraryID_tumor, - LibraryID_normal, - type, - objp, - portal_run_id - ) - -lims <- lims_raw |> - dplyr::filter(LibraryID %in% c(o$LibraryID_tumor)) |> - dplyr::select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> - dplyr::distinct() - - -o2 <- o |> - dplyr::left_join(lims, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> - dplyr::mutate( + select(date_analysed_aest, SubjectID, LibraryID_tumor, type, objp, portal_run_id) + +lims_s3 <- lims_raw |> + filter(LibraryID %in% dat_s3_res$LibraryID_tumor) |> + select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> + distinct() +lims_gds <- lims_raw |> + filter(LibraryID %in% c(dat_gds_res$LibraryID_tumor)) |> + select(SubjectID, LibraryID, ExternalSubjectID, ProjectOwner, ProjectName, Type, Workflow) |> + distinct() + +o1 <- dat_s3_res |> + left_join(lims_s3, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> + mutate( + url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), + sbj_url = glue("{.data$SubjectID}"), + url = glue("{.data$url}") + ) |> + rename(portal_url = url) +o2 <- dat_gds_res |> + left_join(lims_gds, by = c("SubjectID", "LibraryID_tumor" = "LibraryID")) |> + mutate( url = glue("https://portal.umccr.org/subjects/{.data$SubjectID}/overview"), sbj_url = glue("{.data$SubjectID}"), url = glue("{.data$url}"), portal_run_id = glue("dr.{portal_run_id}") ) |> - dplyr::rename(portal_url = url) + rename(portal_url = url) + +d <- list(s3 = o1, gds = o2) |> + bind_rows(.id = "s3_or_gds") dt_view <- function(x, scroll_y = 1000, ...) { + options(DT.TOJSON_ARGS = list(na = "string")) x |> - dplyr::mutate(across(where(is.character), as.factor)) |> + mutate(across(where(is.character), as.factor)) |> DT::datatable( filter = list(position = "top", clear = FALSE, plain = TRUE), class = "cell-border display compact", @@ -173,12 +261,12 @@ dt_view <- function(x, scroll_y = 1000, ...) { ) } -qcsum <- o2 |> +qcsum <- d |> filter(type == "UmQcSumFile") |> - unnest_wider(objp) -hrd_chord <- o2 |> + tidyr::unnest_wider(objp) +hrd_chord <- d |> filter(type == "UmChordTsvFile") |> - unnest_wider(objp) |> + tidyr::unnest_wider(objp) |> select(portal_run_id, # chord_p_hrd = p_hrd, chord_hr_status = hr_status, @@ -191,22 +279,24 @@ hrd_chord <- o2 |> # filter(type == "UmHrdetectTsvFile") |> # unnest_wider(objp) |> # select(portal_run_id, hrdetect_prob = Probability) -sigs_2015 <- o2 |> +sigs_2015 <- d |> filter(type == "UmSigsSnvFile2015") |> - unnest_wider(objp) |> - select(-c(type)) -sigs_2020 <- o2 |> + tidyr::unnest_wider(objp) |> + select(-c(type)) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_2020 <- d |> filter(type == "UmSigsSnvFile2020") |> - unnest_wider(objp) |> - select(-c(type)) + tidyr::unnest_wider(objp) |> + select(-c(type)) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) ``` ## umccrise Results ```{r final_tab} -cols_select <- c( +cols_select1 <- c( "date_analysed_aest", "SubjectID", "sbj_url", "LibraryID_tumor", "ExternalSubjectID", - "ProjectOwner", "ProjectName", "Type", "Workflow", "LibraryID_normal", + "ProjectOwner", "ProjectName", "Type", "Workflow", "hrd_chord", "hrd_hrdetect", "chord_hr_status", "chord_hrd_type", "chord_p_BRCA1", "chord_p_BRCA2", "qc_status_hmf", "sex_hmf", "purity_hmf", "ploidy_hmf", "msi_hmf", @@ -214,28 +304,65 @@ cols_select <- c( "deleted_genes_hmf", "tmb_hmf", "tml_hmf", "wgd_hmf", "hypermutated", "bpi_enabled", "portal_run_id", "portal_url" ) -d <- qcsum |> - dplyr::left_join(hrd_chord, by = "portal_run_id") |> - dplyr::select(dplyr::all_of(cols_select), dplyr::everything(), -c("type")) -dt_view(d, caption = "umccrise Results Summary") +# signatures +dsig <- bind_rows(list(s2015 = sigs_2015, s2020 = sigs_2020), .id = "Sig_group") |> + select(portal_run_id, Sig_group, Rank, Signature, Contribution, RelFreq) + +# keep top two ranked sigs from 2015 +dsig_filt <- dsig |> + filter( + Sig_group == "s2015" + ) |> + group_by(portal_run_id) |> + mutate(tot_sig_vars = sum(Contribution)) |> + arrange(Rank) |> + slice_head(n = 2) |> + # some sigs have same Rank so use explicit sig_rank + mutate(sig_rank = row_number()) |> + ungroup() |> + mutate( + sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") + ) |> + select(portal_run_id, sig_rank, sig_summary) |> + tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> + mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> + select(portal_run_id, sig_top2) + +dall <- qcsum |> + left_join(hrd_chord, by = "portal_run_id") |> + select(all_of(cols_select1), everything(), -c("type")) |> + left_join(dsig_filt, by = "portal_run_id") |> + relocate(sig_top2, .before = "hrd_chord") |> + relocate(s3_or_gds, .after = "SubjectID") +dt_view(dall) ``` +```{r join_excel_layla, eval=FALSE} +excel_all <- here("nogit/umccrise/Combined analysis Jan22_Aug23.xlsx") |> + readxl::read_xlsx(sheet = "All") +excel_all |> + select("...1", portal_run_id) |> + left_join(dall |> select(portal_run_id, sig_top2)) |> + rename(N = "...1") |> + readr::write_csv("sigs_top2_2023-09-08.csv") +``` + + ### HRD Results -```{r hrd_plot, fig.width=15, fig.height = 10} -p <- d |> - dplyr::mutate( +```{r hrd_plot, fig.width=15, fig.height = 15} +p1 <- dall |> + mutate( sbj = glue("{SubjectID}_{LibraryID_tumor}"), date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") ) |> - dplyr::select( + select( date, sbj, chord = hrd_chord, hrdetect = hrd_hrdetect, ) |> - tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") -p1 <- p |> - ggplot(aes(x = date, y = probability, label = sbj)) + + tidyr::pivot_longer(chord:hrdetect, names_to = "method", values_to = "probability") |> + ggplot2::ggplot(aes(x = date, y = probability, label = sbj)) + ggplot2::geom_point(aes(colour = method)) + ggplot2::geom_line(aes(group = sbj), linewidth = 0.05) + ggplot2::theme_bw() + @@ -244,31 +371,79 @@ p1 <- p |> plotly::ggplotly(p1) ``` +### Signature Results + +```{r fig.width = 15, fig.height=65, eval=TRUE} +sig_order2015 <- paste0("Sig", 1:30) +sig_order2020 <- paste0( + "SBS", + c( + 1:6, + paste0(7, c("a", "b", "c", "d")), + 8:9, + paste0(10, c("a", "b", "c", "d")), + 11:16, + paste0(17, c("a", "b")), + 18:60, + 84:94 + ) +) + +p2_prep <- dsig |> + filter( + Sig_group == "s2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed_aest, SubjectID, LibraryID_tumor), by = "portal_run_id") |> + mutate( + sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}")), + date = lubridate::as_datetime(date_analysed_aest, format = "%Y-%m-%d %H:%M:%S") + ) |> + select( + date, sbj, Sig_group, Rank, Signature, Contribution, RelFreq + ) |> + mutate(Signature = factor(Signature, levels = c(sig_order2015, sig_order2020))) +p2 <- p2_prep |> + filter(!grepl("ALLOCATE", sbj)) |> # get rid of ALLOCATE subject + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + + ggplot2::geom_bar(position = "fill", stat = "identity") + + ggplot2::theme_bw(base_size = 7) +# ggplot2::facet_wrap(~Sig_group, ncol = 1) + +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) +``` + ## Metadata Summary {.tabset .tabset-pills} ### ProjectOwner ```{r ProjectOwner} -count(d, ProjectOwner) |> dt_view(scroll_y = 400) +count(dall, ProjectOwner) |> dt_view(scroll_y = 400) ``` ### ProjectName ```{r ProjectName} -count(d, ProjectName) |> dt_view(scroll_y = 400) +count(dall, ProjectName) |> dt_view(scroll_y = 400) ``` ### Type ```{r Type} -count(d, Type) |> dt_view(scroll_y = 400) +count(dall, Type) |> dt_view(scroll_y = 400) ``` ### Workflow ```{r Workflow} -count(d, Workflow) |> dt_view(scroll_y = 400) +count(dall, Workflow) |> dt_view(scroll_y = 400) +``` + +### S3orGDS + +```{r s3orgds} +count(dall, s3_or_gds) |> dt_view(scroll_y = 400) ``` diff --git a/inst/scripts/umccrise_run.R b/inst/scripts/umccrise_run.R index 8c40a6d..6602d87 100644 --- a/inst/scripts/umccrise_run.R +++ b/inst/scripts/umccrise_run.R @@ -4,6 +4,7 @@ require(glue) require(dplyr) require(readr) +#---- GDS ----# # read last 1000 umccrise runs from portal # 475 from 2022-01-24 until 2023-09-03, of which 449 Succeeded date1 <- "2023-09-04" @@ -43,4 +44,31 @@ d <- pmeta |> d # final portal meta for umccrise runs +# columns: +# "id", "wfr_name", "wfr_id", "version", "end_status", "start", "end", "portal_run_id", +# "SubjectID", "LibraryID_tumor", "LibraryID_normal", "SampleID_tumor", "SampleID_normal", +# "gds_outdir_umccrise", "gds_indir_dragen_somatic", "gds_indir_dragen_germline", "gds_infile_genomes_tar" saveRDS(d, file = here(glue("nogit/umccrise/rds/portal_meta/{date1}_pmeta_final.rds"))) + +#---- S3 ----# +pat <- "qc_summary.tsv.gz" +rows <- 1000 +d_s3_raw <- dracarys::s3_search(pat = pat, rows = rows) + +d_s3 <- d_s3_raw |> + arrange(desc(date_aest)) |> + mutate( + bname = basename(path), + dir1 = dirname(path), # path/to/dirA/cancer_report_tables + dir2 = basename(dirname(dir1)), # dirA + sbj_samp_lib = sub(".*__(.*)", "\\1", dir2), + SubjectID = sub("(SBJ[0-9]{5})_.*", "\\1", sbj_samp_lib), + SampleID_tumor = sub("SBJ.*?_(.*?)_.*", "\\1", sbj_samp_lib), + LibraryID_tumor = sub("SBJ.*?_.*?_(.*)", "\\1", sbj_samp_lib), + rerun = grepl("rerun", .data$LibraryID_tumor) + ) |> + select(dir1, SubjectID, LibraryID_tumor, SampleID_tumor, date = date_aest, rerun) + +date2 <- "2023-09-12" +saveRDS(d_s3, file = here(glue("nogit/umccrise/rds/portal_meta/{date2}_pmeta_s3.rds"))) +# now we have S3 paths and metadata, so all we need is to generate presigned URLs and read the data diff --git a/man/s3_files_list_filter_relevant.Rd b/man/s3_files_list_filter_relevant.Rd new file mode 100644 index 0000000..1194eea --- /dev/null +++ b/man/s3_files_list_filter_relevant.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3.R +\name{s3_files_list_filter_relevant} +\alias{s3_files_list_filter_relevant} +\title{List Relevant Files In AWS S3 Directory} +\usage{ +s3_files_list_filter_relevant( + s3dir, + pattern = NULL, + page_size = 1000, + max_items = 1000, + presign = FALSE, + expiry_sec = 43200 +) +} +\arguments{ +\item{s3dir}{GDS directory.} + +\item{pattern}{Pattern to further filter the returned file type tibble.} + +\item{page_size}{The size of each page to get in the AWS service call (def: 1000).} + +\item{max_items}{The total number of items to return in the command’s output (def: 1000).} + +\item{presign}{Include presigned URLs (def: FALSE).} + +\item{expiry_sec}{Number of seconds the presigned URL will be valid for (if generated) (def: 43200 (12hrs)).} +} +\value{ +A tibble with path, date, file size, file type, and presigned URL if requested. +} +\description{ +Lists relevant files in an AWS S3 directory. +} +\examples{ +\dontrun{ +s3dir <- "s3://umccr-primary-data-prod/Accreditation/ALLOCATE-134131/WGS/2021-07-26/umccrised/ALLOCATE-134131__ALLOCATE-134131_MDx150892_Missing/cancer_report_tables" +s3_files_list_filter_relevant(s3dir = s3dir, presign = TRUE) +} +} diff --git a/man/s3_search.Rd b/man/s3_search.Rd new file mode 100644 index 0000000..c0d9f64 --- /dev/null +++ b/man/s3_search.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/s3.R +\name{s3_search} +\alias{s3_search} +\title{Search AWS S3 Objects} +\usage{ +s3_search(pat, rows) +} +\arguments{ +\item{pat}{Pattern to search for (e.g. 'multiqc_data.json').} + +\item{rows}{Max number of rows to return.} +} +\value{ +Tibble with S3 path, object size, date modified, id, unique hash. +} +\description{ +Searches for the given pattern in the UMCCR \code{umccr-primary-data-prod} AWS S3 +bucket. +} +\examples{ +\dontrun{ +pat <- "qc_summary.tsv.gz" +s3_search(pat, 10) +} +}