Skip to content

[Bug]: Dropping variables when merging datasets with merge_expression_module #262

Open
@llrs-roche

Description

@llrs-roche

What happened?

When extracting and merging from more than two data sources with merge_expression_module it seems like the variables selected from the third that are not part of the primary key are dropped even if they are required/selected by the app:

Details

library(shiny)
library(teal.data)
library(teal.widgets)

ADSL <- data.frame(
  STUDYID = "A",
  USUBJID = LETTERS[1:10],
  SEX = rep(c("F", "M"), 5),
  AGE = rpois(10, 30),
  BMRKR1 = rlnorm(10)
)
ADLB <- expand.grid(
  STUDYID = "A",
  USUBJID = LETTERS[1:10],
  PARAMCD = c("ALT", "CRP", "IGA"),
  AVISIT = c("SCREENING", "BASELINE", "WEEK 1 DAY 8", "WEEK 2 DAY 15")
)
ADLB$AVAL <- rlnorm(120)
ADLB$CHG <- rnorm(120)

data_list <- list(
  ADSL = reactive(ADSL),
  ADLB = reactive(ADLB),
  ADAE = reactive(teal.data::rADAE)
)

join_keys <- join_keys(
  join_key("ADSL", "ADSL", c("STUDYID", "USUBJID")),
  join_key("ADSL", "ADLB", c("STUDYID", "USUBJID")),
  join_key("ADSL", "ADAE", c("STUDYID", "USUBJID")),
  join_key("ADLB", "ADAE", c("STUDYID", "USUBJID")),
  join_key("ADAE", "ADAE", c("STUDYID", "USUBJID")),
  join_key("ADLB", "ADLB", c("STUDYID", "USUBJID", "PARAMCD", "AVISIT"))
)

adsl_extract <- data_extract_spec(
  dataname = "ADSL",
  select = select_spec(
    label = "Select variable:",
    choices = c("AGE", "BMRKR1"),
    selected = "AGE",
    multiple = TRUE,
    fixed = FALSE
  )
)

adlb_extract <- data_extract_spec(
  dataname = "ADLB",
  filter = filter_spec(vars = "PARAMCD", choices = c("ALT", "CRP", "IGA"), selected = "ALT"),
  select = select_spec(
    label = "Select variable:",
    choices = c("AVAL", "CHG"),
    selected = "AVAL",
    multiple = TRUE,
    fixed = FALSE
  )
)
adae_extract <- data_extract_spec(
  dataname = "ADAE",
  select = select_spec(
    label = "Select variable:",
    choices = c("RACE", "ETHNIC"),
    selected = "RACE",
    multiple = TRUE,
    fixed = FALSE
  )
)

ui <- bslib::page_fluid(
  bslib::layout_sidebar(
    tags$div(
      verbatimTextOutput("expr"),
      dataTableOutput("data")
    ),
    sidebar = tagList(
      data_extract_ui("adsl_var", label = "ADSL selection", adsl_extract),
      data_extract_ui("adlb_var", label = "ADLB selection", adlb_extract),
      data_extract_ui("adae_var", label = "ADAE selection", adae_extract)
    )
  )
)

server <- function(input, output, session) {
  data_q <- qenv()
  
  data_q <- eval_code(
    data_q,
    "ADSL <- data.frame(
        STUDYID = 'A',
        USUBJID = LETTERS[1:10],
        SEX = rep(c('F', 'M'), 5),
        AGE = rpois(10, 30),
        BMRKR1 = rlnorm(10)
      )"
  )
  
  data_q <- eval_code(
    data_q,
    "ADLB <- expand.grid(
        STUDYID = 'A',
        USUBJID = LETTERS[1:10],
        PARAMCD = c('ALT', 'CRP', 'IGA'),
        AVISIT = c('SCREENING', 'BASELINE', 'WEEK 1 DAY 8', 'WEEK 2 DAY 15'),
        AVAL = rlnorm(120),
        CHG = rlnorm(120)
       )"
  )
  
  data_q <- within(data_q, {
    ADAE <- teal.data::rADAE
  })
  
  merged_data <- merge_expression_module(
    data_extract = list(adsl_var = adsl_extract, 
                        adlb_var = adlb_extract, 
                        adae_bar = adae_extract),
    datasets = data_list,
    join_keys = join_keys,
    merge_function = "dplyr::left_join"
  )
  
  code_merge <- reactive({
    for (exp in merged_data()$expr) data_q <- eval_code(data_q, exp)
    data_q
  })
  
  output$expr <- renderText(paste(merged_data()$expr, collapse = "\n"))
  output$data <- renderDataTable(code_merge()[["ANL"]])
}

if (interactive()) {
  shinyApp(ui, server)
}

Code/expression generated by the function:

ANL_1 <- ADSL %>% dplyr::select(STUDYID, USUBJID, AGE)
ANL_2 <- ADLB %>% dplyr::filter(PARAMCD == "ALT") %>% dplyr::select(STUDYID, USUBJID, AVAL)
ANL_3 <- ADAE %>% dplyr::select(STUDYID, USUBJID) # RACE is selected by the APP
ANL <- ANL_1
ANL <- dplyr::left_join(ANL, ANL_2, by = c("STUDYID", "USUBJID"))
ANL <- dplyr::left_join(ANL, ANL_3, by = c("STUDYID", "USUBJID"))

Expected code generated by the app:

ANL_1 <- ADSL %>% dplyr::select(STUDYID, USUBJID, AGE)
ANL_2 <- ADLB %>% dplyr::filter(PARAMCD == "ALT") %>% dplyr::select(STUDYID, USUBJID, AVAL)
ANL_3 <- ADAE # Or %>% dplyr::select(STUDYID, USUBJID, RACE)
ANL <- ANL_1
ANL <- dplyr::left_join(ANL, ANL_2, by = c("STUDYID", "USUBJID"))
ANL <- dplyr::left_join(ANL, ANL_3, by = c("STUDYID", "USUBJID"))

While it is labeled as experimental I saw it first mentioned in teal.transform 0.2.0 (which I don't know how long ago it was) but it is clearly used across the framework for example in TMC modules (where I found it while developing a new module)

sessionInfo()

Relevant log output

Code of Conduct

  • I agree to follow this project's Code of Conduct.

Contribution Guidelines

  • I agree to follow this project's Contribution Guidelines.

Security Policy

  • I agree to follow this project's Security Policy.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions