run.R

#!/usr/local/bin/Rscript

task <- dyncli::main()

# load libraries
library(dyncli, warn.conflicts = FALSE)
library(dynwrap, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(purrr, warn.conflicts = FALSE)
library(readr, warn.conflicts = FALSE)

# calista NEEDS to be in the CALISTA-R folder while at the same time
# requiring to be able to write files there
tmpdir <- dynutils::safe_tempdir("workspace")
file.copy("/CALISTA/CALISTA-R", tmpdir, recursive = TRUE)
setwd(paste0(tmpdir, "/CALISTA-R"))
source("R/initialization.R")

#####################################
###           LOAD DATA           ###
#####################################
expression <- task$expression
parameters <- task$parameters
priors <- task$priors

file_loc <- tempfile(pattern = "expression.csv")

data_df <- data.frame(
  row.names = NULL,
  as.matrix(expression),
  check.names = FALSE
)
write_csv(data_df, file_loc)

# TIMING: done with preproc
timings <- list(method_afterpreproc = Sys.time())

#####################################
###        INFER TRAJECTORY       ###
#####################################

# based on an example script available at
# https://github.com/CABSEL/CALISTA/tree/367d5bdbfa80796145a9feba0f1dffb144ac67bc/CALISTA-R/EXAMPLES/RNA-seq


# Prepare CALISTA for work
INPUTS <- list()
# Specify data types and settings for pre-processing
INPUTS$data_location=file_loc
INPUTS$data_type=1; # Single-cell RT-qPCR CT data
INPUTS$format_data=3; # Rows= cells and Columns= genes (no time/stage info)
INPUTS$data_selection= integer(); # Include data from all time points
INPUTS$perczeros_genes=100; # Remove genes with > 100# of zeros
INPUTS$perczeros_cells=100; # Remove cells with 100# of zeros
INPUTS$cells_2_cut=0; # No manual removal of cells
INPUTS$perc_top_genes=100; # Retain only top X the most variable genes with X=min(200, INPUTS$perc_top_genes * num of cells/100, num of genes)
# Specify single-cell clustering settings
INPUTS$optimize=1; # The number of cluster is known a priori
INPUTS$parallel=0; # Use parallelization option
INPUTS$cluster='kmedoids'; # Use k-medoids in consensus clustering
INPUTS$Cluster='kmedoids'; # Use k-medoids in consensus clustering
# Specify transition genes settings
INPUTS$thr_transition_genes=50; # Set threshold for transition genes determination to

# put parameters into INPUTS
INPUTS$runs=parameters$runs; # Perform 50 independent runs of greedy algorithm
INPUTS$max_iter=parameters$max_iter; # Limit the number of iterations in greedy algorithm to 100

# % Upload and pre-process data
DATA=import_data(INPUTS)

# %% *** 2-SINGLE-CELL CLUSTERING ***
# %
# % Please check comments in 'CALISTA_clustering_main' for more information.
# %

Results=list()
CALISTA_clustering_main_results=CALISTA_clustering_main(DATA,INPUTS)
Results=CALISTA_clustering_main_results$Results
DATA=CALISTA_clustering_main_results$DATA
INPUTS=CALISTA_clustering_main_results$INPUTS

###cluster cutting
cluster_cut=0L

# %% *** 3-RECONSTRUCTION OF LINEAGE PROGRESSION ***
# %
# % Please check comments in 'CALISTA_transition_main' for more information.
Results=CALISTA_transition_main(DATA,Results)

# %% *** 4-DETERMINATION OF TRANSITION GENES ***
# %
# % Please check comments in 'CALISTA_transition_genes_main' for more information.
Results=CALISTA_transition_genes_main(DATA,INPUTS,Results)

# %% *** 5-PSEUDOTEMPORAL ORDERING OF CELLS ***
# %
# % Please check comments in 'CALISTA_ordering_main' for more information.
#
Results=CALISTA_ordering_main(DATA,Results)

# TIMING: done with trajectory inference
timings$method_aftermethod <- Sys.time()

#####################################
###     SAVE OUTPUT TRAJECTORY    ###
#####################################
milestone_network <- inner_join(data.frame(Results$TRANSITION$nodes_connection), data.frame(Results$cluster_distance), by = c("X1", "X2")) %>%
  dplyr::select(from = X1, to = X2, length = X3) %>%
  mutate(directed = FALSE)

cluster_probs <- Results$clustering_struct$all$all$clusterprobabilities

progressions <- map_df(seq_len(nrow(cluster_probs)), function(cell_i) {
  if (!any(is.finite(cluster_probs[cell_i, , drop = TRUE]))) {
    NULL
  } else {
    milestone_network %>%
      mutate(
        from_prob = cluster_probs[cell_i, from, drop = TRUE],
        to_prob = cluster_probs[cell_i, to, drop = TRUE],
        sum = from_prob + to_prob,
        percentage = to_prob / sum,
        cell_id = rownames(expression)[[cell_i]]
      ) %>%
      arrange(desc(sum)) %>%
      slice(1) %>%
      dplyr::select(cell_id, from, to, percentage)
  }
})

milestone_network <- milestone_network %>%
  mutate(from = paste0("M", from), to = paste0("M", to))

progressions <- progressions %>%
  mutate(from = paste0("M", from), to = paste0("M", to))

output <-
  wrap_data(
    cell_ids = unique(progressions$cell_id)
  ) %>%
  add_trajectory(
    milestone_network = milestone_network,
    progressions = progressions
  ) %>%
  add_timings(
    timings = timings
  )

dyncli::write_output(output, task$output)