-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.R
executable file
·150 lines (123 loc) · 4.89 KB
/
run.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/local/bin/Rscript
task <- dyncli::main()
# load libraries
library(dyncli, warn.conflicts = FALSE)
library(dynwrap, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(purrr, warn.conflicts = FALSE)
library(readr, warn.conflicts = FALSE)
# calista NEEDS to be in the CALISTA-R folder while at the same time
# requiring to be able to write files there
tmpdir <- dynutils::safe_tempdir("workspace")
file.copy("/CALISTA/CALISTA-R", tmpdir, recursive = TRUE)
setwd(paste0(tmpdir, "/CALISTA-R"))
source("R/initialization.R")
#####################################
### LOAD DATA ###
#####################################
expression <- task$expression
parameters <- task$parameters
priors <- task$priors
file_loc <- tempfile(pattern = "expression.csv")
data_df <- data.frame(
row.names = NULL,
as.matrix(expression),
check.names = FALSE
)
write_csv(data_df, file_loc)
# TIMING: done with preproc
timings <- list(method_afterpreproc = Sys.time())
#####################################
### INFER TRAJECTORY ###
#####################################
# based on an example script available at
# https://github.com/CABSEL/CALISTA/tree/367d5bdbfa80796145a9feba0f1dffb144ac67bc/CALISTA-R/EXAMPLES/RNA-seq
# Prepare CALISTA for work
INPUTS <- list()
# Specify data types and settings for pre-processing
INPUTS$data_location=file_loc
INPUTS$data_type=1; # Single-cell RT-qPCR CT data
INPUTS$format_data=3; # Rows= cells and Columns= genes (no time/stage info)
INPUTS$data_selection= integer(); # Include data from all time points
INPUTS$perczeros_genes=100; # Remove genes with > 100# of zeros
INPUTS$perczeros_cells=100; # Remove cells with 100# of zeros
INPUTS$cells_2_cut=0; # No manual removal of cells
INPUTS$perc_top_genes=100; # Retain only top X the most variable genes with X=min(200, INPUTS$perc_top_genes * num of cells/100, num of genes)
# Specify single-cell clustering settings
INPUTS$optimize=1; # The number of cluster is known a priori
INPUTS$parallel=0; # Use parallelization option
INPUTS$cluster='kmedoids'; # Use k-medoids in consensus clustering
INPUTS$Cluster='kmedoids'; # Use k-medoids in consensus clustering
# Specify transition genes settings
INPUTS$thr_transition_genes=50; # Set threshold for transition genes determination to
# put parameters into INPUTS
INPUTS$runs=parameters$runs; # Perform 50 independent runs of greedy algorithm
INPUTS$max_iter=parameters$max_iter; # Limit the number of iterations in greedy algorithm to 100
# % Upload and pre-process data
DATA=import_data(INPUTS)
# %% *** 2-SINGLE-CELL CLUSTERING ***
# %
# % Please check comments in 'CALISTA_clustering_main' for more information.
# %
Results=list()
CALISTA_clustering_main_results=CALISTA_clustering_main(DATA,INPUTS)
Results=CALISTA_clustering_main_results$Results
DATA=CALISTA_clustering_main_results$DATA
INPUTS=CALISTA_clustering_main_results$INPUTS
###cluster cutting
cluster_cut=0L
# %% *** 3-RECONSTRUCTION OF LINEAGE PROGRESSION ***
# %
# % Please check comments in 'CALISTA_transition_main' for more information.
Results=CALISTA_transition_main(DATA,Results)
# %% *** 4-DETERMINATION OF TRANSITION GENES ***
# %
# % Please check comments in 'CALISTA_transition_genes_main' for more information.
Results=CALISTA_transition_genes_main(DATA,INPUTS,Results)
# %% *** 5-PSEUDOTEMPORAL ORDERING OF CELLS ***
# %
# % Please check comments in 'CALISTA_ordering_main' for more information.
#
Results=CALISTA_ordering_main(DATA,Results)
# TIMING: done with trajectory inference
timings$method_aftermethod <- Sys.time()
#####################################
### SAVE OUTPUT TRAJECTORY ###
#####################################
milestone_network <- inner_join(data.frame(Results$TRANSITION$nodes_connection), data.frame(Results$cluster_distance), by = c("X1", "X2")) %>%
dplyr::select(from = X1, to = X2, length = X3) %>%
mutate(directed = FALSE)
cluster_probs <- Results$clustering_struct$all$all$clusterprobabilities
progressions <- map_df(seq_len(nrow(cluster_probs)), function(cell_i) {
if (!any(is.finite(cluster_probs[cell_i, , drop = TRUE]))) {
NULL
} else {
milestone_network %>%
mutate(
from_prob = cluster_probs[cell_i, from, drop = TRUE],
to_prob = cluster_probs[cell_i, to, drop = TRUE],
sum = from_prob + to_prob,
percentage = to_prob / sum,
cell_id = rownames(expression)[[cell_i]]
) %>%
arrange(desc(sum)) %>%
slice(1) %>%
dplyr::select(cell_id, from, to, percentage)
}
})
milestone_network <- milestone_network %>%
mutate(from = paste0("M", from), to = paste0("M", to))
progressions <- progressions %>%
mutate(from = paste0("M", from), to = paste0("M", to))
output <-
wrap_data(
cell_ids = unique(progressions$cell_id)
) %>%
add_trajectory(
milestone_network = milestone_network,
progressions = progressions
) %>%
add_timings(
timings = timings
)
dyncli::write_output(output, task$output)