EngreitzLab · mayasheth · Mar 12, 2025 · Mar 13, 2025
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -3,8 +3,10 @@
 configfile: "config/config.yml"
 
 # rules for CRISPR comparisons
+include: "rules/utils.smk"
 include: "rules/crispr_comparison.smk"
 
+
 # perform all comparisons listed in config.yml
 rule all:
   input:

diff --git a/workflow/rules/crispr_comparison.smk b/workflow/rules/crispr_comparison.smk
@@ -65,7 +65,7 @@ rule mergePredictionsWithExperiment:
   log: "results/{comparison}/logs/mergePredictionsWithExperiment.log"
   conda: "../envs/r_crispr_comparison.yml"
   resources:
-    mem_mb = 32000
+    mem_mb = determine_mem_mb
   script:
    "../../workflow/scripts/mergePredictionsWithExperiment.R"
 
@@ -80,7 +80,7 @@ rule annotateEnhFeatures:
     "results/{comparison}/expt_pred_merged_annot.txt.gz"
   conda: "../envs/r_crispr_comparison.yml"
   resources:
-    mem_mb = 32000
+    mem_mb = determine_mem_mb
   script:
     "../../workflow/scripts/annotateMergedData.R"
 
@@ -99,7 +99,7 @@ rule comparePredictionsToExperiment:
      include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None)
   conda: "../envs/r_crispr_comparison.yml"
   resources:
-    mem_mb = 32000,
+    mem_mb = determine_mem_mb,
     runtime = "6h"
   script:
     "../../workflow/scripts/comparePredictionsToExperiment.Rmd"

diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
@@ -0,0 +1,10 @@
+MAX_MEM_MB = 250 * 1000  # 250GB
+
+def determine_mem_mb(wildcards, input, attempt, min_gb=8):
+	# Memory resource calculator for snakemake rules
+	input_size_mb = input.size_mb
+	if ".gz" in str(input):
+		input_size_mb *= 8  # assume gz compressesed the file <= 8x
+	attempt_multiplier = 2 ** (attempt - 1)  # Double memory for each retry
+	mem_to_use_mb = attempt_multiplier *  max(4 * input_size_mb, min_gb * 1000)
+	return min(mem_to_use_mb, MAX_MEM_MB)
diff --git a/workflow/scripts/crisprComparisonLoadInputData.R b/workflow/scripts/crisprComparisonLoadInputData.R
@@ -198,6 +198,12 @@ load_encode_pred_file <- function(file, showProgress) {
   # load predictions and remove optional "#" in header row
   pred <- fread(file)
   colnames(pred)[[1]] <- sub("^#", "", colnames(pred)[[1]])
+
+  if ("PredictionCellType" %in% colnames(pred)) {
+    pred <- pred %>% rename(CellType = PredictionCellType)
+  }
+
+  pred <- pred %>% mutate(name = paste0(chr, ":", start, "-", end))
 
   return(pred)