From 2e3a472ce37b2413057f114e76ae8392df61b37b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Thu, 11 Apr 2024 11:23:19 +0200 Subject: [PATCH 1/7] feat: reduce fgbio memory usage --- workflow/envs/fgbio.yaml | 2 +- workflow/rules/common.smk | 8 +++---- workflow/rules/mapping.smk | 44 ++++++++++++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/workflow/envs/fgbio.yaml b/workflow/envs/fgbio.yaml index 8cff1a720..d4a74bfcd 100644 --- a/workflow/envs/fgbio.yaml +++ b/workflow/envs/fgbio.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - fgbio =1.4 \ No newline at end of file + - fgbio =2.2 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a22b2531f..1161306d0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -396,7 +396,7 @@ def get_sample_datatype(sample): def get_markduplicates_input(wildcards): aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa" if sample_has_umis(wildcards.sample): - return "results/mapped/{aligner}/{{sample}}.annotated.bam".format( + return "results/mapped/{aligner}/{{sample}}.annotated.sorted.bam".format( aligner=aligner ) else: @@ -1084,12 +1084,12 @@ def get_vembrane_config(wildcards, input): def get_umi_fastq(wildcards): umi_read = extract_unique_sample_column_value(wildcards.sample, "umi_read") if umi_read in ["fq1", "fq2"]: - return "results/untrimmed/{S}_{R}.fastq.gz".format( + return "results/untrimmed/{S}_{R}.sorted.fastq.gz".format( S=wildcards.sample, R=umi_read ) elif umi_read == "both": return expand( - "results/untrimmed/{S}_{R}.fastq.gz", S=wildcards.sample, R=["fq1", "fq2"] + "results/untrimmed/{S}_{R}.sorted.fastq.gz", S=wildcards.sample, R=["fq1", "fq2"] ) else: return umi_read @@ -1100,7 +1100,7 @@ def sample_has_umis(sample): def get_umi_read_structure(wildcards): - return "-r {}".format( + return "-s true -r {}".format( extract_unique_sample_column_value(wildcards.sample, "umi_read_structure") ) diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 52d9bea86..6f5da5505 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -15,6 +15,19 @@ rule map_reads: "v2.3.2/bio/bwa/mem" +rule query_sort_reads: + input: + "results/mapped/{aligner}/{sample}.bam" + output: + temp("results/mapped/{aligner}/{sample}.sorted.bam") + conda: + "../envs/fgbio.yaml" + log: + "logs/fgbio/sort_bam/{aligner}_{sample}.log" + shell: + "fgbio SortBam -i {input} -o {output} -s Queryname 2> {log}" + + rule merge_untrimmed_fastqs: input: get_untrimmed_fastqs, @@ -28,20 +41,43 @@ rule merge_untrimmed_fastqs: "cat {input} > {output} 2> {log}" +rule sort_untrimmed_fastqs: + input: + "results/untrimmed/{sample}_{read}.fastq.gz" + output: + temp("results/untrimmed/{sample}_{read}.sorted.fastq.gz") + conda: + "../envs/fgbio.yaml" + log: + "logs/fgbio/sort_fastq/{sample}_{read}.log" + shell: + "fgbio SortFastq -i {input} -o {output} 2> {log}" + + rule annotate_umis: input: - bam="results/mapped/{aligner}/{sample}.bam", + bam="results/mapped/{aligner}/{sample}.sorted.bam", umi=get_umi_fastq, output: temp("results/mapped/{aligner}/{sample}.annotated.bam"), params: extra=get_umi_read_structure, - resources: - mem_mb=lambda wc, input: 2.5 * input.size_mb, log: "logs/fgbio/annotate_bam/{aligner}/{sample}.log", wrapper: - "v2.3.2/bio/fgbio/annotatebamwithumis" + "v3.7.0/bio/fgbio/annotatebamwithumis" + + +rule sort_annotated_reads: + input: + "results/mapped/{aligner}/{sample}.annotated.bam", + output: + temp("results/mapped/{aligner}/{sample}.annotated.sorted.bam"), + log: + "logs/samtools_sort/{aligner}_{sample}.log", + threads: 8 + wrapper: + "v3.7.0/bio/samtools/sort" rule mark_duplicates: From 876d181297b983577939cbf434879dbc7372662d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Thu, 11 Apr 2024 11:59:47 +0200 Subject: [PATCH 2/7] formatting --- workflow/rules/common.smk | 4 +++- workflow/rules/mapping.smk | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1161306d0..9c70c9df9 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1089,7 +1089,9 @@ def get_umi_fastq(wildcards): ) elif umi_read == "both": return expand( - "results/untrimmed/{S}_{R}.sorted.fastq.gz", S=wildcards.sample, R=["fq1", "fq2"] + "results/untrimmed/{S}_{R}.sorted.fastq.gz", + S=wildcards.sample, + R=["fq1", "fq2"] ) else: return umi_read diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 6f5da5505..683b97170 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -17,13 +17,13 @@ rule map_reads: rule query_sort_reads: input: - "results/mapped/{aligner}/{sample}.bam" + "results/mapped/{aligner}/{sample}.bam", output: - temp("results/mapped/{aligner}/{sample}.sorted.bam") + temp("results/mapped/{aligner}/{sample}.sorted.bam"), conda: "../envs/fgbio.yaml" log: - "logs/fgbio/sort_bam/{aligner}_{sample}.log" + "logs/fgbio/sort_bam/{aligner}_{sample}.log", shell: "fgbio SortBam -i {input} -o {output} -s Queryname 2> {log}" @@ -43,13 +43,13 @@ rule merge_untrimmed_fastqs: rule sort_untrimmed_fastqs: input: - "results/untrimmed/{sample}_{read}.fastq.gz" + "results/untrimmed/{sample}_{read}.fastq.gz", output: - temp("results/untrimmed/{sample}_{read}.sorted.fastq.gz") + temp("results/untrimmed/{sample}_{read}.sorted.fastq.gz"), conda: "../envs/fgbio.yaml" log: - "logs/fgbio/sort_fastq/{sample}_{read}.log" + "logs/fgbio/sort_fastq/{sample}_{read}.log", shell: "fgbio SortFastq -i {input} -o {output} 2> {log}" From d50a713e963bf5f73e92a4747f1976e56c7f1d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Thu, 11 Apr 2024 12:03:45 +0200 Subject: [PATCH 3/7] formatting --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9c70c9df9..8b767c8d2 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1091,7 +1091,7 @@ def get_umi_fastq(wildcards): return expand( "results/untrimmed/{S}_{R}.sorted.fastq.gz", S=wildcards.sample, - R=["fq1", "fq2"] + R=["fq1", "fq2"], ) else: return umi_read From 8b343fbdac36c1ef6bb28954e71a928a6d6225ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Fri, 12 Apr 2024 12:05:19 +0200 Subject: [PATCH 4/7] replace rule by updated wrapper --- workflow/rules/common.smk | 12 ++++++++++++ workflow/rules/mapping.smk | 23 ++++++----------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 8b767c8d2..ab9b8c2ac 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -589,6 +589,18 @@ def get_read_group(wildcards): ) +def get_map_reads_sorting_params(wildcards, order_param=False): + match (sample_has_umis(wildcards.sample), order_param): + case (True, True): + return "queryname" + case (True, False): + return "fgbio" + case (False, True): + return "coordinate" + case (False, False): + return "samtools" + + def get_mutational_burden_targets(): mutational_burden_targets = [] if is_activated("mutational_burden"): diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 683b97170..ab6b21891 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -8,24 +8,11 @@ rule map_reads: "logs/bwa_mem/{sample}.log", params: extra=get_read_group, - sorting="samtools", - sort_order="coordinate", + sorting=get_map_reads_sorting_params, + sort_order=lambda wc: get_map_reads_sorting_params(wc, order_param=True), threads: 8 wrapper: - "v2.3.2/bio/bwa/mem" - - -rule query_sort_reads: - input: - "results/mapped/{aligner}/{sample}.bam", - output: - temp("results/mapped/{aligner}/{sample}.sorted.bam"), - conda: - "../envs/fgbio.yaml" - log: - "logs/fgbio/sort_bam/{aligner}_{sample}.log", - shell: - "fgbio SortBam -i {input} -o {output} -s Queryname 2> {log}" + "v3.7.0-29-ge7ff82c/bio/bwa/mem" rule merge_untrimmed_fastqs: @@ -33,6 +20,8 @@ rule merge_untrimmed_fastqs: get_untrimmed_fastqs, output: temp("results/untrimmed/{sample}_{read}.fastq.gz"), + conda: + "../envs/fgbio.yaml" log: "logs/merge-fastqs/untrimmed/{sample}_{read}.log", wildcard_constraints: @@ -56,7 +45,7 @@ rule sort_untrimmed_fastqs: rule annotate_umis: input: - bam="results/mapped/{aligner}/{sample}.sorted.bam", + bam="results/mapped/{aligner}/{sample}.bam", umi=get_umi_fastq, output: temp("results/mapped/{aligner}/{sample}.annotated.bam"), From 39c3b7914592541f93cf82638beee2b7c673d247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Mon, 15 Apr 2024 07:47:57 +0200 Subject: [PATCH 5/7] update wrapper --- workflow/envs/fgbio.yaml | 2 +- workflow/rules/common.smk | 4 ++-- workflow/rules/mapping.smk | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/envs/fgbio.yaml b/workflow/envs/fgbio.yaml index d4a74bfcd..9542fda1c 100644 --- a/workflow/envs/fgbio.yaml +++ b/workflow/envs/fgbio.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - fgbio =2.2 \ No newline at end of file + - fgbio-minimal =2.2 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ab9b8c2ac..b0f0b77f6 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -589,8 +589,8 @@ def get_read_group(wildcards): ) -def get_map_reads_sorting_params(wildcards, order_param=False): - match (sample_has_umis(wildcards.sample), order_param): +def get_map_reads_sorting_params(wildcards, ordering=False): + match (sample_has_umis(wildcards.sample), ordering): case (True, True): return "queryname" case (True, False): diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index ab6b21891..11ba23e8c 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -9,10 +9,10 @@ rule map_reads: params: extra=get_read_group, sorting=get_map_reads_sorting_params, - sort_order=lambda wc: get_map_reads_sorting_params(wc, order_param=True), + sort_order=lambda wc: get_map_reads_sorting_params(wc, ordering=True), threads: 8 wrapper: - "v3.7.0-29-ge7ff82c/bio/bwa/mem" + "v3.8.0/bio/bwa/mem" rule merge_untrimmed_fastqs: From 438053bbd85417bbd5f7656c84370c595b796490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Mon, 22 Apr 2024 08:07:31 +0200 Subject: [PATCH 6/7] rename function --- workflow/rules/common.smk | 4 ++-- workflow/rules/mapping.smk | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b0f0b77f6..fde051039 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1113,8 +1113,8 @@ def sample_has_umis(sample): return pd.notna(extract_unique_sample_column_value(sample, "umi_read")) -def get_umi_read_structure(wildcards): - return "-s true -r {}".format( +def get_annotate_umis_params(wildcards): + return "--sorted=true -r {}".format( extract_unique_sample_column_value(wildcards.sample, "umi_read_structure") ) diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 11ba23e8c..2663ca90e 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -50,7 +50,7 @@ rule annotate_umis: output: temp("results/mapped/{aligner}/{sample}.annotated.bam"), params: - extra=get_umi_read_structure, + extra=get_annotate_umis_params, log: "logs/fgbio/annotate_bam/{aligner}/{sample}.log", wrapper: From fd4a0e942ea08ec27c7c62ee9a661a524ba7485b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Mon, 22 Apr 2024 14:11:46 +0200 Subject: [PATCH 7/7] add piping --- workflow/rules/common.smk | 2 +- workflow/rules/mapping.smk | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index fde051039..a622d2544 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -396,7 +396,7 @@ def get_sample_datatype(sample): def get_markduplicates_input(wildcards): aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa" if sample_has_umis(wildcards.sample): - return "results/mapped/{aligner}/{{sample}}.annotated.sorted.bam".format( + return "results/mapped/{aligner}/{{sample}}.annotated.bam".format( aligner=aligner ) else: diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 2663ca90e..6c2a704a3 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -48,7 +48,7 @@ rule annotate_umis: bam="results/mapped/{aligner}/{sample}.bam", umi=get_umi_fastq, output: - temp("results/mapped/{aligner}/{sample}.annotated.bam"), + pipe("pipe/{aligner}/{sample}.annotated.bam"), params: extra=get_annotate_umis_params, log: @@ -59,9 +59,9 @@ rule annotate_umis: rule sort_annotated_reads: input: - "results/mapped/{aligner}/{sample}.annotated.bam", + "pipe/{aligner}/{sample}.annotated.bam", output: - temp("results/mapped/{aligner}/{sample}.annotated.sorted.bam"), + temp("results/mapped/{aligner}/{sample}.annotated.bam"), log: "logs/samtools_sort/{aligner}_{sample}.log", threads: 8