Skip to content

Commit

Permalink
feat: reduce fgbio memory usage (#296)
Browse files Browse the repository at this point in the history
* feat: reduce fgbio memory usage

* formatting

* formatting

* replace rule by updated wrapper

* update wrapper

* rename function

* add piping
  • Loading branch information
FelixMoelder committed Apr 24, 2024
1 parent 246e186 commit 71611c8
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 13 deletions.
2 changes: 1 addition & 1 deletion workflow/envs/fgbio.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ channels:
- conda-forge
- bioconda
dependencies:
- fgbio =1.4
- fgbio-minimal =2.2
22 changes: 18 additions & 4 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,18 @@ def get_read_group(wildcards):
)


def get_map_reads_sorting_params(wildcards, ordering=False):
match (sample_has_umis(wildcards.sample), ordering):
case (True, True):
return "queryname"
case (True, False):
return "fgbio"
case (False, True):
return "coordinate"
case (False, False):
return "samtools"


def get_mutational_burden_targets():
mutational_burden_targets = []
if is_activated("mutational_burden"):
Expand Down Expand Up @@ -1084,12 +1096,14 @@ def get_vembrane_config(wildcards, input):
def get_umi_fastq(wildcards):
umi_read = extract_unique_sample_column_value(wildcards.sample, "umi_read")
if umi_read in ["fq1", "fq2"]:
return "results/untrimmed/{S}_{R}.fastq.gz".format(
return "results/untrimmed/{S}_{R}.sorted.fastq.gz".format(
S=wildcards.sample, R=umi_read
)
elif umi_read == "both":
return expand(
"results/untrimmed/{S}_{R}.fastq.gz", S=wildcards.sample, R=["fq1", "fq2"]
"results/untrimmed/{S}_{R}.sorted.fastq.gz",
S=wildcards.sample,
R=["fq1", "fq2"],
)
else:
return umi_read
Expand All @@ -1099,8 +1113,8 @@ def sample_has_umis(sample):
return pd.notna(extract_unique_sample_column_value(sample, "umi_read"))


def get_umi_read_structure(wildcards):
return "-r {}".format(
def get_annotate_umis_params(wildcards):
return "--sorted=true -r {}".format(
extract_unique_sample_column_value(wildcards.sample, "umi_read_structure")
)

Expand Down
41 changes: 33 additions & 8 deletions workflow/rules/mapping.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,20 @@ rule map_reads:
"logs/bwa_mem/{sample}.log",
params:
extra=get_read_group,
sorting="samtools",
sort_order="coordinate",
sorting=get_map_reads_sorting_params,
sort_order=lambda wc: get_map_reads_sorting_params(wc, ordering=True),
threads: 8
wrapper:
"v2.3.2/bio/bwa/mem"
"v3.8.0/bio/bwa/mem"


rule merge_untrimmed_fastqs:
input:
get_untrimmed_fastqs,
output:
temp("results/untrimmed/{sample}_{read}.fastq.gz"),
conda:
"../envs/fgbio.yaml"
log:
"logs/merge-fastqs/untrimmed/{sample}_{read}.log",
wildcard_constraints:
Expand All @@ -28,20 +30,43 @@ rule merge_untrimmed_fastqs:
"cat {input} > {output} 2> {log}"


rule sort_untrimmed_fastqs:
input:
"results/untrimmed/{sample}_{read}.fastq.gz",
output:
temp("results/untrimmed/{sample}_{read}.sorted.fastq.gz"),
conda:
"../envs/fgbio.yaml"
log:
"logs/fgbio/sort_fastq/{sample}_{read}.log",
shell:
"fgbio SortFastq -i {input} -o {output} 2> {log}"


rule annotate_umis:
input:
bam="results/mapped/{aligner}/{sample}.bam",
umi=get_umi_fastq,
output:
temp("results/mapped/{aligner}/{sample}.annotated.bam"),
pipe("pipe/{aligner}/{sample}.annotated.bam"),
params:
extra=get_umi_read_structure,
resources:
mem_mb=lambda wc, input: 2.5 * input.size_mb,
extra=get_annotate_umis_params,
log:
"logs/fgbio/annotate_bam/{aligner}/{sample}.log",
wrapper:
"v2.3.2/bio/fgbio/annotatebamwithumis"
"v3.7.0/bio/fgbio/annotatebamwithumis"


rule sort_annotated_reads:
input:
"pipe/{aligner}/{sample}.annotated.bam",
output:
temp("results/mapped/{aligner}/{sample}.annotated.bam"),
log:
"logs/samtools_sort/{aligner}_{sample}.log",
threads: 8
wrapper:
"v3.7.0/bio/samtools/sort"


rule mark_duplicates:
Expand Down

0 comments on commit 71611c8

Please sign in to comment.