From 4fb85995405488a5ef4a0f99296bbf9bc391abc8 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 15:05:55 +0200 Subject: [PATCH 01/25] fix: code polishing --- workflows/sarek.nf | 68 ++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 993c78a3fd..fbe109b92c 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -62,7 +62,7 @@ else { input_sample = extract_csv(csv_file) -save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false +def save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false // Save AWS IGenomes file containing annotation version def anno_readme = params.genomes[ params.genome ]?.readme @@ -141,7 +141,7 @@ read_structure2 = params.read_structure2 ?: Channel.empty() /* ======================================================================================== - INCLUDE LOCAL SUBWORKFLOWS + INCLUDE SUBWORKFLOWS ======================================================================================== */ @@ -159,6 +159,7 @@ include { BUILD_INDICES } from '../subworkflows/local/build_indices' addParams( tabix_known_indels_options: modules['tabix_known_indels'], tabix_pon_options: modules['tabix_pon'] ) + include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( bwamem1_mem_options: modules['bwa_mem1_mem'], bwamem1_mem_tumor_options: modules['bwa_mem1_mem_tumor'], @@ -197,11 +198,11 @@ include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recali include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_variant_calling' addParams( - concat_gvcf_options: modules['concat_gvcf'], - concat_haplotypecaller_options: modules['concat_haplotypecaller'], - genotypegvcf_options: modules['genotypegvcf'], - haplotypecaller_options: modules['haplotypecaller'], - strelka_options: modules['strelka_germline'] + concat_gvcf_options: modules['concat_gvcf'], + concat_haplotypecaller_options: modules['concat_haplotypecaller'], + genotypegvcf_options: modules['genotypegvcf'], + haplotypecaller_options: modules['haplotypecaller'], + strelka_options: modules['strelka_germline'] ) // // include { TUMOR_VARIANT_CALLING } from '../subworkflows/local/tumor_variant_calling' addParams( // // ) @@ -214,15 +215,20 @@ include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_varian // ) include { ANNOTATE } from '../subworkflows/local/annotate' addParams( - annotation_cache: params.annotation_cache, - bgziptabix_merge_vep_options: modules['bgziptabix_merge_vep'], - bgziptabix_snpeff_options: modules['bgziptabix_snpeff'], - bgziptabix_vep_options: modules['bgziptabix_vep'], - merge_vep_options: modules['merge_vep'], - snpeff_options: modules['snpeff'], - snpeff_tag: "${modules['snpeff'].tag_base}.${params.genome}", - vep_options: modules['ensemblvep'], - vep_tag: "${modules['ensemblvep'].tag_base}.${params.genome}" + annotation_cache: params.annotation_cache, + bgziptabix_merge_vep_options: modules['bgziptabix_merge_vep'], + bgziptabix_snpeff_options: modules['bgziptabix_snpeff'], + bgziptabix_vep_options: modules['bgziptabix_vep'], + merge_vep_options: modules['merge_vep'], + snpeff_options: modules['snpeff'], + snpeff_tag: "${modules['snpeff'].tag_base}.${params.genome}", + vep_options: modules['ensemblvep'], + vep_tag: "${modules['ensemblvep'].tag_base}.${params.genome}" +) + +include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' addParams( + fastqc_options: modules['fastqc'], + trimgalore_options: modules['trimgalore'] ) /* @@ -231,24 +237,12 @@ include { ANNOTATE } from '../subworkflows/local/annotate' addParams( ======================================================================================== */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() -def multiqc_options = modules['multiqc'] -multiqc_options.args += params.multiqc_title ? Utils.joinModuleArgs(["--title \"$params.multiqc_title\""]) : '' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams( options: multiqc_options ) -include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) - -/* -======================================================================================== - INCLUDE NF-CORE SUBWORKFLOWS -======================================================================================== -*/ - -include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' addParams( - fastqc_options: modules['fastqc'], - trimgalore_options: modules['trimgalore'] -) +modules['multiqc'].args += params.multiqc_title ? Utils.joinModuleArgs(["--title \"$params.multiqc_title\""]) : '' +include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams( options: modules['multiqc'] ) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) def multiqc_report = [] @@ -271,9 +265,7 @@ workflow SAREK { intervals = BUILD_INDICES.out.intervals num_intervals = 0 - intervals.count().map{ - num_intervals = it - } + intervals.count().map{ num_intervals = it } bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict @@ -285,8 +277,8 @@ workflow SAREK { pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi) : BUILD_INDICES.out.pon_tbi : [] //TODO @Rike, is this working for you? - known_sites = dbsnp ? dbsnp.concat(known_indels).collect() : ch_dummy_file - known_sites_tbi = dbsnp_tbi ? dbsnp_tbi.concat(known_indels_tbi).collect() : ch_dummy_file + known_sites = dbsnp.concat(known_indels).collect() + known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi @@ -490,7 +482,7 @@ workflow SAREK { ch_workflow_summary = Channel.value(workflow_summary) ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) + ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_config) ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(GET_SOFTWARE_VERSIONS.out.yaml.collect()) From 8fc3d3ec0b68e8f0258be0d0fc4b58169323ee98 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 15:29:32 +0200 Subject: [PATCH 02/25] fix: more code polish --- subworkflows/local/build_indices.nf | 30 ++++++++----- workflows/sarek.nf | 70 +++++++++++++---------------- 2 files changed, 51 insertions(+), 49 deletions(-) diff --git a/subworkflows/local/build_indices.nf b/subworkflows/local/build_indices.nf index 2063e6a01f..e4aa023287 100644 --- a/subworkflows/local/build_indices.nf +++ b/subworkflows/local/build_indices.nf @@ -136,15 +136,23 @@ workflow BUILD_INDICES { } emit: - bwa = result_bwa - bwa_version = version_bwa - dbsnp_tbi = result_dbsnp_tbi - dict = result_dict - fai = result_fai - germline_resource_tbi = result_germline_resource_tbi - intervals = result_intervals - known_indels_tbi = result_known_indels_tbi - msisensorpro_scan = result_msisensorpro_scan - pon_tbi = result_pon_tbi - target_bed_gz_tbi = result_target_bed + bwa = result_bwa + bwa_version = version_bwa + dbsnp_tbi = result_dbsnp_tbi + dbsnp_tbi_version = version_dbsnp_tbi + dict = result_dict + dict_version = version_dict + fai = result_fai + fai_version = version_fai + germline_resource_tbi = result_germline_resource_tbi + germline_resource_tbi_version = version_germline_resource_tbi + intervals = result_intervals + known_indels_tbi = result_known_indels_tbi.collect() + known_indels_tbi_version = version_known_indels_tbi + msisensorpro_scan = result_msisensorpro_scan + msisensorpro_scan_version = version_msisensorpro_scan + pon_tbi = result_pon_tbi + pon_tbi_version = version_pon_tbi + target_bed_gz_tbi = result_target_bed + target_bed_version = version_target_bed } diff --git a/workflows/sarek.nf b/workflows/sarek.nf index fbe109b92c..2540a0c9f2 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -104,40 +104,35 @@ if (params.skip_markduplicates) { } // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : ch_dummy_file -chr_length = params.chr_length ? Channel.fromPath(params.chr_length).collect() : ch_dummy_file -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : ch_dummy_file -dbsnp_tbi = params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : ch_dummy_file -fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : ch_dummy_file -fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : ch_dummy_file -germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : ch_dummy_file -germline_resource_tbi = params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : ch_dummy_file -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : ch_dummy_file -known_indels_tbi = params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : ch_dummy_file -loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : ch_dummy_file -loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : ch_dummy_file -mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : ch_dummy_file +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : ch_dummy_file +chr_length = params.chr_length ? Channel.fromPath(params.chr_length).collect() : ch_dummy_file +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : ch_dummy_file +fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : ch_dummy_file +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : ch_dummy_file +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : ch_dummy_file +loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : ch_dummy_file +loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : ch_dummy_file +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : ch_dummy_file // Initialize value channels based on params, defined in the params.genomes[params.genome] scope -snpeff_db = params.snpeff_db ?: Channel.empty() -vep_cache_version = params.vep_cache_version ?: Channel.empty() -vep_genome = params.vep_genome ?: Channel.empty() -vep_species = params.vep_species ?: Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_genome = params.vep_genome ?: Channel.empty() +vep_species = params.vep_species ?: Channel.empty() // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope -cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : ch_dummy_file -cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : ch_dummy_file -cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : ch_dummy_file -cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : ch_dummy_file -pon = params.pon ? Channel.fromPath(params.pon).collect() : ch_dummy_file -pon_tbi = params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : ch_dummy_file -snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : ch_dummy_file -target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : ch_dummy_file -vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : ch_dummy_file +cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : ch_dummy_file +cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : ch_dummy_file +cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : ch_dummy_file +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : ch_dummy_file +pon = params.pon ? Channel.fromPath(params.pon).collect() : ch_dummy_file +snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : ch_dummy_file +target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : ch_dummy_file +vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : ch_dummy_file // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope -read_structure1 = params.read_structure1 ?: Channel.empty() -read_structure2 = params.read_structure2 ?: Channel.empty() +read_structure1 = params.read_structure1 ?: Channel.empty() +read_structure2 = params.read_structure2 ?: Channel.empty() /* ======================================================================================== @@ -254,7 +249,7 @@ workflow SAREK { BUILD_INDICES( dbsnp, fasta, - fasta_fai, + params.fasta_fai, germline_resource, known_indels, pon, @@ -267,14 +262,13 @@ workflow SAREK { num_intervals = 0 intervals.count().map{ num_intervals = it } - bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa - dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict - fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai - - dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi) : BUILD_INDICES.out.dbsnp_tbi : [] - germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi) : BUILD_INDICES.out.germline_resource_tbi : [] - known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi) : BUILD_INDICES.out.known_indels_tbi.collect() : [] - pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi) : BUILD_INDICES.out.pon_tbi : [] + bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : BUILD_INDICES.out.bwa : ch_dummy_file + dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : BUILD_INDICES.out.dict : ch_dummy_file + fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : BUILD_INDICES.out.fai : ch_dummy_file + dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : BUILD_INDICES.out.dbsnp_tbi : ch_dummy_file + germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : BUILD_INDICES.out.germline_resource_tbi : ch_dummy_file + known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : BUILD_INDICES.out.known_indels_tbi : ch_dummy_file + pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : BUILD_INDICES.out.pon_tbi : ch_dummy_file //TODO @Rike, is this working for you? known_sites = dbsnp.concat(known_indels).collect() @@ -407,7 +401,7 @@ workflow SAREK { tools, cram_variant_calling, dbsnp, - dbsnp_tbi.collect(), + dbsnp_tbi, dict, fai, fasta, From 51bf9948c23c837e7ac683c50546aa5f11c5a528 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 17:01:55 +0200 Subject: [PATCH 03/25] feat: get some versions --- workflows/sarek.nf | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 2540a0c9f2..4739960ff6 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -257,11 +257,6 @@ workflow SAREK { tools, step) - intervals = BUILD_INDICES.out.intervals - - num_intervals = 0 - intervals.count().map{ num_intervals = it } - bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : BUILD_INDICES.out.bwa : ch_dummy_file dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : BUILD_INDICES.out.dict : ch_dummy_file fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : BUILD_INDICES.out.fai : ch_dummy_file @@ -274,9 +269,23 @@ workflow SAREK { known_sites = dbsnp.concat(known_indels).collect() known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + intervals = BUILD_INDICES.out.intervals + num_intervals = 0 + intervals.count().map{ num_intervals = it } + msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi + bwa_version = BUILD_INDICES.out.bwa_version + dbsnp_tbi_version = BUILD_INDICES.out.dbsnp_tbi_version + dict_version = BUILD_INDICES.out.dict_version + fai_version = BUILD_INDICES.out.fai_version + germline_resource_tbi_version = BUILD_INDICES.out.germline_resource_tbi_version + known_indels_tbi_version = BUILD_INDICES.out.known_indels_tbi_version + msisensorpro_scan_version = BUILD_INDICES.out.msisensorpro_scan_version + pon_tbi_version = BUILD_INDICES.out.pon_tbi_version + target_bed_version = BUILD_INDICES.out.target_bed_version + // PREPROCESSING bam_mapped = Channel.empty() @@ -321,10 +330,8 @@ workflow SAREK { } if (step == 'preparerecalibration') { - bam_mapped = Channel.empty() - if (params.skip_markduplicates) { - bam_indexed = input_sample - } else cram_markduplicates = input_sample + if (params.skip_markduplicates) bam_indexed = input_sample + else cram_markduplicates = input_sample } if (step in ['mapping', 'preparerecalibration']) { @@ -333,10 +340,12 @@ workflow SAREK { bam_indexed, ('markduplicates' in params.use_gatk_spark), !('markduplicates' in params.skip_qc), - fasta, fai, dict, + fasta, + fai, + dict, params.skip_markduplicates, - 'bamqc' in params.skip_qc, - 'samtools' in params.skip_qc, + ('bamqc' in params.skip_qc), + ('samtools' in params.skip_qc), target_bed) cram_markduplicates = QC_MARKDUPLICATES.out.cram From bb2be350a023dfe69fd2a6ed8bd7bdc42866057f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 17:13:17 +0200 Subject: [PATCH 04/25] feat: actually uses the versions collected --- workflows/sarek.nf | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 4739960ff6..9bb701462e 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -276,15 +276,12 @@ workflow SAREK { msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi - bwa_version = BUILD_INDICES.out.bwa_version - dbsnp_tbi_version = BUILD_INDICES.out.dbsnp_tbi_version - dict_version = BUILD_INDICES.out.dict_version - fai_version = BUILD_INDICES.out.fai_version - germline_resource_tbi_version = BUILD_INDICES.out.germline_resource_tbi_version - known_indels_tbi_version = BUILD_INDICES.out.known_indels_tbi_version - msisensorpro_scan_version = BUILD_INDICES.out.msisensorpro_scan_version - pon_tbi_version = BUILD_INDICES.out.pon_tbi_version - target_bed_version = BUILD_INDICES.out.target_bed_version + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.bwa_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.dict_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.fai_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.msisensorpro_scan_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.target_bed_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.dbsnp_tbi_version.mix(BUILD_INDICES.out.germline_resource_tbi_version, BUILD_INDICES.out.known_indels_tbi_version, BUILD_INDICES.out.pon_tbi_version).first().ifEmpty(null)) // PREPROCESSING From 1aa84e2d819ed0e84e57b25d33527a7b4e002607 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 17:45:15 +0200 Subject: [PATCH 05/25] feat: use some qc reports --- workflows/sarek.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 9bb701462e..0686087260 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -488,6 +488,8 @@ workflow SAREK { ch_multiqc_files = ch_multiqc_files.mix(GET_SOFTWARE_VERSIONS.out.yaml.collect()) if (step == 'mapping') ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(qc_reports) + MULTIQC(ch_multiqc_files.collect()) multiqc_report = MULTIQC.out.report.toList() ch_software_versions = ch_software_versions.mix(MULTIQC.out.version.ifEmpty(null)) From f0a155ccb780188525e438b37ada3439d3ae96dd Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 19:36:12 +0200 Subject: [PATCH 06/25] fix: use qc reports for multiqc --- conf/modules.config | 2 ++ subworkflows/nf-core/fastqc_trimgalore.nf | 10 ++++----- subworkflows/nf-core/qc_markduplicates.nf | 2 +- subworkflows/nf-core/recalibrate.nf | 2 +- workflows/sarek.nf | 26 +++++++++++------------ 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5c21954be7..933c933eb7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -133,6 +133,7 @@ params { args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' publish_by_meta = true publish_dir = 'reports/qualimap' + suffix = '.mapped' } 'samtools_stats_mapping' { publish_by_meta = true @@ -184,6 +185,7 @@ params { args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' publish_by_meta = true publish_dir = 'reports/qualimap' + suffix = '.recal' } 'samtools_index_recalibrate' { suffix = 'recal' diff --git a/subworkflows/nf-core/fastqc_trimgalore.nf b/subworkflows/nf-core/fastqc_trimgalore.nf index c067ba25c6..321b946486 100644 --- a/subworkflows/nf-core/fastqc_trimgalore.nf +++ b/subworkflows/nf-core/fastqc_trimgalore.nf @@ -1,12 +1,12 @@ -/* - * Read QC and trimming - */ +// +// Read QC and trimming +// params.fastqc_options = [:] params.trimgalore_options = [:] -include { FASTQC } from '../../modules/nf-core/modules/fastqc/main' addParams( options: params.fastqc_options ) -include { TRIMGALORE } from '../../modules/nf-core/modules/trimgalore/main' addParams( options: params.trimgalore_options ) +include { FASTQC } from '../../modules/nf-core/modules/fastqc/main' addParams( options: params.fastqc_options ) +include { TRIMGALORE } from '../../modules/nf-core/modules/trimgalore/main' addParams( options: params.trimgalore_options ) workflow FASTQC_TRIMGALORE { take: diff --git a/subworkflows/nf-core/qc_markduplicates.nf b/subworkflows/nf-core/qc_markduplicates.nf index 46e8dd7abe..8767a6189b 100644 --- a/subworkflows/nf-core/qc_markduplicates.nf +++ b/subworkflows/nf-core/qc_markduplicates.nf @@ -82,7 +82,7 @@ workflow QC_MARKDUPLICATES { if (!skip_bamqc && !skip_markduplicates) { //TODO: after adding CI tests, allow bamqc on mapped bams if no duplicate marking is done QUALIMAP_BAMQC(bam_markduplicates, target_bed, params.target_bed) - qualimap_bamqc = QUALIMAP_BAMQC.out + qualimap_bamqc = QUALIMAP_BAMQC.out.results } samtools_stats = Channel.empty() diff --git a/subworkflows/nf-core/recalibrate.nf b/subworkflows/nf-core/recalibrate.nf index c672651384..4d0b514d00 100644 --- a/subworkflows/nf-core/recalibrate.nf +++ b/subworkflows/nf-core/recalibrate.nf @@ -71,7 +71,7 @@ workflow RECALIBRATE { if (!skip_bamqc) { QUALIMAP_BAMQC_CRAM(cram_recalibrated_index,target_bed, params.target_bed,fasta, fai) - qualimap_bamqc = QUALIMAP_BAMQC_CRAM.out + qualimap_bamqc = QUALIMAP_BAMQC_CRAM.out.results } if (!skip_samtools) { diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 0686087260..faf70c5537 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -304,15 +304,15 @@ workflow SAREK { reads_input = FASTQC_TRIMGALORE.out.reads - qc_reports = qc_reports.mix( - FASTQC_TRIMGALORE.out.fastqc_html, - FASTQC_TRIMGALORE.out.fastqc_zip, - FASTQC_TRIMGALORE.out.trim_html, - FASTQC_TRIMGALORE.out.trim_log, - FASTQC_TRIMGALORE.out.trim_zip) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_html.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_html.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) // STEP 1: MAPPING READS TO REFERENCE GENOME - MAPPING(params.aligner, + MAPPING( + params.aligner, bwa, fai, fasta, @@ -333,7 +333,8 @@ workflow SAREK { if (step in ['mapping', 'preparerecalibration']) { // STEP 2: MARKING DUPLICATES AND/OR QC, conversion to CRAM - QC_MARKDUPLICATES(bam_mapped, + QC_MARKDUPLICATES( + bam_mapped, bam_indexed, ('markduplicates' in params.use_gatk_spark), !('markduplicates' in params.skip_qc), @@ -350,10 +351,11 @@ workflow SAREK { // Create CSV to restart from this step MARKDUPLICATES_CSV(cram_markduplicates) - qc_reports = qc_reports.mix(QC_MARKDUPLICATES.out.qc) + qc_reports = qc_reports.mix(QC_MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) // STEP 3: CREATING RECALIBRATION TABLES - PREPARE_RECALIBRATION(cram_markduplicates, + PREPARE_RECALIBRATION( + cram_markduplicates, ('bqsr' in params.use_gatk_spark), dict, fai, @@ -391,7 +393,7 @@ workflow SAREK { RECALIBRATE_CSV(cram_recalibrated) - qc_reports = qc_reports.mix(cram_recalibrated_qc) + qc_reports = qc_reports.mix(cram_recalibrated_qc.collect{it[1]}.ifEmpty([])) cram_variant_calling = cram_recalibrated } @@ -486,8 +488,6 @@ workflow SAREK { ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(GET_SOFTWARE_VERSIONS.out.yaml.collect()) - if (step == 'mapping') ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(qc_reports) MULTIQC(ch_multiqc_files.collect()) From 3bc9e8626bff0acb6105dab2b0c213a4bbbc1c51 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 20:38:34 +0200 Subject: [PATCH 07/25] fix: qualimap output --- tests/test_aligner.yml | 2 +- tests/test_default.yml | 2 +- tests/test_gatk_spark.yml | 2 +- tests/test_pair.yml | 4 ++-- tests/test_skip_markduplicates.yml | 4 ++-- tests/test_targeted.yml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index cd2de82960..02f389d7eb 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -22,6 +22,6 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_default.yml b/tests/test_default.yml index 1ecd249447..e9e99d985a 100644 --- a/tests/test_default.yml +++ b/tests/test_default.yml @@ -22,6 +22,6 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_gatk_spark.yml b/tests/test_gatk_spark.yml index 2bc096694b..71017ccee6 100644 --- a/tests/test_gatk_spark.yml +++ b/tests/test_gatk_spark.yml @@ -23,6 +23,6 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_pair.yml b/tests/test_pair.yml index a670c5ccdd..f728c85db2 100644 --- a/tests/test_pair.yml +++ b/tests/test_pair.yml @@ -34,8 +34,8 @@ - path: results/reports/fastqc/9876T-9876T_M2 - path: results/reports/fastqc/9876T-9876T_M4 - path: results/reports/fastqc/9876T-9876T_M5 - - path: results/reports/qualimap/1234N - - path: results/reports/qualimap/9876T + - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/9876T.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats - path: results/reports/samtools_stats/9876T/9876T.md.cram.stats diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index a07d2a3d42..a4e4d2112d 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -22,7 +22,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats - name: Run default pipeline while skipping MarkDuplicates, starting with prepare_recalibration @@ -45,6 +45,6 @@ - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_1234N.csv - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_targeted.yml b/tests/test_targeted.yml index 70d1861152..1fad5e12f2 100644 --- a/tests/test_targeted.yml +++ b/tests/test_targeted.yml @@ -22,6 +22,6 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats From 7684001ad8dbfda60af22425154c953f48709651 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 20:54:56 +0200 Subject: [PATCH 08/25] fix: output for qualimap --- tests/test_aligner.yml | 3 ++- tests/test_default.yml | 3 ++- tests/test_gatk_spark.yml | 3 ++- tests/test_pair.yml | 6 ++++-- tests/test_skip_markduplicates.yml | 6 ++++-- tests/test_targeted.yml | 3 ++- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index 02f389d7eb..92fa1bb3e2 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -22,6 +22,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_default.yml b/tests/test_default.yml index e9e99d985a..536d136be5 100644 --- a/tests/test_default.yml +++ b/tests/test_default.yml @@ -22,6 +22,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_gatk_spark.yml b/tests/test_gatk_spark.yml index 71017ccee6..50a7c6518f 100644 --- a/tests/test_gatk_spark.yml +++ b/tests/test_gatk_spark.yml @@ -23,6 +23,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_pair.yml b/tests/test_pair.yml index f728c85db2..90913b0cc8 100644 --- a/tests/test_pair.yml +++ b/tests/test_pair.yml @@ -34,8 +34,10 @@ - path: results/reports/fastqc/9876T-9876T_M2 - path: results/reports/fastqc/9876T-9876T_M4 - path: results/reports/fastqc/9876T-9876T_M5 - - path: results/reports/qualimap/1234N.recal - - path: results/reports/qualimap/9876T.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal + - path: results/reports/qualimap/9876T/9876T.mapped + - path: results/reports/qualimap/9876T/9876T.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats - path: results/reports/samtools_stats/9876T/9876T.md.cram.stats diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index a4e4d2112d..c0fb6d0d66 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -22,7 +22,8 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats - name: Run default pipeline while skipping MarkDuplicates, starting with prepare_recalibration @@ -45,6 +46,7 @@ - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_1234N.csv - - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_targeted.yml b/tests/test_targeted.yml index 1fad5e12f2..635b4c4024 100644 --- a/tests/test_targeted.yml +++ b/tests/test_targeted.yml @@ -22,6 +22,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N.recal + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats From 773fcfda0b75ee98dab6feef6c279cb3335cf760 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 21:08:07 +0200 Subject: [PATCH 09/25] fix: qualimap output for skip_marduplicates --- tests/test_skip_markduplicates.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index c0fb6d0d66..1ba3ab179a 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -22,7 +22,6 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N/1234N.mapped - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats @@ -46,7 +45,6 @@ - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_1234N.csv - - path: results/reports/qualimap/1234N/1234N.mapped - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats From ee4514d3767539c2b51d89f5574983dadc5f380b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 21:57:46 +0200 Subject: [PATCH 10/25] fix: code polish and more complete QC reports in CI --- conf/modules.config | 48 +++++++++++------------ subworkflows/nf-core/qc_markduplicates.nf | 19 +++++---- tests/test_skip_markduplicates.yml | 5 +++ workflows/sarek.nf | 7 +++- 4 files changed, 43 insertions(+), 36 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 933c933eb7..d940a22618 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,16 +43,16 @@ params { publish_dir = 'reference' publish_files = false } - 'bgziptabix_target_bed' { - suffix = '.bed' + 'msisensorpro_scan' { publish_dir = 'reference' publish_files = false } - 'msisensorpro_scan' { + 'samtools_faidx' { publish_dir = 'reference' publish_files = false } - 'samtools_faidx' { + 'bgziptabix_target_bed' { + suffix = '.bed' publish_dir = 'reference' publish_files = false } @@ -73,26 +73,22 @@ params { publish_files = false } // MAPPING - 'seqkit_split2' { - args = "--by-size ${params.split_fastq}" - publish_files = false - } - 'bwa_mem1_mem' { + 'bwa_mem2_mem' { args = '-K 100000000 -M' args2 = 'sort' publish_files = false } - 'bwa_mem1_mem_tumor' { + 'bwa_mem2_mem_tumor' { args = '-K 100000000 -M -B 3' args2 = 'sort' publish_files = false } - 'bwa_mem2_mem' { + 'bwa_mem1_mem' { args = '-K 100000000 -M' args2 = 'sort' publish_files = false } - 'bwa_mem2_mem_tumor' { + 'bwa_mem1_mem_tumor' { args = '-K 100000000 -M -B 3' args2 = 'sort' publish_files = false @@ -102,32 +98,36 @@ params { publish_files = ['bai':'mapped'] publish_dir = 'preprocessing' } - // MARKDUPLICATES - 'markduplicates' { - args = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT' - suffix = '.md' + 'merge_bam_mapping' { publish_by_meta = true + publish_files = ['bam':'mapped'] publish_dir = 'preprocessing' + } + 'seqkit_split2' { + args = "--by-size ${params.split_fastq}" publish_files = false } - 'markduplicatesspark' { - args = '--remove-sequencing-duplicates false -VS LENIENT' + // MARKDUPLICATES + 'estimatelibrarycomplexity' { + args = '' suffix = '.md' publish_by_meta = true publish_dir = 'preprocessing' - publish_files = ['cram': 'markduplicates', 'crai': 'markduplicates'] + publish_files = ['metrics': 'markduplicates'] } - 'estimatelibrarycomplexity' { - args = '' + 'markduplicates' { + args = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT' suffix = '.md' publish_by_meta = true publish_dir = 'preprocessing' - publish_files = ['metrics': 'markduplicates'] + publish_files = false } - 'merge_bam_mapping' { + 'markduplicatesspark' { + args = '--remove-sequencing-duplicates false -VS LENIENT' + suffix = '.md' publish_by_meta = true - publish_files = ['bam':'mapped'] publish_dir = 'preprocessing' + publish_files = ['cram': 'markduplicates', 'crai': 'markduplicates'] } 'qualimap_bamqc_mapping' { args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' diff --git a/subworkflows/nf-core/qc_markduplicates.nf b/subworkflows/nf-core/qc_markduplicates.nf index 8767a6189b..92d2887a0c 100644 --- a/subworkflows/nf-core/qc_markduplicates.nf +++ b/subworkflows/nf-core/qc_markduplicates.nf @@ -13,14 +13,14 @@ params.samtools_stats_options = [:] params.samtools_view_options = [:] params.samtools_index_options = [:] +include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/local/gatk4/estimatelibrarycomplexity/main' addParams(options: params.estimatelibrarycomplexity_options) include { GATK4_MARKDUPLICATES } from '../../modules/local/gatk4/markduplicates/main' addParams(options: params.markduplicates_options) include { GATK4_MARKDUPLICATES_SPARK } from '../../modules/local/gatk4/markduplicatesspark/main' addParams(options: params.markduplicatesspark_options) -include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/local/gatk4/estimatelibrarycomplexity/main' addParams(options: params.estimatelibrarycomplexity_options) include { QUALIMAP_BAMQC } from '../../modules/local/qualimap/bamqc/main' addParams(options: params.qualimap_bamqc_options) +include { SAMTOOLS_INDEX } from '../../modules/local/samtools/index/main' addParams(options: params.samtools_index_options) include { SAMTOOLS_STATS } from '../../modules/local/samtools/stats/main' addParams(options: params.samtools_stats_options) include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options) include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM_SPARK } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options) -include { SAMTOOLS_INDEX } from '../../modules/local/samtools/index/main' addParams(options: params.samtools_index_options) workflow QC_MARKDUPLICATES { take: @@ -76,21 +76,20 @@ workflow QC_MARKDUPLICATES { } //If skip_markduplicates then QC tools are run on mapped bams, - //if !skip_markduplicates, then QC tools are run on duplicate marked bams + //if !skip_markduplicates, then QC tools are run on duplicate marked crams //After bamqc finishes, convert to cram for further analysis - qualimap_bamqc = Channel.empty() - if (!skip_bamqc && !skip_markduplicates) { - //TODO: after adding CI tests, allow bamqc on mapped bams if no duplicate marking is done - QUALIMAP_BAMQC(bam_markduplicates, target_bed, params.target_bed) - qualimap_bamqc = QUALIMAP_BAMQC.out.results - } - samtools_stats = Channel.empty() if (!skip_samtools) { SAMTOOLS_STATS(cram_markduplicates, fasta) samtools_stats = SAMTOOLS_STATS.out.stats } + qualimap_bamqc = Channel.empty() + if (!skip_bamqc) { + QUALIMAP_BAMQC(bam_markduplicates, target_bed, params.target_bed) + qualimap_bamqc = QUALIMAP_BAMQC.out.results + } + qc_reports = samtools_stats.mix(qualimap_bamqc) qc_reports = report_markduplicates.mix(qc_reports) diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index 1ba3ab179a..6ff01a16c4 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -6,6 +6,9 @@ - skip_markduplicates files: - path: results/multiqc + - path: results/preprocessing/1234N/mapped/1234N.bam + - path: results/preprocessing/1234N/mapped/1234N.bai + - path: results/preprocessing/1234N/mapped/1234N.recal.table - path: results/preprocessing/1234N/markduplicates/1234N.md.cram - path: results/preprocessing/1234N/markduplicates/1234N.md.cram.crai - path: results/preprocessing/1234N/recalibrated/1234N.recal.cram @@ -22,6 +25,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 + - path: results/reports/qualimap/1234N/1234N.mapped - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats @@ -45,6 +49,7 @@ - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_1234N.csv + - path: results/reports/qualimap/1234N/1234N.mapped - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/workflows/sarek.nf b/workflows/sarek.nf index faf70c5537..fc90097408 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -39,7 +39,7 @@ def checkPathParamList = [ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// get step and tools +// Get step and tools def step = params.step ? params.step.replaceAll('-', '').replaceAll('_', '') : '' def tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] @@ -160,6 +160,8 @@ include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( bwamem1_mem_tumor_options: modules['bwa_mem1_mem_tumor'], bwamem2_mem_options: modules['bwa_mem2_mem'], bwamem2_mem_tumor_options: modules['bwa_mem2_mem_tumor'], + merge_bam_options: modules['merge_bam_mapping'], + samtools_index_options: modules['samtools_index_mapping'], seqkit_split2_options: modules['seqkit_split2'] ) @@ -418,7 +420,8 @@ workflow SAREK { target_bed, target_bed_gz_tbi) - vcf_to_annotate = vcf_to_annotate.mix(GERMLINE_VARIANT_CALLING.out.haplotypecaller_vcf, GERMLINE_VARIANT_CALLING.out.strelka_vcf) + vcf_to_annotate = vcf_to_annotate.mix(GERMLINE_VARIANT_CALLING.out.haplotypecaller_vcf) + vcf_to_annotate = vcf_to_annotate.mix(GERMLINE_VARIANT_CALLING.out.strelka_vcf) // SOMATIC VARIANT CALLING From fe2bdd6f3f55bfa28012900d79073d4ca245a589 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 22:13:59 +0200 Subject: [PATCH 11/25] fix: CI + code polishing --- conf/modules.config | 12 ++++++------ .../{qc_markduplicates.nf => markduplicates.nf} | 6 +++--- tests/test_skip_markduplicates.yml | 2 +- workflows/sarek.nf | 8 ++++---- 4 files changed, 14 insertions(+), 14 deletions(-) rename subworkflows/nf-core/{qc_markduplicates.nf => markduplicates.nf} (99%) diff --git a/conf/modules.config b/conf/modules.config index d940a22618..868ca8698f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -73,17 +73,12 @@ params { publish_files = false } // MAPPING - 'bwa_mem2_mem' { - args = '-K 100000000 -M' - args2 = 'sort' - publish_files = false - } 'bwa_mem2_mem_tumor' { args = '-K 100000000 -M -B 3' args2 = 'sort' publish_files = false } - 'bwa_mem1_mem' { + 'bwa_mem2_mem' { args = '-K 100000000 -M' args2 = 'sort' publish_files = false @@ -93,6 +88,11 @@ params { args2 = 'sort' publish_files = false } + 'bwa_mem1_mem' { + args = '-K 100000000 -M' + args2 = 'sort' + publish_files = false + } 'samtools_index_mapping' { publish_by_meta = true publish_files = ['bai':'mapped'] diff --git a/subworkflows/nf-core/qc_markduplicates.nf b/subworkflows/nf-core/markduplicates.nf similarity index 99% rename from subworkflows/nf-core/qc_markduplicates.nf rename to subworkflows/nf-core/markduplicates.nf index 92d2887a0c..d6488e174e 100644 --- a/subworkflows/nf-core/qc_markduplicates.nf +++ b/subworkflows/nf-core/markduplicates.nf @@ -4,14 +4,14 @@ ======================================================================================== */ +params.estimatelibrarycomplexity_options = [:] params.markduplicates_options = [:] params.markduplicatesspark_options = [:] -params.estimatelibrarycomplexity_options = [:] params.merge_bam_options = [:] params.qualimap_bamqc_options = [:] +params.samtools_index_options = [:] params.samtools_stats_options = [:] params.samtools_view_options = [:] -params.samtools_index_options = [:] include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/local/gatk4/estimatelibrarycomplexity/main' addParams(options: params.estimatelibrarycomplexity_options) include { GATK4_MARKDUPLICATES } from '../../modules/local/gatk4/markduplicates/main' addParams(options: params.markduplicates_options) @@ -22,7 +22,7 @@ include { SAMTOOLS_STATS } from '../../modules/loca include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options) include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM_SPARK } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options) -workflow QC_MARKDUPLICATES { +workflow MARKDUPLICATES { take: bam_mapped // channel: [mandatory] meta, bam bam_indexed // channel: [mandatory] meta, bam, bai diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index 6ff01a16c4..95bfb00eb4 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -7,7 +7,7 @@ files: - path: results/multiqc - path: results/preprocessing/1234N/mapped/1234N.bam - - path: results/preprocessing/1234N/mapped/1234N.bai + - path: results/preprocessing/1234N/mapped/1234N.bam.bai - path: results/preprocessing/1234N/mapped/1234N.recal.table - path: results/preprocessing/1234N/markduplicates/1234N.md.cram - path: results/preprocessing/1234N/markduplicates/1234N.md.cram.crai diff --git a/workflows/sarek.nf b/workflows/sarek.nf index fc90097408..199ee0e2dc 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -165,7 +165,7 @@ include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( seqkit_split2_options: modules['seqkit_split2'] ) -include { QC_MARKDUPLICATES } from '../subworkflows/nf-core/qc_markduplicates' addParams( +include { MARKDUPLICATES } from '../subworkflows/nf-core/markduplicates' addParams( estimatelibrarycomplexity_options: modules['estimatelibrarycomplexity'], markduplicates_options: modules['markduplicates'], markduplicatesspark_options: modules['markduplicatesspark'], @@ -335,7 +335,7 @@ workflow SAREK { if (step in ['mapping', 'preparerecalibration']) { // STEP 2: MARKING DUPLICATES AND/OR QC, conversion to CRAM - QC_MARKDUPLICATES( + MARKDUPLICATES( bam_mapped, bam_indexed, ('markduplicates' in params.use_gatk_spark), @@ -348,12 +348,12 @@ workflow SAREK { ('samtools' in params.skip_qc), target_bed) - cram_markduplicates = QC_MARKDUPLICATES.out.cram + cram_markduplicates = MARKDUPLICATES.out.cram // Create CSV to restart from this step MARKDUPLICATES_CSV(cram_markduplicates) - qc_reports = qc_reports.mix(QC_MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) // STEP 3: CREATING RECALIBRATION TABLES PREPARE_RECALIBRATION( From f236ad64aca4cde3b78ea17bec4dbdc5eead9bc2 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 22:57:04 +0200 Subject: [PATCH 12/25] fix: QC/version channels --- subworkflows/nf-core/fastqc_trimgalore.nf | 10 ++- subworkflows/nf-core/mapping.nf | 75 +++++++++++-------- subworkflows/nf-core/prepare_recalibration.nf | 11 +-- workflows/sarek.nf | 5 +- 4 files changed, 60 insertions(+), 41 deletions(-) diff --git a/subworkflows/nf-core/fastqc_trimgalore.nf b/subworkflows/nf-core/fastqc_trimgalore.nf index 321b946486..12360ab980 100644 --- a/subworkflows/nf-core/fastqc_trimgalore.nf +++ b/subworkflows/nf-core/fastqc_trimgalore.nf @@ -19,9 +19,10 @@ workflow FASTQC_TRIMGALORE { fastqc_zip = Channel.empty() fastqc_version = Channel.empty() if (!skip_fastqc) { - FASTQC ( reads ).html.set { fastqc_html } + FASTQC(reads) + fastqc_html = FASTQC.out.html fastqc_zip = FASTQC.out.zip - fastqc_version = FASTQC.out.version + fastqc_version = FASTQC.out.version.first() } trim_reads = reads @@ -30,11 +31,12 @@ workflow FASTQC_TRIMGALORE { trim_log = Channel.empty() trimgalore_version = Channel.empty() if (!skip_trimming) { - TRIMGALORE ( reads ).reads.set { trim_reads } + TRIMGALORE(reads) + trim_reads = TRIMGALORE.out.reads trim_html = TRIMGALORE.out.html trim_zip = TRIMGALORE.out.zip trim_log = TRIMGALORE.out.log - trimgalore_version = TRIMGALORE.out.version + trimgalore_version = TRIMGALORE.out.version.first() } emit: diff --git a/subworkflows/nf-core/mapping.nf b/subworkflows/nf-core/mapping.nf index 44e322446e..42f0c931b5 100644 --- a/subworkflows/nf-core/mapping.nf +++ b/subworkflows/nf-core/mapping.nf @@ -34,58 +34,72 @@ workflow MAPPING { bam_indexed = Channel.empty() if (params.split_fastq > 1) { - reads_input_split = SEQKIT_SPLIT2(reads_input).reads.map{ - key, reads -> - //TODO maybe this can be replaced by a regex to include part_001 etc. - - //sorts list of split fq files by : - //[R1.part_001, R2.part_001, R1.part_002, R2.part_002,R1.part_003, R2.part_003,...] - //TODO: determine whether it is possible to have an uneven number of parts, so remainder: true woud need to be used, I guess this could be possible for unfiltered reads, reads that don't have pairs etc. - return [key, reads.sort{ a,b -> a.getName().tokenize('.')[ a.getName().tokenize('.').size() - 3] <=> b.getName().tokenize('.')[ b.getName().tokenize('.').size() - 3]}.collate(2)] - }.transpose() - } else { - reads_input_split = reads_input - } + reads_input_split = SEQKIT_SPLIT2(reads_input).reads.map{ key, reads -> + //TODO maybe this can be replaced by a regex to include part_001 etc. + + //sorts list of split fq files by : + //[R1.part_001, R2.part_001, R1.part_002, R2.part_002,R1.part_003, R2.part_003,...] + //TODO: determine whether it is possible to have an uneven number of parts, so remainder: true woud need to be used, I guess this could be possible for unfiltered reads, reads that don't have pairs etc. + [key, reads.sort{ a,b -> a.getName().tokenize('.')[ a.getName().tokenize('.').size() - 3] <=> b.getName().tokenize('.')[ b.getName().tokenize('.').size() - 3]}.collate(2)] + }.transpose() + } else reads_input_split = reads_input // If meta.status is 1, then sample is tumor // else, (even is no meta.status exist) sample is normal reads_input_split.branch{ - tumor: it[0].status == 1 - normal: true - }.set{ reads_input_status } + tumor: it[0].status == 1 + normal: true + }.set{ reads_input_status } - bam_bwamem1 = Channel.empty() - bam_bwamem2 = Channel.empty() + bam_bwamem = Channel.empty() + bam_bwamem1 = Channel.empty() + bam_bwamem2 = Channel.empty() + tool_versions = Channel.empty() if (aligner == "bwa-mem") { BWAMEM1_MEM(reads_input_status.normal, bwa) - bam_bwamem1_n = BWAMEM1_MEM.out.bam - BWAMEM1_MEM_T(reads_input_status.tumor, bwa) + + bam_bwamem1_n = BWAMEM1_MEM.out.bam bam_bwamem1_t = BWAMEM1_MEM_T.out.bam + bam_bwamem1 = bam_bwamem1_n.mix(bam_bwamem1_t) + + bwamem1_n_version = BWAMEM1_MEM.out.version + bwamem1_t_version = BWAMEM1_MEM_T.out.version + + bwamem1_version = bwamem1_n_version.mix(bwamem1_t_version).first() - bam_bwamem1 = bam_bwamem1_n.mix(bam_bwamem1_t) + tool_versions = tool_versions.mix(bwamem1_version) } else { BWAMEM2_MEM(reads_input_status.normal, bwa) - bam_bwamem2_n = BWAMEM2_MEM.out.bam - BWAMEM2_MEM_T(reads_input_status.tumor, bwa) + + bam_bwamem2_n = BWAMEM2_MEM.out.bam bam_bwamem2_t = BWAMEM2_MEM_T.out.bam + bam_bwamem2 = bam_bwamem2_n.mix(bam_bwamem2_t) - bam_bwamem2 = bam_bwamem2_n.mix(bam_bwamem2_t) + bwamem2_n_version = BWAMEM2_MEM.out.version + bwamem2_t_version = BWAMEM2_MEM_T.out.version + + bwamem2_version = bwamem2_n_version.mix(bwamem2_t_version).first() + tool_versions = tool_versions.mix(bwamem1_version) } - bam_bwa = bam_bwamem1.mix(bam_bwamem2) - bam_bwa.map{ meta, bam -> - meta.remove('read_group') - meta.id = meta.sample + bam_bwamem = bam_bwamem.mix(bam_bwamem1) + bam_bwamem = bam_bwamem.mix(bam_bwamem2) + + bam_bwamem.map{ meta, bam -> + new_meta = meta.clone() + new_meta.remove('read_group') + new_meta.id = meta.sample + // groupKey is to makes sure that the correct group can advance as soon as it is complete // and not stall the workflow until all pieces are mapped def groupKey = groupKey(meta, meta.numLanes * params.split_fastq) tuple(groupKey, bam) - [meta, bam] - }.groupTuple() - .set{bam_mapped} + + [new_meta, bam] + }.groupTuple().set{ bam_mapped } // MarkDuplicates can handle multiple BAMS as input, so no merging/indexing at this step // Except if and only if skipping MarkDuplicates @@ -106,4 +120,5 @@ workflow MAPPING { emit: bam = bam_mapped bam_indexed = bam_indexed + versions = tool_versions } diff --git a/subworkflows/nf-core/prepare_recalibration.nf b/subworkflows/nf-core/prepare_recalibration.nf index 10d0b98b11..f3e35f38a5 100644 --- a/subworkflows/nf-core/prepare_recalibration.nf +++ b/subworkflows/nf-core/prepare_recalibration.nf @@ -4,13 +4,13 @@ ======================================================================================== */ -params.baserecalibrator_options = [:] -params.gatherbqsrreports_options = [:] +params.baserecalibrator_options = [:] +params.gatherbqsrreports_options = [:] params.baserecalibrator_spark_options = [:] -include { GATK4_BASERECALIBRATOR as BASERECALIBRATOR } from '../../modules/local/gatk4/baserecalibrator/main' addParams(options: params.baserecalibrator_options) -include { GATK4_BASERECALIBRATOR_SPARK as BASERECALIBRATOR_SPARK } from '../../modules/local/gatk4/baserecalibratorspark/main' addParams(options: params.baserecalibrator_spark_options) -include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../modules/local/gatk4/gatherbqsrreports/main' addParams(options: params.gatherbqsrreports_options) +include { GATK4_BASERECALIBRATOR as BASERECALIBRATOR } from '../../modules/local/gatk4/baserecalibrator/main' addParams(options: params.baserecalibrator_options) +include { GATK4_BASERECALIBRATOR_SPARK as BASERECALIBRATOR_SPARK } from '../../modules/local/gatk4/baserecalibratorspark/main' addParams(options: params.baserecalibrator_spark_options) +include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../modules/local/gatk4/gatherbqsrreports/main' addParams(options: params.gatherbqsrreports_options) workflow PREPARE_RECALIBRATION { take: @@ -26,6 +26,7 @@ workflow PREPARE_RECALIBRATION { no_intervals // value: [mandatory] no_intervals main: + cram_markduplicates.combine(intervals) .map{ meta, cram, crai, intervals -> new_meta = meta.clone() diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 199ee0e2dc..e66b85e8b4 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -312,6 +312,9 @@ workflow SAREK { qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) + ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.fastqc_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.trimgalore_version.ifEmpty(null)) + // STEP 1: MAPPING READS TO REFERENCE GENOME MAPPING( params.aligner, @@ -471,8 +474,6 @@ workflow SAREK { } } - if (step == 'mapping') ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.fastqc_version.first().ifEmpty(null)) - ch_software_versions .map { it -> if (it) [ it.baseName, it ] } .groupTuple() From 518ed464fd61b6daae9f4320006e2d7f6aa66faf Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 4 Aug 2021 23:00:06 +0200 Subject: [PATCH 13/25] fix: typo --- subworkflows/nf-core/mapping.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/nf-core/mapping.nf b/subworkflows/nf-core/mapping.nf index 42f0c931b5..bfaf742741 100644 --- a/subworkflows/nf-core/mapping.nf +++ b/subworkflows/nf-core/mapping.nf @@ -82,7 +82,7 @@ workflow MAPPING { bwamem2_t_version = BWAMEM2_MEM_T.out.version bwamem2_version = bwamem2_n_version.mix(bwamem2_t_version).first() - tool_versions = tool_versions.mix(bwamem1_version) + tool_versions = tool_versions.mix(bwamem2_version) } bam_bwamem = bam_bwamem.mix(bam_bwamem1) From 1d226cefb9042455db9a88575a886b9393d7b613 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 Aug 2021 15:22:15 +0200 Subject: [PATCH 14/25] feat: extreme code polishing --- subworkflows/local/annotate.nf | 8 +- subworkflows/local/build_indices.nf | 55 +++++----- .../local/germline_variant_calling.nf | 8 +- subworkflows/local/input_check.nf | 42 -------- subworkflows/local/mapping_csv.nf | 8 +- subworkflows/local/markduplicates_csv.nf | 8 +- subworkflows/local/pair_variant_calling.nf | 8 +- .../local/prepare_recalibration_csv.nf | 8 +- subworkflows/local/recalibrate_csv.nf | 8 +- subworkflows/nf-core/ensemblvep_annotate.nf | 6 +- subworkflows/nf-core/mapping.nf | 8 +- subworkflows/nf-core/markduplicates.nf | 8 +- subworkflows/nf-core/prepare_recalibration.nf | 8 +- subworkflows/nf-core/recalibrate.nf | 8 +- subworkflows/nf-core/snpeff_annotate.nf | 6 +- workflows/sarek.nf | 102 +++++++++++------- 16 files changed, 127 insertions(+), 172 deletions(-) delete mode 100644 subworkflows/local/input_check.nf diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index c3769d0f4e..2841a277cd 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - ANNOTATION -======================================================================================== -*/ +// +// ANNOTATION +// params.annotation_cache = [:] params.bgziptabix_merge_vep_options = [:] diff --git a/subworkflows/local/build_indices.nf b/subworkflows/local/build_indices.nf index d1e5884f73..67cb855943 100644 --- a/subworkflows/local/build_indices.nf +++ b/subworkflows/local/build_indices.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - BUILDING INDICES -======================================================================================== -*/ +// +// BUILDING INDICES +// params.bgziptabix_options = [:] params.build_intervals_options = [:] @@ -53,23 +51,15 @@ workflow BUILD_INDICES { else (result_bwa, version_bwa) = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() - version_dict = Channel.empty() + version_gatk = Channel.empty() if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) - (result_dict, version_dict) = GATK4_CREATESEQUENCEDICTIONARY(fasta) + (result_dict, version_gatk) = GATK4_CREATESEQUENCEDICTIONARY(fasta) result_fai = Channel.empty() - version_fai = Channel.empty() + version_samtools = Channel.empty() if (fasta_fai) result_fai = fasta_fai if (!(params.fasta_fai) && !('annotate' in step)) - (result_fai, version_fai) = SAMTOOLS_FAIDX(fasta) - - result_dbsnp_tbi = Channel.empty() - version_dbsnp_tbi = Channel.empty() - if (!(params.dbsnp_tbi) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools|| 'mutect2' in tools || 'tnscope' in tools)) { - dbsnp_id = dbsnp.map{ it -> [[id:it[0].baseName], it] } - (result_dbsnp_tbi, version_dbsnp_tbi) = TABIX_DBSNP(dbsnp_id) - result_dbsnp_tbi = result_dbsnp_tbi.map{ meta, tbi -> [tbi] } - } + (result_fai, version_samtools) = SAMTOOLS_FAIDX(fasta) result_target_bed = Channel.empty() version_target_bed = Channel.empty() @@ -79,6 +69,14 @@ workflow BUILD_INDICES { result_target_bed = result_target_bed.map{ meta, bed, tbi -> [bed, tbi] } } + result_dbsnp_tbi = Channel.empty() + version_dbsnp_tbi = Channel.empty() + if (!(params.dbsnp_tbi) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools|| 'mutect2' in tools || 'tnscope' in tools)) { + dbsnp_id = dbsnp.map{ it -> [[id:it[0].baseName], it] } + (result_dbsnp_tbi, version_dbsnp_tbi) = TABIX_DBSNP(dbsnp_id) + result_dbsnp_tbi = result_dbsnp_tbi.map{ meta, tbi -> [tbi] } + } + result_germline_resource_tbi = Channel.empty() version_germline_resource_tbi = Channel.empty() if (!(params.germline_resource_tbi) && params.germline_resource && 'mutect2' in tools) { @@ -94,11 +92,6 @@ workflow BUILD_INDICES { result_known_indels_tbi = result_known_indels_tbi.map{ meta, tbi -> [tbi] } } - result_msisensorpro_scan = Channel.empty() - version_msisensorpro_scan = Channel.empty() - if ('msisensorpro' in tools) - (result_msisensorpro_scan, version_msisensorpro_scan) = MSISENSORPRO_SCAN(fasta) - result_pon_tbi = Channel.empty() version_pon_tbi = Channel.empty() if (!(params.pon_tbi) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) { @@ -106,6 +99,14 @@ workflow BUILD_INDICES { (result_pon_tbi, version_pon_tbi) = TABIX_PON(pon_id) } + version_tabix = Channel.empty() + version_tabix = version_tabix.mix(version_target_bed, version_dbsnp_tbi, version_germline_resource_tbi, version_known_indels_tbi, version_pon_tbi).first() + + result_msisensorpro_scan = Channel.empty() + version_msisensorpro_scan = Channel.empty() + if ('msisensorpro' in tools) + (result_msisensorpro_scan, version_msisensorpro_scan) = MSISENSORPRO_SCAN(fasta) + result_intervals = Channel.empty() if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" @@ -139,20 +140,16 @@ workflow BUILD_INDICES { bwa = result_bwa bwa_version = version_bwa dbsnp_tbi = result_dbsnp_tbi - dbsnp_tbi_version = version_dbsnp_tbi dict = result_dict - dict_version = version_dict fai = result_fai - fai_version = version_fai + gatk_version = version_gatk germline_resource_tbi = result_germline_resource_tbi - germline_resource_tbi_version = version_germline_resource_tbi intervals = result_intervals known_indels_tbi = result_known_indels_tbi.collect() - known_indels_tbi_version = version_known_indels_tbi msisensorpro_scan = result_msisensorpro_scan msisensorpro_scan_version = version_msisensorpro_scan pon_tbi = result_pon_tbi - pon_tbi_version = version_pon_tbi + samtools_version = version_samtools + tabix_version = version_tabix target_bed_gz_tbi = result_target_bed - target_bed_version = version_target_bed } diff --git a/subworkflows/local/germline_variant_calling.nf b/subworkflows/local/germline_variant_calling.nf index 16a255c5bb..6cc692c924 100644 --- a/subworkflows/local/germline_variant_calling.nf +++ b/subworkflows/local/germline_variant_calling.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - GERMLINE VARIANT CALLING -======================================================================================== -*/ +// +// GERMLINE VARIANT CALLING +// params.haplotypecaller_options = [:] params.deepvariant_options = [:] diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index b664bc8caf..0000000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,42 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -params.options = [:] - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channels(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channels(LinkedHashMap row) { - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - def array = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - array = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return array -} diff --git a/subworkflows/local/mapping_csv.nf b/subworkflows/local/mapping_csv.nf index 8bf8802b55..4bcfb7dbd2 100644 --- a/subworkflows/local/mapping_csv.nf +++ b/subworkflows/local/mapping_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MAPPING_CSV -======================================================================================== -*/ +// +// MAPPING_CSV +// workflow MAPPING_CSV { take: diff --git a/subworkflows/local/markduplicates_csv.nf b/subworkflows/local/markduplicates_csv.nf index 775d646b42..e8aa4214e0 100644 --- a/subworkflows/local/markduplicates_csv.nf +++ b/subworkflows/local/markduplicates_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MARKDUPLICATES_CSV -======================================================================================== -*/ +// +// MARKDUPLICATES_CSV +// workflow MARKDUPLICATES_CSV { take: diff --git a/subworkflows/local/pair_variant_calling.nf b/subworkflows/local/pair_variant_calling.nf index d32ad17b2f..4dc4ab112f 100644 --- a/subworkflows/local/pair_variant_calling.nf +++ b/subworkflows/local/pair_variant_calling.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - SOMATIC VARIANT CALLING -======================================================================================== -*/ +// +// SOMATIC VARIANT CALLING +// params.manta_options = [:] params.msisensorpro_msi_options = [:] diff --git a/subworkflows/local/prepare_recalibration_csv.nf b/subworkflows/local/prepare_recalibration_csv.nf index 4fb1dca896..7e79b658bf 100644 --- a/subworkflows/local/prepare_recalibration_csv.nf +++ b/subworkflows/local/prepare_recalibration_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - PREPARE_RECALIBRATION_CSV -======================================================================================== -*/ +// +// PREPARE_RECALIBRATION_CSV +// workflow PREPARE_RECALIBRATION_CSV { take: diff --git a/subworkflows/local/recalibrate_csv.nf b/subworkflows/local/recalibrate_csv.nf index 7e96ef76f8..13b0aeca55 100644 --- a/subworkflows/local/recalibrate_csv.nf +++ b/subworkflows/local/recalibrate_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - RECALIBRATE_CSV -======================================================================================== -*/ +// +// RECALIBRATE_CSV +// workflow RECALIBRATE_CSV { take: diff --git a/subworkflows/nf-core/ensemblvep_annotate.nf b/subworkflows/nf-core/ensemblvep_annotate.nf index dff7fd6fa0..a5c043a818 100644 --- a/subworkflows/nf-core/ensemblvep_annotate.nf +++ b/subworkflows/nf-core/ensemblvep_annotate.nf @@ -1,6 +1,6 @@ -/* - * Run VEP to annotate VCF files - */ +// +// Run VEP to annotate VCF files +// params.bgziptabix_vep = [:] params.vep_options = [:] diff --git a/subworkflows/nf-core/mapping.nf b/subworkflows/nf-core/mapping.nf index bfaf742741..c0d954b110 100644 --- a/subworkflows/nf-core/mapping.nf +++ b/subworkflows/nf-core/mapping.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MAPPING -======================================================================================== -*/ +// +// MAPPING +// params.bwamem1_mem_options = [:] params.bwamem1_mem_tumor_options = [:] diff --git a/subworkflows/nf-core/markduplicates.nf b/subworkflows/nf-core/markduplicates.nf index d6488e174e..1582a1b36d 100644 --- a/subworkflows/nf-core/markduplicates.nf +++ b/subworkflows/nf-core/markduplicates.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MARKDUPLICATES AND/OR QC after mapping -======================================================================================== -*/ +// +// MARKDUPLICATES AND/OR QC after mapping +// params.estimatelibrarycomplexity_options = [:] params.markduplicates_options = [:] diff --git a/subworkflows/nf-core/prepare_recalibration.nf b/subworkflows/nf-core/prepare_recalibration.nf index f3e35f38a5..18183f8e18 100644 --- a/subworkflows/nf-core/prepare_recalibration.nf +++ b/subworkflows/nf-core/prepare_recalibration.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - PREPARE RECALIBRATION -======================================================================================== -*/ +// +// PREPARE RECALIBRATION +// params.baserecalibrator_options = [:] params.gatherbqsrreports_options = [:] diff --git a/subworkflows/nf-core/recalibrate.nf b/subworkflows/nf-core/recalibrate.nf index 4d0b514d00..285ab29bdc 100644 --- a/subworkflows/nf-core/recalibrate.nf +++ b/subworkflows/nf-core/recalibrate.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - RECALIBRATE -======================================================================================== -*/ +// +// RECALIBRATE +// params.applybqsr_options = [:] params.applybqsr_spark_options = [:] diff --git a/subworkflows/nf-core/snpeff_annotate.nf b/subworkflows/nf-core/snpeff_annotate.nf index ee10ec0097..577cdefb69 100644 --- a/subworkflows/nf-core/snpeff_annotate.nf +++ b/subworkflows/nf-core/snpeff_annotate.nf @@ -1,6 +1,6 @@ -/* - * Run snpEff to annotate VCF files - */ +// +// Run snpEff to annotate VCF files +// params.bgziptabix_snpeff = [:] params.snpeff_options = [:] diff --git a/workflows/sarek.nf b/workflows/sarek.nf index e66b85e8b4..7520418c7b 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -37,25 +37,23 @@ def checkPathParamList = [ params.vep_cache ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +for (param in checkPathParamList) if (param) file(param, checkIfExists: true) // Get step and tools def step = params.step ? params.step.replaceAll('-', '').replaceAll('_', '') : '' def tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] // Check mandatory parameters -input_sample = Channel.empty() - if (params.input) csv_file = file(params.input) else { log.warn "No samplesheet specified, attempting to restart from csv files present in ${params.outdir}" switch (step) { - case 'mapping': break - case 'prepare_recalibration': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates_no_table.csv", checkIfExists: true); break - case 'recalibrate': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates.csv" , checkIfExists: true); break - case 'variant_calling': csv_file = file("${params.outdir}/preprocessing/csv/recalibrated.csv" , checkIfExists: true); break - // case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break - case 'annotate': csv_file = file("${params.outdir}/variant_calling/csv/recalibrated.csv" , checkIfExists: true); break + case 'mapping': exit 1, "Can't start with step $step without samplesheet" + case 'preparerecalibration': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates_no_table.csv", checkIfExists: true); break + case 'recalibrate': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates.csv", checkIfExists: true); break + case 'variantcalling': csv_file = file("${params.outdir}/preprocessing/csv/recalibrated.csv", checkIfExists: true); break + // case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break + case 'annotate': csv_file = file("${params.outdir}/variant_calling/csv/recalibrated.csv", checkIfExists: true); break default: exit 1, "Unknown step $step" } } @@ -65,23 +63,25 @@ input_sample = extract_csv(csv_file) def save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false // Save AWS IGenomes file containing annotation version -def anno_readme = params.genomes[ params.genome ]?.readme +def anno_readme = params.genomes[params.genome]?.readme if (anno_readme && file(anno_readme).exists()) { file("${params.outdir}/genome/").mkdirs() file(anno_readme).copyTo("${params.outdir}/genome/") } -// Stage dummy file to be used as an optional input where required -ch_dummy_file = Channel.fromPath("$projectDir/assets/dummy_file.txt", checkIfExists: true).collect() - /* ======================================================================================== - UPDATE MODULES OPTIONS BASED ON PARAMS + IMPORT LOCAL MODULES/SUBWORKFLOWS ======================================================================================== */ +// Stage dummy file to be used as an optional input where required +ch_dummy_file = Channel.fromPath("$projectDir/assets/dummy_file.txt", checkIfExists: true).collect() + +// Don't overwrite global params.modules, create a copy instead and use that within the main script def modules = params.modules.clone() +// Update modules options based on params if (params.save_reference) { modules['build_intervals'].publish_files = ['bed':'intervals'] modules['bwa_index'].publish_files = ['amb':'bwa', 'ann':'bwa', 'bwt':'bwa', 'pac':'bwa', 'sa':'bwa'] @@ -134,12 +134,15 @@ vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache read_structure1 = params.read_structure1 ?: Channel.empty() read_structure2 = params.read_structure2 ?: Channel.empty() -/* -======================================================================================== - INCLUDE SUBWORKFLOWS -======================================================================================== -*/ +// SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules +// Create samplesheets to restart from different steps +include { MAPPING_CSV } from '../subworkflows/local/mapping_csv' +include { MARKDUPLICATES_CSV } from '../subworkflows/local/markduplicates_csv' +include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recalibration_csv' +include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' + +// Build indices if needed include { BUILD_INDICES } from '../subworkflows/local/build_indices' addParams( bgziptabix_target_bed_options: modules['bgziptabix_target_bed'], build_intervals_options: modules['build_intervals'], @@ -155,6 +158,7 @@ include { BUILD_INDICES } from '../subworkflows/local/build_indices' addParams( tabix_pon_options: modules['tabix_pon'] ) +// Map input reads to reference genome (+QC) include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( bwamem1_mem_options: modules['bwa_mem1_mem'], bwamem1_mem_tumor_options: modules['bwa_mem1_mem_tumor'], @@ -165,6 +169,7 @@ include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( seqkit_split2_options: modules['seqkit_split2'] ) +// Mark duplicates (+QC) include { MARKDUPLICATES } from '../subworkflows/nf-core/markduplicates' addParams( estimatelibrarycomplexity_options: modules['estimatelibrarycomplexity'], markduplicates_options: modules['markduplicates'], @@ -176,11 +181,14 @@ include { MARKDUPLICATES } from '../subworkflows/nf-core/markduplicates' addPara samtools_view_options: modules['samtools_view'] ) +// Create recalibration tables include { PREPARE_RECALIBRATION } from '../subworkflows/nf-core/prepare_recalibration' addParams( baserecalibrator_options: modules['baserecalibrator'], baserecalibrator_spark_options: modules['baserecalibrator_spark'], gatherbqsrreports_options: modules['gatherbqsrreports'] ) + +// Create recalibrated cram files to use for variant calling include { RECALIBRATE } from '../subworkflows/nf-core/recalibrate' addParams( applybqsr_options: modules['applybqsr'], applybqsr_spark_options: modules['applybqsr_spark'], @@ -189,11 +197,8 @@ include { RECALIBRATE } from '../subworkflows/nf-core/recalibrate' addParams( samtools_index_options: modules['samtools_index_recalibrate'], samtools_stats_options: modules['samtools_stats_recalibrate'] ) -include { MAPPING_CSV } from '../subworkflows/local/mapping_csv' -include { MARKDUPLICATES_CSV } from '../subworkflows/local/markduplicates_csv' -include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recalibration_csv' -include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' +// Variant calling on a single normal sample include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_variant_calling' addParams( concat_gvcf_options: modules['concat_gvcf'], concat_haplotypecaller_options: modules['concat_haplotypecaller'], @@ -201,8 +206,10 @@ include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_varian haplotypecaller_options: modules['haplotypecaller'], strelka_options: modules['strelka_germline'] ) +// // Variant calling on a single tumor sample // // include { TUMOR_VARIANT_CALLING } from '../subworkflows/local/tumor_variant_calling' addParams( // // ) +// // Variant calling on tumor/normal pair // include { PAIR_VARIANT_CALLING } from '../subworkflows/local/pair_variant_calling' addParams( // manta_options: modules['manta_somatic'], // msisensorpro_msi_options: modules['msisensorpro_msi'], @@ -211,6 +218,7 @@ include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_varian // mutect2_somatic_options: modules['mutect2_somatic'] // ) +// Annotation include { ANNOTATE } from '../subworkflows/local/annotate' addParams( annotation_cache: params.annotation_cache, bgziptabix_merge_vep_options: modules['bgziptabix_merge_vep'], @@ -223,23 +231,32 @@ include { ANNOTATE } from '../subworkflows/local/annotate' addParams( vep_tag: "${modules['ensemblvep'].tag_base}.${params.genome}" ) -include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' addParams( - fastqc_options: modules['fastqc'], - trimgalore_options: modules['trimgalore'] -) - /* ======================================================================================== - INCLUDE NF-CORE MODULES + IMPORT NF-CORE MODULES/SUBWORKFLOWS ======================================================================================== */ +// Config files ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() - modules['multiqc'].args += params.multiqc_title ? Utils.joinModuleArgs(["--title \"$params.multiqc_title\""]) : '' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams( options: modules['multiqc'] ) -include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) + +// +// SUBWORKFLOWS +// + +include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' addParams( + fastqc_options: modules['fastqc'], + trimgalore_options: modules['trimgalore'] +) + +// +// MODULES: Installed directly from nf-core/modules +// + +include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams(options: modules['multiqc']) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams(options: [publish_files : ['tsv':'']]) def multiqc_report = [] @@ -247,7 +264,7 @@ workflow SAREK { ch_software_versions = Channel.empty() - // BUILD INDICES + // Build indices if needed BUILD_INDICES( dbsnp, fasta, @@ -259,6 +276,7 @@ workflow SAREK { tools, step) + // Gather built indices or get them from the params bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : BUILD_INDICES.out.bwa : ch_dummy_file dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : BUILD_INDICES.out.dict : ch_dummy_file fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : BUILD_INDICES.out.fai : ch_dummy_file @@ -266,24 +284,26 @@ workflow SAREK { germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : BUILD_INDICES.out.germline_resource_tbi : ch_dummy_file known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : BUILD_INDICES.out.known_indels_tbi : ch_dummy_file pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : BUILD_INDICES.out.pon_tbi : ch_dummy_file + msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan + target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi //TODO @Rike, is this working for you? + // known_sites is made by grouping both the dbsnp and the known indels ressources + // Which can either or both be optional known_sites = dbsnp.concat(known_indels).collect() known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + // Intervals for speed up preprocessing/variant calling by spread/gather intervals = BUILD_INDICES.out.intervals num_intervals = 0 intervals.count().map{ num_intervals = it } - msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan - target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi - + // Get versions from all software used ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.bwa_version.ifEmpty(null)) - ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.dict_version.ifEmpty(null)) - ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.fai_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.gatk_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.samtools_version.ifEmpty(null)) ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.msisensorpro_scan_version.ifEmpty(null)) - ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.target_bed_version.ifEmpty(null)) - ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.dbsnp_tbi_version.mix(BUILD_INDICES.out.germline_resource_tbi_version, BUILD_INDICES.out.known_indels_tbi_version, BUILD_INDICES.out.pon_tbi_version).first().ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.tabix_version.ifEmpty(null)) // PREPROCESSING @@ -323,6 +343,8 @@ workflow SAREK { fasta, reads_input, params.skip_markduplicates) + // params.skip_markduplicates, + // save_bam_mapped) bam_mapped = MAPPING.out.bam bam_indexed = MAPPING.out.bam_indexed From be0b64e6c740570a7b6b12bbe502e7311961197f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 Aug 2021 16:12:56 +0200 Subject: [PATCH 15/25] feat: more refactoring --- conf/modules.config | 1 - nextflow.config | 2 +- nextflow_schema.json | 3 ++- subworkflows/local/build_indices.nf | 6 +++--- workflows/sarek.nf | 12 ++++++++---- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 868ca8698f..c10aae17f2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,7 +52,6 @@ params { publish_files = false } 'bgziptabix_target_bed' { - suffix = '.bed' publish_dir = 'reference' publish_files = false } diff --git a/nextflow.config b/nextflow.config index a09280320f..c1638bb875 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,7 +37,7 @@ params { three_prime_clip_r2 = 0 trim_nextseq = 0 save_trimmed = false - split_fastq = 0 // Fastq files will not be split by default + split_fastq = 0 // FASTQ files will not be split by default // Preprocessing aligner = 'bwa-mem' diff --git a/nextflow_schema.json b/nextflow_schema.json index 3a95526ae3..cd243887c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -158,7 +158,8 @@ "hidden": true }, "split_fastq": { - "type": "number", + "type": "integer", + "default": 0, "fa_icon": "fas fa-cut", "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn of splitting at all", "help_text": "Use the the tools seqkit/split2 to split FASTQ file by number of reads", diff --git a/subworkflows/local/build_indices.nf b/subworkflows/local/build_indices.nf index 67cb855943..007ee8cf90 100644 --- a/subworkflows/local/build_indices.nf +++ b/subworkflows/local/build_indices.nf @@ -2,7 +2,7 @@ // BUILDING INDICES // -params.bgziptabix_options = [:] +params.bgziptabix_target_bed_options = [:] params.build_intervals_options = [:] params.bwa_index_options = [:] params.bwamem2_index_options = [:] @@ -24,7 +24,7 @@ include { CREATE_INTERVALS_BED } from '../../modules/local/cre include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main' addParams(options: params.gatk4_dict_options) include { MSISENSORPRO_SCAN } from '../../modules/local/msisensorpro/scan/main' addParams(options: params.msisensorpro_scan_options) include { SAMTOOLS_FAIDX } from '../../modules/nf-core/modules/samtools/faidx/main' addParams(options: params.samtools_faidx_options) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/modules/tabix/bgziptabix/main' addParams(options: params.bgziptabix_options) +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/modules/tabix/bgziptabix/main' addParams(options: params.bgziptabix_target_bed_options) include { TABIX_TABIX as TABIX_DBSNP } from '../../modules/nf-core/modules/tabix/tabix/main' addParams(options: params.tabix_dbsnp_options) include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../modules/nf-core/modules/tabix/tabix/main' addParams(options: params.tabix_germline_resource_options) include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../modules/nf-core/modules/tabix/tabix/main' addParams(options: params.tabix_known_indels_options) @@ -64,7 +64,7 @@ workflow BUILD_INDICES { result_target_bed = Channel.empty() version_target_bed = Channel.empty() if ((params.target_bed) && ('manta' in tools || 'strelka' in tools)) { - target_bed_id = target_bed.map{ it -> [[id:it[0].baseName], it] } + target_bed_id = target_bed.map{ it -> [[id:it[0].getName()], it] } (result_target_bed, version_target_bed) = TABIX_BGZIPTABIX(target_bed_id) result_target_bed = result_target_bed.map{ meta, bed, tbi -> [bed, tbi] } } diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 7520418c7b..2b9301c7c3 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -169,7 +169,7 @@ include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( seqkit_split2_options: modules['seqkit_split2'] ) -// Mark duplicates (+QC) +// Mark duplicates (+QC) + convert to CRAM include { MARKDUPLICATES } from '../subworkflows/nf-core/markduplicates' addParams( estimatelibrarycomplexity_options: modules['estimatelibrarycomplexity'], markduplicates_options: modules['markduplicates'], @@ -263,6 +263,7 @@ def multiqc_report = [] workflow SAREK { ch_software_versions = Channel.empty() + qc_reports = Channel.empty() // Build indices if needed BUILD_INDICES( @@ -311,7 +312,6 @@ workflow SAREK { bam_mapped_qc = Channel.empty() bam_recalibrated_qc = Channel.empty() bam_variant_calling = Channel.empty() - qc_reports = Channel.empty() // STEP 0: QC & TRIM // `--skip_qc fastqc` to skip fastqc @@ -324,14 +324,17 @@ workflow SAREK { ('fastqc' in params.skip_qc), !(params.trim_fastq)) + // Get reads after optional trimming (+QC) reads_input = FASTQC_TRIMGALORE.out.reads + // Get all qc reports for MultiQC qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_html.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_html.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) + // Get versions from all software used ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.fastqc_version.ifEmpty(null)) ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.trimgalore_version.ifEmpty(null)) @@ -346,6 +349,7 @@ workflow SAREK { // params.skip_markduplicates, // save_bam_mapped) + // Get mapped reads (BAM) with and without index bam_mapped = MAPPING.out.bam bam_indexed = MAPPING.out.bam_indexed @@ -359,7 +363,7 @@ workflow SAREK { } if (step in ['mapping', 'preparerecalibration']) { - // STEP 2: MARKING DUPLICATES AND/OR QC, conversion to CRAM + // STEP 2: Mark duplicates (+QC) + convert to CRAM MARKDUPLICATES( bam_mapped, bam_indexed, @@ -380,7 +384,7 @@ workflow SAREK { qc_reports = qc_reports.mix(MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) - // STEP 3: CREATING RECALIBRATION TABLES + // STEP 3: Create recalibration tables PREPARE_RECALIBRATION( cram_markduplicates, ('bqsr' in params.use_gatk_spark), From fe7a7e6832bc69a99435a1d9b297258b77786ad0 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 Aug 2021 17:02:11 +0200 Subject: [PATCH 16/25] feat: add save_mapped tests --- .github/workflows/ci.yml | 11 ++++++++++- nextflow.config | 22 +++++++++++----------- nextflow_schema.json | 18 +++++++++++++++--- subworkflows/nf-core/mapping.nf | 19 +++++++++++-------- tests/test_save_mapped.yml | 30 ++++++++++++++++++++++++++++++ workflows/sarek.nf | 5 ++--- 6 files changed, 79 insertions(+), 26 deletions(-) create mode 100644 tests/test_save_mapped.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3927c31442..30083046f2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,7 +25,16 @@ jobs: # Nextflow versions: check pipeline minimum and current latest nxf_ver: ['21.04.0', ''] engine: ['docker'] - test: ['default', 'aligner', 'gatk4_spark', 'targeted', 'skip_markduplicates', 'tumor_normal_pair', 'variant_calling', 'annotation'] + test: + - 'aligner' + - 'annotation' + - 'default' + - 'gatk4_spark' + - 'saved_bam_mapped' + - 'skip_markduplicates' + - 'targeted' + - 'tumor_normal_pair' + - 'variant_calling' steps: - name: Check out pipeline code uses: actions/checkout@v2 diff --git a/nextflow.config b/nextflow.config index c1638bb875..b0c3411517 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,7 +24,7 @@ params { help = false no_intervals = false // Intervals will be built from the fasta file nucleotides_per_second = 1000 // Default interval size - sentieon = null // Not using Sentieon by default + sentieon = false // Not using Sentieon by default skip_qc = null // All QC tools are used target_bed = false // No default TargetBED file for targeted sequencing tools = null // No default Variant_Calling or Annotation tools @@ -43,24 +43,24 @@ params { aligner = 'bwa-mem' markdup_java_options = '"-Xms4000m -Xmx7g"' // Established values for markDuplicates memory consumption, see https://github.com/SciLifeLab/Sarek/pull/689 for details use_gatk_spark = false // GATK Spark implementation of their tools in local mode not used by default - save_bam_mapped = null // Mapped BAMs not saved + save_bam_mapped = false // Mapped BAMs not saved skip_markduplicates = false // Do not skip markDuplicates by default sequencing_center = null // No sequencing center to be written in BAM header in MapReads process // Variant Calling ascat_ploidy = null // Use default value ascat_purity = null // Use default value - cf_coeff = 0.05 // default value for Control-FREEC - cf_contamination = null // by default not specified in Control-FREEC - cf_contamination_adjustment = null // by default we are not using this in Control-FREEC - cf_ploidy = 2 // you can use 2,3,4 - cf_window = null // by default we are not using this in Control-FREEC - generate_gvcf = null // g.vcf are not produced by HaplotypeCaller by default - no_strelka_bp = null // Strelka will use Manta candidateSmallIndels if available + cf_coeff = 0.05 // default value for Control-FREEC + cf_contamination = null // by default not specified in Control-FREEC + cf_contamination_adjustment = false // by default we are not using this in Control-FREEC + cf_ploidy = 2 // you can use 2,3,4 + cf_window = null // by default we are not using this in Control-FREEC + generate_gvcf = false // g.vcf are not produced by HaplotypeCaller by default + no_strelka_bp = false // Strelka will use Manta candidateSmallIndels if available pon = false // No default PON (Panel of Normals) file for GATK Mutect2 / Sentieon TNscope pon_tbi = false // No default PON index for GATK Mutect2 / Sentieon TNscope - ignore_soft_clipped_bases = null // no --dont-use-soft-clipped-bases for GATK Mutect2 - umi = null // no umi + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + umi = false // no umi read_structure1 = null // no umi read_structure2 = null // no umi diff --git a/nextflow_schema.json b/nextflow_schema.json index cd243887c2..5b82e9fcc1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,6 +64,7 @@ }, "no_intervals": { "type": "boolean", + "default": false, "fa_icon": "fas fa-ban", "description": "Disable usage of intervals.", "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling" @@ -77,6 +78,7 @@ }, "sentieon": { "type": "boolean", + "default": false, "fa_icon": "fas fa-tools", "description": "Enable Sentieon if available.", "help_text": "Sentieon is a commercial solution to process genomics data with high computing efficiency, fast turnaround time, exceptional accuracy, and 100% consistency.\n\n> **NB** Adds the following tools for the `--tools` options: `DNAseq`, `DNAscope` and `TNscope`." @@ -90,6 +92,8 @@ }, "target_bed": { "type": "string", + "format": "file-path", + "pattern": "\\.bed$", "fa_icon": "fas fa-crosshairs", "description": "Target BED file for whole exome or targeted sequencing.", "help_text": "This parameter does _not_ imply that the workflow is running alignment or variant calling only for the supplied targets.\nInstead, we are aligning for the whole genome, and selecting variants only at the very end by intersecting with the provided target file.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory.\nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--target_bed` option.\nThe workflow will pick up these intervals, and activate any `--exome` flag in any tools that allow it to process deeper coverage.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow." @@ -109,6 +113,7 @@ "fa_icon": "fas fa-cut", "description": "Run Trim Galore.", "hidden": true, + "default": false, "help_text": "Use this to perform adapter trimming with Trim Galore.\ncf [Trim Galore User Guide](https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md)" }, "clip_r1": { @@ -154,6 +159,7 @@ "save_trimmed": { "type": "boolean", "fa_icon": "fas fa-save", + "default": false, "description": "Save trimmed FastQ file intermediates", "hidden": true }, @@ -204,6 +210,7 @@ "save_bam_mapped": { "type": "boolean", "fa_icon": "fas fa-download", + "default": false, "description": "Save Mapped BAMs" }, "skip_markduplicates": { @@ -223,13 +230,13 @@ "fa_icon": "fas fa-toolbox", "properties": { "ascat_ploidy": { - "type": "string", + "type": "number", "fa_icon": "fas fa-wrench", "description": "Overwrite ASCAT ploidy", "help_text": "Requires that `--ascat_purity` is set" }, "ascat_purity": { - "type": "string", + "type": "number", "fa_icon": "fas fa-wrench", "description": "Overwrite ASCAT purity", "help_text": "Requires that `--ascat_ploidy` is set" @@ -242,11 +249,12 @@ }, "cf_contamination_adjustment": { "type": "boolean", + "default": false, "fa_icon": "fas fa-wrench", "description": "Overwrite Control-FREEC contaminationAdjustement" }, "cf_contamination": { - "type": "string", + "type": "number", "fa_icon": "fas fa-wrench", "description": "Design known contamination value for Control-FREEC" }, @@ -264,11 +272,13 @@ }, "generate_gvcf": { "type": "boolean", + "default": false, "fa_icon": "fas fa-copy", "description": "Generate g.vcf output from GATK HaplotypeCaller" }, "no_strelka_bp": { "type": "boolean", + "default": false, "fa_icon": "fas fa-ban", "description": "Will not use Manta candidateSmallIndels for Strelka", "help_text": "Not recommended by Best Practices" @@ -287,12 +297,14 @@ }, "ignore_soft_clipped_bases": { "type": "boolean", + "default": false, "fa_icon": "fas fa-ban", "description": "Do not analyze soft clipped bases in the reads for GATK Mutect2", "help_text": "use the `--dont-use-soft-clipped-bases` params with GATK." }, "umi": { "type": "boolean", + "default": false, "fa_icon": "fas fa-tape", "description": "If provided, UMIs steps will be run to extract and annotate the reads with UMI and create consensus reads", "help_text": "This part of the pipeline uses fgbio to convert the FASTQ files into a unmapped BAM, where reads are tagged with the UMIs extracted from the FASTQ sequences.\nIn order to allow the correct tagging, the UMI sequence must be contained in the read sequence itself, and not in the FASTQ filename.\nFollowing this step, the unmapped BAM is aligned and reads are then grouped based on mapping position and UMI tag.\nFinally, reads in the same groups are collapsed to create a consensus read.\nTo create consensus, we have chosen to use the adjacency method\n\ncf [fgbio](https://github.com/fulcrumgenomics/fgbio)\ncf [UMIs, the problem, the solution and the proof](https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/)\n\n> **NB** In order for the correct tagging to be performed, a read structure needs to be specified with `--read_structure1` and `--readstructure2`" diff --git a/subworkflows/nf-core/mapping.nf b/subworkflows/nf-core/mapping.nf index c0d954b110..5dbd1e834d 100644 --- a/subworkflows/nf-core/mapping.nf +++ b/subworkflows/nf-core/mapping.nf @@ -26,6 +26,7 @@ workflow MAPPING { fasta // channel: [mandatory] fasta reads_input // channel: [mandatory] meta, reads_input skip_markduplicates // boolean: true/false + save_bam_mapped // boolean: true/false main: @@ -44,10 +45,10 @@ workflow MAPPING { // If meta.status is 1, then sample is tumor // else, (even is no meta.status exist) sample is normal - reads_input_split.branch{ - tumor: it[0].status == 1 + reads_input_split.branch{ meta, reads -> + tumor: meta.status == 1 normal: true - }.set{ reads_input_status } + }.set{reads_input_status} bam_bwamem = Channel.empty() bam_bwamem1 = Channel.empty() @@ -97,16 +98,18 @@ workflow MAPPING { tuple(groupKey, bam) [new_meta, bam] - }.groupTuple().set{ bam_mapped } + }.groupTuple().set{bam_mapped} - // MarkDuplicates can handle multiple BAMS as input, so no merging/indexing at this step - // Except if and only if skipping MarkDuplicates + // GATK markduplicates can handle multiple BAMS as input + // So no merging/indexing at this step + // Except if and only if skipping markduplicates + // Or saving mapped BAMs - if (skip_markduplicates) { + if (save_bam_mapped || skip_markduplicates) { bam_mapped.branch{ single: it[1].size() == 1 multiple: it[1].size() > 1 - }.set{ bam_to_merge } + }.set{bam_to_merge} SAMTOOLS_MERGE(bam_to_merge.multiple) bam_merged = bam_to_merge.single.mix(SAMTOOLS_MERGE.out.bam) diff --git a/tests/test_save_mapped.yml b/tests/test_save_mapped.yml new file mode 100644 index 0000000000..c76ffd149c --- /dev/null +++ b/tests/test_save_mapped.yml @@ -0,0 +1,30 @@ +- name: Run bwa-mem2 + command: nextflow run main.nf -profile test,docker --save_bam_mapped + tags: + - aligner + - bwa-mem2 + files: + - path: results/multiqc + - path: results/preprocessing/1234N/mapped/1234N.bam + - path: results/preprocessing/1234N/mapped/1234N.bam.bai + - path: results/preprocessing/1234N/markduplicates/1234N.md.cram + - path: results/preprocessing/1234N/markduplicates/1234N.md.cram.crai + - path: results/preprocessing/1234N/recal_table/1234N.recal.table + - path: results/preprocessing/1234N/recalibrated/1234N.recal.cram + - path: results/preprocessing/1234N/recalibrated/1234N.recal.cram.crai + - path: results/preprocessing/csv/markduplicates.csv + - path: results/preprocessing/csv/markduplicates_1234N.csv + - path: results/preprocessing/csv/markduplicates_no_table.csv + - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv + - path: results/preprocessing/csv/recalibrated.csv + - path: results/preprocessing/csv/recalibrated_1234N.csv + - path: results/reports/fastqc/1234N-1234N_M1 + - path: results/reports/fastqc/1234N-1234N_M2 + - path: results/reports/fastqc/1234N-1234N_M4 + - path: results/reports/fastqc/1234N-1234N_M5 + - path: results/reports/fastqc/1234N-1234N_M6 + - path: results/reports/fastqc/1234N-1234N_M7 + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal + - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats + - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 2b9301c7c3..f70e447470 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -345,9 +345,8 @@ workflow SAREK { fai, fasta, reads_input, - params.skip_markduplicates) - // params.skip_markduplicates, - // save_bam_mapped) + params.skip_markduplicates, + save_bam_mapped) // Get mapped reads (BAM) with and without index bam_mapped = MAPPING.out.bam From d3c0934baefa7ff8bb2a3a2d5de0ece78258303a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 09:40:37 +0200 Subject: [PATCH 17/25] fix: correct meta data --- tests/test_aligner.yml | 1 + tests/test_save_mapped.yml | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index 92fa1bb3e2..424ec2cf81 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -1,6 +1,7 @@ - name: Run bwa-mem2 command: nextflow run main.nf -profile test,docker --aligner bwa-mem2 tags: + - preprocessing - aligner - bwa-mem2 files: diff --git a/tests/test_save_mapped.yml b/tests/test_save_mapped.yml index c76ffd149c..57304f13e5 100644 --- a/tests/test_save_mapped.yml +++ b/tests/test_save_mapped.yml @@ -1,8 +1,8 @@ -- name: Run bwa-mem2 +- name: Run save bam mapped command: nextflow run main.nf -profile test,docker --save_bam_mapped tags: - - aligner - - bwa-mem2 + - preprocessing + - save_bam_mapped files: - path: results/multiqc - path: results/preprocessing/1234N/mapped/1234N.bam From fe5414d24096cef5e9f67e115a463357c7fbd9b7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 09:48:15 +0200 Subject: [PATCH 18/25] fix: typo --- .github/workflows/ci.yml | 2 +- tests/test_aligner.yml | 2 +- tests/{test_save_mapped.yml => test_save_bam_mapped.yml} | 2 +- tests/test_skip_markduplicates.yml | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) rename tests/{test_save_mapped.yml => test_save_bam_mapped.yml} (98%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30083046f2..77a6956a0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: - 'annotation' - 'default' - 'gatk4_spark' - - 'saved_bam_mapped' + - 'save_bam_mapped' - 'skip_markduplicates' - 'targeted' - 'tumor_normal_pair' diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index 424ec2cf81..2f452eb2e5 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -1,9 +1,9 @@ - name: Run bwa-mem2 command: nextflow run main.nf -profile test,docker --aligner bwa-mem2 tags: - - preprocessing - aligner - bwa-mem2 + - preprocessing files: - path: results/multiqc - path: results/preprocessing/1234N/markduplicates/1234N.md.cram diff --git a/tests/test_save_mapped.yml b/tests/test_save_bam_mapped.yml similarity index 98% rename from tests/test_save_mapped.yml rename to tests/test_save_bam_mapped.yml index 57304f13e5..ce576aa825 100644 --- a/tests/test_save_mapped.yml +++ b/tests/test_save_bam_mapped.yml @@ -1,4 +1,4 @@ -- name: Run save bam mapped +- name: Run save_bam_mapped command: nextflow run main.nf -profile test,docker --save_bam_mapped tags: - preprocessing diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index 95bfb00eb4..d9525f49b8 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -1,8 +1,8 @@ - name: Run default pipeline while skipping MarkDuplicates command: nextflow run main.nf -profile test,docker,skip_markduplicates tags: - - preprocessing - markduplicates + - preprocessing - skip_markduplicates files: - path: results/multiqc @@ -32,9 +32,9 @@ - name: Run default pipeline while skipping MarkDuplicates, starting with prepare_recalibration command: nextflow run main.nf -profile test,docker,prepare_recalibration,skip_markduplicates tags: - - preprocessing - - prepare_recalibration - markduplicates + - prepare_recalibration + - preprocessing - skip_markduplicates files: - path: results/multiqc From d5a8abd15f874a5e119e989928616f72b4e13ae4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 14:28:55 +0200 Subject: [PATCH 19/25] fix: remove unnecessary reports --- workflows/sarek.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index f70e447470..63eabf6ab6 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -328,9 +328,7 @@ workflow SAREK { reads_input = FASTQC_TRIMGALORE.out.reads // Get all qc reports for MultiQC - qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_html.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) - qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_html.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) From c70e896fc91cd988189fe090357a886071bb2a39 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 14:47:20 +0200 Subject: [PATCH 20/25] feat: enable --skip_qc --- workflows/sarek.nf | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 63eabf6ab6..9ebd76af22 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -42,6 +42,7 @@ for (param in checkPathParamList) if (param) file(param, checkIfExists: true) // Get step and tools def step = params.step ? params.step.replaceAll('-', '').replaceAll('_', '') : '' def tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +def skip_qc = params.skip_qc ? params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] // Check mandatory parameters if (params.input) csv_file = file(params.input) @@ -321,7 +322,7 @@ workflow SAREK { if (step == 'mapping') { FASTQC_TRIMGALORE( input_sample, - ('fastqc' in params.skip_qc), + ('fastqc' in skip_qc), !(params.trim_fastq)) // Get reads after optional trimming (+QC) @@ -365,13 +366,13 @@ workflow SAREK { bam_mapped, bam_indexed, ('markduplicates' in params.use_gatk_spark), - !('markduplicates' in params.skip_qc), + !('markduplicates' in skip_qc), fasta, fai, dict, params.skip_markduplicates, - ('bamqc' in params.skip_qc), - ('samtools' in params.skip_qc), + ('bamqc' in skip_qc), + ('samtools' in skip_qc), target_bed) cram_markduplicates = MARKDUPLICATES.out.cram @@ -406,8 +407,8 @@ workflow SAREK { // STEP 4: RECALIBRATING RECALIBRATE( ('bqsr' in params.use_gatk_spark), - ('bamqc' in params.skip_qc), - ('samtools' in params.skip_qc), + ('bamqc' in skip_qc), + ('samtools' in skip_qc), cram_applybqsr, dict, fai, @@ -505,7 +506,11 @@ workflow SAREK { .collect() .set { ch_software_versions } - GET_SOFTWARE_VERSIONS(ch_software_versions.map{it}.collect()) + ch_version_yaml = Channel.empty() + if (!('versions' in skip_qc)) { + GET_SOFTWARE_VERSIONS(ch_software_versions.map{it}.collect()) + ch_version_yaml = GET_SOFTWARE_VERSIONS.out.yaml.collect() + } workflow_summary = WorkflowSarek.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) @@ -514,12 +519,14 @@ workflow SAREK { ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_config) ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(GET_SOFTWARE_VERSIONS.out.yaml.collect()) + ch_multiqc_files = ch_multiqc_files.mix(ch_version_yaml) ch_multiqc_files = ch_multiqc_files.mix(qc_reports) - MULTIQC(ch_multiqc_files.collect()) - multiqc_report = MULTIQC.out.report.toList() - ch_software_versions = ch_software_versions.mix(MULTIQC.out.version.ifEmpty(null)) + multiqc_report = Channel.empty() + if (!('multiqc' in skip_qc)) { + MULTIQC(ch_multiqc_files.collect()) + multiqc_report = MULTIQC.out.report.toList() + } } workflow.onComplete { From 0648fd072e6aa94910434c9d7ff05cd2422c885a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 14:51:15 +0200 Subject: [PATCH 21/25] update schema_input --- assets/samplesheet.csv | 5 ++--- assets/schema_input.json | 9 +++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7bf..0563fd1125 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,2 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +patient,sample,lane,fastq1,fastq2 +ID_PAIRED_END,SAMPLE_PAIRED_END,LANE,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz diff --git a/assets/schema_input.json b/assets/schema_input.json index 7a41823168..7681b9eff1 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,6 +7,11 @@ "items": { "type": "object", "properties": { + "patient": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "ID name must be provided and cannot contain spaces" + }, "sample": { "type": "string", "pattern": "^\\S+$", @@ -32,8 +37,8 @@ } }, "required": [ - "sample", - "fastq_1" + "patient", + "sample" ] } } From 68ba43880020733b343ad510eb03dff933252462 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 15:03:20 +0200 Subject: [PATCH 22/25] fix: code polishing --- workflows/sarek.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 9ebd76af22..c050872090 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -546,7 +546,7 @@ def extract_csv(csv_file) { }.groupTuple() .map{ meta, rows -> size = rows.size() - return [rows, size] + [rows, size] }.transpose() .map{ row, numLanes -> //from here do the usual thing for csv parsing def meta = [:] From e0014946d321f91e64a21f3d2ecda835dc2980e8 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 Aug 2021 15:26:27 +0200 Subject: [PATCH 23/25] feat: update input header for more coherence between pipelines --- assets/schema_input.json | 132 ++++++++++++++++++++++++++++++++++++++- workflows/sarek.nf | 8 +-- 2 files changed, 133 insertions(+), 7 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 7681b9eff1..05a1347073 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -17,10 +17,58 @@ "pattern": "^\\S+$", "errorMessage": "Sample name must be provided and cannot contain spaces" }, + "gender": { + "errorMessage": "Gender cannot contain spaces", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "status": { + "errorMessage": "Status can only be 0 or 1", + "anyOf": [ + { + "type": "string", + "pattern": "^(0|1)*$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "lane": { + "errorMessage": "Lane cannot contain spaces", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + + }, "fastq_1": { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "errorMessage": "FastQ file for reads 1 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + { + "type": "string", + "maxLength": 0 + } + ] }, "fastq_2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", @@ -34,6 +82,84 @@ "maxLength": 0 } ] + }, + "table": { + "errorMessage": "Recalibration table cannot contain spaces and must have extension '.table'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.table$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "cram": { + "errorMessage": "CRAM file cannot contain spaces and must have extension '.cram'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.cram$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "crai": { + "errorMessage": "CRAM index file cannot contain spaces and must have extension '.crai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.crai$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "bam": { + "errorMessage": "BAM file cannot contain spaces and must have extension '.bam'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bam$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "bai": { + "errorMessage": "BAM index file cannot contain spaces and must have extension '.bai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bai$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "vcf": { + "errorMessage": "VCF file for reads 1 cannot contain spaces and must have extension '.vcf' or '.vcf.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.vcf(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ] } }, "required": [ diff --git a/workflows/sarek.nf b/workflows/sarek.nf index c050872090..b963697aaa 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -569,15 +569,15 @@ def extract_csv(csv_file) { else meta.status = 0 // mapping with fastq - if (row.lane && row.fastq2) { + if (row.lane && row.fastq_2) { meta.id = "${row.sample}-${row.lane}".toString() - def fastq1 = file(row.fastq1, checkIfExists: true) - def fastq2 = file(row.fastq2, checkIfExists: true) + def fastq_1 = file(row.fastq_1, checkIfExists: true) + def fastq_2 = file(row.fastq_2, checkIfExists: true) def CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : '' def read_group = "\"@RG\\tID:${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:ILLUMINA\"" meta.numLanes = numLanes.toInteger() meta.read_group = read_group.toString() - return [meta, [fastq1, fastq2]] + return [meta, [fastq_1, fastq_2]] // recalibration } else if (row.table && row.cram) { meta.id = meta.sample From 954ef3c8a446ead659fb0b60554ba15df8366e88 Mon Sep 17 00:00:00 2001 From: "Maxime U. Garcia" Date: Fri, 6 Aug 2021 15:32:35 +0200 Subject: [PATCH 24/25] Update samplesheet.csv --- assets/samplesheet.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 0563fd1125..5b0aafe7d2 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,2 +1,2 @@ -patient,sample,lane,fastq1,fastq2 +patient,sample,lane,fastq_1,fastq_2 ID_PAIRED_END,SAMPLE_PAIRED_END,LANE,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz From 181319be91dbe950ed2de7ff4df7c1223a4fbf39 Mon Sep 17 00:00:00 2001 From: "Maxime U. Garcia" Date: Fri, 6 Aug 2021 15:45:31 +0200 Subject: [PATCH 25/25] Update samplesheet.csv --- assets/samplesheet.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5b0aafe7d2..171d070180 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,2 +1,2 @@ patient,sample,lane,fastq_1,fastq_2 -ID_PAIRED_END,SAMPLE_PAIRED_END,LANE,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz +PATIENT_ID,SAMPLE_PAIRED_END,LANE,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz