diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3927c31442..77a6956a0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,7 +25,16 @@ jobs: # Nextflow versions: check pipeline minimum and current latest nxf_ver: ['21.04.0', ''] engine: ['docker'] - test: ['default', 'aligner', 'gatk4_spark', 'targeted', 'skip_markduplicates', 'tumor_normal_pair', 'variant_calling', 'annotation'] + test: + - 'aligner' + - 'annotation' + - 'default' + - 'gatk4_spark' + - 'save_bam_mapped' + - 'skip_markduplicates' + - 'targeted' + - 'tumor_normal_pair' + - 'variant_calling' steps: - name: Check out pipeline code uses: actions/checkout@v2 diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7bf..171d070180 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,2 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +patient,sample,lane,fastq_1,fastq_2 +PATIENT_ID,SAMPLE_PAIRED_END,LANE,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz diff --git a/assets/schema_input.json b/assets/schema_input.json index 7a41823168..05a1347073 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,15 +7,68 @@ "items": { "type": "object", "properties": { + "patient": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "ID name must be provided and cannot contain spaces" + }, "sample": { "type": "string", "pattern": "^\\S+$", "errorMessage": "Sample name must be provided and cannot contain spaces" }, + "gender": { + "errorMessage": "Gender cannot contain spaces", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "status": { + "errorMessage": "Status can only be 0 or 1", + "anyOf": [ + { + "type": "string", + "pattern": "^(0|1)*$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "lane": { + "errorMessage": "Lane cannot contain spaces", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + + }, "fastq_1": { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "errorMessage": "FastQ file for reads 1 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + { + "type": "string", + "maxLength": 0 + } + ] }, "fastq_2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", @@ -29,11 +82,89 @@ "maxLength": 0 } ] + }, + "table": { + "errorMessage": "Recalibration table cannot contain spaces and must have extension '.table'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.table$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "cram": { + "errorMessage": "CRAM file cannot contain spaces and must have extension '.cram'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.cram$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "crai": { + "errorMessage": "CRAM index file cannot contain spaces and must have extension '.crai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.crai$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "bam": { + "errorMessage": "BAM file cannot contain spaces and must have extension '.bam'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bam$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "bai": { + "errorMessage": "BAM index file cannot contain spaces and must have extension '.bai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bai$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "vcf": { + "errorMessage": "VCF file for reads 1 cannot contain spaces and must have extension '.vcf' or '.vcf.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.vcf(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ] } }, "required": [ - "sample", - "fastq_1" + "patient", + "sample" ] } } diff --git a/conf/modules.config b/conf/modules.config index 5c21954be7..c10aae17f2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,16 +43,15 @@ params { publish_dir = 'reference' publish_files = false } - 'bgziptabix_target_bed' { - suffix = '.bed' + 'msisensorpro_scan' { publish_dir = 'reference' publish_files = false } - 'msisensorpro_scan' { + 'samtools_faidx' { publish_dir = 'reference' publish_files = false } - 'samtools_faidx' { + 'bgziptabix_target_bed' { publish_dir = 'reference' publish_files = false } @@ -73,11 +72,12 @@ params { publish_files = false } // MAPPING - 'seqkit_split2' { - args = "--by-size ${params.split_fastq}" + 'bwa_mem2_mem_tumor' { + args = '-K 100000000 -M -B 3' + args2 = 'sort' publish_files = false } - 'bwa_mem1_mem' { + 'bwa_mem2_mem' { args = '-K 100000000 -M' args2 = 'sort' publish_files = false @@ -87,52 +87,52 @@ params { args2 = 'sort' publish_files = false } - 'bwa_mem2_mem' { + 'bwa_mem1_mem' { args = '-K 100000000 -M' args2 = 'sort' publish_files = false } - 'bwa_mem2_mem_tumor' { - args = '-K 100000000 -M -B 3' - args2 = 'sort' - publish_files = false - } 'samtools_index_mapping' { publish_by_meta = true publish_files = ['bai':'mapped'] publish_dir = 'preprocessing' } - // MARKDUPLICATES - 'markduplicates' { - args = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT' - suffix = '.md' + 'merge_bam_mapping' { publish_by_meta = true + publish_files = ['bam':'mapped'] publish_dir = 'preprocessing' + } + 'seqkit_split2' { + args = "--by-size ${params.split_fastq}" publish_files = false } - 'markduplicatesspark' { - args = '--remove-sequencing-duplicates false -VS LENIENT' + // MARKDUPLICATES + 'estimatelibrarycomplexity' { + args = '' suffix = '.md' publish_by_meta = true publish_dir = 'preprocessing' - publish_files = ['cram': 'markduplicates', 'crai': 'markduplicates'] + publish_files = ['metrics': 'markduplicates'] } - 'estimatelibrarycomplexity' { - args = '' + 'markduplicates' { + args = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT' suffix = '.md' publish_by_meta = true publish_dir = 'preprocessing' - publish_files = ['metrics': 'markduplicates'] + publish_files = false } - 'merge_bam_mapping' { + 'markduplicatesspark' { + args = '--remove-sequencing-duplicates false -VS LENIENT' + suffix = '.md' publish_by_meta = true - publish_files = ['bam':'mapped'] publish_dir = 'preprocessing' + publish_files = ['cram': 'markduplicates', 'crai': 'markduplicates'] } 'qualimap_bamqc_mapping' { args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' publish_by_meta = true publish_dir = 'reports/qualimap' + suffix = '.mapped' } 'samtools_stats_mapping' { publish_by_meta = true @@ -184,6 +184,7 @@ params { args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' publish_by_meta = true publish_dir = 'reports/qualimap' + suffix = '.recal' } 'samtools_index_recalibrate' { suffix = 'recal' diff --git a/nextflow.config b/nextflow.config index a09280320f..b0c3411517 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,7 +24,7 @@ params { help = false no_intervals = false // Intervals will be built from the fasta file nucleotides_per_second = 1000 // Default interval size - sentieon = null // Not using Sentieon by default + sentieon = false // Not using Sentieon by default skip_qc = null // All QC tools are used target_bed = false // No default TargetBED file for targeted sequencing tools = null // No default Variant_Calling or Annotation tools @@ -37,30 +37,30 @@ params { three_prime_clip_r2 = 0 trim_nextseq = 0 save_trimmed = false - split_fastq = 0 // Fastq files will not be split by default + split_fastq = 0 // FASTQ files will not be split by default // Preprocessing aligner = 'bwa-mem' markdup_java_options = '"-Xms4000m -Xmx7g"' // Established values for markDuplicates memory consumption, see https://github.com/SciLifeLab/Sarek/pull/689 for details use_gatk_spark = false // GATK Spark implementation of their tools in local mode not used by default - save_bam_mapped = null // Mapped BAMs not saved + save_bam_mapped = false // Mapped BAMs not saved skip_markduplicates = false // Do not skip markDuplicates by default sequencing_center = null // No sequencing center to be written in BAM header in MapReads process // Variant Calling ascat_ploidy = null // Use default value ascat_purity = null // Use default value - cf_coeff = 0.05 // default value for Control-FREEC - cf_contamination = null // by default not specified in Control-FREEC - cf_contamination_adjustment = null // by default we are not using this in Control-FREEC - cf_ploidy = 2 // you can use 2,3,4 - cf_window = null // by default we are not using this in Control-FREEC - generate_gvcf = null // g.vcf are not produced by HaplotypeCaller by default - no_strelka_bp = null // Strelka will use Manta candidateSmallIndels if available + cf_coeff = 0.05 // default value for Control-FREEC + cf_contamination = null // by default not specified in Control-FREEC + cf_contamination_adjustment = false // by default we are not using this in Control-FREEC + cf_ploidy = 2 // you can use 2,3,4 + cf_window = null // by default we are not using this in Control-FREEC + generate_gvcf = false // g.vcf are not produced by HaplotypeCaller by default + no_strelka_bp = false // Strelka will use Manta candidateSmallIndels if available pon = false // No default PON (Panel of Normals) file for GATK Mutect2 / Sentieon TNscope pon_tbi = false // No default PON index for GATK Mutect2 / Sentieon TNscope - ignore_soft_clipped_bases = null // no --dont-use-soft-clipped-bases for GATK Mutect2 - umi = null // no umi + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + umi = false // no umi read_structure1 = null // no umi read_structure2 = null // no umi diff --git a/nextflow_schema.json b/nextflow_schema.json index 3a95526ae3..5b82e9fcc1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,6 +64,7 @@ }, "no_intervals": { "type": "boolean", + "default": false, "fa_icon": "fas fa-ban", "description": "Disable usage of intervals.", "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling" @@ -77,6 +78,7 @@ }, "sentieon": { "type": "boolean", + "default": false, "fa_icon": "fas fa-tools", "description": "Enable Sentieon if available.", "help_text": "Sentieon is a commercial solution to process genomics data with high computing efficiency, fast turnaround time, exceptional accuracy, and 100% consistency.\n\n> **NB** Adds the following tools for the `--tools` options: `DNAseq`, `DNAscope` and `TNscope`." @@ -90,6 +92,8 @@ }, "target_bed": { "type": "string", + "format": "file-path", + "pattern": "\\.bed$", "fa_icon": "fas fa-crosshairs", "description": "Target BED file for whole exome or targeted sequencing.", "help_text": "This parameter does _not_ imply that the workflow is running alignment or variant calling only for the supplied targets.\nInstead, we are aligning for the whole genome, and selecting variants only at the very end by intersecting with the provided target file.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory.\nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--target_bed` option.\nThe workflow will pick up these intervals, and activate any `--exome` flag in any tools that allow it to process deeper coverage.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow." @@ -109,6 +113,7 @@ "fa_icon": "fas fa-cut", "description": "Run Trim Galore.", "hidden": true, + "default": false, "help_text": "Use this to perform adapter trimming with Trim Galore.\ncf [Trim Galore User Guide](https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md)" }, "clip_r1": { @@ -154,11 +159,13 @@ "save_trimmed": { "type": "boolean", "fa_icon": "fas fa-save", + "default": false, "description": "Save trimmed FastQ file intermediates", "hidden": true }, "split_fastq": { - "type": "number", + "type": "integer", + "default": 0, "fa_icon": "fas fa-cut", "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn of splitting at all", "help_text": "Use the the tools seqkit/split2 to split FASTQ file by number of reads", @@ -203,6 +210,7 @@ "save_bam_mapped": { "type": "boolean", "fa_icon": "fas fa-download", + "default": false, "description": "Save Mapped BAMs" }, "skip_markduplicates": { @@ -222,13 +230,13 @@ "fa_icon": "fas fa-toolbox", "properties": { "ascat_ploidy": { - "type": "string", + "type": "number", "fa_icon": "fas fa-wrench", "description": "Overwrite ASCAT ploidy", "help_text": "Requires that `--ascat_purity` is set" }, "ascat_purity": { - "type": "string", + "type": "number", "fa_icon": "fas fa-wrench", "description": "Overwrite ASCAT purity", "help_text": "Requires that `--ascat_ploidy` is set" @@ -241,11 +249,12 @@ }, "cf_contamination_adjustment": { "type": "boolean", + "default": false, "fa_icon": "fas fa-wrench", "description": "Overwrite Control-FREEC contaminationAdjustement" }, "cf_contamination": { - "type": "string", + "type": "number", "fa_icon": "fas fa-wrench", "description": "Design known contamination value for Control-FREEC" }, @@ -263,11 +272,13 @@ }, "generate_gvcf": { "type": "boolean", + "default": false, "fa_icon": "fas fa-copy", "description": "Generate g.vcf output from GATK HaplotypeCaller" }, "no_strelka_bp": { "type": "boolean", + "default": false, "fa_icon": "fas fa-ban", "description": "Will not use Manta candidateSmallIndels for Strelka", "help_text": "Not recommended by Best Practices" @@ -286,12 +297,14 @@ }, "ignore_soft_clipped_bases": { "type": "boolean", + "default": false, "fa_icon": "fas fa-ban", "description": "Do not analyze soft clipped bases in the reads for GATK Mutect2", "help_text": "use the `--dont-use-soft-clipped-bases` params with GATK." }, "umi": { "type": "boolean", + "default": false, "fa_icon": "fas fa-tape", "description": "If provided, UMIs steps will be run to extract and annotate the reads with UMI and create consensus reads", "help_text": "This part of the pipeline uses fgbio to convert the FASTQ files into a unmapped BAM, where reads are tagged with the UMIs extracted from the FASTQ sequences.\nIn order to allow the correct tagging, the UMI sequence must be contained in the read sequence itself, and not in the FASTQ filename.\nFollowing this step, the unmapped BAM is aligned and reads are then grouped based on mapping position and UMI tag.\nFinally, reads in the same groups are collapsed to create a consensus read.\nTo create consensus, we have chosen to use the adjacency method\n\ncf [fgbio](https://github.com/fulcrumgenomics/fgbio)\ncf [UMIs, the problem, the solution and the proof](https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/)\n\n> **NB** In order for the correct tagging to be performed, a read structure needs to be specified with `--read_structure1` and `--readstructure2`" diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index c3769d0f4e..2841a277cd 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - ANNOTATION -======================================================================================== -*/ +// +// ANNOTATION +// params.annotation_cache = [:] params.bgziptabix_merge_vep_options = [:] diff --git a/subworkflows/local/build_indices.nf b/subworkflows/local/build_indices.nf index 201b2ce6a1..007ee8cf90 100644 --- a/subworkflows/local/build_indices.nf +++ b/subworkflows/local/build_indices.nf @@ -1,10 +1,8 @@ -/* -======================================================================================== - BUILDING INDICES -======================================================================================== -*/ +// +// BUILDING INDICES +// -params.bgziptabix_options = [:] +params.bgziptabix_target_bed_options = [:] params.build_intervals_options = [:] params.bwa_index_options = [:] params.bwamem2_index_options = [:] @@ -26,7 +24,7 @@ include { CREATE_INTERVALS_BED } from '../../modules/local/cre include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main' addParams(options: params.gatk4_dict_options) include { MSISENSORPRO_SCAN } from '../../modules/local/msisensorpro/scan/main' addParams(options: params.msisensorpro_scan_options) include { SAMTOOLS_FAIDX } from '../../modules/nf-core/modules/samtools/faidx/main' addParams(options: params.samtools_faidx_options) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/modules/tabix/bgziptabix/main' addParams(options: params.bgziptabix_options) +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/modules/tabix/bgziptabix/main' addParams(options: params.bgziptabix_target_bed_options) include { TABIX_TABIX as TABIX_DBSNP } from '../../modules/nf-core/modules/tabix/tabix/main' addParams(options: params.tabix_dbsnp_options) include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../modules/nf-core/modules/tabix/tabix/main' addParams(options: params.tabix_germline_resource_options) include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../modules/nf-core/modules/tabix/tabix/main' addParams(options: params.tabix_known_indels_options) @@ -53,15 +51,23 @@ workflow BUILD_INDICES { else (result_bwa, version_bwa) = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() - version_dict = Channel.empty() + version_gatk = Channel.empty() if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) - (result_dict, version_dict) = GATK4_CREATESEQUENCEDICTIONARY(fasta) + (result_dict, version_gatk) = GATK4_CREATESEQUENCEDICTIONARY(fasta) result_fai = Channel.empty() - version_fai = Channel.empty() + version_samtools = Channel.empty() if (fasta_fai) result_fai = fasta_fai if (!(params.fasta_fai) && !('annotate' in step)) - (result_fai, version_fai) = SAMTOOLS_FAIDX(fasta) + (result_fai, version_samtools) = SAMTOOLS_FAIDX(fasta) + + result_target_bed = Channel.empty() + version_target_bed = Channel.empty() + if ((params.target_bed) && ('manta' in tools || 'strelka' in tools)) { + target_bed_id = target_bed.map{ it -> [[id:it[0].getName()], it] } + (result_target_bed, version_target_bed) = TABIX_BGZIPTABIX(target_bed_id) + result_target_bed = result_target_bed.map{ meta, bed, tbi -> [bed, tbi] } + } result_dbsnp_tbi = Channel.empty() version_dbsnp_tbi = Channel.empty() @@ -71,14 +77,6 @@ workflow BUILD_INDICES { result_dbsnp_tbi = result_dbsnp_tbi.map{ meta, tbi -> [tbi] } } - result_target_bed = Channel.empty() - version_target_bed = Channel.empty() - if ((params.target_bed) && ('manta' in tools || 'strelka' in tools)) { - target_bed_id = target_bed.map{ it -> [[id:it[0].baseName], it] } - (result_target_bed, version_target_bed) = TABIX_BGZIPTABIX(target_bed_id) - result_target_bed = result_target_bed.map{ meta, bed, tbi -> [bed, tbi] } - } - result_germline_resource_tbi = Channel.empty() version_germline_resource_tbi = Channel.empty() if (!(params.germline_resource_tbi) && params.germline_resource && 'mutect2' in tools) { @@ -94,11 +92,6 @@ workflow BUILD_INDICES { result_known_indels_tbi = result_known_indels_tbi.map{ meta, tbi -> [tbi] } } - result_msisensorpro_scan = Channel.empty() - version_msisensorpro_scan = Channel.empty() - if ('msisensorpro' in tools) - (result_msisensorpro_scan, version_msisensorpro_scan) = MSISENSORPRO_SCAN(fasta) - result_pon_tbi = Channel.empty() version_pon_tbi = Channel.empty() if (!(params.pon_tbi) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) { @@ -106,6 +99,14 @@ workflow BUILD_INDICES { (result_pon_tbi, version_pon_tbi) = TABIX_PON(pon_id) } + version_tabix = Channel.empty() + version_tabix = version_tabix.mix(version_target_bed, version_dbsnp_tbi, version_germline_resource_tbi, version_known_indels_tbi, version_pon_tbi).first() + + result_msisensorpro_scan = Channel.empty() + version_msisensorpro_scan = Channel.empty() + if ('msisensorpro' in tools) + (result_msisensorpro_scan, version_msisensorpro_scan) = MSISENSORPRO_SCAN(fasta) + result_intervals = Channel.empty() if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" @@ -136,15 +137,19 @@ workflow BUILD_INDICES { } emit: - bwa = result_bwa - bwa_version = version_bwa - dbsnp_tbi = result_dbsnp_tbi - dict = result_dict - fai = result_fai - germline_resource_tbi = result_germline_resource_tbi - intervals = result_intervals - known_indels_tbi = result_known_indels_tbi - msisensorpro_scan = result_msisensorpro_scan - pon_tbi = result_pon_tbi - target_bed_gz_tbi = result_target_bed + bwa = result_bwa + bwa_version = version_bwa + dbsnp_tbi = result_dbsnp_tbi + dict = result_dict + fai = result_fai + gatk_version = version_gatk + germline_resource_tbi = result_germline_resource_tbi + intervals = result_intervals + known_indels_tbi = result_known_indels_tbi.collect() + msisensorpro_scan = result_msisensorpro_scan + msisensorpro_scan_version = version_msisensorpro_scan + pon_tbi = result_pon_tbi + samtools_version = version_samtools + tabix_version = version_tabix + target_bed_gz_tbi = result_target_bed } diff --git a/subworkflows/local/germline_variant_calling.nf b/subworkflows/local/germline_variant_calling.nf index 16a255c5bb..6cc692c924 100644 --- a/subworkflows/local/germline_variant_calling.nf +++ b/subworkflows/local/germline_variant_calling.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - GERMLINE VARIANT CALLING -======================================================================================== -*/ +// +// GERMLINE VARIANT CALLING +// params.haplotypecaller_options = [:] params.deepvariant_options = [:] diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index b664bc8caf..0000000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,42 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -params.options = [:] - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channels(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channels(LinkedHashMap row) { - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - def array = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - array = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return array -} diff --git a/subworkflows/local/mapping_csv.nf b/subworkflows/local/mapping_csv.nf index 8bf8802b55..4bcfb7dbd2 100644 --- a/subworkflows/local/mapping_csv.nf +++ b/subworkflows/local/mapping_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MAPPING_CSV -======================================================================================== -*/ +// +// MAPPING_CSV +// workflow MAPPING_CSV { take: diff --git a/subworkflows/local/markduplicates_csv.nf b/subworkflows/local/markduplicates_csv.nf index 775d646b42..e8aa4214e0 100644 --- a/subworkflows/local/markduplicates_csv.nf +++ b/subworkflows/local/markduplicates_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MARKDUPLICATES_CSV -======================================================================================== -*/ +// +// MARKDUPLICATES_CSV +// workflow MARKDUPLICATES_CSV { take: diff --git a/subworkflows/local/pair_variant_calling.nf b/subworkflows/local/pair_variant_calling.nf index d32ad17b2f..4dc4ab112f 100644 --- a/subworkflows/local/pair_variant_calling.nf +++ b/subworkflows/local/pair_variant_calling.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - SOMATIC VARIANT CALLING -======================================================================================== -*/ +// +// SOMATIC VARIANT CALLING +// params.manta_options = [:] params.msisensorpro_msi_options = [:] diff --git a/subworkflows/local/prepare_recalibration_csv.nf b/subworkflows/local/prepare_recalibration_csv.nf index 4fb1dca896..7e79b658bf 100644 --- a/subworkflows/local/prepare_recalibration_csv.nf +++ b/subworkflows/local/prepare_recalibration_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - PREPARE_RECALIBRATION_CSV -======================================================================================== -*/ +// +// PREPARE_RECALIBRATION_CSV +// workflow PREPARE_RECALIBRATION_CSV { take: diff --git a/subworkflows/local/recalibrate_csv.nf b/subworkflows/local/recalibrate_csv.nf index 7e96ef76f8..13b0aeca55 100644 --- a/subworkflows/local/recalibrate_csv.nf +++ b/subworkflows/local/recalibrate_csv.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - RECALIBRATE_CSV -======================================================================================== -*/ +// +// RECALIBRATE_CSV +// workflow RECALIBRATE_CSV { take: diff --git a/subworkflows/nf-core/ensemblvep_annotate.nf b/subworkflows/nf-core/ensemblvep_annotate.nf index dff7fd6fa0..a5c043a818 100644 --- a/subworkflows/nf-core/ensemblvep_annotate.nf +++ b/subworkflows/nf-core/ensemblvep_annotate.nf @@ -1,6 +1,6 @@ -/* - * Run VEP to annotate VCF files - */ +// +// Run VEP to annotate VCF files +// params.bgziptabix_vep = [:] params.vep_options = [:] diff --git a/subworkflows/nf-core/fastqc_trimgalore.nf b/subworkflows/nf-core/fastqc_trimgalore.nf index c067ba25c6..12360ab980 100644 --- a/subworkflows/nf-core/fastqc_trimgalore.nf +++ b/subworkflows/nf-core/fastqc_trimgalore.nf @@ -1,12 +1,12 @@ -/* - * Read QC and trimming - */ +// +// Read QC and trimming +// params.fastqc_options = [:] params.trimgalore_options = [:] -include { FASTQC } from '../../modules/nf-core/modules/fastqc/main' addParams( options: params.fastqc_options ) -include { TRIMGALORE } from '../../modules/nf-core/modules/trimgalore/main' addParams( options: params.trimgalore_options ) +include { FASTQC } from '../../modules/nf-core/modules/fastqc/main' addParams( options: params.fastqc_options ) +include { TRIMGALORE } from '../../modules/nf-core/modules/trimgalore/main' addParams( options: params.trimgalore_options ) workflow FASTQC_TRIMGALORE { take: @@ -19,9 +19,10 @@ workflow FASTQC_TRIMGALORE { fastqc_zip = Channel.empty() fastqc_version = Channel.empty() if (!skip_fastqc) { - FASTQC ( reads ).html.set { fastqc_html } + FASTQC(reads) + fastqc_html = FASTQC.out.html fastqc_zip = FASTQC.out.zip - fastqc_version = FASTQC.out.version + fastqc_version = FASTQC.out.version.first() } trim_reads = reads @@ -30,11 +31,12 @@ workflow FASTQC_TRIMGALORE { trim_log = Channel.empty() trimgalore_version = Channel.empty() if (!skip_trimming) { - TRIMGALORE ( reads ).reads.set { trim_reads } + TRIMGALORE(reads) + trim_reads = TRIMGALORE.out.reads trim_html = TRIMGALORE.out.html trim_zip = TRIMGALORE.out.zip trim_log = TRIMGALORE.out.log - trimgalore_version = TRIMGALORE.out.version + trimgalore_version = TRIMGALORE.out.version.first() } emit: diff --git a/subworkflows/nf-core/mapping.nf b/subworkflows/nf-core/mapping.nf index 44e322446e..5dbd1e834d 100644 --- a/subworkflows/nf-core/mapping.nf +++ b/subworkflows/nf-core/mapping.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - MAPPING -======================================================================================== -*/ +// +// MAPPING +// params.bwamem1_mem_options = [:] params.bwamem1_mem_tumor_options = [:] @@ -28,73 +26,90 @@ workflow MAPPING { fasta // channel: [mandatory] fasta reads_input // channel: [mandatory] meta, reads_input skip_markduplicates // boolean: true/false + save_bam_mapped // boolean: true/false main: bam_indexed = Channel.empty() if (params.split_fastq > 1) { - reads_input_split = SEQKIT_SPLIT2(reads_input).reads.map{ - key, reads -> - //TODO maybe this can be replaced by a regex to include part_001 etc. - - //sorts list of split fq files by : - //[R1.part_001, R2.part_001, R1.part_002, R2.part_002,R1.part_003, R2.part_003,...] - //TODO: determine whether it is possible to have an uneven number of parts, so remainder: true woud need to be used, I guess this could be possible for unfiltered reads, reads that don't have pairs etc. - return [key, reads.sort{ a,b -> a.getName().tokenize('.')[ a.getName().tokenize('.').size() - 3] <=> b.getName().tokenize('.')[ b.getName().tokenize('.').size() - 3]}.collate(2)] - }.transpose() - } else { - reads_input_split = reads_input - } + reads_input_split = SEQKIT_SPLIT2(reads_input).reads.map{ key, reads -> + //TODO maybe this can be replaced by a regex to include part_001 etc. + + //sorts list of split fq files by : + //[R1.part_001, R2.part_001, R1.part_002, R2.part_002,R1.part_003, R2.part_003,...] + //TODO: determine whether it is possible to have an uneven number of parts, so remainder: true woud need to be used, I guess this could be possible for unfiltered reads, reads that don't have pairs etc. + [key, reads.sort{ a,b -> a.getName().tokenize('.')[ a.getName().tokenize('.').size() - 3] <=> b.getName().tokenize('.')[ b.getName().tokenize('.').size() - 3]}.collate(2)] + }.transpose() + } else reads_input_split = reads_input // If meta.status is 1, then sample is tumor // else, (even is no meta.status exist) sample is normal - reads_input_split.branch{ - tumor: it[0].status == 1 - normal: true - }.set{ reads_input_status } + reads_input_split.branch{ meta, reads -> + tumor: meta.status == 1 + normal: true + }.set{reads_input_status} - bam_bwamem1 = Channel.empty() - bam_bwamem2 = Channel.empty() + bam_bwamem = Channel.empty() + bam_bwamem1 = Channel.empty() + bam_bwamem2 = Channel.empty() + tool_versions = Channel.empty() if (aligner == "bwa-mem") { BWAMEM1_MEM(reads_input_status.normal, bwa) - bam_bwamem1_n = BWAMEM1_MEM.out.bam - BWAMEM1_MEM_T(reads_input_status.tumor, bwa) + + bam_bwamem1_n = BWAMEM1_MEM.out.bam bam_bwamem1_t = BWAMEM1_MEM_T.out.bam + bam_bwamem1 = bam_bwamem1_n.mix(bam_bwamem1_t) + + bwamem1_n_version = BWAMEM1_MEM.out.version + bwamem1_t_version = BWAMEM1_MEM_T.out.version + + bwamem1_version = bwamem1_n_version.mix(bwamem1_t_version).first() - bam_bwamem1 = bam_bwamem1_n.mix(bam_bwamem1_t) + tool_versions = tool_versions.mix(bwamem1_version) } else { BWAMEM2_MEM(reads_input_status.normal, bwa) - bam_bwamem2_n = BWAMEM2_MEM.out.bam - BWAMEM2_MEM_T(reads_input_status.tumor, bwa) + + bam_bwamem2_n = BWAMEM2_MEM.out.bam bam_bwamem2_t = BWAMEM2_MEM_T.out.bam + bam_bwamem2 = bam_bwamem2_n.mix(bam_bwamem2_t) - bam_bwamem2 = bam_bwamem2_n.mix(bam_bwamem2_t) + bwamem2_n_version = BWAMEM2_MEM.out.version + bwamem2_t_version = BWAMEM2_MEM_T.out.version + + bwamem2_version = bwamem2_n_version.mix(bwamem2_t_version).first() + tool_versions = tool_versions.mix(bwamem2_version) } - bam_bwa = bam_bwamem1.mix(bam_bwamem2) - bam_bwa.map{ meta, bam -> - meta.remove('read_group') - meta.id = meta.sample + bam_bwamem = bam_bwamem.mix(bam_bwamem1) + bam_bwamem = bam_bwamem.mix(bam_bwamem2) + + bam_bwamem.map{ meta, bam -> + new_meta = meta.clone() + new_meta.remove('read_group') + new_meta.id = meta.sample + // groupKey is to makes sure that the correct group can advance as soon as it is complete // and not stall the workflow until all pieces are mapped def groupKey = groupKey(meta, meta.numLanes * params.split_fastq) tuple(groupKey, bam) - [meta, bam] - }.groupTuple() - .set{bam_mapped} - // MarkDuplicates can handle multiple BAMS as input, so no merging/indexing at this step - // Except if and only if skipping MarkDuplicates + [new_meta, bam] + }.groupTuple().set{bam_mapped} + + // GATK markduplicates can handle multiple BAMS as input + // So no merging/indexing at this step + // Except if and only if skipping markduplicates + // Or saving mapped BAMs - if (skip_markduplicates) { + if (save_bam_mapped || skip_markduplicates) { bam_mapped.branch{ single: it[1].size() == 1 multiple: it[1].size() > 1 - }.set{ bam_to_merge } + }.set{bam_to_merge} SAMTOOLS_MERGE(bam_to_merge.multiple) bam_merged = bam_to_merge.single.mix(SAMTOOLS_MERGE.out.bam) @@ -106,4 +121,5 @@ workflow MAPPING { emit: bam = bam_mapped bam_indexed = bam_indexed + versions = tool_versions } diff --git a/subworkflows/nf-core/qc_markduplicates.nf b/subworkflows/nf-core/markduplicates.nf similarity index 91% rename from subworkflows/nf-core/qc_markduplicates.nf rename to subworkflows/nf-core/markduplicates.nf index 46e8dd7abe..1582a1b36d 100644 --- a/subworkflows/nf-core/qc_markduplicates.nf +++ b/subworkflows/nf-core/markduplicates.nf @@ -1,28 +1,26 @@ -/* -======================================================================================== - MARKDUPLICATES AND/OR QC after mapping -======================================================================================== -*/ +// +// MARKDUPLICATES AND/OR QC after mapping +// +params.estimatelibrarycomplexity_options = [:] params.markduplicates_options = [:] params.markduplicatesspark_options = [:] -params.estimatelibrarycomplexity_options = [:] params.merge_bam_options = [:] params.qualimap_bamqc_options = [:] +params.samtools_index_options = [:] params.samtools_stats_options = [:] params.samtools_view_options = [:] -params.samtools_index_options = [:] +include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/local/gatk4/estimatelibrarycomplexity/main' addParams(options: params.estimatelibrarycomplexity_options) include { GATK4_MARKDUPLICATES } from '../../modules/local/gatk4/markduplicates/main' addParams(options: params.markduplicates_options) include { GATK4_MARKDUPLICATES_SPARK } from '../../modules/local/gatk4/markduplicatesspark/main' addParams(options: params.markduplicatesspark_options) -include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/local/gatk4/estimatelibrarycomplexity/main' addParams(options: params.estimatelibrarycomplexity_options) include { QUALIMAP_BAMQC } from '../../modules/local/qualimap/bamqc/main' addParams(options: params.qualimap_bamqc_options) +include { SAMTOOLS_INDEX } from '../../modules/local/samtools/index/main' addParams(options: params.samtools_index_options) include { SAMTOOLS_STATS } from '../../modules/local/samtools/stats/main' addParams(options: params.samtools_stats_options) include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options) include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM_SPARK } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options) -include { SAMTOOLS_INDEX } from '../../modules/local/samtools/index/main' addParams(options: params.samtools_index_options) -workflow QC_MARKDUPLICATES { +workflow MARKDUPLICATES { take: bam_mapped // channel: [mandatory] meta, bam bam_indexed // channel: [mandatory] meta, bam, bai @@ -76,21 +74,20 @@ workflow QC_MARKDUPLICATES { } //If skip_markduplicates then QC tools are run on mapped bams, - //if !skip_markduplicates, then QC tools are run on duplicate marked bams + //if !skip_markduplicates, then QC tools are run on duplicate marked crams //After bamqc finishes, convert to cram for further analysis - qualimap_bamqc = Channel.empty() - if (!skip_bamqc && !skip_markduplicates) { - //TODO: after adding CI tests, allow bamqc on mapped bams if no duplicate marking is done - QUALIMAP_BAMQC(bam_markduplicates, target_bed, params.target_bed) - qualimap_bamqc = QUALIMAP_BAMQC.out - } - samtools_stats = Channel.empty() if (!skip_samtools) { SAMTOOLS_STATS(cram_markduplicates, fasta) samtools_stats = SAMTOOLS_STATS.out.stats } + qualimap_bamqc = Channel.empty() + if (!skip_bamqc) { + QUALIMAP_BAMQC(bam_markduplicates, target_bed, params.target_bed) + qualimap_bamqc = QUALIMAP_BAMQC.out.results + } + qc_reports = samtools_stats.mix(qualimap_bamqc) qc_reports = report_markduplicates.mix(qc_reports) diff --git a/subworkflows/nf-core/prepare_recalibration.nf b/subworkflows/nf-core/prepare_recalibration.nf index 10d0b98b11..18183f8e18 100644 --- a/subworkflows/nf-core/prepare_recalibration.nf +++ b/subworkflows/nf-core/prepare_recalibration.nf @@ -1,16 +1,14 @@ -/* -======================================================================================== - PREPARE RECALIBRATION -======================================================================================== -*/ - -params.baserecalibrator_options = [:] -params.gatherbqsrreports_options = [:] +// +// PREPARE RECALIBRATION +// + +params.baserecalibrator_options = [:] +params.gatherbqsrreports_options = [:] params.baserecalibrator_spark_options = [:] -include { GATK4_BASERECALIBRATOR as BASERECALIBRATOR } from '../../modules/local/gatk4/baserecalibrator/main' addParams(options: params.baserecalibrator_options) -include { GATK4_BASERECALIBRATOR_SPARK as BASERECALIBRATOR_SPARK } from '../../modules/local/gatk4/baserecalibratorspark/main' addParams(options: params.baserecalibrator_spark_options) -include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../modules/local/gatk4/gatherbqsrreports/main' addParams(options: params.gatherbqsrreports_options) +include { GATK4_BASERECALIBRATOR as BASERECALIBRATOR } from '../../modules/local/gatk4/baserecalibrator/main' addParams(options: params.baserecalibrator_options) +include { GATK4_BASERECALIBRATOR_SPARK as BASERECALIBRATOR_SPARK } from '../../modules/local/gatk4/baserecalibratorspark/main' addParams(options: params.baserecalibrator_spark_options) +include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../modules/local/gatk4/gatherbqsrreports/main' addParams(options: params.gatherbqsrreports_options) workflow PREPARE_RECALIBRATION { take: @@ -26,6 +24,7 @@ workflow PREPARE_RECALIBRATION { no_intervals // value: [mandatory] no_intervals main: + cram_markduplicates.combine(intervals) .map{ meta, cram, crai, intervals -> new_meta = meta.clone() diff --git a/subworkflows/nf-core/recalibrate.nf b/subworkflows/nf-core/recalibrate.nf index c672651384..285ab29bdc 100644 --- a/subworkflows/nf-core/recalibrate.nf +++ b/subworkflows/nf-core/recalibrate.nf @@ -1,8 +1,6 @@ -/* -======================================================================================== - RECALIBRATE -======================================================================================== -*/ +// +// RECALIBRATE +// params.applybqsr_options = [:] params.applybqsr_spark_options = [:] @@ -71,7 +69,7 @@ workflow RECALIBRATE { if (!skip_bamqc) { QUALIMAP_BAMQC_CRAM(cram_recalibrated_index,target_bed, params.target_bed,fasta, fai) - qualimap_bamqc = QUALIMAP_BAMQC_CRAM.out + qualimap_bamqc = QUALIMAP_BAMQC_CRAM.out.results } if (!skip_samtools) { diff --git a/subworkflows/nf-core/snpeff_annotate.nf b/subworkflows/nf-core/snpeff_annotate.nf index ee10ec0097..577cdefb69 100644 --- a/subworkflows/nf-core/snpeff_annotate.nf +++ b/subworkflows/nf-core/snpeff_annotate.nf @@ -1,6 +1,6 @@ -/* - * Run snpEff to annotate VCF files - */ +// +// Run snpEff to annotate VCF files +// params.bgziptabix_snpeff = [:] params.snpeff_options = [:] diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index cd2de82960..2f452eb2e5 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -3,6 +3,7 @@ tags: - aligner - bwa-mem2 + - preprocessing files: - path: results/multiqc - path: results/preprocessing/1234N/markduplicates/1234N.md.cram @@ -22,6 +23,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_default.yml b/tests/test_default.yml index 1ecd249447..536d136be5 100644 --- a/tests/test_default.yml +++ b/tests/test_default.yml @@ -22,6 +22,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_gatk_spark.yml b/tests/test_gatk_spark.yml index 2bc096694b..50a7c6518f 100644 --- a/tests/test_gatk_spark.yml +++ b/tests/test_gatk_spark.yml @@ -23,6 +23,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_pair.yml b/tests/test_pair.yml index a670c5ccdd..90913b0cc8 100644 --- a/tests/test_pair.yml +++ b/tests/test_pair.yml @@ -34,8 +34,10 @@ - path: results/reports/fastqc/9876T-9876T_M2 - path: results/reports/fastqc/9876T-9876T_M4 - path: results/reports/fastqc/9876T-9876T_M5 - - path: results/reports/qualimap/1234N - - path: results/reports/qualimap/9876T + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal + - path: results/reports/qualimap/9876T/9876T.mapped + - path: results/reports/qualimap/9876T/9876T.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats - path: results/reports/samtools_stats/9876T/9876T.md.cram.stats diff --git a/tests/test_save_bam_mapped.yml b/tests/test_save_bam_mapped.yml new file mode 100644 index 0000000000..ce576aa825 --- /dev/null +++ b/tests/test_save_bam_mapped.yml @@ -0,0 +1,30 @@ +- name: Run save_bam_mapped + command: nextflow run main.nf -profile test,docker --save_bam_mapped + tags: + - preprocessing + - save_bam_mapped + files: + - path: results/multiqc + - path: results/preprocessing/1234N/mapped/1234N.bam + - path: results/preprocessing/1234N/mapped/1234N.bam.bai + - path: results/preprocessing/1234N/markduplicates/1234N.md.cram + - path: results/preprocessing/1234N/markduplicates/1234N.md.cram.crai + - path: results/preprocessing/1234N/recal_table/1234N.recal.table + - path: results/preprocessing/1234N/recalibrated/1234N.recal.cram + - path: results/preprocessing/1234N/recalibrated/1234N.recal.cram.crai + - path: results/preprocessing/csv/markduplicates.csv + - path: results/preprocessing/csv/markduplicates_1234N.csv + - path: results/preprocessing/csv/markduplicates_no_table.csv + - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv + - path: results/preprocessing/csv/recalibrated.csv + - path: results/preprocessing/csv/recalibrated_1234N.csv + - path: results/reports/fastqc/1234N-1234N_M1 + - path: results/reports/fastqc/1234N-1234N_M2 + - path: results/reports/fastqc/1234N-1234N_M4 + - path: results/reports/fastqc/1234N-1234N_M5 + - path: results/reports/fastqc/1234N-1234N_M6 + - path: results/reports/fastqc/1234N-1234N_M7 + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal + - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats + - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index a07d2a3d42..d9525f49b8 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -1,11 +1,14 @@ - name: Run default pipeline while skipping MarkDuplicates command: nextflow run main.nf -profile test,docker,skip_markduplicates tags: - - preprocessing - markduplicates + - preprocessing - skip_markduplicates files: - path: results/multiqc + - path: results/preprocessing/1234N/mapped/1234N.bam + - path: results/preprocessing/1234N/mapped/1234N.bam.bai + - path: results/preprocessing/1234N/mapped/1234N.recal.table - path: results/preprocessing/1234N/markduplicates/1234N.md.cram - path: results/preprocessing/1234N/markduplicates/1234N.md.cram.crai - path: results/preprocessing/1234N/recalibrated/1234N.recal.cram @@ -22,15 +25,16 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats - name: Run default pipeline while skipping MarkDuplicates, starting with prepare_recalibration command: nextflow run main.nf -profile test,docker,prepare_recalibration,skip_markduplicates tags: - - preprocessing - - prepare_recalibration - markduplicates + - prepare_recalibration + - preprocessing - skip_markduplicates files: - path: results/multiqc @@ -45,6 +49,7 @@ - path: results/preprocessing/csv/markduplicates_no_table_1234N.csv - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_1234N.csv - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/tests/test_targeted.yml b/tests/test_targeted.yml index 70d1861152..635b4c4024 100644 --- a/tests/test_targeted.yml +++ b/tests/test_targeted.yml @@ -22,6 +22,7 @@ - path: results/reports/fastqc/1234N-1234N_M5 - path: results/reports/fastqc/1234N-1234N_M6 - path: results/reports/fastqc/1234N-1234N_M7 - - path: results/reports/qualimap/1234N + - path: results/reports/qualimap/1234N/1234N.mapped + - path: results/reports/qualimap/1234N/1234N.recal - path: results/reports/samtools_stats/1234N/1234N.md.cram.stats - path: results/reports/samtools_stats/1234N/1234N.recal.cram.stats diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 993c78a3fd..b963697aaa 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -37,51 +37,52 @@ def checkPathParamList = [ params.vep_cache ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +for (param in checkPathParamList) if (param) file(param, checkIfExists: true) -// get step and tools +// Get step and tools def step = params.step ? params.step.replaceAll('-', '').replaceAll('_', '') : '' def tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +def skip_qc = params.skip_qc ? params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] // Check mandatory parameters -input_sample = Channel.empty() - if (params.input) csv_file = file(params.input) else { log.warn "No samplesheet specified, attempting to restart from csv files present in ${params.outdir}" switch (step) { - case 'mapping': break - case 'prepare_recalibration': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates_no_table.csv", checkIfExists: true); break - case 'recalibrate': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates.csv" , checkIfExists: true); break - case 'variant_calling': csv_file = file("${params.outdir}/preprocessing/csv/recalibrated.csv" , checkIfExists: true); break - // case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break - case 'annotate': csv_file = file("${params.outdir}/variant_calling/csv/recalibrated.csv" , checkIfExists: true); break + case 'mapping': exit 1, "Can't start with step $step without samplesheet" + case 'preparerecalibration': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates_no_table.csv", checkIfExists: true); break + case 'recalibrate': csv_file = file("${params.outdir}/preprocessing/csv/markduplicates.csv", checkIfExists: true); break + case 'variantcalling': csv_file = file("${params.outdir}/preprocessing/csv/recalibrated.csv", checkIfExists: true); break + // case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break + case 'annotate': csv_file = file("${params.outdir}/variant_calling/csv/recalibrated.csv", checkIfExists: true); break default: exit 1, "Unknown step $step" } } input_sample = extract_csv(csv_file) -save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false +def save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false // Save AWS IGenomes file containing annotation version -def anno_readme = params.genomes[ params.genome ]?.readme +def anno_readme = params.genomes[params.genome]?.readme if (anno_readme && file(anno_readme).exists()) { file("${params.outdir}/genome/").mkdirs() file(anno_readme).copyTo("${params.outdir}/genome/") } -// Stage dummy file to be used as an optional input where required -ch_dummy_file = Channel.fromPath("$projectDir/assets/dummy_file.txt", checkIfExists: true).collect() - /* ======================================================================================== - UPDATE MODULES OPTIONS BASED ON PARAMS + IMPORT LOCAL MODULES/SUBWORKFLOWS ======================================================================================== */ +// Stage dummy file to be used as an optional input where required +ch_dummy_file = Channel.fromPath("$projectDir/assets/dummy_file.txt", checkIfExists: true).collect() + +// Don't overwrite global params.modules, create a copy instead and use that within the main script def modules = params.modules.clone() +// Update modules options based on params if (params.save_reference) { modules['build_intervals'].publish_files = ['bed':'intervals'] modules['bwa_index'].publish_files = ['amb':'bwa', 'ann':'bwa', 'bwt':'bwa', 'pac':'bwa', 'sa':'bwa'] @@ -104,47 +105,45 @@ if (params.skip_markduplicates) { } // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : ch_dummy_file -chr_length = params.chr_length ? Channel.fromPath(params.chr_length).collect() : ch_dummy_file -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : ch_dummy_file -dbsnp_tbi = params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : ch_dummy_file -fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : ch_dummy_file -fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : ch_dummy_file -germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : ch_dummy_file -germline_resource_tbi = params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : ch_dummy_file -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : ch_dummy_file -known_indels_tbi = params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : ch_dummy_file -loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : ch_dummy_file -loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : ch_dummy_file -mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : ch_dummy_file +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : ch_dummy_file +chr_length = params.chr_length ? Channel.fromPath(params.chr_length).collect() : ch_dummy_file +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : ch_dummy_file +fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : ch_dummy_file +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : ch_dummy_file +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : ch_dummy_file +loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : ch_dummy_file +loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : ch_dummy_file +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : ch_dummy_file // Initialize value channels based on params, defined in the params.genomes[params.genome] scope -snpeff_db = params.snpeff_db ?: Channel.empty() -vep_cache_version = params.vep_cache_version ?: Channel.empty() -vep_genome = params.vep_genome ?: Channel.empty() -vep_species = params.vep_species ?: Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_genome = params.vep_genome ?: Channel.empty() +vep_species = params.vep_species ?: Channel.empty() // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope -cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : ch_dummy_file -cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : ch_dummy_file -cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : ch_dummy_file -cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : ch_dummy_file -pon = params.pon ? Channel.fromPath(params.pon).collect() : ch_dummy_file -pon_tbi = params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : ch_dummy_file -snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : ch_dummy_file -target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : ch_dummy_file -vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : ch_dummy_file +cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : ch_dummy_file +cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : ch_dummy_file +cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : ch_dummy_file +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : ch_dummy_file +pon = params.pon ? Channel.fromPath(params.pon).collect() : ch_dummy_file +snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : ch_dummy_file +target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : ch_dummy_file +vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : ch_dummy_file // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope -read_structure1 = params.read_structure1 ?: Channel.empty() -read_structure2 = params.read_structure2 ?: Channel.empty() +read_structure1 = params.read_structure1 ?: Channel.empty() +read_structure2 = params.read_structure2 ?: Channel.empty() -/* -======================================================================================== - INCLUDE LOCAL SUBWORKFLOWS -======================================================================================== -*/ +// SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules + +// Create samplesheets to restart from different steps +include { MAPPING_CSV } from '../subworkflows/local/mapping_csv' +include { MARKDUPLICATES_CSV } from '../subworkflows/local/markduplicates_csv' +include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recalibration_csv' +include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' +// Build indices if needed include { BUILD_INDICES } from '../subworkflows/local/build_indices' addParams( bgziptabix_target_bed_options: modules['bgziptabix_target_bed'], build_intervals_options: modules['build_intervals'], @@ -159,15 +158,20 @@ include { BUILD_INDICES } from '../subworkflows/local/build_indices' addParams( tabix_known_indels_options: modules['tabix_known_indels'], tabix_pon_options: modules['tabix_pon'] ) + +// Map input reads to reference genome (+QC) include { MAPPING } from '../subworkflows/nf-core/mapping' addParams( bwamem1_mem_options: modules['bwa_mem1_mem'], bwamem1_mem_tumor_options: modules['bwa_mem1_mem_tumor'], bwamem2_mem_options: modules['bwa_mem2_mem'], bwamem2_mem_tumor_options: modules['bwa_mem2_mem_tumor'], + merge_bam_options: modules['merge_bam_mapping'], + samtools_index_options: modules['samtools_index_mapping'], seqkit_split2_options: modules['seqkit_split2'] ) -include { QC_MARKDUPLICATES } from '../subworkflows/nf-core/qc_markduplicates' addParams( +// Mark duplicates (+QC) + convert to CRAM +include { MARKDUPLICATES } from '../subworkflows/nf-core/markduplicates' addParams( estimatelibrarycomplexity_options: modules['estimatelibrarycomplexity'], markduplicates_options: modules['markduplicates'], markduplicatesspark_options: modules['markduplicatesspark'], @@ -178,11 +182,14 @@ include { QC_MARKDUPLICATES } from '../subworkflows/nf-core/qc_markduplicates' a samtools_view_options: modules['samtools_view'] ) +// Create recalibration tables include { PREPARE_RECALIBRATION } from '../subworkflows/nf-core/prepare_recalibration' addParams( baserecalibrator_options: modules['baserecalibrator'], baserecalibrator_spark_options: modules['baserecalibrator_spark'], gatherbqsrreports_options: modules['gatherbqsrreports'] ) + +// Create recalibrated cram files to use for variant calling include { RECALIBRATE } from '../subworkflows/nf-core/recalibrate' addParams( applybqsr_options: modules['applybqsr'], applybqsr_spark_options: modules['applybqsr_spark'], @@ -191,20 +198,19 @@ include { RECALIBRATE } from '../subworkflows/nf-core/recalibrate' addParams( samtools_index_options: modules['samtools_index_recalibrate'], samtools_stats_options: modules['samtools_stats_recalibrate'] ) -include { MAPPING_CSV } from '../subworkflows/local/mapping_csv' -include { MARKDUPLICATES_CSV } from '../subworkflows/local/markduplicates_csv' -include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recalibration_csv' -include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' +// Variant calling on a single normal sample include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_variant_calling' addParams( - concat_gvcf_options: modules['concat_gvcf'], - concat_haplotypecaller_options: modules['concat_haplotypecaller'], - genotypegvcf_options: modules['genotypegvcf'], - haplotypecaller_options: modules['haplotypecaller'], - strelka_options: modules['strelka_germline'] + concat_gvcf_options: modules['concat_gvcf'], + concat_haplotypecaller_options: modules['concat_haplotypecaller'], + genotypegvcf_options: modules['genotypegvcf'], + haplotypecaller_options: modules['haplotypecaller'], + strelka_options: modules['strelka_germline'] ) +// // Variant calling on a single tumor sample // // include { TUMOR_VARIANT_CALLING } from '../subworkflows/local/tumor_variant_calling' addParams( // // ) +// // Variant calling on tumor/normal pair // include { PAIR_VARIANT_CALLING } from '../subworkflows/local/pair_variant_calling' addParams( // manta_options: modules['manta_somatic'], // msisensorpro_msi_options: modules['msisensorpro_msi'], @@ -213,54 +219,58 @@ include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_varian // mutect2_somatic_options: modules['mutect2_somatic'] // ) +// Annotation include { ANNOTATE } from '../subworkflows/local/annotate' addParams( - annotation_cache: params.annotation_cache, - bgziptabix_merge_vep_options: modules['bgziptabix_merge_vep'], - bgziptabix_snpeff_options: modules['bgziptabix_snpeff'], - bgziptabix_vep_options: modules['bgziptabix_vep'], - merge_vep_options: modules['merge_vep'], - snpeff_options: modules['snpeff'], - snpeff_tag: "${modules['snpeff'].tag_base}.${params.genome}", - vep_options: modules['ensemblvep'], - vep_tag: "${modules['ensemblvep'].tag_base}.${params.genome}" + annotation_cache: params.annotation_cache, + bgziptabix_merge_vep_options: modules['bgziptabix_merge_vep'], + bgziptabix_snpeff_options: modules['bgziptabix_snpeff'], + bgziptabix_vep_options: modules['bgziptabix_vep'], + merge_vep_options: modules['merge_vep'], + snpeff_options: modules['snpeff'], + snpeff_tag: "${modules['snpeff'].tag_base}.${params.genome}", + vep_options: modules['ensemblvep'], + vep_tag: "${modules['ensemblvep'].tag_base}.${params.genome}" ) /* ======================================================================================== - INCLUDE NF-CORE MODULES + IMPORT NF-CORE MODULES/SUBWORKFLOWS ======================================================================================== */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +// Config files +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +modules['multiqc'].args += params.multiqc_title ? Utils.joinModuleArgs(["--title \"$params.multiqc_title\""]) : '' -def multiqc_options = modules['multiqc'] -multiqc_options.args += params.multiqc_title ? Utils.joinModuleArgs(["--title \"$params.multiqc_title\""]) : '' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams( options: multiqc_options ) -include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) - -/* -======================================================================================== - INCLUDE NF-CORE SUBWORKFLOWS -======================================================================================== -*/ +// +// SUBWORKFLOWS +// include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' addParams( fastqc_options: modules['fastqc'], trimgalore_options: modules['trimgalore'] ) +// +// MODULES: Installed directly from nf-core/modules +// + +include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams(options: modules['multiqc']) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams(options: [publish_files : ['tsv':'']]) + def multiqc_report = [] workflow SAREK { ch_software_versions = Channel.empty() + qc_reports = Channel.empty() - // BUILD INDICES + // Build indices if needed BUILD_INDICES( dbsnp, fasta, - fasta_fai, + params.fasta_fai, germline_resource, known_indels, pon, @@ -268,28 +278,34 @@ workflow SAREK { tools, step) - intervals = BUILD_INDICES.out.intervals - - num_intervals = 0 - intervals.count().map{ - num_intervals = it - } - - bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa - dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict - fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai - - dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi) : BUILD_INDICES.out.dbsnp_tbi : [] - germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi) : BUILD_INDICES.out.germline_resource_tbi : [] - known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi) : BUILD_INDICES.out.known_indels_tbi.collect() : [] - pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi) : BUILD_INDICES.out.pon_tbi : [] + // Gather built indices or get them from the params + bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : BUILD_INDICES.out.bwa : ch_dummy_file + dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : BUILD_INDICES.out.dict : ch_dummy_file + fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : BUILD_INDICES.out.fai : ch_dummy_file + dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : BUILD_INDICES.out.dbsnp_tbi : ch_dummy_file + germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : BUILD_INDICES.out.germline_resource_tbi : ch_dummy_file + known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : BUILD_INDICES.out.known_indels_tbi : ch_dummy_file + pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : BUILD_INDICES.out.pon_tbi : ch_dummy_file + msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan + target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi //TODO @Rike, is this working for you? - known_sites = dbsnp ? dbsnp.concat(known_indels).collect() : ch_dummy_file - known_sites_tbi = dbsnp_tbi ? dbsnp_tbi.concat(known_indels_tbi).collect() : ch_dummy_file + // known_sites is made by grouping both the dbsnp and the known indels ressources + // Which can either or both be optional + known_sites = dbsnp.concat(known_indels).collect() + known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() - msisensorpro_scan = BUILD_INDICES.out.msisensorpro_scan - target_bed_gz_tbi = BUILD_INDICES.out.target_bed_gz_tbi + // Intervals for speed up preprocessing/variant calling by spread/gather + intervals = BUILD_INDICES.out.intervals + num_intervals = 0 + intervals.count().map{ num_intervals = it } + + // Get versions from all software used + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.bwa_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.gatk_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.samtools_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.msisensorpro_scan_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(BUILD_INDICES.out.tabix_version.ifEmpty(null)) // PREPROCESSING @@ -297,7 +313,6 @@ workflow SAREK { bam_mapped_qc = Channel.empty() bam_recalibrated_qc = Channel.empty() bam_variant_calling = Channel.empty() - qc_reports = Channel.empty() // STEP 0: QC & TRIM // `--skip_qc fastqc` to skip fastqc @@ -307,26 +322,32 @@ workflow SAREK { if (step == 'mapping') { FASTQC_TRIMGALORE( input_sample, - ('fastqc' in params.skip_qc), + ('fastqc' in skip_qc), !(params.trim_fastq)) + // Get reads after optional trimming (+QC) reads_input = FASTQC_TRIMGALORE.out.reads - qc_reports = qc_reports.mix( - FASTQC_TRIMGALORE.out.fastqc_html, - FASTQC_TRIMGALORE.out.fastqc_zip, - FASTQC_TRIMGALORE.out.trim_html, - FASTQC_TRIMGALORE.out.trim_log, - FASTQC_TRIMGALORE.out.trim_zip) + // Get all qc reports for MultiQC + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) + qc_reports = qc_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) + + // Get versions from all software used + ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.fastqc_version.ifEmpty(null)) + ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.trimgalore_version.ifEmpty(null)) // STEP 1: MAPPING READS TO REFERENCE GENOME - MAPPING(params.aligner, + MAPPING( + params.aligner, bwa, fai, fasta, reads_input, - params.skip_markduplicates) + params.skip_markduplicates, + save_bam_mapped) + // Get mapped reads (BAM) with and without index bam_mapped = MAPPING.out.bam bam_indexed = MAPPING.out.bam_indexed @@ -335,33 +356,35 @@ workflow SAREK { } if (step == 'preparerecalibration') { - bam_mapped = Channel.empty() - if (params.skip_markduplicates) { - bam_indexed = input_sample - } else cram_markduplicates = input_sample + if (params.skip_markduplicates) bam_indexed = input_sample + else cram_markduplicates = input_sample } if (step in ['mapping', 'preparerecalibration']) { - // STEP 2: MARKING DUPLICATES AND/OR QC, conversion to CRAM - QC_MARKDUPLICATES(bam_mapped, + // STEP 2: Mark duplicates (+QC) + convert to CRAM + MARKDUPLICATES( + bam_mapped, bam_indexed, ('markduplicates' in params.use_gatk_spark), - !('markduplicates' in params.skip_qc), - fasta, fai, dict, + !('markduplicates' in skip_qc), + fasta, + fai, + dict, params.skip_markduplicates, - 'bamqc' in params.skip_qc, - 'samtools' in params.skip_qc, + ('bamqc' in skip_qc), + ('samtools' in skip_qc), target_bed) - cram_markduplicates = QC_MARKDUPLICATES.out.cram + cram_markduplicates = MARKDUPLICATES.out.cram // Create CSV to restart from this step MARKDUPLICATES_CSV(cram_markduplicates) - qc_reports = qc_reports.mix(QC_MARKDUPLICATES.out.qc) + qc_reports = qc_reports.mix(MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) - // STEP 3: CREATING RECALIBRATION TABLES - PREPARE_RECALIBRATION(cram_markduplicates, + // STEP 3: Create recalibration tables + PREPARE_RECALIBRATION( + cram_markduplicates, ('bqsr' in params.use_gatk_spark), dict, fai, @@ -384,8 +407,8 @@ workflow SAREK { // STEP 4: RECALIBRATING RECALIBRATE( ('bqsr' in params.use_gatk_spark), - ('bamqc' in params.skip_qc), - ('samtools' in params.skip_qc), + ('bamqc' in skip_qc), + ('samtools' in skip_qc), cram_applybqsr, dict, fai, @@ -399,7 +422,7 @@ workflow SAREK { RECALIBRATE_CSV(cram_recalibrated) - qc_reports = qc_reports.mix(cram_recalibrated_qc) + qc_reports = qc_reports.mix(cram_recalibrated_qc.collect{it[1]}.ifEmpty([])) cram_variant_calling = cram_recalibrated } @@ -415,7 +438,7 @@ workflow SAREK { tools, cram_variant_calling, dbsnp, - dbsnp_tbi.collect(), + dbsnp_tbi, dict, fai, fasta, @@ -424,7 +447,8 @@ workflow SAREK { target_bed, target_bed_gz_tbi) - vcf_to_annotate = vcf_to_annotate.mix(GERMLINE_VARIANT_CALLING.out.haplotypecaller_vcf, GERMLINE_VARIANT_CALLING.out.strelka_vcf) + vcf_to_annotate = vcf_to_annotate.mix(GERMLINE_VARIANT_CALLING.out.haplotypecaller_vcf) + vcf_to_annotate = vcf_to_annotate.mix(GERMLINE_VARIANT_CALLING.out.strelka_vcf) // SOMATIC VARIANT CALLING @@ -474,8 +498,6 @@ workflow SAREK { } } - if (step == 'mapping') ch_software_versions = ch_software_versions.mix(FASTQC_TRIMGALORE.out.fastqc_version.first().ifEmpty(null)) - ch_software_versions .map { it -> if (it) [ it.baseName, it ] } .groupTuple() @@ -484,21 +506,27 @@ workflow SAREK { .collect() .set { ch_software_versions } - GET_SOFTWARE_VERSIONS(ch_software_versions.map{it}.collect()) + ch_version_yaml = Channel.empty() + if (!('versions' in skip_qc)) { + GET_SOFTWARE_VERSIONS(ch_software_versions.map{it}.collect()) + ch_version_yaml = GET_SOFTWARE_VERSIONS.out.yaml.collect() + } workflow_summary = WorkflowSarek.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) + ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_config) ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(GET_SOFTWARE_VERSIONS.out.yaml.collect()) - if (step == 'mapping') ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_version_yaml) + ch_multiqc_files = ch_multiqc_files.mix(qc_reports) - MULTIQC(ch_multiqc_files.collect()) - multiqc_report = MULTIQC.out.report.toList() - ch_software_versions = ch_software_versions.mix(MULTIQC.out.version.ifEmpty(null)) + multiqc_report = Channel.empty() + if (!('multiqc' in skip_qc)) { + MULTIQC(ch_multiqc_files.collect()) + multiqc_report = MULTIQC.out.report.toList() + } } workflow.onComplete { @@ -518,7 +546,7 @@ def extract_csv(csv_file) { }.groupTuple() .map{ meta, rows -> size = rows.size() - return [rows, size] + [rows, size] }.transpose() .map{ row, numLanes -> //from here do the usual thing for csv parsing def meta = [:] @@ -541,15 +569,15 @@ def extract_csv(csv_file) { else meta.status = 0 // mapping with fastq - if (row.lane && row.fastq2) { + if (row.lane && row.fastq_2) { meta.id = "${row.sample}-${row.lane}".toString() - def fastq1 = file(row.fastq1, checkIfExists: true) - def fastq2 = file(row.fastq2, checkIfExists: true) + def fastq_1 = file(row.fastq_1, checkIfExists: true) + def fastq_2 = file(row.fastq_2, checkIfExists: true) def CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : '' def read_group = "\"@RG\\tID:${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:ILLUMINA\"" meta.numLanes = numLanes.toInteger() meta.read_group = read_group.toString() - return [meta, [fastq1, fastq2]] + return [meta, [fastq_1, fastq_2]] // recalibration } else if (row.table && row.cram) { meta.id = meta.sample