diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 2607f9286a..5a3eda8264 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,4 +1,3 @@ - name: Bug report description: Report something that is broken or incorrect labels: bug diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 97e0356171..e25d253329 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -18,7 +18,7 @@ jobs: # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters - + with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 60c5d6b971..38b8a5e644 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,7 +12,6 @@ jobs: steps: - name: Launch workflow via tower uses: nf-core/tower-action@v2 - with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c04e3c9d0b..7bb63d02f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,7 +66,7 @@ jobs: run: python -m pip install --upgrade pip pytest-workflow - name: Run pipeline with tests settings - run: pytest --tag ${{ matrix.test }} --kwdof + run: pytest --tag ${{ matrix.test }} --kwdof --git-aware - name: Output log on failure if: failure() diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 3b448773c4..72fadf1ddb 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -12,9 +12,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 + - uses: actions/setup-node@v2 with: - node-version: '10' + node-version: '17' - name: Install markdownlint run: npm install -g markdownlint-cli - name: Run Markdownlint @@ -51,9 +51,9 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 + - uses: actions/setup-node@v2 with: - node-version: '10' + node-version: '17' - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -65,13 +65,13 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - - uses: actions/setup-node@v1 + - uses: actions/setup-node@v2 with: - node-version: '10' + node-version: '17' - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") -c ${GITHUB_WORKSPACE}/.yamllint.yml # If the above check failed, post a comment on the PR explaining the failure - name: Post PR comment @@ -87,7 +87,7 @@ jobs: * Install `yaml-lint` * [Install `npm`](https://www.npmjs.com/get-npm) then [install `yaml-lint`](https://www.npmjs.com/package/yaml-lint) (`npm install -g yaml-lint`) * Fix the markdown errors - * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml")` + * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml") -c ./.yamllint.yml` * Fix any reported errors in your YAML files Once you push these changes the test should pass, and you can hide this comment :+1: @@ -120,7 +120,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install git+https://github.com/nf-core/tools.git@dev - name: Run nf-core lint env: @@ -142,4 +142,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 44d72994b0..90d1577082 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -2,7 +2,6 @@ name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork - on: workflow_run: workflows: ["nf-core linting"] diff --git a/.github/workflows/local_modules.yml b/.github/workflows/local_modules.yml index 838351536d..1e68e39f95 100644 --- a/.github/workflows/local_modules.yml +++ b/.github/workflows/local_modules.yml @@ -1,6 +1,5 @@ name: Local Modules pytest-workflow on: [push, pull_request] - jobs: changes: name: Check for changes diff --git a/.nf-core.yml b/.nf-core.yml index b994ae2cce..7b702a060a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,6 +1,9 @@ +repository_type: pipeline lint: files_unchanged: + - .github/workflows/branch.yml - .github/workflows/linting.yml + - .github/workflows/linting_comment.yml - assets/multiqc_config.yaml - assets/nf-core-sarek_logo_light.png - docs/images/nf-core-sarek_logo_dark.png diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 0000000000..d466deec92 --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,6 @@ +extends: default + +rules: + document-start: disable + line-length: disable + truthy: disable diff --git a/LICENSE b/LICENSE index 6060922966..f257f6ad4c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Maxime Garcia, Szilveszter Juhos +Copyright (c) Maxime Garcia, Szilveszter Juhos, Friederike Hanssen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 60a50311bc..e6fb23e06a 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations If you use `nf-core/sarek` for your analysis, please cite the `Sarek` article as follows: -> Garcia M, Juhos S, Larsson M et al. **Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants [version 2; peer review: 2 approved]** *F1000Research* 2020, 9:63 [doi: 10.12688/f1000research.16665.2](http://dx.doi.org/10.12688/f1000research.16665.2). +> Garcia M, Juhos S, Larsson M et al. **Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants [version 2; peer review: 2 approved]** _F1000Research_ 2020, 9:63 [doi: 10.12688/f1000research.16665.2](http://dx.doi.org/10.12688/f1000research.16665.2). You can cite the sarek zenodo record for a specific version using the following [doi: 10.5281/zenodo.3476426](https://zenodo.org/badge/latestdoi/184289291) diff --git a/conf/modules.config b/conf/modules.config index 731e9d8db8..bb471ecf54 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -71,6 +71,15 @@ process { ] } + withName: 'GATK4_INTERVALLISTTOBED' { + publishDir = [ + enabled: "${params.save_reference}", + mode: "${params.publish_dir_mode}", + path: { "${params.outdir}/reference/intervals" }, + pattern: "*bed" + ] + } + withName: 'MSISENSORPRO_SCAN' { ext.when = { params.tools && params.tools.contains('msisensorpro') } publishDir = [ @@ -142,73 +151,42 @@ process { } } -// UMI Subworkflow -process{ - withName: 'BAM2FASTQ' { - ext.args = '-T RX' - } - - withName: 'SAMBLASTER' { - ext.args = '-M --addMateTags' - ext.prefix = {"${meta.id}_unsorted_tagged"} - } +// BAM TO FASTQ +process { - withName: 'CALLUMICONSENSUS' { - ext.args = '-M 1 -S Coordinate' - ext.prefix = {"${meta.id}_umi-consensus"} + withName: 'SAMTOOLS_FASTQ_MAPPED'{ + ext.args2 = '-N' + ext.prefix = {"${meta.id}.mapped"} } -} -if (params.umi_read_structure) { - process { - withName: "NFCORE_SAREK:SAREK:CREATE_UMI_CONSENSUS:BWA.*_MEM" { - ext.args = '-p -C -M' - ext.args2 = '-bS' - ext.prefix = {"${meta.id}.umi_unsorted"} - } + withName: 'SAMTOOLS_FASTQ_UNMAPPED'{ + ext.args2 = '-N' + ext.prefix = {"${meta.id}.unmapped"} } -} -//BAMTOFASTQ -process { withName: 'SAMTOOLS_VIEW_MAP_MAP' { ext.args = '-b -f1 -F12' ext.prefix = {"${meta.id}.map_map"} } - withName: 'SAMTOOLS_VIEW_UNMAP_UNMAP' { - ext.args = '-b -f12 -F256' - ext.prefix = {"${meta.id}.unmap_unmap"} - } - withName: 'SAMTOOLS_VIEW_UNMAP_MAP' { - ext.args = '-b -f4 -F264' - ext.prefix = {"${meta.id}.unmap_map"} - } + withName: 'SAMTOOLS_VIEW_MAP_UNMAP' { ext.args = '-b -f8 -F260' ext.prefix = {"${meta.id}.map_unmap"} } - withName: 'SAMTOOLS_FASTQ_UNMAPPED'{ - ext.args2 = '-N' - ext.prefix = {"${meta.id}.unmapped"} + + withName: 'SAMTOOLS_VIEW_UNMAP_MAP' { + ext.args = '-b -f4 -F264' + ext.prefix = {"${meta.id}.unmap_map"} } - withName: 'SAMTOOLS_FASTQ_MAPPED'{ - ext.args2 = '-N' - ext.prefix = {"${meta.id}.mapped"} + + withName: 'SAMTOOLS_VIEW_UNMAP_UNMAP' { + ext.args = '-b -f12 -F256' + ext.prefix = {"${meta.id}.unmap_unmap"} } } -// FASTQ_QC_TRIM +// TRIMMING process { - withName: 'FASTQC' { - errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} - ext.args = '--quiet' - ext.when = { !(params.skip_tools && params.skip_tools.contains('fastqc')) } - publishDir = [ - enabled: true, - mode: "${params.publish_dir_mode}", - path: { "${params.outdir}/reports/fastqc/${meta.id}" } - ] - } withName: 'TRIMGALORE' { ext.args = '--fastqc' ext.when = { params.trim_fastq } @@ -220,7 +198,32 @@ process { } } -// MAPPING +// UMI Subworkflow +process{ + + withName: 'BAM2FASTQ' { + ext.args = '-T RX' + ext.when = { umi_read_structure } + } + + withName: 'CALLUMICONSENSUS' { + ext.args = '-M 1 -S Coordinate' + ext.prefix = {"${meta.id}_umi-consensus"} + } + + withName: "NFCORE_SAREK:SAREK:CREATE_UMI_CONSENSUS:MAPPING_UMI:BWAMEM.*_MEM" { + ext.args = '-p -C -M' + ext.args2 = '-bS' + ext.prefix = {"${meta.id}.umi_unsorted"} + } + + withName: 'SAMBLASTER' { + ext.args = '-M --addMateTags' + ext.prefix = {"${meta.id}_unsorted_tagged"} + } +} + +// SPLIT FASTQ process { withName: "SEQKIT_SPLIT2" { @@ -233,6 +236,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } +} + +// MAPPING +process { withName: "BWAMEM1_MEM" { ext.when = { params.aligner == "bwa-mem" } @@ -241,17 +248,20 @@ process { ext.when = { params.aligner == "bwa-mem2" } } - withName: "BWA.*MEM" { + withName: "NFCORE_SAREK:SAREK:GATK4_MAPPING:BWAMEM.*_MEM" { + // Using -B 3 for tumor samples ext.args = { meta.status == 1 ? '-K 100000000 -M -B 3' : '-K 100000000 -M' } - ext.prefix = { params.split_fastq > 1 ? "${meta.id}".concat('.').concat(reads.get(0).name.findAll(/part_([0-9]+)?/).last()) : "" } - } - - withName: 'MERGE_MAPPING' { - ext.when = { params.save_bam_mapped || (params.skip_tools && params.skip_tools.contains('markduplicates')) } + // Markduplicates Spark NEEDS name-sorted reads or runtime goes through the roof + // However if it's skipped, reads need to be coordinate-sorted + // Only name sort if Spark for Markduplicates + duplicate marking is not skipped + ext.args2 = { params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates') && (!params.skip_tools || (params.skip_tools && !params.skip_tools.contains('markduplicates'))) ? '-n' : '' } + ext.prefix = { params.split_fastq > 1 ? "${meta.id}".concat('.').concat(reads.get(0).name.findAll(/part_([0-9]+)?/).last()) : "" } + publishDir = [ + enabled: false + ] } - withName: 'INDEX_MAPPING' { - ext.when = { params.save_bam_mapped || (params.skip_tools && params.skip_tools.contains('markduplicates')) } + withName: 'INDEX_MERGE_BAM' { publishDir = [ enabled: true, mode: "${params.publish_dir_mode}", @@ -261,22 +271,11 @@ process { } } -// Markduplicates Spark NEEDS name-sorted reads or runtime goes through the roof -// However if it's skipped, reads need to be coordinate-sorted -// Only name sort if Spark for Markduplicates + duplicate marking is not skipped -if ( params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates') && (!params.skip_tools || (params.skip_tools && !params.skip_tools.contains('markduplicates')))) { - process { - withName: "BWA.*_MEM" { - ext.args2 = '-n' - } - } -} - // MARKDUPLICATES process { withName: 'GATK4_ESTIMATELIBRARYCOMPLEXITY' { ext.prefix = { "${meta.id}.md" } - ext.when = { (params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates')) && (!(params.skip_tools && params.skip_tools.contains('markduplicates_report'))) } + ext.when = { !(params.skip_tools && params.skip_tools.contains('markduplicates_report')) } publishDir = [ enabled: true, mode: "${params.publish_dir_mode}", @@ -287,16 +286,16 @@ process { withName: 'GATK4_MARKDUPLICATES' { ext.args = '-REMOVE_DUPLICATES false -VALIDATION_STRINGENCY LENIENT' ext.prefix = { "${meta.id}.md" } - //publishDir = [ - // enabled: false - // mode: "${params.publish_dir_mode}", - // path: { "${params.outdir}/preprocessing/${meta.id}/markduplicates" }, - //] + publishDir = [ + enabled: true, + mode: "${params.publish_dir_mode}", + path: { "${params.outdir}/reports/${meta.id}/markduplicates" }, + pattern: "*{metrics}" + ] } withName: 'GATK4_MARKDUPLICATES_SPARK' { ext.args = '--remove-sequencing-duplicates false -VS LENIENT' ext.prefix = { !(params.skip_tools && (params.skip_tools.contains('bamqc') || params.skip_tools.contains('deeptools'))) ? "${meta.id}.md.bam" : "${meta.id}.md.cram" } - ext.when = { params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates') } publishDir = [ enabled: true, mode: "${params.publish_dir_mode}", @@ -304,42 +303,10 @@ process { pattern: "*{cram,crai}" ] } - withName: 'QUALIMAP_BAMQC' { - ext.args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' - ext.prefix = { "${meta.id}.mapped" } - ext.when = { !(params.skip_tools && params.skip_tools.contains('bamqc')) } - publishDir = [ - enabled: true, - mode: "${params.publish_dir_mode}", - path: { "${params.outdir}/reports/qualimap/${meta.id}" } - ] - } - withName: 'SAMTOOLS_STATS' { - ext.when = { !(params.skip_tools && params.skip_tools.contains('samtools')) } - publishDir = [ - enabled: true, - mode: "${params.publish_dir_mode}", - path: { "${params.outdir}/reports/samtools_stats/${meta.id}" } - ] - } - withName: 'DEEPTOOLS_BAMCOVERAGE' { - ext.when = { !(params.skip_tools && params.skip_tools.contains('deeptools')) } - publishDir = [ - enabled: true, - mode: "${params.publish_dir_mode}", - path: { "${params.outdir}/reports/deeptools/${meta.id}" } - ] - } - withName: 'SAMTOOLS_BAM_TO_CRAM_NO_DUPLICATES' { - ext.when = { params.skip_tools && params.skip_tools.contains('markduplicates') } - } - withName: 'GATK4_MARKDUPLICATES|SAMTOOLS_BAM_TO_CRAM_DUPLICATES' { - ext.when = { !(params.skip_tools && params.skip_tools.contains('markduplicates')) && !(params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates')) } - } - withName: 'SAMTOOLS_BAM_TO_CRAM_SPARK' { - ext.when = { !(params.skip_tools && (params.skip_tools.contains('bamqc') || params.skip_tools.contains('deeptools'))) } + withName: 'GATK4_MARKDUPLICATES' { + ext.when = { !(params.skip_tools && params.skip_tools.contains('markduplicates')) } } - withName: 'SAMTOOLS_BAM_TO_CRAM_DUPLICATES|SAMTOOLS_BAM_TO_CRAM_NO_DUPLICATES|SAMTOOLS_BAM_TO_CRAM_SPARK' { + withName: 'SAMTOOLS_BAM_TO_CRAM' { ext.prefix = { "${meta.id}.md" } publishDir = [ enabled: true, @@ -369,13 +336,6 @@ process { pattern: "*.table" ] } - - withName: 'BASERECALIBRATOR' { - ext.when = { !params.use_gatk_spark || (params.use_gatk_spark && !params.use_gatk_spark.contains('baserecalibrator')) } - } - withName: 'BASERECALIBRATOR_SPARK' { - ext.when = { params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator') } - } withName: 'GATHERBQSRREPORTS' { ext.when = { !params.no_intervals } } @@ -384,28 +344,47 @@ process { // RECALIBRATE process { - withName: 'APPLYBQSR' { - ext.when = { !params.use_gatk_spark || (params.use_gatk_spark && !params.use_gatk_spark.contains('baserecalibrator')) } - } - withName: 'APPLYBQSR_SPARK' { - ext.when = { params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator') } - } withName: 'APPLYBQSR|APPLYBQSR_SPARK' { ext.prefix = {"${meta.id}.recal"} + publishDir = [ + enabled: false + ] } - withName: 'SAMTOOLS_MERGE_CRAM' { + withName: 'NFCORE_SAREK:SAREK:RECALIBRATE:MERGE_INDEX_CRAM:MERGE_CRAM' { ext.prefix = { "${meta.id}.recal" } ext.when = { !params.no_intervals } publishDir = [ - enabled: true, + enabled: false, mode: "${params.publish_dir_mode}", path: { "${params.outdir}/preprocessing/${meta.id}/recalibrated" }, pattern: "*cram" ] } - withName: 'QUALIMAP_BAMQCCRAM' { + withName: 'NFCORE_SAREK:SAREK:RECALIBRATE:MERGE_INDEX_CRAM:INDEX_CRAM' { + publishDir = [ + enabled: true, + mode: "${params.publish_dir_mode}", + path: { "${params.outdir}/preprocessing/${meta.id}/recalibrated" }, + pattern: "*{recal.cram,recal.cram.crai}" + ] + } +} + +// QC +process{ + withName: 'FASTQC' { + errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} + ext.args = '--quiet' + ext.when = { !(params.skip_tools && params.skip_tools.contains('fastqc')) } + publishDir = [ + enabled: true, + mode: "${params.publish_dir_mode}", + path: { "${params.outdir}/reports/fastqc/${meta.id}" } + ] + } + withName: 'QUALIMAP_BAMQC' { ext.args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' - ext.prefix = { "${meta.id}.recal" } + ext.prefix = { "${meta.id}.mapped" } ext.when = { !(params.skip_tools && params.skip_tools.contains('bamqc')) } publishDir = [ enabled: true, @@ -413,16 +392,33 @@ process { path: { "${params.outdir}/reports/qualimap/${meta.id}" } ] } - withName: 'INDEX_RECALIBRATE' { + withName: 'SAMTOOLS_STATS' { + ext.when = { !(params.skip_tools && params.skip_tools.contains('samtools')) } + publishDir = [ + enabled: true, + mode: "${params.publish_dir_mode}", + path: { "${params.outdir}/reports/samtools_stats/${meta.id}" } + ] + } + withName: 'DEEPTOOLS_BAMCOVERAGE' { + ext.when = { !(params.skip_tools && params.skip_tools.contains('deeptools')) } + publishDir = [ + enabled: true, + mode: "${params.publish_dir_mode}", + path: { "${params.outdir}/reports/deeptools/${meta.id}" } + ] + } + withName: 'QUALIMAP_BAMQCCRAM' { + ext.args = '--paint-chromosome-limits --genome-gc-distr HUMAN -skip-duplicated --skip-dup-mode 0 -outformat HTML' ext.prefix = { "${meta.id}.recal" } + ext.when = { !(params.skip_tools && params.skip_tools.contains('bamqc')) } publishDir = [ enabled: true, mode: "${params.publish_dir_mode}", - path: { "${params.outdir}/preprocessing/${meta.id}/recalibrated" }, - pattern: "*{recal.cram,recal.cram.crai}" + path: { "${params.outdir}/reports/qualimap/${meta.id}" } ] } - withName: 'SAMTOOLS_STATS' { + withName: 'NFCORE_SAREK:SAREK:CRAM_QC:SAMTOOLS_STATS' { ext.when = { !(params.skip_tools && params.skip_tools.contains('samtools')) } publishDir = [ enabled: true, @@ -453,7 +449,7 @@ process{ // DEEPVARIANT withName: 'BGZIP_VC_DEEPVARIANT_GVCF' { - ext.when = { params.generate_gvcf } + ext.when = { params.generate_gvcf && !params.no_intervals } } withName: 'CONCAT_DEEPVARIANT_.*' { publishDir = [ diff --git a/docs/output.md b/docs/output.md index 55b2965749..6916048284 100644 --- a/docs/output.md +++ b/docs/output.md @@ -592,7 +592,7 @@ For all samples: - `sample_R1_XXX_fastqc.zip` and `sample_R2_XXX_fastqc.zip` - Zip archive containing the FastQC report, tab-delimited data file and plot images -> **NB:** The `FastQC` plots displayed in the `MultiQC` report shows _untrimmed_ reads. +> **NB:** The `FastQC` plots displayed in the `MultiQC` report shows *untrimmed* reads. > They may contain adapter sequence and potentially regions with low quality. - `fastqc/` @@ -621,7 +621,7 @@ For further reading and documentation see the [Qualimap bamqc manual](http://qua More information in the [GATK MarkDuplicates section](#gatk-markduplicates) -Duplicates can arise during sample preparation _e.g._ library construction using PCR. +Duplicates can arise during sample preparation *e.g.* library construction using PCR. Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. These duplication artifacts are referred to as optical duplicates. diff --git a/docs/usage.md b/docs/usage.md index 643db4b71a..9a45aa43f7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -175,7 +175,14 @@ Work dir: Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` ``` -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so based on the search results the file we want is `modules/nf-core/software/star/align/main.nf`. If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. Providing you haven't set any other standard nf-core parameters to __cap__ the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. +To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. +The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). +We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so based on the search results the file we want is `modules/nf-core/software/star/align/main.nf`. +If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). +The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. +The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. +Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. +The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. ```nextflow process { @@ -522,7 +529,7 @@ Each `FASTQ` file pair gets its own read group (`@RG`) in the resulting `BAM` fi * The sample name (`SM`) is derived from the the last component of the path given to `--input`. That is, you should make sure that that directory has a meaningful name! For example, with `--input=/my/fastqs/sample123`, the sample name will be `sample123`. -* The read group id is set to *flowcell.samplename.lane*. +* The read group id is set to _flowcell.samplename.lane_. The flowcell id and lane number are auto-detected from the name of the first read in the `FASTQ` file. ### --input <VCF> --step annotate diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 40ab65f205..b3d092f809 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -27,7 +27,7 @@ class NfcoreSchema { /* groovylint-disable-next-line UnusedPrivateMethodParameter */ public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { def has_error = false - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Check for nextflow core params and unexpected params def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') @@ -135,7 +135,7 @@ class NfcoreSchema { } } - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Validate parameters against the schema InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) diff --git a/modules/local/gatk4/applybqsrspark/main.nf b/modules/local/gatk4/applybqsrspark/main.nf index c8aa7ca84f..a50509d59a 100644 --- a/modules/local/gatk4/applybqsrspark/main.nf +++ b/modules/local/gatk4/applybqsrspark/main.nf @@ -2,10 +2,10 @@ process GATK4_APPLYBQSR_SPARK { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::gatk4=4.2.4.1" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.2.4.1--hdfd78af_0' : - 'quay.io/biocontainers/gatk4:4.2.4.1--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gatk4:4.2.3.0--hdfd78af_0' : + 'quay.io/biocontainers/gatk4:4.2.3.0--hdfd78af_0' }" input: tuple val(meta), path(cram), path(crai), path(bqsr_table), path(intervals_bed) path fasta diff --git a/modules/local/gatk4/baserecalibratorspark/main.nf b/modules/local/gatk4/baserecalibratorspark/main.nf index 68c4084e26..1002a8b55c 100644 --- a/modules/local/gatk4/baserecalibratorspark/main.nf +++ b/modules/local/gatk4/baserecalibratorspark/main.nf @@ -2,10 +2,10 @@ process GATK4_BASERECALIBRATOR_SPARK { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::gatk4=4.2.4.1" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.2.4.1--hdfd78af_0' : - 'quay.io/biocontainers/gatk4:4.2.4.1--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gatk4:4.2.3.0--hdfd78af_0' : + 'quay.io/biocontainers/gatk4:4.2.3.0--hdfd78af_0' }" input: tuple val(meta), path(cram), path(crai), path(intervals_bed) path fasta diff --git a/modules/local/gatk4/markduplicatesspark/main.nf b/modules/local/gatk4/markduplicatesspark/main.nf index 33b51a8611..9c8d50b587 100644 --- a/modules/local/gatk4/markduplicatesspark/main.nf +++ b/modules/local/gatk4/markduplicatesspark/main.nf @@ -2,10 +2,10 @@ process GATK4_MARKDUPLICATES_SPARK { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.2.5.0--hdfd78af_0' : - 'broadinstitute/gatk:4.2.5.0' }" + 'https://depot.galaxyproject.org/singularity/gatk4:4.2.3.0--hdfd78af_0' : + 'broadinstitute/gatk:4.2.3.0' }" input: tuple val(meta), path(bams) diff --git a/modules/local/samtools/fastq/main.nf b/modules/local/samtools/fastq/main.nf index caba63bd86..87978d05d6 100644 --- a/modules/local/samtools/fastq/main.nf +++ b/modules/local/samtools/fastq/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FASTQ { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::samtools=1.14" : null) + conda (params.enable_conda ? "bioconda::samtools=1.15" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.14--hb421002_0' : - 'quay.io/biocontainers/samtools:1.14--hb421002_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : + 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" input: tuple val(meta), path(input) diff --git a/modules/local/samtools/index/main.nf b/modules/local/samtools/index/main.nf index 02852d3f63..8d8f91dbe9 100644 --- a/modules/local/samtools/index/main.nf +++ b/modules/local/samtools/index/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_INDEX { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::samtools=1.14" : null) + conda (params.enable_conda ? "bioconda::samtools=1.15" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.14--hb421002_0' : - 'quay.io/biocontainers/samtools:1.14--hb421002_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : + 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" input: tuple val(meta), path(input) diff --git a/modules/local/samtools/mergecram/main.nf b/modules/local/samtools/mergecram/main.nf index f1ab6f9679..9e91e09964 100644 --- a/modules/local/samtools/mergecram/main.nf +++ b/modules/local/samtools/mergecram/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_MERGE_CRAM { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::samtools=1.14" : null) + conda (params.enable_conda ? "bioconda::samtools=1.15" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.14--hb421002_0' : - 'quay.io/biocontainers/samtools:1.14--hb421002_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : + 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" input: tuple val(meta), path(crams) diff --git a/modules/local/samtools/viewindex/main.nf b/modules/local/samtools/viewindex/main.nf index 443c06a989..d37735d1f6 100644 --- a/modules/local/samtools/viewindex/main.nf +++ b/modules/local/samtools/viewindex/main.nf @@ -3,10 +3,10 @@ process SAMTOOLS_VIEWINDEX { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::samtools=1.14" : null) + conda (params.enable_conda ? "bioconda::samtools=1.15" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.14--hb421002_0' : - 'quay.io/biocontainers/samtools:1.14--hb421002_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : + 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" input: tuple val(meta), path(input), path(index) diff --git a/nextflow.config b/nextflow.config index ff77dbdaae..6a28d61fa0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,15 +40,15 @@ params { save_split_fastqs = false // UMI tagged reads - umi_read_structure = null // no umi - group_by_umi_strategy = 'Adjacency' // + umi_read_structure = null // no UMI + group_by_umi_strategy = 'Adjacency' // default strategy when UMI // Preprocessing - aligner = 'bwa-mem' + aligner = 'bwa-mem' // Default is bwa mem, bwa-mem2 mem can be used too markdup_java_options = '"-Xms4000m -Xmx7g"' // Established values for markDuplicates memory consumption, see https://github.com/SciLifeLab/Sarek/pull/689 for details use_gatk_spark = null // GATK Spark implementation of their tools in local mode not used by default save_bam_mapped = false // Mapped BAMs not saved - sequencing_center = null // No sequencing center to be written in BAM header in MapReads process + sequencing_center = null // No sequencing center to be written in BAM header by bwa/bwa-mem2 mem // Variant Calling ascat_ploidy = null // Use default value diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index e6e8898a78..11bcfa7280 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -2,9 +2,9 @@ // ANNOTATION // -include { ANNOTATION_SNPEFF } from '../nf-core/annotation_snpeff/main' -include { ANNOTATION_ENSEMBLVEP as MERGE_ANNOTATE } from '../nf-core/annotation_ensemblvep/main' -include { ANNOTATION_ENSEMBLVEP } from '../nf-core/annotation_ensemblvep/main' +include { ANNOTATION_SNPEFF } from '../nf-core/annotation/snpeff/main' +include { ANNOTATION_ENSEMBLVEP as MERGE_ANNOTATE } from '../nf-core/annotation/ensemblvep/main' +include { ANNOTATION_ENSEMBLVEP } from '../nf-core/annotation/ensemblvep/main' workflow ANNOTATE { take: diff --git a/subworkflows/local/bam2fastq.nf b/subworkflows/local/bam2fastq.nf index 2347eb00dc..a875db6d36 100644 --- a/subworkflows/local/bam2fastq.nf +++ b/subworkflows/local/bam2fastq.nf @@ -2,16 +2,14 @@ // BAM/CRAM to FASTQ conversion, paired end only // -//include { FASTQC } from '../../modules/nf-core/modules/fastqc/main' -include { SAMTOOLS_INDEX } from '../../modules/nf-core/modules/samtools/index/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_MAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_UNMAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_MAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_UNMAP } from '../../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_UNMAPPED } from '../../modules/nf-core/modules/samtools/merge/main' -include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_UNMAPPED } from '../../modules/local/samtools/fastq/main' -include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_MAPPED } from '../../modules/local/samtools/fastq/main' -include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_MAP } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_UNMAP } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_MAP } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_UNMAP } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_UNMAPPED } from '../../modules/nf-core/modules/samtools/merge/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_UNMAPPED } from '../../modules/local/samtools/fastq/main' +include { SAMTOOLS_FASTQ as SAMTOOLS_FASTQ_MAPPED } from '../../modules/local/samtools/fastq/main' +include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' workflow ALIGNMENT_TO_FASTQ { take: @@ -20,25 +18,21 @@ workflow ALIGNMENT_TO_FASTQ { main: ch_versions = Channel.empty() - //Index File if not PROVIDED -> this also requires updates to samtools view possibly URGH + // Index File if not PROVIDED -> this also requires updates to samtools view possibly URGH //QC input BAM? -> needs another FASTQC module implementation - //MAP - MAP + // MAP - MAP SAMTOOLS_VIEW_MAP_MAP(input, fasta) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW_MAP_MAP.out.versions) // UNMAP - UNMAP SAMTOOLS_VIEW_UNMAP_UNMAP(input, fasta) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW_UNMAP_UNMAP.out.versions) // UNMAP - MAP SAMTOOLS_VIEW_UNMAP_MAP(input, fasta) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW_UNMAP_MAP.out.versions) - //MAP - UNMAP + // MAP - UNMAP SAMTOOLS_VIEW_MAP_UNMAP(input, fasta) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW_MAP_UNMAP.out.versions) // Merge UNMAP SAMTOOLS_VIEW_UNMAP_UNMAP.out.bam.join(SAMTOOLS_VIEW_UNMAP_MAP.out.bam, remainder: true) @@ -48,15 +42,12 @@ workflow ALIGNMENT_TO_FASTQ { }.set{ all_unmapped_bam } SAMTOOLS_MERGE_UNMAPPED(all_unmapped_bam, fasta) - ch_versions = ch_versions.mix(SAMTOOLS_MERGE_UNMAPPED.out.versions) // Collate & convert unmapped SAMTOOLS_FASTQ_UNMAPPED(SAMTOOLS_MERGE_UNMAPPED.out.bam) - ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_UNMAPPED.out.versions) // Collate & convert mapped SAMTOOLS_FASTQ_MAPPED(SAMTOOLS_VIEW_MAP_MAP.out.bam) - ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_MAPPED.out.versions) // join Mapped & unmapped fastq SAMTOOLS_FASTQ_UNMAPPED.out.reads.map{ meta, reads -> @@ -77,7 +68,16 @@ workflow ALIGNMENT_TO_FASTQ { // Concatenate Mapped_R1 with Unmapped_R1 and Mapped_R2 with Unmapped_R2 CAT_FASTQ(reads_to_concat) + + // Gather versions of all tools used ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_MAPPED.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_FASTQ_UNMAPPED.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_MERGE_UNMAPPED.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_MAP_MAP.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_MAP_UNMAP.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_UNMAP_MAP.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_UNMAP_UNMAP.out.versions) emit: reads = CAT_FASTQ.out.reads diff --git a/subworkflows/local/germline_variant_calling.nf b/subworkflows/local/germline_variant_calling.nf index 3c8e1871de..cd940fc40f 100644 --- a/subworkflows/local/germline_variant_calling.nf +++ b/subworkflows/local/germline_variant_calling.nf @@ -24,7 +24,7 @@ include { DEEPVARIANT } from '../../modules/nf-cor include { FREEBAYES } from '../../modules/nf-core/modules/freebayes/main' include { GATK4_GENOTYPEGVCFS as GENOTYPEGVCFS } from '../../modules/nf-core/modules/gatk4/genotypegvcfs/main' include { GATK4_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../modules/nf-core/modules/gatk4/haplotypecaller/main' -include { GATK_JOINT_GERMLINE_VARIANT_CALLING } from '../../subworkflows/nf-core/joint_germline_variant_calling/main' +include { GATK_JOINT_GERMLINE_VARIANT_CALLING } from '../../subworkflows/nf-core/gatk4/joint_germline_variant_calling/main' include { MANTA_GERMLINE } from '../../modules/nf-core/modules/manta/germline/main' include { STRELKA_GERMLINE } from '../../modules/nf-core/modules/strelka/germline/main' include { TABIX_BGZIPTABIX as TABIX_BGZIP_TIDDIT_SV } from '../../modules/nf-core/modules/tabix/bgziptabix/main' diff --git a/subworkflows/local/mapping_csv.nf b/subworkflows/local/mapping_csv.nf index 4bcfb7dbd2..98e021a6ab 100644 --- a/subworkflows/local/mapping_csv.nf +++ b/subworkflows/local/mapping_csv.nf @@ -5,36 +5,17 @@ workflow MAPPING_CSV { take: bam_indexed // channel: [mandatory] meta, bam, bai - save_bam_mapped // boolean: [mandatory] save_bam_mapped - skip_markduplicates // boolean: [mandatory] skip_markduplicates main: - if (save_bam_mapped) { - csv_bam_mapped = bam_indexed.map { meta, bam, bai -> [meta] } - // Creating csv files to restart from this step - csv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/csv") { meta -> - patient = meta.patient[0] - sample = meta.sample[0] - gender = meta.gender[0] - status = meta.status[0] - bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" - bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" - ["mapped_${sample}.csv", "patient,gender,status,sample,bam,bai\n${patient},${gender},${status},${sample},${bam},${bai}\n"] - }.collectFile(name: "mapped.csv", keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/preprocessing/csv") - } - - if (skip_markduplicates) { - csv_bam_mapped = bam_indexed.map { meta, bam, bai -> [meta] } - // Creating csv files to restart from this step - csv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/csv") { meta -> - patient = meta.patient[0] - sample = meta.sample[0] - gender = meta.gender[0] - status = meta.status[0] - bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" - bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" - table = "${params.outdir}/preprocessing/${sample}/recal_table/${sample}.recal.table" - ["mapped_no_markduplicates_${sample}.csv", "patient,gender,status,sample,bam,bai,table\n${patient},${gender},${status},${sample},${bam},${bai},${table}\n"] - }.collectFile(name: 'mapped_no_markduplicates.csv', keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/preprocessing/csv") - } + csv_bam_mapped = bam_indexed.map { meta, bam, bai -> [meta] } + // Creating csv files to restart from this step + csv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/csv") { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + ["mapped_${sample}.csv", "patient,gender,status,sample,bam,bai\n${patient},${gender},${status},${sample},${bam},${bai}\n"] + }.collectFile(name: "mapped.csv", keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/preprocessing/csv") } diff --git a/subworkflows/local/pair_variant_calling.nf b/subworkflows/local/pair_variant_calling.nf index 8cfd89f722..af1b22e245 100644 --- a/subworkflows/local/pair_variant_calling.nf +++ b/subworkflows/local/pair_variant_calling.nf @@ -13,7 +13,7 @@ include { CONCAT_VCF as CONCAT_MANTA_SOMATIC } from '../../modules/local/ include { CONCAT_VCF as CONCAT_MANTA_SV } from '../../modules/local/concat_vcf/main' include { CONCAT_VCF as CONCAT_STRELKA_INDELS } from '../../modules/local/concat_vcf/main' include { CONCAT_VCF as CONCAT_STRELKA_SNVS } from '../../modules/local/concat_vcf/main' -include { GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING } from '../../subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/main' +include { GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING } from '../../subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main' include { MANTA_SOMATIC } from '../../modules/nf-core/modules/manta/somatic/main' include { MSISENSORPRO_MSI_SOMATIC } from '../../modules/nf-core/modules/msisensorpro/msi_somatic/main' include { STRELKA_SOMATIC } from '../../modules/nf-core/modules/strelka/somatic/main' diff --git a/subworkflows/local/prepare_intervals.nf b/subworkflows/local/prepare_intervals.nf index 505516f6ff..2af55711fe 100644 --- a/subworkflows/local/prepare_intervals.nf +++ b/subworkflows/local/prepare_intervals.nf @@ -14,7 +14,7 @@ include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_ALL } from '../../modu workflow PREPARE_INTERVALS { take: - fasta_fai // channel: [optional] fasta_fai + fasta_fai // channel: [mandatory] fasta_fai main: diff --git a/subworkflows/local/tumor_variant_calling.nf b/subworkflows/local/tumor_variant_calling.nf index 475afe5050..46d02d34e3 100644 --- a/subworkflows/local/tumor_variant_calling.nf +++ b/subworkflows/local/tumor_variant_calling.nf @@ -17,7 +17,7 @@ include { CONCAT_VCF as CONCAT_MANTA_TUMOR } from '../../modules/local/co include { CONCAT_VCF as CONCAT_STRELKA } from '../../modules/local/concat_vcf/main' include { CONCAT_VCF as CONCAT_STRELKA_GENOME } from '../../modules/local/concat_vcf/main' include { FREEBAYES } from '../../modules/nf-core/modules/freebayes/main' -include { GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING } from '../../subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/main' +include { GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING } from '../../subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main' include { MANTA_TUMORONLY } from '../../modules/nf-core/modules/manta/tumoronly/main' include { STRELKA_GERMLINE as STRELKA_TUMORONLY } from '../../modules/nf-core/modules/strelka/germline/main' include { TABIX_TABIX as TABIX_VC_FREEBAYES } from '../../modules/nf-core/modules/tabix/tabix/main' diff --git a/subworkflows/nf-core/annotation_ensemblvep/main.nf b/subworkflows/nf-core/annotation/ensemblvep/main.nf similarity index 90% rename from subworkflows/nf-core/annotation_ensemblvep/main.nf rename to subworkflows/nf-core/annotation/ensemblvep/main.nf index d8f21f10aa..479fed888a 100644 --- a/subworkflows/nf-core/annotation_ensemblvep/main.nf +++ b/subworkflows/nf-core/annotation/ensemblvep/main.nf @@ -2,8 +2,8 @@ // Run VEP to annotate VCF files // -include { ENSEMBLVEP } from '../../../modules/nf-core/modules/ensemblvep/main' -include { TABIX_BGZIPTABIX as ANNOTATION_BGZIPTABIX } from '../../../modules/nf-core/modules/tabix/bgziptabix/main' +include { ENSEMBLVEP } from '../../../../modules/nf-core/modules/ensemblvep/main' +include { TABIX_BGZIPTABIX as ANNOTATION_BGZIPTABIX } from '../../../../modules/nf-core/modules/tabix/bgziptabix/main' workflow ANNOTATION_ENSEMBLVEP { take: diff --git a/subworkflows/nf-core/annotation_ensemblvep/meta.yml b/subworkflows/nf-core/annotation/ensemblvep/meta.yml similarity index 100% rename from subworkflows/nf-core/annotation_ensemblvep/meta.yml rename to subworkflows/nf-core/annotation/ensemblvep/meta.yml diff --git a/subworkflows/nf-core/annotation_snpeff/main.nf b/subworkflows/nf-core/annotation/snpeff/main.nf similarity index 89% rename from subworkflows/nf-core/annotation_snpeff/main.nf rename to subworkflows/nf-core/annotation/snpeff/main.nf index d2625c1d9c..48f87afb8b 100644 --- a/subworkflows/nf-core/annotation_snpeff/main.nf +++ b/subworkflows/nf-core/annotation/snpeff/main.nf @@ -2,8 +2,8 @@ // Run SNPEFF to annotate VCF files // -include { SNPEFF } from '../../../modules/nf-core/modules/snpeff/main' -include { TABIX_BGZIPTABIX as ANNOTATION_BGZIPTABIX } from '../../../modules/nf-core/modules/tabix/bgziptabix/main' +include { SNPEFF } from '../../../../modules/nf-core/modules/snpeff/main' +include { TABIX_BGZIPTABIX as ANNOTATION_BGZIPTABIX } from '../../../../modules/nf-core/modules/tabix/bgziptabix/main' workflow ANNOTATION_SNPEFF { take: diff --git a/subworkflows/nf-core/annotation_snpeff/meta.yml b/subworkflows/nf-core/annotation/snpeff/meta.yml similarity index 100% rename from subworkflows/nf-core/annotation_snpeff/meta.yml rename to subworkflows/nf-core/annotation/snpeff/meta.yml diff --git a/subworkflows/nf-core/bam_to_cram.nf b/subworkflows/nf-core/bam_to_cram.nf new file mode 100644 index 0000000000..8f27bf1e50 --- /dev/null +++ b/subworkflows/nf-core/bam_to_cram.nf @@ -0,0 +1,48 @@ +// +// BAM TO CRAM and optionnal QC +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { DEEPTOOLS_BAMCOVERAGE } from '../../modules/nf-core/modules/deeptools/bamcoverage/main' +include { QUALIMAP_BAMQC } from '../../modules/nf-core/modules/qualimap/bamqc/main' +include { SAMTOOLS_VIEWINDEX as SAMTOOLS_BAM_TO_CRAM } from '../../modules/local/samtools/viewindex/main' + +workflow BAM_TO_CRAM { + take: + bam_indexed // channel: [mandatory] meta, bam, bai + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals_combined_bed_gz_tbi // channel: [optional] intervals_bed.gz, intervals_bed.gz.tbi + + main: + ch_versions = Channel.empty() + qc_reports = Channel.empty() + + // remap to have channel without bam index + bam_no_index = bam_indexed.map{ meta, bam, bai -> [meta, bam] } + + // Convert bam input to cram + SAMTOOLS_BAM_TO_CRAM(bam_indexed, fasta, fasta_fai) + + // Reports on bam input + DEEPTOOLS_BAMCOVERAGE(bam_indexed) + QUALIMAP_BAMQC(bam_no_index, intervals_combined_bed_gz_tbi) + + // Other reports run on cram + + // Gather all reports generated + qc_reports = qc_reports.mix(DEEPTOOLS_BAMCOVERAGE.out.bigwig) + qc_reports = qc_reports.mix(QUALIMAP_BAMQC.out.results) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(DEEPTOOLS_BAMCOVERAGE.out.versions.first()) + ch_versions = ch_versions.mix(QUALIMAP_BAMQC.out.versions.first()) + ch_versions = ch_versions.mix(SAMTOOLS_BAM_TO_CRAM.out.versions.first()) + + emit: + cram = SAMTOOLS_BAM_TO_CRAM.out.cram_crai + qc = qc_reports + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/cram_qc.nf b/subworkflows/nf-core/cram_qc.nf new file mode 100644 index 0000000000..8acd2d65c1 --- /dev/null +++ b/subworkflows/nf-core/cram_qc.nf @@ -0,0 +1,37 @@ +// +// QC on CRAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_STATS } from '../../modules/nf-core/modules/samtools/stats/main' +include { QUALIMAP_BAMQCCRAM } from '../../modules/nf-core/modules/qualimap/bamqccram/main' + +workflow CRAM_QC { + take: + cram // channel: [mandatory] meta, cram, crai + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals_combined_bed_gz_tbi + + main: + ch_versions = Channel.empty() + qc_reports = Channel.empty() + + // Reports run on cram + SAMTOOLS_STATS(cram, fasta) + QUALIMAP_BAMQCCRAM(cram, intervals_combined_bed_gz_tbi, fasta, fasta_fai) + + // Gather all reports generated + qc_reports = qc_reports.mix(SAMTOOLS_STATS.out.stats) + qc_reports = qc_reports.mix(QUALIMAP_BAMQCCRAM.out.results) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(QUALIMAP_BAMQCCRAM.out.versions.first()) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) + + emit: + qc = qc_reports + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf b/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf index f8c2eaad61..785ebd7e5d 100644 --- a/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf +++ b/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf @@ -3,15 +3,15 @@ // Convert them to unmapped BAM file, map them to the reference genome, // use the mapped information to group UMIs and generate consensus reads // +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { BWAMEM2_MEM } from '../../../modules/nf-core/modules/bwamem2/mem/main' -include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/nf-core/modules/bwa/mem/main' -include { FGBIO_CALLMOLECULARCONSENSUSREADS as CALLUMICONSENSUS } from '../../../modules/nf-core/modules/fgbio/callmolecularconsensusreads/main.nf' -include { FGBIO_FASTQTOBAM as FASTQTOBAM } from '../../../modules/nf-core/modules/fgbio/fastqtobam/main' -include { FGBIO_GROUPREADSBYUMI as GROUPREADSBYUMI } from '../../../modules/nf-core/modules/fgbio/groupreadsbyumi/main' -include { SAMBLASTER } from '../../../modules/nf-core/modules/samblaster/main' -include { SAMTOOLS_BAM2FQ as BAM2FASTQ } from '../../../modules/nf-core/modules/samtools/bam2fq/main.nf' +include { FGBIO_CALLMOLECULARCONSENSUSREADS as CALLUMICONSENSUS } from '../../../modules/nf-core/modules/fgbio/callmolecularconsensusreads/main.nf' +include { FGBIO_FASTQTOBAM as FASTQTOBAM } from '../../../modules/nf-core/modules/fgbio/fastqtobam/main' +include { FGBIO_GROUPREADSBYUMI as GROUPREADSBYUMI } from '../../../modules/nf-core/modules/fgbio/groupreadsbyumi/main' +include { GATK4_MAPPING as MAPPING_UMI } from '../gatk4/mapping/main' +include { SAMBLASTER } from '../../../modules/nf-core/modules/samblaster/main' +include { SAMTOOLS_BAM2FQ as BAM2FASTQ } from '../../../modules/nf-core/modules/samtools/bam2fq/main.nf' workflow CREATE_UMI_CONSENSUS { take: @@ -20,54 +20,43 @@ workflow CREATE_UMI_CONSENSUS { bwa // channel: [mandatory] Pre-computed BWA index (either bwa-mem or bwa-mem2; MUST be matching to chosen aligner) read_structure // string: [mandatory] "read_structure" groupreadsbyumi_strategy // string: [mandatory] grouping strategy - default: "Adjacency" - aligner // string: [mandatory] "bwa-mem" or "bwa-mem2" main: ch_versions = Channel.empty() // using information in val(read_structure) FASTQ reads are converted into // a tagged unmapped BAM file (uBAM) - FASTQTOBAM ( reads, read_structure ) - ch_versions = ch_versions.mix(FASTQTOBAM.out.versions) + FASTQTOBAM(reads, read_structure) // in order to map uBAM using BWA MEM, we need to convert uBAM to FASTQ // but keep the appropriate UMI tags in the FASTQ comment field and produce // an interleaved FASQT file (hence, split = false) split = false - BAM2FASTQ ( FASTQTOBAM.out.umibam, split ) - ch_versions = ch_versions.mix(BAM2FASTQ.out.versions) - - // the user can choose here to use either bwa-mem (default) or bwa-mem2 - aligned_bam = Channel.empty() + BAM2FASTQ(FASTQTOBAM.out.umibam, split) - if (aligner == "bwa-mem") { - - // appropriately tagged interleaved FASTQ reads are mapped to the reference - BWAMEM1_MEM ( BAM2FASTQ.out.reads, bwa, false ) - ch_versions = ch_versions.mix(BWAMEM1_MEM.out.versions) - aligned_bam = BWAMEM1_MEM.out.bam - } else { - - // appropriately tagged interleaved FASTQ reads are mapped to the reference - BWAMEM2_MEM ( BAM2FASTQ.out.reads, bwa, false ) - ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions) - aligned_bam = BWAMEM2_MEM.out.bam - } + // appropriately tagged interleaved FASTQ reads are mapped to the reference + // bams will not be sorted (hence, sort = false) + sort = false + MAPPING_UMI(BAM2FASTQ.out.reads, bwa, sort) // samblaster is used in order to tag mates information in the BAM file // this is used in order to group reads by UMI - SAMBLASTER ( aligned_bam ) - ch_versions = ch_versions.mix(SAMBLASTER.out.versions) + SAMBLASTER(MAPPING_UMI.out.bam) // appropriately tagged reads are now grouped by UMI information - GROUPREADSBYUMI ( SAMBLASTER.out.bam, groupreadsbyumi_strategy ) - ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions) + GROUPREADSBYUMI(SAMBLASTER.out.bam, groupreadsbyumi_strategy) + + // Using newly created groups + // To call a consensus across reads in the same group + // And emit a consensus BAM file + CALLUMICONSENSUS(GROUPREADSBYUMI.out.bam) - // using the above created groups, a consensus across reads in the same grou - // can be called - // this will emit a consensus BAM file - CALLUMICONSENSUS ( GROUPREADSBYUMI.out.bam ) + ch_versions = ch_versions.mix(BAM2FASTQ.out.versions) + ch_versions = ch_versions.mix(MAPPING_UMI.out.versions) ch_versions = ch_versions.mix(CALLUMICONSENSUS.out.versions) + ch_versions = ch_versions.mix(FASTQTOBAM.out.versions) + ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions) + ch_versions = ch_versions.mix(SAMBLASTER.out.versions) emit: umibam = FASTQTOBAM.out.umibam // channel: [ val(meta), [ bam ] ] diff --git a/subworkflows/nf-core/joint_germline_variant_calling/main.nf b/subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf similarity index 95% rename from subworkflows/nf-core/joint_germline_variant_calling/main.nf rename to subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf index 195c6bb92a..3e2240423d 100644 --- a/subworkflows/nf-core/joint_germline_variant_calling/main.nf +++ b/subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf @@ -2,11 +2,11 @@ // Run GATK haplotypecaller for all input samples, merge them with genomicsdbimport, perform joint genotyping with genotypeGVCFS and recalibrate with variantrecalibrator & applyvqsr. // -include { GATK4_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../../modules/nf-core/modules/gatk4/haplotypecaller/main' -include { GATK4_GENOMICSDBIMPORT as GENOMICSDBIMPORT } from '../../../modules/nf-core/modules/gatk4/genomicsdbimport/main' -include { GATK4_GENOTYPEGVCFS as GENOTYPEGVCFS } from '../../../modules/nf-core/modules/gatk4/genotypegvcfs/main' -include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR } from '../../../modules/nf-core/modules/gatk4/variantrecalibrator/main' -include { GATK4_APPLYVQSR as APPLYVQSR } from '../../../modules/nf-core/modules/gatk4/applyvqsr/main' +include { GATK4_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../../../modules/nf-core/modules/gatk4/haplotypecaller/main' +include { GATK4_GENOMICSDBIMPORT as GENOMICSDBIMPORT } from '../../../../modules/nf-core/modules/gatk4/genomicsdbimport/main' +include { GATK4_GENOTYPEGVCFS as GENOTYPEGVCFS } from '../../../../modules/nf-core/modules/gatk4/genotypegvcfs/main' +include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR } from '../../../../modules/nf-core/modules/gatk4/variantrecalibrator/main' +include { GATK4_APPLYVQSR as APPLYVQSR } from '../../../../modules/nf-core/modules/gatk4/applyvqsr/main' workflow GATK_JOINT_GERMLINE_VARIANT_CALLING { take: diff --git a/subworkflows/nf-core/joint_germline_variant_calling/meta.yml b/subworkflows/nf-core/gatk4/joint_germline_variant_calling/meta.yml similarity index 100% rename from subworkflows/nf-core/joint_germline_variant_calling/meta.yml rename to subworkflows/nf-core/gatk4/joint_germline_variant_calling/meta.yml diff --git a/subworkflows/nf-core/gatk4/mapping/main.nf b/subworkflows/nf-core/gatk4/mapping/main.nf new file mode 100644 index 0000000000..77f1599ede --- /dev/null +++ b/subworkflows/nf-core/gatk4/mapping/main.nf @@ -0,0 +1,43 @@ +// +// MAPPING +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BWAMEM2_MEM } from '../../../../modules/nf-core/modules/bwamem2/mem/main' +include { BWA_MEM as BWAMEM1_MEM } from '../../../../modules/nf-core/modules/bwa/mem/main' + +workflow GATK4_MAPPING { + take: + ch_reads // channel: [mandatory] meta, reads + ch_bwa // channel: [mandatory] bwa + sort // boolean: [mandatory] true -> sort, false -> don't sort + + main: + + ch_versions = Channel.empty() + + // Only one of the following should be run + BWAMEM1_MEM(ch_reads, ch_bwa, sort) // If aligner is bwa-mem + BWAMEM2_MEM(ch_reads, ch_bwa, sort) // If aligner is bwa-mem2 + + // Grouping the bams from the same samples not to stall the workflow + ch_bam_mapped = BWAMEM1_MEM.out.bam.mix(BWAMEM2_MEM.out.bam).map{ meta, bam -> + new_meta = meta.clone() + + // Use groupKey to make sure that the correct group can advance as soon as it is complete + // and not stall the workflow until all reads from all channels are mapped + def groupKey = groupKey(new_meta, meta.numLanes * meta.size) + + //Returns the values we need + tuple(groupKey, new_meta, bam) + }.groupTuple(by:[0,1]).map{ groupKey, new_meta, bam -> [new_meta, bam] } + + // Gather versions of all tools used + ch_versions = ch_versions.mix(BWAMEM1_MEM.out.versions.first()) + ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions.first()) + + emit: + bam = ch_bam_mapped // channel: [ [meta], bam ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/gatk4/markduplicates/main.nf b/subworkflows/nf-core/gatk4/markduplicates/main.nf new file mode 100644 index 0000000000..0d7393cb3a --- /dev/null +++ b/subworkflows/nf-core/gatk4/markduplicates/main.nf @@ -0,0 +1,39 @@ +// +// MARKDUPLICATES +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MARKDUPLICATES } from '../../../../modules/nf-core/modules/gatk4/markduplicates/main' +include { BAM_TO_CRAM } from '../../bam_to_cram' + +workflow MARKDUPLICATES { + take: + bam // channel: [mandatory] meta, bam + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals_combined_bed_gz_tbi // channel: [optional] intervals_bed.gz, intervals_bed.gz.tbi + + main: + ch_versions = Channel.empty() + qc_reports = Channel.empty() + + // Run Markupduplicates + GATK4_MARKDUPLICATES(bam) + + // Convert output to cram + BAM_TO_CRAM(GATK4_MARKDUPLICATES.out.bam.join(GATK4_MARKDUPLICATES.out.bai), fasta, fasta_fai, intervals_combined_bed_gz_tbi) + + // Gather all reports generated + qc_reports = qc_reports.mix(GATK4_MARKDUPLICATES.out.metrics) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES.out.versions.first()) + ch_versions = ch_versions.mix(BAM_TO_CRAM.out.versions) + + emit: + cram = BAM_TO_CRAM.out.cram + qc = qc_reports + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/gatk4/markduplicates_spark/main.nf b/subworkflows/nf-core/gatk4/markduplicates_spark/main.nf new file mode 100644 index 0000000000..e1fc6d5c1a --- /dev/null +++ b/subworkflows/nf-core/gatk4/markduplicates_spark/main.nf @@ -0,0 +1,59 @@ +// +// MARKDUPLICATES AND/OR QC after mapping +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../../../modules/nf-core/modules/gatk4/estimatelibrarycomplexity/main' +include { GATK4_MARKDUPLICATES_SPARK } from '../../../../modules/local/gatk4/markduplicatesspark/main' +include { SAMTOOLS_INDEX as INDEX_MARKDUPLICATES } from '../../../../modules/local/samtools/index/main' +include { BAM_TO_CRAM } from '../../bam_to_cram' + +workflow MARKDUPLICATES_SPARK { + take: + bam // channel: [mandatory] meta, bam + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals_combined_bed_gz_tbi // channel: [optional] intervals_bed.gz, intervals_bed.gz.tbi + + main: + ch_versions = Channel.empty() + qc_reports = Channel.empty() + + // Run Markupduplicates spark + // When running bamqc and/or deeptools output is bam, else cram + GATK4_MARKDUPLICATES_SPARK(bam, fasta, fasta_fai, dict) + INDEX_MARKDUPLICATES(GATK4_MARKDUPLICATES_SPARK.out.output) + + // Convert Markupduplicates spark bam output to cram when running bamqc and/or deeptools + BAM_TO_CRAM(INDEX_MARKDUPLICATES.out.bam_bai, fasta, fasta_fai, intervals_combined_bed_gz_tbi) + + // Only one of these channel is not empty: + // - running Markupduplicates spark with bam output + // - running Markupduplicates spark with cram output + cram_markduplicates = Channel.empty().mix( + BAM_TO_CRAM.out.cram, + GATK4_MARKDUPLICATES_SPARK.out.output.join(INDEX_MARKDUPLICATES.out.cram_crai)) + + // When running Marduplicates spark, and saving reports + GATK4_ESTIMATELIBRARYCOMPLEXITY(bam, fasta, fasta_fai, dict) + + // Other reports done either with BAM_TO_CRAM subworkflow + // or CRAM_QC subworkflow + + // Gather all reports generated + qc_reports = qc_reports.mix(GATK4_ESTIMATELIBRARYCOMPLEXITY.out.metrics) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(GATK4_ESTIMATELIBRARYCOMPLEXITY.out.versions.first()) + ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES_SPARK.out.versions.first()) + ch_versions = ch_versions.mix(INDEX_MARKDUPLICATES.out.versions.first()) + ch_versions = ch_versions.mix(BAM_TO_CRAM.out.versions.first()) + + emit: + cram = cram_markduplicates + qc = qc_reports + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf b/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf new file mode 100644 index 0000000000..e05e873b97 --- /dev/null +++ b/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf @@ -0,0 +1,58 @@ +// +// PREPARE RECALIBRATION +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_BASERECALIBRATOR as BASERECALIBRATOR } from '../../../../modules/nf-core/modules/gatk4/baserecalibrator/main' +include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../../../modules/nf-core/modules/gatk4/gatherbqsrreports/main' + +workflow PREPARE_RECALIBRATION { + take: + cram // channel: [mandatory] cram_markduplicates + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals // channel: [mandatory] intervals + known_sites // channel: [optional] known_sites + known_sites_tbi // channel: [optional] known_sites_tbi + num_intervals // value: [mandatory] number of intervals + + main: + ch_versions = Channel.empty() + + cram_intervals = cram.combine(intervals) + .map{ meta, cram, crai, intervals -> + new_meta = meta.clone() + new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName + [new_meta, cram, crai, intervals] + } + + // Run Baserecalibrator + BASERECALIBRATOR(cram_intervals, fasta, fasta_fai, dict, known_sites, known_sites_tbi) + + // Figuring out if there is one or more table(s) from the same sample + ch_table = BASERECALIBRATOR.out.table + .map{ meta, table -> + meta.id = meta.sample + [meta, table] + }.groupTuple(size: num_intervals) + .branch{ + single: it[1].size() == 1 + multiple: it[1].size() > 1 + }.set{table_to_merge} + + // STEP 3.5: MERGING RECALIBRATION TABLES + + // Merge the tables only when we have intervals + GATHERBQSRREPORTS(table_to_merge.multiple) + table_bqsr = table_to_merge.single.mix(GATHERBQSRREPORTS.out.table) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(BASERECALIBRATOR.out.versions) + ch_versions = ch_versions.mix(GATHERBQSRREPORTS.out.versions) + + emit: + table_bqsr = table_bqsr + versions = ch_versions // channel: [versions.yml] +} diff --git a/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf b/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf new file mode 100644 index 0000000000..8020ccb595 --- /dev/null +++ b/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf @@ -0,0 +1,58 @@ +// +// PREPARE RECALIBRATION with SPARK +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_BASERECALIBRATOR_SPARK as BASERECALIBRATOR_SPARK } from '../../../../modules/local/gatk4/baserecalibratorspark/main' +include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../../../modules/nf-core/modules/gatk4/gatherbqsrreports/main' + +workflow PREPARE_RECALIBRATION_SPARK { + take: + cram // channel: [mandatory] cram_markduplicates + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals // channel: [mandatory] intervals + known_sites // channel: [optional] known_sites + known_sites_tbi // channel: [optional] known_sites_tbi + num_intervals // value: [mandatory] number of intervals + + main: + ch_versions = Channel.empty() + + cram_intervals = cram.combine(intervals) + .map{ meta, cram, crai, intervals -> + new_meta = meta.clone() + new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName + [new_meta, cram, crai, intervals] + } + + // Run Baserecalibrator spark + BASERECALIBRATOR_SPARK(cram_intervals, fasta, fasta_fai, dict, known_sites, known_sites_tbi) + + // Figuring out if there is one or more table(s) from the same sample + ch_table = BASERECALIBRATOR_SPARK.out.table + .map{ meta, table -> + meta.id = meta.sample + [meta, table] + }.groupTuple(size: num_intervals) + .branch{ + single: it[1].size() == 1 + multiple: it[1].size() > 1 + }.set{table_to_merge} + + // STEP 3.5: MERGING RECALIBRATION TABLES + + // Merge the tables only when we have intervals + GATHERBQSRREPORTS(table_to_merge.multiple) + table_bqsr = table_to_merge.single.mix(GATHERBQSRREPORTS.out.table) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(BASERECALIBRATOR_SPARK.out.versions) + ch_versions = ch_versions.mix(GATHERBQSRREPORTS.out.versions) + + emit: + table_bqsr = table_bqsr + versions = ch_versions // channel: [versions.yml] +} diff --git a/subworkflows/nf-core/gatk4/recalibrate/main.nf b/subworkflows/nf-core/gatk4/recalibrate/main.nf new file mode 100644 index 0000000000..ec2a18026c --- /dev/null +++ b/subworkflows/nf-core/gatk4/recalibrate/main.nf @@ -0,0 +1,43 @@ +// +// RECALIBRATE +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_APPLYBQSR as APPLYBQSR } from '../../../../modules/nf-core/modules/gatk4/applybqsr/main' +include { MERGE_INDEX_CRAM } from '../../merge_index_cram' + +workflow RECALIBRATE { + take: + cram // channel: [mandatory] cram + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals // channel: [mandatory] intervals + num_intervals // value: [mandatory] number of intervals + + main: + ch_versions = Channel.empty() + + cram_intervals = cram.combine(intervals) + .map{ meta, cram, crai, recal, intervals -> + new_meta = meta.clone() + new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName + [new_meta, cram, crai, recal, intervals] + } + + // Run Applybqsr + APPLYBQSR(cram_intervals, fasta, fasta_fai, dict) + + // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES + MERGE_INDEX_CRAM(APPLYBQSR.out.cram, fasta, num_intervals) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(APPLYBQSR.out.versions) + ch_versions = ch_versions.mix(MERGE_INDEX_CRAM.out.versions) + + emit: + cram = MERGE_INDEX_CRAM.out.cram_crai + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf b/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf new file mode 100644 index 0000000000..fdb60fdf58 --- /dev/null +++ b/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf @@ -0,0 +1,43 @@ +// +// RECALIBRATE SPARK +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_APPLYBQSR_SPARK as APPLYBQSR_SPARK } from '../../../../modules/local/gatk4/applybqsrspark/main' +include { MERGE_INDEX_CRAM } from '../../merge_index_cram' + +workflow RECALIBRATE_SPARK { + take: + cram // channel: [mandatory] cram + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals // channel: [mandatory] intervals + num_intervals // value: [mandatory] number of intervals + + main: + ch_versions = Channel.empty() + + cram_intervals = cram.combine(intervals) + .map{ meta, cram, crai, recal, intervals -> + new_meta = meta.clone() + new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName + [new_meta, cram, crai, recal, intervals] + } + + // Run Applybqsr spark + APPLYBQSR_SPARK(cram_intervals, fasta, fasta_fai, dict) + + // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES + MERGE_INDEX_CRAM(APPLYBQSR_SPARK.out.cram, fasta, num_intervals) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(APPLYBQSR_SPARK.out.versions) + ch_versions = ch_versions.mix(MERGE_INDEX_CRAM.out.versions) + + emit: + cram = MERGE_INDEX_CRAM.out.cram_crai + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/main.nf b/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf similarity index 91% rename from subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/main.nf rename to subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf index a211c74dba..754d65dc5b 100644 --- a/subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/main.nf +++ b/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf @@ -2,18 +2,17 @@ // Run GATK mutect2 in tumor normal mode, getepileupsummaries, calculatecontamination, learnreadorientationmodel and filtermutectcalls // -include { BGZIP as BGZIP_MUTECT2 } from '../../../modules/local/bgzip' -include { CONCAT_VCF as CONCAT_MUTECT2 } from '../../../modules/local/concat_vcf/main' - -include { GATK4_MUTECT2 as MUTECT2 } from '../../../modules/nf-core/modules/gatk4/mutect2/main' -include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../modules/nf-core/modules/gatk4/mergemutectstats/main' -include { GATK4_LEARNREADORIENTATIONMODEL as LEARNREADORIENTATIONMODEL } from '../../../modules/nf-core/modules/gatk4/learnreadorientationmodel/main' -include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_TUMOR } from '../../../modules/nf-core/modules/gatk4/gatherpileupsummaries/main' -include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_NORMAL} from '../../../modules/nf-core/modules/gatk4/gatherpileupsummaries/main' -include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_TUMOR } from '../../../modules/nf-core/modules/gatk4/getpileupsummaries/main' -include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_NORMAL } from '../../../modules/nf-core/modules/gatk4/getpileupsummaries/main' -include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/nf-core/modules/gatk4/calculatecontamination/main' -include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../modules/nf-core/modules/gatk4/filtermutectcalls/main' +include { BGZIP as BGZIP_MUTECT2 } from '../../../../modules/local/bgzip' +include { CONCAT_VCF as CONCAT_MUTECT2 } from '../../../../modules/local/concat_vcf/main' +include { GATK4_MUTECT2 as MUTECT2 } from '../../../../modules/nf-core/modules/gatk4/mutect2/main' +include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../../modules/nf-core/modules/gatk4/mergemutectstats/main' +include { GATK4_LEARNREADORIENTATIONMODEL as LEARNREADORIENTATIONMODEL } from '../../../../modules/nf-core/modules/gatk4/learnreadorientationmodel/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_TUMOR } from '../../../../modules/nf-core/modules/gatk4/gatherpileupsummaries/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_NORMAL} from '../../../../modules/nf-core/modules/gatk4/gatherpileupsummaries/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_TUMOR } from '../../../../modules/nf-core/modules/gatk4/getpileupsummaries/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_NORMAL } from '../../../../modules/nf-core/modules/gatk4/getpileupsummaries/main' +include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../../modules/nf-core/modules/gatk4/calculatecontamination/main' +include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../../modules/nf-core/modules/gatk4/filtermutectcalls/main' workflow GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING { take: diff --git a/subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/meta.yml b/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/meta.yml similarity index 100% rename from subworkflows/nf-core/gatk_tumor_normal_somatic_variant_calling/meta.yml rename to subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/meta.yml diff --git a/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/main.nf b/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf similarity index 89% rename from subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/main.nf rename to subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf index c8d81f02fe..d8b44a46e8 100644 --- a/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/main.nf +++ b/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf @@ -2,15 +2,14 @@ // Run GATK mutect2 in tumor only mode, getepileupsummaries, calculatecontamination and filtermutectcalls // -include { BGZIP as BGZIP_MUTECT2 } from '../../../modules/local/bgzip' -include { CONCAT_VCF as CONCAT_MUTECT2 } from '../../../modules/local/concat_vcf/main' - -include { GATK4_MUTECT2 as MUTECT2 } from '../../../modules/nf-core/modules/gatk4/mutect2/main' -include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../modules/nf-core/modules/gatk4/mergemutectstats/main' -include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES } from '../../../modules/nf-core/modules/gatk4/getpileupsummaries/main' -include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES } from '../../../modules/nf-core/modules/gatk4/gatherpileupsummaries/main' -include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/nf-core/modules/gatk4/calculatecontamination/main' -include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../modules/nf-core/modules/gatk4/filtermutectcalls/main' +include { BGZIP as BGZIP_MUTECT2 } from '../../../../modules/local/bgzip' +include { CONCAT_VCF as CONCAT_MUTECT2 } from '../../../../modules/local/concat_vcf/main' +include { GATK4_MUTECT2 as MUTECT2 } from '../../../../modules/nf-core/modules/gatk4/mutect2/main' +include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../../modules/nf-core/modules/gatk4/mergemutectstats/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES } from '../../../../modules/nf-core/modules/gatk4/getpileupsummaries/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES } from '../../../../modules/nf-core/modules/gatk4/gatherpileupsummaries/main' +include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../../modules/nf-core/modules/gatk4/calculatecontamination/main' +include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../../modules/nf-core/modules/gatk4/filtermutectcalls/main' workflow GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING { take: diff --git a/subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/meta.yml b/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/meta.yml similarity index 100% rename from subworkflows/nf-core/gatk_tumor_only_somatic_variant_calling/meta.yml rename to subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/meta.yml diff --git a/subworkflows/nf-core/gatk4_mapping/main.nf b/subworkflows/nf-core/gatk4_mapping/main.nf deleted file mode 100644 index 027d571b72..0000000000 --- a/subworkflows/nf-core/gatk4_mapping/main.nf +++ /dev/null @@ -1,66 +0,0 @@ -// -// MAPPING -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { BWAMEM2_MEM } from '../../../modules/nf-core/modules/bwamem2/mem/main' -include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/nf-core/modules/bwa/mem/main' -include { SAMTOOLS_INDEX as INDEX_MAPPING } from '../../../modules/local/samtools/index/main' -include { SAMTOOLS_MERGE as MERGE_MAPPING } from '../../../modules/nf-core/modules/samtools/merge/main' - -workflow GATK4_MAPPING { - take: - reads // channel: [mandatory] meta, reads - bwa // channel: [mandatory] bwa - fasta // channel: [mandatory] fasta - fasta_fai // channel: [mandatory] fasta_fai - - main: - ch_versions = Channel.empty() - - // Only one of the following will be run - BWAMEM1_MEM(reads, bwa, true) // If aligner is bwa-mem - BWAMEM2_MEM(reads, bwa, true) // If aligner is bwa-mem2 - - // Grouping the bams from the same samples not to stall the workflow - // Removing unneeded fields in the new_meta map - bam_mapped = BWAMEM1_MEM.out.bam.mix(BWAMEM2_MEM.out.bam).map{ meta, bam -> - new_meta = meta.clone() - new_meta.remove('read_group') - new_meta.remove('size') - new_meta.id = meta.sample - - // groupKey is to makes sure that the correct group can advance as soon as it is complete - // and not stall the workflow until all reads from all channels are mapped - def groupKey = groupKey(new_meta, meta.numLanes * meta.size) - - //Returns the values we need - tuple(groupKey, new_meta, bam) - }.groupTuple(by:[0,1]).map{ groupKey, new_meta, bam -> [new_meta, bam] } - - // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here - // Except if and only if skipping markduplicates or saving mapped bams - - // Figuring out if there is one or more bam(s) from the same sample - // Is done for all, but not blocking - bam_mapped.branch{ - single: it[1].size() == 1 - multiple: it[1].size() > 1 - }.set{bam_to_merge} - - // Only if the when clause is true - MERGE_MAPPING(bam_to_merge.multiple, []) - INDEX_MAPPING(bam_to_merge.single.mix(MERGE_MAPPING.out.bam)) - - // Gather versions of all tools used - ch_versions = ch_versions.mix(BWAMEM1_MEM.out.versions.first()) - ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions.first()) - ch_versions = ch_versions.mix(INDEX_MAPPING.out.versions.first()) - ch_versions = ch_versions.mix(MERGE_MAPPING.out.versions.first()) - - emit: - bam = bam_mapped - bam_indexed = INDEX_MAPPING.out.bam_bai - versions = ch_versions -} diff --git a/subworkflows/nf-core/markduplicates.nf b/subworkflows/nf-core/markduplicates.nf deleted file mode 100644 index 118da59bea..0000000000 --- a/subworkflows/nf-core/markduplicates.nf +++ /dev/null @@ -1,90 +0,0 @@ -// -// MARKDUPLICATES AND/OR QC after mapping -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { DEEPTOOLS_BAMCOVERAGE } from '../../modules/nf-core/modules/deeptools/bamcoverage/main' -include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/nf-core/modules/gatk4/estimatelibrarycomplexity/main' -include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/modules/gatk4/markduplicates/main' -include { GATK4_MARKDUPLICATES_SPARK } from '../../modules/local/gatk4/markduplicatesspark/main' -include { QUALIMAP_BAMQC } from '../../modules/nf-core/modules/qualimap/bamqc/main' -include { SAMTOOLS_INDEX as INDEX_MARKDUPLICATES } from '../../modules/local/samtools/index/main' -include { SAMTOOLS_STATS } from '../../modules/nf-core/modules/samtools/stats/main' -include { SAMTOOLS_VIEWINDEX as SAMTOOLS_BAM_TO_CRAM_DUPLICATES } from '../../modules/local/samtools/viewindex/main' -include { SAMTOOLS_VIEWINDEX as SAMTOOLS_BAM_TO_CRAM_NO_DUPLICATES } from '../../modules/local/samtools/viewindex/main' -include { SAMTOOLS_VIEWINDEX as SAMTOOLS_BAM_TO_CRAM_SPARK } from '../../modules/local/samtools/viewindex/main' - -workflow MARKDUPLICATES { - take: - bam_mapped // channel: [mandatory, when running Markduplicates, else optional] meta, bam - bam_indexed // channel: [mandatory, when skipping Markduplicates, else optional] meta, bam, bai - dict // channel: [mandatory] dict - fasta // channel: [mandatory] fasta - fasta_fai // channel: [mandatory] fasta_fai - intervals_combined_bed_gz_tbi // channel: [optional] intervals_bed.gz, intervals_bed.gz.tbi - - main: - ch_versions = Channel.empty() - qc_reports = Channel.empty() - - // When skipping Markduplicates converting bam input to cram - SAMTOOLS_BAM_TO_CRAM_NO_DUPLICATES(bam_indexed, fasta, fasta_fai) - - // Run Markupduplicates spark - // When running bamqc and/or deeptools output is bam, else cram - GATK4_MARKDUPLICATES_SPARK(bam_mapped, fasta, fasta_fai, dict) - INDEX_MARKDUPLICATES(GATK4_MARKDUPLICATES_SPARK.out.output) - - // Convert Markupduplicates spark bam output to cram when running bamqc and/or deeptools - SAMTOOLS_BAM_TO_CRAM_SPARK(GATK4_MARKDUPLICATES_SPARK.out.output.join(INDEX_MARKDUPLICATES.out.bam_bai), fasta, fasta_fai) - - // Run Markupduplicates - GATK4_MARKDUPLICATES(bam_mapped) - // Convert output to cram - SAMTOOLS_BAM_TO_CRAM_DUPLICATES(GATK4_MARKDUPLICATES.out.bam.join(GATK4_MARKDUPLICATES.out.bai), fasta, fasta_fai) - - // Only one of these channel is not empty: - // - skipping Markduplicates - // - running Markupduplicates spark with bam output - // - running Markupduplicates spark with cram output - // - running Markupduplicates - cram_markduplicates = Channel.empty().mix( - SAMTOOLS_BAM_TO_CRAM_NO_DUPLICATES.out.cram_crai, - SAMTOOLS_BAM_TO_CRAM_SPARK.out.cram_crai, - GATK4_MARKDUPLICATES_SPARK.out.output.join(INDEX_MARKDUPLICATES.out.cram_crai), - SAMTOOLS_BAM_TO_CRAM_DUPLICATES.out.cram_crai) - - // When running Marduplicates spark, and saving reports - GATK4_ESTIMATELIBRARYCOMPLEXITY(bam_mapped, fasta, fasta_fai, dict) - // Reports on Marduplicates spark bam output or on bam input - QUALIMAP_BAMQC(bam_mapped.mix(GATK4_MARKDUPLICATES.out.bam), intervals_combined_bed_gz_tbi) - DEEPTOOLS_BAMCOVERAGE(bam_indexed.mix(GATK4_MARKDUPLICATES.out.bam.join(GATK4_MARKDUPLICATES.out.bai))) - // Other reports run on cram - SAMTOOLS_STATS(cram_markduplicates, fasta) - - // Gather all reports generated - qc_reports = qc_reports.mix(DEEPTOOLS_BAMCOVERAGE.out.bigwig) - qc_reports = qc_reports.mix(GATK4_ESTIMATELIBRARYCOMPLEXITY.out.metrics) - qc_reports = qc_reports.mix(GATK4_MARKDUPLICATES.out.metrics) - qc_reports = qc_reports.mix(QUALIMAP_BAMQC.out.results) - qc_reports = qc_reports.mix(SAMTOOLS_STATS.out.stats) - - // Gather versions of all tools used - ch_versions = ch_versions.mix(DEEPTOOLS_BAMCOVERAGE.out.versions.first()) - ch_versions = ch_versions.mix(GATK4_ESTIMATELIBRARYCOMPLEXITY.out.versions.first()) - ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES.out.versions.first()) - ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES_SPARK.out.versions.first()) - ch_versions = ch_versions.mix(INDEX_MARKDUPLICATES.out.versions.first()) - ch_versions = ch_versions.mix(QUALIMAP_BAMQC.out.versions.first()) - ch_versions = ch_versions.mix(SAMTOOLS_BAM_TO_CRAM_DUPLICATES.out.versions.first()) - ch_versions = ch_versions.mix(SAMTOOLS_BAM_TO_CRAM_NO_DUPLICATES.out.versions.first()) - ch_versions = ch_versions.mix(SAMTOOLS_BAM_TO_CRAM_SPARK.out.versions.first()) - ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) - - emit: - cram = cram_markduplicates - qc = qc_reports - - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/nf-core/merge_index_bam.nf b/subworkflows/nf-core/merge_index_bam.nf new file mode 100644 index 0000000000..9839248250 --- /dev/null +++ b/subworkflows/nf-core/merge_index_bam.nf @@ -0,0 +1,33 @@ +// +// MERGE INDEX BAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_INDEX as INDEX_MERGE_BAM } from '../../modules/local/samtools/index/main' +include { SAMTOOLS_MERGE as MERGE_BAM } from '../../modules/nf-core/modules/samtools/merge/main' + +workflow MERGE_INDEX_BAM { + take: + bam // channel: [mandatory] meta, bam + + main: + ch_versions = Channel.empty() + + // Figuring out if there is one or more bam(s) from the same sample + bam.branch{ + single: it[1].size() == 1 + multiple: it[1].size() > 1 + }.set{bam_to_merge} + + MERGE_BAM(bam_to_merge.multiple, []) + INDEX_MERGE_BAM(bam_to_merge.single.mix(MERGE_BAM.out.bam)) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(INDEX_MERGE_BAM.out.versions.first()) + ch_versions = ch_versions.mix(MERGE_BAM.out.versions.first()) + + emit: + bam_bai = INDEX_MERGE_BAM.out.bam_bai + versions = ch_versions +} diff --git a/subworkflows/nf-core/merge_index_cram.nf b/subworkflows/nf-core/merge_index_cram.nf new file mode 100644 index 0000000000..04394afc26 --- /dev/null +++ b/subworkflows/nf-core/merge_index_cram.nf @@ -0,0 +1,40 @@ +// +// MERGE INDEX BAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_INDEX as INDEX_CRAM } from '../../modules/local/samtools/index/main' +include { SAMTOOLS_MERGE_CRAM as MERGE_CRAM } from '../../modules/local/samtools/mergecram/main' + +workflow MERGE_INDEX_CRAM { + take: + ch_cram // channel: [mandatory] meta, cram + fasta // channel: [mandatory] fasta + num_intervals + + main: + ch_versions = Channel.empty() + + // Figuring out if there is one or more cram(s) from the same sample + ch_cram = ch_cram + .map{ meta, cram -> + meta.id = meta.sample + [meta, cram] + }.groupTuple(size: num_intervals) + .branch{ + single: it[1].size() == 1 + multiple: it[1].size() > 1 + }.set{cram_to_merge} + + MERGE_CRAM(cram_to_merge.multiple, fasta) + INDEX_CRAM(cram_to_merge.single.mix(MERGE_CRAM.out.cram)) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(INDEX_CRAM.out.versions.first()) + ch_versions = ch_versions.mix(MERGE_CRAM.out.versions.first()) + + emit: + cram_crai = INDEX_CRAM.out.cram_crai + versions = ch_versions +} diff --git a/subworkflows/nf-core/prepare_recalibration.nf b/subworkflows/nf-core/prepare_recalibration.nf deleted file mode 100644 index 4d0f082b2c..0000000000 --- a/subworkflows/nf-core/prepare_recalibration.nf +++ /dev/null @@ -1,64 +0,0 @@ -// -// PREPARE RECALIBRATION -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { GATK4_BASERECALIBRATOR as BASERECALIBRATOR } from '../../modules/nf-core/modules/gatk4/baserecalibrator/main' -include { GATK4_BASERECALIBRATOR_SPARK as BASERECALIBRATOR_SPARK } from '../../modules/local/gatk4/baserecalibratorspark/main' -include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../modules/nf-core/modules/gatk4/gatherbqsrreports/main' - -workflow PREPARE_RECALIBRATION { - take: - cram_markduplicates // channel: [mandatory] cram_markduplicates - dict // channel: [mandatory] dict - fasta // channel: [mandatory] fasta - fasta_fai // channel: [mandatory] fasta_fai - intervals // channel: [mandatory] intervals - num_intervals - known_sites // channel: [optional] known_sites - known_sites_tbi // channel: [optional] known_sites_tbi - no_intervals // value: [mandatory] no_intervals - - main: - - ch_versions = Channel.empty() - - cram_markduplicates_intervals = cram_markduplicates.combine(intervals) - .map{ meta, cram, crai, intervals -> - new_meta = meta.clone() - new_meta.id = intervals.baseName != "no_intervals" ? meta.sample + "_" + intervals.baseName : meta.sample - [new_meta, cram, crai, intervals] - } - - // Run Baserecalibrator or Baserecalibrator spark - BASERECALIBRATOR(cram_markduplicates_intervals, fasta, fasta_fai, dict, known_sites, known_sites_tbi) - BASERECALIBRATOR_SPARK(cram_markduplicates_intervals, fasta, fasta_fai, dict, known_sites, known_sites_tbi) - - table_baserecalibrator = BASERECALIBRATOR.out.table.mix(BASERECALIBRATOR_SPARK.out.table) - .map{ meta, table -> - meta.id = meta.sample - [meta, table] - } - - // Only one of the two channels will be used - table_no_intervals = table_baserecalibrator - table_intervals = table_baserecalibrator.groupTuple(size: num_intervals) - - // STEP 3.5: MERGING RECALIBRATION TABLES - // Empty the no intervals table channel if we have intervals - if (!no_intervals) table_no_intervals = Channel.empty() - - // Merge the tables only when we have intervals - GATHERBQSRREPORTS(table_intervals) - table_bqsr = table_no_intervals.mix(GATHERBQSRREPORTS.out.table) - - // Gather versions of all tools used - ch_versions = ch_versions.mix(BASERECALIBRATOR.out.versions) - ch_versions = ch_versions.mix(BASERECALIBRATOR_SPARK.out.versions) - ch_versions = ch_versions.mix(GATHERBQSRREPORTS.out.versions) - - emit: - table_bqsr = table_bqsr - versions = ch_versions // channel: [versions.yml] -} diff --git a/subworkflows/nf-core/recalibrate.nf b/subworkflows/nf-core/recalibrate.nf deleted file mode 100644 index fa0b6870a7..0000000000 --- a/subworkflows/nf-core/recalibrate.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// RECALIBRATE -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { GATK4_APPLYBQSR as APPLYBQSR } from '../../modules/nf-core/modules/gatk4/applybqsr/main' -include { GATK4_APPLYBQSR_SPARK as APPLYBQSR_SPARK } from '../../modules/local/gatk4/applybqsrspark/main' -include { QUALIMAP_BAMQCCRAM } from '../../modules/nf-core/modules/qualimap/bamqccram/main' -include { SAMTOOLS_INDEX as INDEX_RECALIBRATE } from '../../modules/local/samtools/index/main' -include { SAMTOOLS_MERGE_CRAM } from '../../modules/local/samtools/mergecram/main' -include { SAMTOOLS_STATS } from '../../modules/nf-core/modules/samtools/stats/main' - -workflow RECALIBRATE { - take: - cram // channel: [mandatory] cram - dict // channel: [mandatory] dict - fasta // channel: [mandatory] fasta - fasta_fai // channel: [mandatory] fasta_fai - intervals // channel: [mandatory] intervals - num_intervals - no_intervals - intervals_combined_bed_gz_tbi - - main: - ch_versions = Channel.empty() - qc_reports = Channel.empty() - - cram_intervals = cram.combine(intervals) - .map{ meta, cram, crai, recal, intervals -> - new_meta = meta.clone() - new_meta.id = intervals.baseName != "no_intervals" ? meta.sample + "_" + intervals.baseName : meta.sample - [new_meta, cram, crai, recal, intervals] - } - - // Run Applybqsr spark or Applybqsr - APPLYBQSR_SPARK(cram_intervals, fasta, fasta_fai, dict) - APPLYBQSR(cram_intervals, fasta, fasta_fai, dict) - - cram_recalibrated_no_intervals = APPLYBQSR_SPARK.out.cram.mix(APPLYBQSR.out.cram) - - // Empty the no intervals cram channel if we have intervals - if (!no_intervals) cram_recalibrated_no_intervals = Channel.empty() - - // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES - cram_recalibrated_interval = APPLYBQSR_SPARK.out.cram.mix(APPLYBQSR.out.cram) - .map{ meta, cram -> - meta.id = meta.sample - [meta, cram] - }.groupTuple(size: num_intervals) - - // Only when we have intervals - SAMTOOLS_MERGE_CRAM(cram_recalibrated_interval, fasta) - - INDEX_RECALIBRATE(cram_recalibrated_no_intervals.mix(SAMTOOLS_MERGE_CRAM.out.cram)) - - // Reports on recalibrated cram - QUALIMAP_BAMQCCRAM(INDEX_RECALIBRATE.out.cram_crai, intervals_combined_bed_gz_tbi, fasta, fasta_fai) - SAMTOOLS_STATS(INDEX_RECALIBRATE.out.cram_crai, fasta) - - // Gather all reports generated - qc_reports = qc_reports.mix(SAMTOOLS_STATS.out.stats) - qc_reports = qc_reports.mix(QUALIMAP_BAMQCCRAM.out.results) - - // Gather versions of all tools used - ch_versions = ch_versions.mix(APPLYBQSR.out.versions) - ch_versions = ch_versions.mix(APPLYBQSR_SPARK.out.versions) - ch_versions = ch_versions.mix(INDEX_RECALIBRATE.out.versions) - ch_versions = ch_versions.mix(QUALIMAP_BAMQCCRAM.out.versions) - ch_versions = ch_versions.mix(SAMTOOLS_MERGE_CRAM.out.versions) - ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) - - emit: - cram = INDEX_RECALIBRATE.out.cram_crai - qc = qc_reports - - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 7b871eb358..90965b1443 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -57,18 +57,12 @@ else { } } -input_sample = extract_csv(csv_file) - -// def save_bam_mapped = 'markduplicates' in params.skip_tools ? true : params.save_bam_mapped ? true : false +ch_input_sample = extract_csv(csv_file) if (params.wes) { - if (!params.intervals.endsWith("bed")) { - exit 1, "Target file must be in BED format" - } + if (params.intervals && !params.intervals.endsWith("bed")) exit 1, "Target file must be in BED format" } else { - if (!params.intervals.endsWith("bed") && !params.intervals.endsWith("interval_list")) { - exit 1, "Interval file must end with .bed or .interval_list" - } + if (params.intervals && !params.intervals.endsWith("bed") && !params.intervals.endsWith("interval_list")) exit 1, "Interval file must end with .bed or .interval_list" } // Save AWS IGenomes file containing annotation version @@ -85,109 +79,122 @@ if (anno_readme && file(anno_readme).exists()) { */ // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : [] -chr_length = params.chr_length ? Channel.fromPath(params.chr_length).collect() : [] -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.empty() -fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : Channel.empty() -fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() -germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.empty() -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.empty() -loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : [] -loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : [] -mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : [] +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : [] +chr_length = params.chr_length ? Channel.fromPath(params.chr_length).collect() : [] +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.empty() +fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : Channel.empty() +fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.empty() +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.empty() +loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : [] +loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : [] +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : [] // Initialize value channels based on params, defined in the params.genomes[params.genome] scope -snpeff_db = params.snpeff_db ?: Channel.empty() -vep_cache_version = params.vep_cache_version ?: Channel.empty() -vep_genome = params.vep_genome ?: Channel.empty() -vep_species = params.vep_species ?: Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_genome = params.vep_genome ?: Channel.empty() +vep_species = params.vep_species ?: Channel.empty() // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope -cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : [] -cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : [] -cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : [] -cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : [] -pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.empty() -snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : [] -//target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : [] -vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] +cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : [] +cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : [] +cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : [] +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : [] +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.empty() +snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : [] +//target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : [] +vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope -umi_read_structure = params.umi_read_structure ? "${params.umi_read_structure} ${params.umi_read_structure}": Channel.empty() - +umi_read_structure = params.umi_read_structure ? "${params.umi_read_structure} ${params.umi_read_structure}" : Channel.empty() -// SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules +/* +======================================================================================== + IMPORT LOCAL/NF-CORE MODULES/SUBWORKFLOWS +======================================================================================== +*/ // Create samplesheets to restart from different steps -include { MAPPING_CSV } from '../subworkflows/local/mapping_csv' -include { MARKDUPLICATES_CSV } from '../subworkflows/local/markduplicates_csv' -include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recalibration_csv' -include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' +include { MAPPING_CSV } from '../subworkflows/local/mapping_csv' +include { MARKDUPLICATES_CSV } from '../subworkflows/local/markduplicates_csv' +include { PREPARE_RECALIBRATION_CSV } from '../subworkflows/local/prepare_recalibration_csv' +include { RECALIBRATE_CSV } from '../subworkflows/local/recalibrate_csv' // Build indices if needed -include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' // Build intervals if needed -include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals' +include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals' // Convert BAM files to FASTQ files -include { ALIGNMENT_TO_FASTQ } from '../subworkflows/local/bam2fastq' +include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_INPUT } from '../subworkflows/local/bam2fastq' +include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_UMI } from '../subworkflows/local/bam2fastq' // Split FASTQ files -include { SPLIT_FASTQ } from '../subworkflows/local/split_fastq' +include { SPLIT_FASTQ } from '../subworkflows/local/split_fastq' + +// Run FASTQC and/or TRIMGALORE +include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' + +// Create umi consensus bams from fastq +include { CREATE_UMI_CONSENSUS } from '../subworkflows/nf-core/fgbio_create_umi_consensus/main' // Map input reads to reference genome -include { GATK4_MAPPING } from '../subworkflows/nf-core/gatk4_mapping/main' +include { GATK4_MAPPING } from '../subworkflows/nf-core/gatk4/mapping/main' + +// Merge and index BAM files (optional) +include { MERGE_INDEX_BAM } from '../subworkflows/nf-core/merge_index_bam' -// Mark duplicates (+QC) + convert to CRAM -include { MARKDUPLICATES } from '../subworkflows/nf-core/markduplicates' +// Mark Duplicates (+QC) +include { MARKDUPLICATES } from '../subworkflows/nf-core/gatk4/markduplicates/main' + +// Mark Duplicates SPARK (+QC) +include { MARKDUPLICATES_SPARK } from '../subworkflows/nf-core/gatk4/markduplicates_spark/main' + +// Convert to CRAM (+QC) +include { BAM_TO_CRAM } from '../subworkflows/nf-core/bam_to_cram' + +// QC on CRAM +include { SAMTOOLS_STATS as SAMTOOLS_STATS_CRAM } from '../modules/nf-core/modules/samtools/stats/main' +include { CRAM_QC } from '../subworkflows/nf-core/cram_qc' // Create recalibration tables -include { PREPARE_RECALIBRATION } from '../subworkflows/nf-core/prepare_recalibration' +include { PREPARE_RECALIBRATION } from '../subworkflows/nf-core/gatk4/prepare_recalibration/main' + +// Create recalibration tables SPARK +include { PREPARE_RECALIBRATION_SPARK } from '../subworkflows/nf-core/gatk4/prepare_recalibration_spark/main' // Create recalibrated cram files to use for variant calling (+QC) -include { RECALIBRATE } from '../subworkflows/nf-core/recalibrate' +include { RECALIBRATE } from '../subworkflows/nf-core/gatk4/recalibrate/main' + +// Create recalibrated cram files to use for variant calling (+QC) +include { RECALIBRATE_SPARK } from '../subworkflows/nf-core/gatk4/recalibrate_spark/main' // Variant calling on a single normal sample -include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_variant_calling' +include { GERMLINE_VARIANT_CALLING } from '../subworkflows/local/germline_variant_calling' // Variant calling on a single tumor sample -include { TUMOR_ONLY_VARIANT_CALLING } from '../subworkflows/local/tumor_variant_calling' +include { TUMOR_ONLY_VARIANT_CALLING } from '../subworkflows/local/tumor_variant_calling' // Variant calling on tumor/normal pair -include { PAIR_VARIANT_CALLING } from '../subworkflows/local/pair_variant_calling' +include { PAIR_VARIANT_CALLING } from '../subworkflows/local/pair_variant_calling' // Annotation -include { ANNOTATE } from '../subworkflows/local/annotate' addParams( +include { ANNOTATE } from '../subworkflows/local/annotate' addParams( annotation_cache: params.annotation_cache ) -/* -======================================================================================== - IMPORT NF-CORE MODULES/SUBWORKFLOWS -======================================================================================== -*/ +// REPORTING VERSIONS OF SOFTWARE USED +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' + +// MULTIQC +include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' // Config files ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() -// -// SUBWORKFLOWS -// - -include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore' - -// Create umi consensus bams from fastq -include { CREATE_UMI_CONSENSUS } from '../subworkflows/nf-core/fgbio_create_umi_consensus/main' - -// -// MODULES: Installed directly from nf-core/modules -// - -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' - def multiqc_report = [] workflow SAREK { @@ -234,19 +241,14 @@ workflow SAREK { intervals_bed_combined_gz = intervals_bed_combined_gz_tbi.map{ bed, tbi -> [bed]}.collect() // one file containing all intervals interval.bed.gz file intervals_for_preprocessing = (!params.wes || params.no_intervals) ? [] : PREPARE_INTERVALS.out.intervals_bed //TODO: intervals also with WGS data? Probably need a parameter if WGS for deepvariant tool, that would allow to check here too - num_intervals = 0 - intervals.count().map{ num_intervals = it } + num_intervals = params.intervals ? count_intervals(file(params.intervals)) : 1 // Gather used softwares versions ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) + ch_versions = ch_versions.mix(PREPARE_INTERVALS.out.versions) // PREPROCESSING - bam_mapped = Channel.empty() - bam_mapped_qc = Channel.empty() - bam_recalibrated_qc = Channel.empty() - bam_variant_calling = Channel.empty() - // STEP 0: QC & TRIM // `--d fastqc` to skip fastqc // trim only with `--trim_fastq` @@ -254,148 +256,268 @@ workflow SAREK { if (params.step == 'mapping') { - if (params.is_bam_input) { - ALIGNMENT_TO_FASTQ(input_sample, []) - input_sample_converted = ALIGNMENT_TO_FASTQ.out.reads - ch_versions = ch_versions.mix(ALIGNMENT_TO_FASTQ.out.versions) - } else input_sample_converted = input_sample + // Figure out if input is fastq or bam + ch_input_sample.branch{ + fastq: it[0].data_type == "fastq" + bam: it[0].data_type == "bam" + }.set{ch_input_sample_type} - FASTQC_TRIMGALORE(input_sample_converted) + // convert any bam input to fastq + ALIGNMENT_TO_FASTQ_INPUT(ch_input_sample_type.bam, []) - // Get reads after optional trimming (+QC) - reads_input = FASTQC_TRIMGALORE.out.reads + // Optionnal trimming (+ QC) + // Done on fastq (inputed or converted) + // Theorically this could work on mixed input (fastq for one sample and bam for another) + // But not sure how to handle that with the samplesheet + // Or if we really want users to be able to do that + FASTQC_TRIMGALORE(ch_input_sample_type.fastq.mix(ALIGNMENT_TO_FASTQ_INPUT.out.reads)) - // Gather QC reports - ch_reports = ch_reports.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) - ch_reports = ch_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) + // Optionnal UMI consensus calling + CREATE_UMI_CONSENSUS(FASTQC_TRIMGALORE.out.reads, + fasta, + bwa, + umi_read_structure, + params.group_by_umi_strategy) - //Since read need additional mapping afterwards, I would argue for having the process here - if (params.umi_read_structure) { - CREATE_UMI_CONSENSUS(reads_input, fasta, bwa, umi_read_structure, params.group_by_umi_strategy, params.aligner) - ALIGNMENT_TO_FASTQ(CREATE_UMI_CONSENSUS.out.consensusbam, []) - reads_input = ALIGNMENT_TO_FASTQ.out.reads + // convert back to fastq for further preprocessing + ALIGNMENT_TO_FASTQ_UMI(CREATE_UMI_CONSENSUS.out.consensusbam, []) - // Gather used softwares versions - ch_versions = ch_versions.mix(CREATE_UMI_CONSENSUS.out.versions) - ch_versions = ch_versions.mix(ALIGNMENT_TO_FASTQ.out.versions) - } + // Get fastq files for further preprocessing + // either out of optionnal trimming or optionnal UMI consensus calling + ch_input_sample_to_split = params.umi_read_structure ? ALIGNMENT_TO_FASTQ_UMI.out.reads : FASTQC_TRIMGALORE.out.reads // OPTIONNAL SPLIT OF FASTQ FILES WITH SEQKIT_SPLIT2 - SPLIT_FASTQ(reads_input) + SPLIT_FASTQ(ch_input_sample_to_split) // STEP 1: MAPPING READS TO REFERENCE GENOME - GATK4_MAPPING( - SPLIT_FASTQ.out.reads, - bwa, - fasta, - fasta_fai) + // reads will be sorted + GATK4_MAPPING(SPLIT_FASTQ.out.reads, bwa, true) - // Get mapped reads (BAM) with and without index - // without index: always contains mapped_bams, only used if duplicate marking is done - // with Index: Duplicate marking is skipped and/or bams are saved, else empty Channel - bam_mapped = GATK4_MAPPING.out.bam - bam_indexed = GATK4_MAPPING.out.bam_indexed + ch_bam_mapped = GATK4_MAPPING.out.bam.map{ meta, bam -> + new_meta = meta.clone() + // remove no longer necessary fields + new_meta.remove('read_group') // Now in the BAM header + new_meta.remove('size') // Was only needed for mapping - // Create CSV to restart from this step - // TODO: How is this handeled if not save_bam is set (no index should be present) - //MAPPING_CSV(bam_indexed, save_bam_mapped, 'markduplicates' in params.skip_tools) + // update ID to be based on the sample name + new_meta.id = meta.sample + + [new_meta, bam] + } + + // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here + // Except if and only if skipping markduplicates or saving mapped bams + if (params.save_bam_mapped || (params.skip_tools && params.skip_tools.contains('markduplicates'))) { + + // bams are merged (when multiple lanes from the same sample), indexed and then converted to cram + MERGE_INDEX_BAM(ch_bam_mapped) + + // Create CSV to restart from this step + MAPPING_CSV(MERGE_INDEX_BAM.out.bam_bai) + + // Gather used softwares versions + ch_versions = ch_versions.mix(MERGE_INDEX_BAM.out.versions) + } + + // Gather QC reports + ch_reports = ch_reports.mix(FASTQC_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([])) + ch_reports = ch_reports.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) // Gather used softwares versions + ch_versions = ch_versions.mix(ALIGNMENT_TO_FASTQ_INPUT.out.versions) + ch_versions = ch_versions.mix(ALIGNMENT_TO_FASTQ_UMI.out.versions) + ch_versions = ch_versions.mix(CREATE_UMI_CONSENSUS.out.versions) ch_versions = ch_versions.mix(FASTQC_TRIMGALORE.out.versions) - ch_versions = ch_versions.mix(SPLIT_FASTQ.out.versions) ch_versions = ch_versions.mix(GATK4_MAPPING.out.versions) + ch_versions = ch_versions.mix(SPLIT_FASTQ.out.versions) } - // Comment out till we get the tests to pass - if (params.step == 'prepare_recalibration') { - bam_indexed = Channel.empty() - bam_mapped = Channel.empty() + if (params.step in ['mapping', 'prepare_recalibration']) { + ch_cram_markduplicates_no_spark = Channel.empty() + ch_cram_markduplicates_spark = Channel.empty() + ch_cram_no_markduplicates = Channel.empty() - // index will be created down the road from Markduplicates - // bam_indexed should only have indexes if Markduplicates is skipped + // STEP 2: markduplicates (+QC) + convert to CRAM - if (params.skip_tools && 'markduplicates' in params.skip_tools) bam_indexed = input_sample - else bam_mapped = input_sample.map{ meta, bam, bai -> [meta, bam] } - } + // ch_bam_for_markduplicates will countain bam mapped with GATK4_MAPPING when step is mapping + // Or bams that are specified in the samplesheet.csv when step is prepare_recalibration + ch_bam_for_markduplicates = params.step == 'mapping' ? ch_bam_mapped : ch_input_sample.map{ meta, bam, bai -> [meta, bam] } - if (params.step in ['mapping', 'prepare_recalibration']) { - // STEP 2: markduplicates (+QC) + convert to CRAM - MARKDUPLICATES( - bam_mapped, - bam_indexed, - dict, - fasta, - fasta_fai, - intervals_for_preprocessing) + if (params.skip_tools && params.skip_tools.contains('markduplicates')) { + + // ch_bam_indexed will countain bam mapped with GATK4_MAPPING when step is mapping + // which are then merged and indexed + // Or bams that are specified in the samplesheet.csv when step is prepare_recalibration + ch_bam_indexed = params.step == 'mapping' ? MERGE_INDEX_BAM.out.bam_bai : ch_input_sample + + BAM_TO_CRAM(ch_bam_indexed, + fasta, + fasta_fai, + intervals_for_preprocessing) + + ch_cram_no_markduplicates = BAM_TO_CRAM.out.cram + + // Gather QC reports + ch_reports = ch_reports.mix(BAM_TO_CRAM.out.qc) + + // Gather used softwares versions + ch_versions = ch_versions.mix(BAM_TO_CRAM.out.versions) + } else if (params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates')) { + MARKDUPLICATES_SPARK(ch_bam_for_markduplicates, + dict, + fasta, + fasta_fai, + intervals_for_preprocessing) + ch_cram_markduplicates_spark = MARKDUPLICATES_SPARK.out.cram - cram_markduplicates = MARKDUPLICATES.out.cram + // Gather QC reports + ch_reports = ch_reports.mix(MARKDUPLICATES_SPARK.out.qc.collect{it[1]}.ifEmpty([])) + + // Gather used softwares versions + ch_versions = ch_versions.mix(MARKDUPLICATES_SPARK.out.versions) + } else { + MARKDUPLICATES(ch_bam_for_markduplicates, + fasta, + fasta_fai, + intervals_for_preprocessing) + + ch_cram_markduplicates_no_spark = MARKDUPLICATES.out.cram + + // Gather QC reports + ch_reports = ch_reports.mix(MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) + + // Gather used softwares versions + ch_versions = ch_versions.mix(MARKDUPLICATES.out.versions) + } + + // ch_cram_for_prepare_recalibration contains either: + // - crams from markduplicates + // - crams from markduplicates_spark + // - crams converted from bam mapped when skipping markduplicates + ch_cram_for_prepare_recalibration = Channel.empty().mix( + ch_cram_markduplicates_no_spark, + ch_cram_markduplicates_spark, + ch_cram_no_markduplicates) + + // Run Samtools stats on CRAM + SAMTOOLS_STATS_CRAM(ch_cram_for_prepare_recalibration, fasta) // Create CSV to restart from this step - MARKDUPLICATES_CSV(cram_markduplicates) + MARKDUPLICATES_CSV(ch_cram_for_prepare_recalibration) // Gather QC reports - ch_reports = ch_reports.mix(MARKDUPLICATES.out.qc.collect{it[1]}.ifEmpty([])) + ch_reports = ch_reports.mix(SAMTOOLS_STATS_CRAM.out.stats.collect{it[1]}.ifEmpty([])) // Gather used softwares versions - ch_versions = ch_versions.mix(MARKDUPLICATES.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_STATS_CRAM.out.versions) // STEP 3: Create recalibration tables - if (!('baserecalibrator' in params.skip_tools)) { - PREPARE_RECALIBRATION( - cram_markduplicates, + if (!(params.skip_tools && params.skip_tools.contains('baserecalibrator'))) { + ch_table_bqsr_no_spark = Channel.empty() + ch_table_bqsr_spark = Channel.empty() + + if (params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator')) { + PREPARE_RECALIBRATION_SPARK(ch_cram_for_prepare_recalibration, dict, fasta, fasta_fai, intervals, - num_intervals, known_sites, known_sites_tbi, - params.no_intervals) + num_intervals) - PREPARE_RECALIBRATION_CSV(PREPARE_RECALIBRATION.out.table_bqsr) + ch_table_bqsr_spark = PREPARE_RECALIBRATION_SPARK.out.table_bqsr - cram_applybqsr = cram_markduplicates.join(PREPARE_RECALIBRATION.out.table_bqsr) + // Gather used softwares versions + ch_versions = ch_versions.mix(PREPARE_RECALIBRATION_SPARK.out.versions) + } else { + PREPARE_RECALIBRATION(ch_cram_for_prepare_recalibration, + dict, + fasta, + fasta_fai, + intervals, + known_sites, + known_sites_tbi, + num_intervals) - // Gather used softwares versions - ch_versions = ch_versions.mix(PREPARE_RECALIBRATION.out.versions) + ch_table_bqsr_no_spark = PREPARE_RECALIBRATION.out.table_bqsr + + // Gather used softwares versions + ch_versions = ch_versions.mix(PREPARE_RECALIBRATION.out.versions) + } + + // ch_table_bqsr contains either: + // - bqsr table from baserecalibrator + // - bqsr table from baserecalibrator_spark + ch_table_bqsr = Channel.empty().mix( + ch_table_bqsr_no_spark, + ch_table_bqsr_spark) + + // Create CSV to restart from this step + PREPARE_RECALIBRATION_CSV(ch_table_bqsr) } } - if (params.step == 'recalibrate') cram_applybqsr = input_sample - + // STEP 4: RECALIBRATING if (params.step in ['mapping', 'prepare_recalibration', 'recalibrate']) { - if (!('baserecalibrator' in params.skip_tools)) { - // STEP 4: RECALIBRATING - RECALIBRATE( - cram_applybqsr, - dict, + if (!(params.skip_tools && params.skip_tools.contains('baserecalibrator'))) { + ch_cram_applybqsr = params.step == 'recalibrate' ? ch_input_sample : ch_cram_for_prepare_recalibration.join(ch_table_bqsr) + ch_cram_variant_calling_no_spark = Channel.empty() + ch_cram_variant_calling_spark = Channel.empty() + + if (params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator')) { + RECALIBRATE_SPARK(ch_cram_applybqsr, + dict, + fasta, + fasta_fai, + intervals, + num_intervals) + + ch_cram_variant_calling_spark = RECALIBRATE_SPARK.out.cram + + // Gather used softwares versions + ch_versions = ch_versions.mix(RECALIBRATE_SPARK.out.versions) + + } else { + RECALIBRATE(ch_cram_applybqsr, + dict, + fasta, + fasta_fai, + intervals, + num_intervals) + + ch_cram_variant_calling_no_spark = RECALIBRATE.out.cram + + // Gather used softwares versions + ch_versions = ch_versions.mix(RECALIBRATE.out.versions) + } + cram_variant_calling = Channel.empty().mix( + ch_cram_variant_calling_no_spark, + ch_cram_variant_calling_spark) + + CRAM_QC(cram_variant_calling, fasta, fasta_fai, - intervals, - num_intervals, - params.no_intervals, intervals_for_preprocessing) - RECALIBRATE_CSV(RECALIBRATE.out.cram) - - cram_variant_calling = RECALIBRATE.out.cram + // Create CSV to restart from this step + RECALIBRATE_CSV(cram_variant_calling) // Gather QC reports - ch_reports = ch_reports.mix(RECALIBRATE.out.qc.collect{it[1]}.ifEmpty([])) + ch_reports = ch_reports.mix(CRAM_QC.out.qc.collect{it[1]}.ifEmpty([])) // Gather used softwares versions - ch_versions = ch_versions.mix(RECALIBRATE.out.versions) - - } else cram_variant_calling = cram_markduplicates + ch_versions = ch_versions.mix(CRAM_QC.out.versions) + } else cram_variant_calling = ch_cram_for_prepare_recalibration } - if (params.step in 'variant_calling') cram_variant_calling = input_sample + if (params.step == 'variant_calling') cram_variant_calling = ch_input_sample if (params.tools) { - if (params.step in 'annotate') cram_variant_calling = Channel.empty() + if (params.step == 'annotate') cram_variant_calling = Channel.empty() // // Logic to separate germline samples, tumor samples with no matched normal, and combine tumor-normal pairs @@ -519,7 +641,7 @@ workflow SAREK { ch_versions = ch_versions.mix(TUMOR_ONLY_VARIANT_CALLING.out.versions) // ANNOTATE - if (params.step == 'annotate') vcf_to_annotate = input_sample + if (params.step == 'annotate') vcf_to_annotate = ch_input_sample if (params.tools.contains('merge') || params.tools.contains('snpeff') || params.tools.contains('vep')) { @@ -539,7 +661,7 @@ workflow SAREK { } ch_version_yaml = Channel.empty() - if (!('versions' in params.skip_tools)) { + if (!(params.skip_tools && params.skip_tools.contains('versions'))) { CUSTOM_DUMPSOFTWAREVERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml')) ch_version_yaml = CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect() } @@ -555,7 +677,7 @@ workflow SAREK { // ch_multiqc_files = ch_multiqc_files.mix(ch_reports) // multiqc_report = Channel.empty() - // if (!('multiqc' in params.skip_tools)) { + // if (!(params.skip_tools && params.skip_tools.contains('multiqc'))) { // MULTIQC(ch_multiqc_files.collect()) // multiqc_report = MULTIQC.out.report.toList() @@ -570,8 +692,6 @@ workflow.onComplete { } // Function to extract information (meta data + file(s)) from csv file(s) - -is_bam_input = false def extract_csv(csv_file) { Channel.from(csv_file).splitCsv(header: true) //Retrieves number of lanes by grouping together by patient and sample and counting how many entries there are for this combination @@ -610,8 +730,9 @@ def extract_csv(csv_file) { def fastq_2 = file(row.fastq_2, checkIfExists: true) def CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : '' def read_group = "\"@RG\\tID:${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:ILLUMINA\"" - meta.numLanes = numLanes.toInteger() + meta.numLanes = numLanes.toInteger() meta.read_group = read_group.toString() + meta.data_type = "fastq" return [meta, [fastq_1, fastq_2]] // start from BAM } else if (row.lane && row.bam) { @@ -619,9 +740,9 @@ def extract_csv(csv_file) { def bam = file(row.bam, checkIfExists: true) def CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : '' def read_group = "\"@RG\\tID:${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:ILLUMINA\"" - meta.numLanes = numLanes.toInteger() + meta.numLanes = numLanes.toInteger() meta.read_group = read_group.toString() - is_bam_input = true + meta.data_type = "bam" return [meta, bam] // recalibration } else if (row.table && row.cram) { @@ -629,6 +750,7 @@ def extract_csv(csv_file) { def cram = file(row.cram, checkIfExists: true) def crai = file(row.crai, checkIfExists: true) def table = file(row.table, checkIfExists: true) + meta.data_type = "cram" return [meta, cram, crai, table] // recalibration when skipping MarkDuplicates } else if (row.table && row.bam) { @@ -636,26 +758,41 @@ def extract_csv(csv_file) { def bam = file(row.bam, checkIfExists: true) def bai = file(row.bai, checkIfExists: true) def table = file(row.table, checkIfExists: true) + meta.data_type = "bam" return [meta, bam, bai, table] // prepare_recalibration or variant_calling } else if (row.cram) { meta.id = meta.sample def cram = file(row.cram, checkIfExists: true) def crai = file(row.crai, checkIfExists: true) + meta.data_type = "cram" return [meta, cram, crai] // prepare_recalibration when skipping MarkDuplicates } else if (row.bam) { meta.id = meta.sample def bam = file(row.bam, checkIfExists: true) def bai = file(row.bai, checkIfExists: true) + meta.data_type = "bam" return [meta, bam, bai] // annotation } else if (row.vcf) { meta.id = meta.sample def vcf = file(row.vcf, checkIfExists: true) + meta.data_type = "vcf" return [meta, vcf] } else { log.warn "Missing or unknown field in csv file header" } } } + +// Function to count number of intervals +def count_intervals(intervals_file) { + count = 0 + + intervals_file.eachLine{ it -> + count += it.startsWith("@") ? 0 : 1 + } + + return count +}