diff --git a/.circleci/config.yml b/.circleci/config.yml index 2da763cf15..ebce2b06eb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,11 +10,11 @@ jobs: - checkout - setup_remote_docker - run: - command: docker build -t nfcore/sareksnpeff:2.5.1.${GENOME} containers/snpeff/. --build-arg GENOME=${GENOME} --build-arg SNPEFF_CACHE_VERSION=${SNPEFF_CACHE_VERSION} + command: docker build -t nfcore/sareksnpeff:2.5.2.${GENOME} containers/snpeff/. --build-arg GENOME=${GENOME} --build-arg SNPEFF_CACHE_VERSION=${SNPEFF_CACHE_VERSION} - run: command: | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker push nfcore/sareksnpeff:2.5.1.${GENOME} + docker push nfcore/sareksnpeff:2.5.2.${GENOME} snpeffgrch38: << : *buildsnpeff @@ -45,10 +45,10 @@ jobs: - checkout - setup_remote_docker - run: - command: docker build -t nfcore/sarekvep:2.5.1.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION} + command: docker build -t nfcore/sarekvep:2.5.2.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION} no_output_timeout: 3h - run: - command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:2.5.1.${GENOME} + command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:2.5.2.${GENOME} vepgrch38: << : *buildvep diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..04c39ab2b5 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @MaxUlysse \ No newline at end of file diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index c8210fcedb..36fed625b8 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,47 +1,57 @@ # nf-core/sarek: Contributing Guidelines -Hi there! Many thanks for taking an interest in improving nf-core/sarek. +Hi there! +Many thanks for taking an interest in improving nf-core/sarek. -We try to manage the required tasks for nf-core/sarek using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. - -However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) - -> If you need help using or modifying nf-core/sarek then the best place to ask is on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). +We try to manage the required tasks for nf-core/sarek using GitHub issues, you probably came to this page when creating one. +Please use the pre-filled template to save time. +However, don't be put off by this template - other more general issues and suggestions are welcome! +Contributions to the code are even more welcome ;) +> If you need help using or modifying nf-core/sarek then the best place to ask is on the nf-core Slack [#sarek](https://nfcore.slack.com/channels/sarek) channel ([join our Slack here](https://nf-co.re/join/slack)). ## Contribution workflow -If you'd like to write some code for nf-core/sarek, the standard workflow -is as follows: -1. Check that there isn't already an issue about your idea in the - [nf-core/sarek issues](https://github.com/nf-core/sarek/issues) to avoid - duplicating work. +If you'd like to write some code for nf-core/sarek, the standard workflow is as follows: + +1. Check that there isn't already an issue about your idea in the [nf-core/sarek issues](https://github.com/nf-core/sarek/issues) to avoid duplicating work * If there isn't one already, please create one so that others know you're working on this -2. Fork the [nf-core/sarek repository](https://github.com/nf-core/sarek) to your GitHub account +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/sarek repository](https://github.com/nf-core/sarek) to your GitHub account 3. Make the necessary changes / additions within your forked repository -4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged. - -If you're not used to this workflow with git, you can start with some [basic docs from GitHub](https://help.github.com/articles/fork-a-repo/) or even their [excellent interactive tutorial](https://try.github.io/). +4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged +If you're not used to this workflow with git, you can start with some [docs from GitHub](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests) or even their [excellent `git` resources](https://try.github.io/). ## Tests -When you create a pull request with changes, [Travis CI](https://travis-ci.org/) will run automatic tests. + +When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. There are typically two types of tests that run: ### Lint Tests -The nf-core has a [set of guidelines](http://nf-co.re/guidelines) which all pipelines must adhere to. + +`nf-core` has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to. To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. If any failures or warnings are encountered, please follow the listed URL for more documentation. ### Pipeline Tests -Each nf-core pipeline should be set up with a minimal set of test-data. -Travis CI then runs the pipeline on this data to ensure that it exists successfully. + +Each `nf-core` pipeline should be set up with a minimal set of test-data. +`GitHub Actions` then runs the pipeline on this data to ensure that it exits successfully. If there are any failures then the automated tests fail. -These tests are run both with the latest available version of Nextflow and also the minimum required version that is stated in the pipeline code. +These tests are run both with the latest available version of `Nextflow` and also the minimum required version that is stated in the pipeline code. + +## Patch + +: warning: Only in the unlikely and regretful event of a release happening with a bug. + +* On your own fork, make a new branch `patch` based on `upstream/master`. +* Fix the bug, and bump version (X.Y.Z+1). +* A PR should be made on `master` from patch to directly this particular bug. ## Getting help -For further information/help, please consult the [nf-core/sarek documentation](https://github.com/nf-core/sarek#documentation) and don't hesitate to get in touch on the [sarek pipeline channel](https://nfcore.slack.com/channels/sarek) on [Slack](https://nf-co.re/join/slack). + +For further information/help, please consult the [nf-core/sarek documentation](https://nf-co.re/sarek/docs) and don't hesitate to get in touch on the nf-core Slack [#sarek](https://nfcore.slack.com/channels/sarek) channel ([join our Slack here](https://nf-co.re/join/slack)). diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 6df6c5052c..c6643b83ab 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,31 +1,42 @@ +# nf-core/sarek bug report + Hi there! -Thanks for telling us about a problem with the pipeline. Please delete this text and anything that's not relevant from the template below: +Thanks for telling us about a problem with the pipeline. +Please delete this text and anything that's not relevant from the template below: + +## Describe the bug -#### Describe the bug A clear and concise description of what the bug is. -#### Steps to reproduce +## Steps to reproduce + Steps to reproduce the behaviour: + 1. Command line: `nextflow run ...` 2. See error: _Please provide your error message_ -#### Expected behaviour +## Expected behaviour + A clear and concise description of what you expected to happen. -#### System: - - Hardware: [e.g. HPC, Desktop, Cloud...] - - Executor: [e.g. slurm, local, awsbatch...] - - OS: [e.g. CentOS Linux, macOS, Linux Mint...] - - Version [e.g. 7, 10.13.6, 18.3...] +## System + +- Hardware: +- Executor: +- OS: +- Version + +## Nextflow Installation + +- Version: + +## Container engine -#### Nextflow Installation: - - Version: [e.g. 0.31.0] +- Engine: +- version: +- Image tag: -#### Container engine: - - Engine: [e.g. Conda, Docker or Singularity] - - version: [e.g. 1.0.0] - - Image tag: [e.g. nfcore/sarek:1.0.0] +## Additional context -#### Additional context Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 1f025b779c..e3f009a723 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,16 +1,24 @@ +# nf-core/sarek feature request + Hi there! -Thanks for suggesting a new feature for the pipeline! Please delete this text and anything that's not relevant from the template below: +Thanks for suggesting a new feature for the pipeline! +Please delete this text and anything that's not relevant from the template below: + +## Is your feature request related to a problem? Please describe -#### Is your feature request related to a problem? Please describe. A clear and concise description of what the problem is. + Ex. I'm always frustrated when [...] -#### Describe the solution you'd like +## Describe the solution you'd like + A clear and concise description of what you want to happen. -#### Describe alternatives you've considered +## Describe alternatives you've considered + A clear and concise description of any alternative solutions or features you've considered. -#### Additional context +## Additional context + Add any other context about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 15c9a1b1df..913a3fa83c 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,16 +1,19 @@ -Many thanks to contributing to nf-core/sarek! +# nf-core/sarek pull request + +Many thanks for contributing to nf-core/sarek! Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs). ## PR checklist - - [ ] This comment contains a description of changes (with reason) - - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If necessary, also make a PR on the [nf-core/sarek branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/sarek) - - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). - - [ ] Make sure your code lints (`nf-core lint .`). - - [ ] Documentation in `docs` is updated - - [ ] `CHANGELOG.md` is updated - - [ ] `README.md` is updated -**Learn more about contributing:** [guidelines](https://github.com/nf-core/sarek/tree/master/.github/CONTRIBUTING.md) \ No newline at end of file +- [ ] This comment contains a description of changes (with reason) +- [ ] If you've fixed a bug or added code that should be tested, add tests! +- [ ] If necessary, also make a PR on the [nf-core/sarek branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/sarek) +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Make sure your code lints (`nf-core lint .`). +- [ ] Documentation in `docs` is updated +- [ ] `CHANGELOG.md` is updated +- [ ] `README.md` is updated + +**Learn more about contributing:** [CONTRIBUTING.md](https://github.com/nf-core/sarek/tree/master/.github/CONTRIBUTING.md) \ No newline at end of file diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml index 8f97a170e8..dea8627539 100644 --- a/.github/markdownlint.yml +++ b/.github/markdownlint.yml @@ -9,4 +9,6 @@ no-duplicate-header: siblings_only: true no-inline-html: allowed_elements: - - img \ No newline at end of file + - a + - img + - p \ No newline at end of file diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 741af20632..855201d049 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -7,10 +7,10 @@ on: jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 steps: # PRs are only ok if coming from an nf-core dev branch - uses: actions/checkout@v1 - name: Check PRs run: | - { [[ $(git remote get-url origin) == *nf-core/sarek ]] && [[ ${GITHUB_BASE_REF} = "master" ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == patch* ]] + { [[ $(git remote get-url origin) == *nf-core/sarek ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]] diff --git a/.github/workflows/ci-extra.yml b/.github/workflows/ci-extra.yml deleted file mode 100644 index 442be114bb..0000000000 --- a/.github/workflows/ci-extra.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: sarek extra CI -# This workflow is triggered on pushes and PRs to the repository. -on: [push, pull_request] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - test: [ANNOTATESNPEFF, GERMLINE, SOMATIC, TARGETED] - nxf_ver: ['19.04.0', ''] - steps: - - uses: actions/checkout@v1 - - name: Install Nextflow - run: | - export NXF_VER=${{ matrix.nxf_ver }} - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - name: Download image - run: | - ${GITHUB_WORKSPACE}/scripts/download_image.sh -n docker --source-version 2.5 --target-version 2.5.1 --test ${{ matrix.test }} - - name: Run test - run: | - ${GITHUB_WORKSPACE}/scripts/run_tests.sh --test ${{ matrix.test }} --verbose \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 16fc90d4d1..844d217b43 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,10 +4,10 @@ on: [push, pull_request] jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 strategy: matrix: - nxf_ver: ['19.04.0', ''] + nxf_ver: ['19.10.0', ''] steps: - uses: actions/checkout@v1 - name: Install Nextflow @@ -17,8 +17,114 @@ jobs: sudo mv nextflow /usr/local/bin/ - name: Download and tag image run: | - docker pull nfcore/sarek:2.5 - docker tag nfcore/sarek:2.5 nfcore/sarek:2.5.1 + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 - name: Run test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker \ No newline at end of file + nextflow run ${GITHUB_WORKSPACE} -profile test,docker + annotation: + runs-on: ubuntu-18.04 + strategy: + matrix: + annotator: [snpeff] + specie: [GRCh37] + steps: + - uses: actions/checkout@v1 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + NXF_VER: '19.10.0' + - name: Download and tag images + run: | + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 + docker pull nfcore/sarek${{ matrix.annotator }}:dev.${{ matrix.specie }} + docker tag nfcore/sarek${{ matrix.annotator }}:dev.${{ matrix.specie }} nfcore/sarek${{ matrix.annotator }}:2.5.2.${{ matrix.specie }} + - name: Run annotation test + run: | + nextflow run . -profile test_annotation,docker --verbose --tools ${{ matrix.annotator }} + germline: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v1 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + NXF_VER: '19.10.0' + - name: Download and tag image + run: | + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 + - name: Get test data + run: | + git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data + - name: Run germline test + run: | + nextflow run . -profile test,docker --input data/testdata/tiny/normal + nextflow run . -profile test,docker --input=false --step recalibrate -resume + nextflow run . -profile test,docker --input=false --step variantCalling + minimal: + runs-on: ubuntu-18.04 + strategy: + matrix: + genome: [smallerGRCh37, minimalGRCh37] + intervals: [--no_intervals, ''] + steps: + - uses: actions/checkout@v1 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + NXF_VER: '19.10.0' + - name: Download and tag image + run: | + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 + - name: Run test for minimal genomes + run: | + nextflow run . -profile test,docker --skipQC all --verbose --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka + profile: + runs-on: ubuntu-18.04 + strategy: + matrix: + profile: [test_splitfastq, test_targeted] + steps: + - uses: actions/checkout@v1 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + NXF_VER: '19.10.0' + - name: Download and tag image + run: | + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 + - name: Run targeted and splitfastq tests + run: | + nextflow run . -profile ${{ matrix.profile }},docker --verbose + tools: + runs-on: ubuntu-18.04 + strategy: + matrix: + tool: [Haplotypecaller, Freebayes, Manta, mpileup, Mutect2, Strelka, TIDDIT] + steps: + - uses: actions/checkout@v1 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + NXF_VER: '19.10.0' + - name: Download and tag image + run: | + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 + - name: Run variant calling test on specific tools + run: | + nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }} diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 3892e41f2f..53f1efd031 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -4,7 +4,7 @@ on: [push, pull_request] jobs: Markdown: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v1 - uses: actions/setup-node@v1 @@ -17,7 +17,7 @@ jobs: run: | markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml YAML: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v1 - uses: actions/setup-node@v1 @@ -30,7 +30,7 @@ jobs: run: | yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml") nf-core: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v1 - name: Install Nextflow diff --git a/.travis.yml b/.travis.yml index fdc9ffb066..e4f4c29ff2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - docker pull nfcore/sarek:dev # Fake the tag locally so that the pipeline runs properly # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - - docker tag nfcore/sarek:dev nfcore/sarek:2.5.1 + - docker tag nfcore/sarek:dev nfcore/sarek:2.5.2 install: # Install Nextflow @@ -30,7 +30,7 @@ install: - sudo apt-get install npm && npm install -g markdownlint-cli env: - - NXF_VER='19.04.0' # Specify a minimum NF version that should be tested and work + - NXF_VER='19.10.0' # Specify a minimum NF version that should be tested and work - NXF_VER='' # Plus: get the latest NF version and check that it works script: diff --git a/CHANGELOG.md b/CHANGELOG.md index e35bf7ad3c..7f0dcf5edf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,65 @@ All notable changes to this project will be documented in this file. -The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) -and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +## [2.5.2] - Jåkkåtjkaskajekna + +Jåkkåtjkaskajekna is one of the two glaciers of the Ålkatj Massif. + +### `Added` + +- [#45](https://github.com/nf-core/sarek/pull/45) - Include Workflow figure in `README.md` +- [#46](https://github.com/nf-core/sarek/pull/46) - Add location to abstracts +- [#52](https://github.com/nf-core/sarek/pull/52) - Add support for mouse data `GRCm38` +- [#60](https://github.com/nf-core/sarek/pull/60) - Add `no_intervals` params +- [#60](https://github.com/nf-core/sarek/pull/60) - Add automatic generation of `intervals` file with `BuildIntervals` process +- [#60](https://github.com/nf-core/sarek/pull/60) - Add minimal support for minimal genome (only `fasta`, or `fasta` + `knownIndels`) +- [#60](https://github.com/nf-core/sarek/pull/60) - Add new processes (`IndexBamFile`, `IndexBamRecal`) to deal with optional usage of interval files and minimal genome +- [#60](https://github.com/nf-core/sarek/pull/60) - Add tests for minimal genome usage +- [#60](https://github.com/nf-core/sarek/pull/60) - Add new minimal genomes (`TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`) to `igenomes.config` +- [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq` +- [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ` + +### `Changed` + +- [#54](https://github.com/nf-core/sarek/pull/54) - Bump version to `2.5.2dev` +- [#60](https://github.com/nf-core/sarek/pull/60) - Some process (`BaseRecalibrator`, `ApplyBQSR`, `Mpileup`) have now optional usage of interval files +- [#60](https://github.com/nf-core/sarek/pull/60) - Update documentation +- [#71](https://github.com/nf-core/sarek/pull/71) - Update `README` +- [#71](https://github.com/nf-core/sarek/pull/71) - Update `CHANGELOG` +- [#74](https://github.com/nf-core/sarek/pull/74) - Update docs +- [#74](https://github.com/nf-core/sarek/pull/74) - Improve CI tests (both Jenkins and GitHub actions tests) +- [#74](https://github.com/nf-core/sarek/pull/74) - Move all CI from `ci-extra.yml` to `ci.yml` + +### `Removed` + +- [#46](https://github.com/nf-core/sarek/pull/46) - Remove mention of old `build.nf` script which was included in `main.nf` +- [#74](https://github.com/nf-core/sarek/pull/74) - Remove `download_image.sh` and `run_tests.sh` scripts + +### `Fixed` + +- [#40](https://github.com/nf-core/sarek/pull/40) - Fix issue with `publishDirMode` within `test` profile +- [#42](https://github.com/nf-core/sarek/pull/42) - Fix typos, and minor updates in `README.md` +- [#43](https://github.com/nf-core/sarek/pull/43) - Fix automated `VEP` builds with circleCI +- [#54](https://github.com/nf-core/sarek/pull/54) - Apply fixes from release `2.5.1` +- [#58](https://github.com/nf-core/sarek/pull/58) - Fix issue with `.interval_list` file from the `GATK` bundle [#56](https://github.com/nf-core/sarek/issues/56) that was not recognized in the `CreateIntervalsBed` process +- [#71](https://github.com/nf-core/sarek/pull/71) - Fix typos in `CHANGELOG` +- [#73](https://github.com/nf-core/sarek/pull/73) - Fix issue with label `memory_max` for `BaseRecalibrator` process [#72](https://github.com/nf-core/sarek/issues/72) + +## [2.5.1] - Årjep-Ålkatjjekna + +Årjep-Ålkatjjekna is one of the two glaciers of the Ålkatj Massif. + +### `Added` + +- [#53](https://github.com/nf-core/sarek/pull/53) - Release `2.5.1` + +### `Fixed` + +- [#48](https://github.com/nf-core/sarek/issues/48) - Fix `singularity.autoMounts` issue. +- [#49](https://github.com/nf-core/sarek/issues/49) - Use correct tag for annotation containers. +- [#50](https://github.com/nf-core/sarek/issues/50) - Fix paths for scripts. ## [2.5.1] - Årjep-Ålkatjjekna @@ -17,6 +74,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [2.5] - Ålkatj +Ålkatj is one of the main massif in the Sarek National Park. + Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) template. ### `Added` @@ -25,37 +84,37 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#2](https://github.com/nf-core/sarek/pull/2), [#3](https://github.com/nf-core/sarek/pull/3), [#4](https://github.com/nf-core/sarek/pull/4), [#5](https://github.com/nf-core/sarek/pull/5), [#7](https://github.com/nf-core/sarek/pull/7), [#9](https://github.com/nf-core/sarek/pull/9), [#10](https://github.com/nf-core/sarek/pull/10), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12) - Add CI for `nf-core/sarek` - [#3](https://github.com/nf-core/sarek/pull/3) - Add preprocessing to `nf-core/sarek` - [#4](https://github.com/nf-core/sarek/pull/4) - Add variant calling to `nf-core/sarek` with `HaplotypeCaller`, and single mode `Manta` and `Strelka` -- [#5](https://github.com/nf-core/sarek/pull/5), [#34](https://github.com/nf-core/sarek/pull/34) - Add variant calling to `nf-core/sarek` with `Manta`, `Strelka`, `Strelka Best Practices`, `MuTecT2`, `FreeBayes`, `ASCAT`, `ControlFREEC` +- [#5](https://github.com/nf-core/sarek/pull/5), [#34](https://github.com/nf-core/sarek/pull/34) - Add variant calling to `nf-core/sarek` with `Manta`, `Strelka`, `Strelka Best Practices`, `Mutect2`, `FreeBayes`, `ASCAT`, `ControlFREEC` - [#6](https://github.com/nf-core/sarek/pull/6) - Add default containers for annotation to `nf-core/sarek` -- [#7](https://github.com/nf-core/sarek/pull/7) - Add MultiQC +- [#7](https://github.com/nf-core/sarek/pull/7) - Add `MultiQC` - [#7](https://github.com/nf-core/sarek/pull/7) - Add annotation - [#7](https://github.com/nf-core/sarek/pull/7) - Add social preview image in `png` and `svg` format - [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#11](https://github.com/nf-core/sarek/pull/11), [#21](https://github.com/nf-core/sarek/pull/21) - Add helper script `run_tests.sh` to run different tests - [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9) - Add automatic build of specific containers for annotation for `GRCh37`, `GRCh38` and `GRCm38` using `CircleCI` - [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9), [#11](https://github.com/nf-core/sarek/pull/11) - Add helper script `build_reference.sh` to build small reference from [nf-core/test-datasets:sarek](https://github.com/nf-core/test-datasets/tree/sarek) - [#7](https://github.com/nf-core/sarek/pull/7), [#9](https://github.com/nf-core/sarek/pull/9), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12) - Add helper script `download_image.sh` to download containers for testing -- [#8](https://github.com/nf-core/sarek/pull/8) - Add test configation for easier testing +- [#8](https://github.com/nf-core/sarek/pull/8) - Add test configuration for easier testing - [#9](https://github.com/nf-core/sarek/pull/9), [#11](https://github.com/nf-core/sarek/pull/11) - Add scripts for `ASCAT` - [#10](https://github.com/nf-core/sarek/pull/10) - Add `TIDDIT` to detect structural variants - [#11](https://github.com/nf-core/sarek/pull/11) - Add automatic build of specific containers for annotation for `CanFam3.1` using `CircleCI` - [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12) - Add posters and abstracts - [#12](https://github.com/nf-core/sarek/pull/12) - Add helper script `make_snapshot.sh` to make an archive for usage on a secure cluster - [#12](https://github.com/nf-core/sarek/pull/12) - Add helper scripts `filter_locifile.py` and `selectROI.py` -- [#12](https://github.com/nf-core/sarek/pull/12) - Use `label` for processes configation +- [#12](https://github.com/nf-core/sarek/pull/12) - Use `label` for processes configuration - [#13](https://github.com/nf-core/sarek/pull/13) - Add Citation documentation - [#13](https://github.com/nf-core/sarek/pull/13) - Add `BamQC` process - [#13](https://github.com/nf-core/sarek/pull/13) - Add `CompressVCFsnpEff` and `CompressVCFvep` processes - [#18](https://github.com/nf-core/sarek/pull/18) - Add `--no-reports` option for tests + add snpEff,VEP,merge to MULTIPLE test -- [#18](https://github.com/nf-core/sarek/pull/18) - Add logo to MultiQC report +- [#18](https://github.com/nf-core/sarek/pull/18) - Add logo to `MultiQC` report - [#18](https://github.com/nf-core/sarek/pull/18), [#29](https://github.com/nf-core/sarek/pull/29) - Add params `--skipQC` to skip specified QC tools - [#18](https://github.com/nf-core/sarek/pull/18) - Add possibility to download other genome for `sareksnpeff` and `sarekvep` containers - [#20](https://github.com/nf-core/sarek/pull/20) - Add `markdownlint` config file -- [#21](https://github.com/nf-core/sarek/pull/21) - Add tests for latest Nextflow version as well -- [#21](https://github.com/nf-core/sarek/pull/21) - Add `genomes.config` for genomes without AWS iGenomes -- [#24](https://github.com/nf-core/sarek/pull/24) - Added GATK4 Mutect2 calling and filtering +- [#21](https://github.com/nf-core/sarek/pull/21) - Add tests for latest `Nextflow` version as well +- [#21](https://github.com/nf-core/sarek/pull/21) - Add `genomes.config` for genomes without `AWS iGenomes` +- [#24](https://github.com/nf-core/sarek/pull/24) - Added `GATK4 Mutect2` calling and filtering - [#27](https://github.com/nf-core/sarek/pull/27), [#30](https://github.com/nf-core/sarek/pull/30) - Use Github actions for CI, linting and branch protection -- [#31](https://github.com/nf-core/sarek/pull/31) - Add nf-core lint -- [#31](https://github.com/nf-core/sarek/pull/31) - Add extra CI to GitHub Actions nf-core extra CI +- [#31](https://github.com/nf-core/sarek/pull/31) - Add `nf-core lint` +- [#31](https://github.com/nf-core/sarek/pull/31) - Add extra CI to `GitHub Actions` nf-core extra CI - [#35](https://github.com/nf-core/sarek/pull/35) - Building indexes from [nf-core/test-datasets:sarek](https://github.com/nf-core/test-datasets/tree/sarek) for CI and small tests ### `Changed` @@ -67,10 +126,10 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#7](https://github.com/nf-core/sarek/pull/8), [#23](https://github.com/nf-core/sarek/pull/23) - `--annotateVCF` is now deprecated, use `--input` instead - [#8](https://github.com/nf-core/sarek/pull/8), [#12](https://github.com/nf-core/sarek/pull/12) - Improve helper script `build.nf` for downloading and building reference files - [#9](https://github.com/nf-core/sarek/pull/9) - `ApplyBQSR` is now parallelized -- [#9](https://github.com/nf-core/sarek/pull/9) - Fastq files are named following "${idRun}_R1.fastq.gz" in the FastQC output for easier reporting +- [#9](https://github.com/nf-core/sarek/pull/9) - Fastq files are named following "${idRun}_R1.fastq.gz" in the `FastQC` output for easier reporting - [#9](https://github.com/nf-core/sarek/pull/9) - Status is now a map with `idpatient`, `idsample` as keys (ie: `status = statusMap[idPatient, idSample]`) - [#9](https://github.com/nf-core/sarek/pull/9) - Use `ensembl-vep` `95.2` instead of `96.0` -- [#11](https://github.com/nf-core/sarek/pull/11) - Summary HTML from VWP is now in the `Reports` directory +- [#11](https://github.com/nf-core/sarek/pull/11) - Summary HTML from `VEP` is now in the `Reports` directory - [#12](https://github.com/nf-core/sarek/pull/12) - Update configuration files - [#12](https://github.com/nf-core/sarek/pull/12) - Disable `Docker` in `singularity` profile - [#12](https://github.com/nf-core/sarek/pull/12) - Disable `Singularity` in `docker` profile @@ -83,23 +142,23 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#18](https://github.com/nf-core/sarek/pull/18) - Add `--no-reports` for all tests but MULTIPLE in Jenkins - [#18](https://github.com/nf-core/sarek/pull/18), [#29](https://github.com/nf-core/sarek/pull/29) - `--noReports` is now `--skipQC all` - [#18](https://github.com/nf-core/sarek/pull/18), [#21](https://github.com/nf-core/sarek/pull/21) - Update logo -- [#21](https://github.com/nf-core/sarek/pull/21) - Moved smallGRCh37 path to `genomes.config` +- [#21](https://github.com/nf-core/sarek/pull/21) - Moved `smallGRCh37` path to `genomes.config` - [#23](https://github.com/nf-core/sarek/pull/23) - Rename `genomeFile`, `genomeIndex` and `genomeDict` by `fasta`, `fastaFai` and `dict` - [#23](https://github.com/nf-core/sarek/pull/23) - `--sample` is now deprecated, use `--input` instead - [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeFile` is now deprecated, use `--fasta` instead - [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeIndex` is now deprecated, use `--fastaFai` instead - [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeDict` is now deprecated, use `--dict` instead -- [#24](https://github.com/nf-core/sarek/pull/24) - iGenomes config now contains germline resource for GATK4 Mutect2 +- [#24](https://github.com/nf-core/sarek/pull/24) - `AWS iGenomes` config now contains germline resource for `GATK4 Mutect2` - [#30](https://github.com/nf-core/sarek/pull/30) - Simplify code for `MapReads` process -- [#24](https://github.com/nf-core/sarek/pull/24) - iGenomes config now contains germline resource for `GATK4 Mutect2` -- [#31](https://github.com/nf-core/sarek/pull/31) - Move extra CI to GitHub Actions nf-core extra CI +- [#24](https://github.com/nf-core/sarek/pull/24) - `AWS iGenomes` config now contains germline resource for `GATK4 Mutect2` +- [#31](https://github.com/nf-core/sarek/pull/31) - Move extra CI to `GitHub Actions` nf-core extra CI - [#32](https://github.com/nf-core/sarek/pull/32), [#33](https://github.com/nf-core/sarek/pull/33) - Install `ASCAT` with `conda` in the `environment.yml` file -- [#33](https://github.com/nf-core/sarek/pull/33) - use `workflow.manifest.version` to specify workflow version in path to scripts for `ControlFREEC` and `VEP` processes +- [#33](https://github.com/nf-core/sarek/pull/33) - Use `workflow.manifest.version` to specify workflow version in path to scripts for `ControlFREEC` and `VEP` processes - [#35](https://github.com/nf-core/sarek/pull/35) - Building indexes is now done in `main.nf` - [#35](https://github.com/nf-core/sarek/pull/35) - `build.nf` script now only download cache, so renamed to `downloadcache.nf` - [#35](https://github.com/nf-core/sarek/pull/35) - Use `tabix` instead of `IGVtools` to build vcf indexes - [#35](https://github.com/nf-core/sarek/pull/35) - Refactor references handling -- [#35](https://github.com/nf-core/sarek/pull/35) - use Channel values instead of `referenceMap` +- [#35](https://github.com/nf-core/sarek/pull/35) - Use Channel values instead of `referenceMap` - [#37](https://github.com/nf-core/sarek/pull/37) - Bump version for Release - [#38](https://github.com/nf-core/sarek/pull/38) - File names before merge is based on `${idSample}_${idRun}` instead of `${idRun}` @@ -109,8 +168,8 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#13](https://github.com/nf-core/sarek/pull/13) - Removed `BamQCmapped` and `BamQCrecalibrated` processes - [#13](https://github.com/nf-core/sarek/pull/13) - Removed `CompressVCF` - [#18](https://github.com/nf-core/sarek/pull/18) - Removed params `--noReports` -- [#24](https://github.com/nf-core/sarek/pull/18) - Removed GATK3.X MuTect2 -- [#31](https://github.com/nf-core/sarek/pull/31) - Remove extra CI from Travis CI and GitHub Actions nf-core CI +- [#24](https://github.com/nf-core/sarek/pull/18) - Removed `GATK3.X Mutect2` +- [#31](https://github.com/nf-core/sarek/pull/31) - Remove extra CI from `Travis CI` and `GitHub Actions` nf-core CI - [#32](https://github.com/nf-core/sarek/pull/32), [#35](https://github.com/nf-core/sarek/pull/35) - Clean up `environment.yml` file - [#35](https://github.com/nf-core/sarek/pull/35) - Remove building indexes from `build.nf` script - [#35](https://github.com/nf-core/sarek/pull/35) - Remove helper script `build_reference.sh` @@ -120,21 +179,21 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Fixed` -- [#3](https://github.com/nf-core/sarek/pull/3) - Fix Docker ownership +- [#3](https://github.com/nf-core/sarek/pull/3) - Fix `Docker` ownership - [#11](https://github.com/nf-core/sarek/pull/11) - Fix `MergeMpileup` PublishDir - [#13](https://github.com/nf-core/sarek/pull/13) - Fix merge in annotation - [#14](https://github.com/nf-core/sarek/pull/14) - Fix output name for vcf files -- [#16](https://github.com/nf-core/sarek/pull/16) - Fix path to Rscript +- [#16](https://github.com/nf-core/sarek/pull/16) - Fix path to `Rscript` - [#18](https://github.com/nf-core/sarek/pull/18) - Improve cpu usage -- [#18](https://github.com/nf-core/sarek/pull/18) - Use same font for nf-core and sarek in ascii art +- [#18](https://github.com/nf-core/sarek/pull/18) - Use same font for `nf-core` and `sarek` in ascii art - [#20](https://github.com/nf-core/sarek/pull/20) - Use new logo in README - [#20](https://github.com/nf-core/sarek/pull/20) - Fix path to references genomes - [#22](https://github.com/nf-core/sarek/pull/22) - Fix `--singleCPUMem` issue -- [#30](https://github.com/nf-core/sarek/pull/30) - fix choice between `inputPairReadsFastQC` and `inputBAMFastQC` channels +- [#30](https://github.com/nf-core/sarek/pull/30) - Fix choice between `inputPairReadsFastQC` and `inputBAMFastQC` channels - [#31](https://github.com/nf-core/sarek/pull/31) - Fix badges according to nf-core lint -- [#31](https://github.com/nf-core/sarek/pull/31) - Fix rcolorbrewer version according to nf-core lint +- [#31](https://github.com/nf-core/sarek/pull/31) - Fix `rcolorbrewer` version according to nf-core lint - [#33](https://github.com/nf-core/sarek/pull/33) - Fix MD Linting -- [#38](https://github.com/nf-core/sarek/pull/38) - Avoid collision in MultiQC +- [#38](https://github.com/nf-core/sarek/pull/38) - Avoid collision in `MultiQC` - [#39](https://github.com/nf-core/sarek/pull/39) - Fix `ch_dbsnp` channel ### `Deprecated` @@ -149,10 +208,12 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Fixed` -- [#742](https://github.com/SciLifeLab/Sarek/pull/742) - Fix output dirs (HaplotypeCaller that was not recognized by annotate.nf introduced by [#728](https://github.com/SciLifeLab/Sarek/pull/728)) +- [#742](https://github.com/SciLifeLab/Sarek/pull/742) - Fix output dirs (`HaplotypeCaller` that was not recognized by `annotate.nf` introduced by [#728](https://github.com/SciLifeLab/Sarek/pull/728)) ## [2.3] - Äpar - 2019-02-27 +Äpar is one of the main massif in the Sarek National Park. + ### `Added` - [#628](https://github.com/SciLifeLab/Sarek/pull/628), [#722](https://github.com/SciLifeLab/Sarek/pull/722) - `ASCAT` now use `.gc` file @@ -164,50 +225,50 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Update `Sarek-data` submodule with multiple patients TSV file - [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Add `cadd_WG_SNVs`, `cadd_WG_SNVs_tbi`, `cadd_InDels`, `cadd_InDels_tbi` and `cadd_cache` params -- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Add tabix indexed cache for VEP +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Add `tabix` indexed cache for `VEP` - [#732](https://github.com/SciLifeLab/Sarek/pull/732) - New `DownloadCADD` process to download CADD files - [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Specify values for `cadd_WG_SNVs`, `cadd_WG_SNVs_tbi`, `cadd_InDels`, `cadd_InDels_tbi` and `cadd_cache` params in `munin.conf` file - [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Use `cadd_cache` param for optional use of CADD VEP plugin in `annotate.nf` -- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - VEP cache has now fasta files for `--HGVS` -- [#735](https://github.com/SciLifeLab/Sarek/pull/735) - Added `--exome` for Manta, and for StrelkaBP -- [#735](https://github.com/SciLifeLab/Sarek/pull/735) - Added Travis CI test for targeted +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - `VEP` cache has now fasta files for `--HGVS` +- [#735](https://github.com/SciLifeLab/Sarek/pull/735) - Added `--exome` for `Manta`, and for `StrelkaBP` +- [#735](https://github.com/SciLifeLab/Sarek/pull/735) - Added `Travis CI` test for targeted ### `Changed` - [#710](https://github.com/SciLifeLab/Sarek/pull/710) - Improve release checklist and script - [#711](https://github.com/SciLifeLab/Sarek/pull/711) - Improve configuration priorities -- [#716](https://github.com/SciLifeLab/Sarek/pull/716) - Update paths to containers and iGenomes +- [#716](https://github.com/SciLifeLab/Sarek/pull/716) - Update paths to containers and `AWS iGenomes` - [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `checkFileExtension` has changed to `hasExtension`, and now only verify if file has extension - [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `fastqFiles` renamed to `inputFiles` - [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `mapping` step can now map BAM files too - [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `MapReads` can now convert BAM to FASTQ and feed it to BWA on the fly - [#717](https://github.com/SciLifeLab/Sarek/pull/717), [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Update documentation -- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpeff` and `vep` containers are now built with conda +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpeff` and `vep` containers are now built with `conda` - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `vepCacheVersion` is now defined in `conf/genomes.config` or `conf/igenomes.config` - [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` - [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Update `Sarek-data` submodule - [#723](https://github.com/SciLifeLab/Sarek/pull/723), [#725](https://github.com/SciLifeLab/Sarek/pull/725) - Update docs -- [#724](https://github.com/SciLifeLab/Sarek/pull/724) - Improved AwsBatch configuration +- [#724](https://github.com/SciLifeLab/Sarek/pull/724) - Improved `AWS batch` configuration - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Improved usage of `targetBED` params -- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Strelka Best Practices output is now prefixed with `StrelkaBP_` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - `Strelka` Best Practices output is now prefixed with `StrelkaBP_` - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - VCFs and Annotated VCFs are now ordered by Patient, then tools - [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Merge `buildContainers.nf` and `buildReferences.nf` in `build.nf` - [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Reduce number of CPUs for `RunVEP` to `4` cf: [VEP docs](https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html) -- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Update VEP from `95.1` to `95.2` +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Update `VEP` from `95.1` to `95.2` ### `Removed` - [#715](https://github.com/SciLifeLab/Sarek/pull/715) - Remove `defReferencesFiles` function from `buildReferences.nf` - [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpEff` base container is no longer used -- [#721](https://github.com/SciLifeLab/Sarek/pull/721) - Remove COSMIC docs +- [#721](https://github.com/SciLifeLab/Sarek/pull/721) - Remove `COSMIC` docs - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Remove `defineDirectoryMap()` -- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Removed `--database` option for VEP cf: [VEP docs](https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html) +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Remove `--database` option for VEP cf: [VEP docs](https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html) ### `Fixed` -- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - bamQC is now run on the recalibrated bams, and not after MarkDuplicates -- [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix Ascat ref file input (one file can't be a set) -- [#727](https://github.com/SciLifeLab/Sarek/pull/727) - bamQC outputs are no longer overwritten (name of dir is now the file instead of sample) +- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - `bamQC` is now run on the recalibrated bams, and not after `MarkDuplicates` +- [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix `Ascat` ref file input (one file can't be a set) +- [#727](https://github.com/SciLifeLab/Sarek/pull/727) - `bamQC` outputs are no longer overwritten (name of dir is now the file instead of sample) - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix issue with annotation that was consuming `cache` channels - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix multi sample TSV file [#691](https://github.com/SciLifeLab/Sarek/issues/691) - [#733](https://github.com/SciLifeLab/Sarek/pull/733) - Fix the possibility to specify reference files on the command line @@ -219,31 +280,31 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs - [#673](https://github.com/SciLifeLab/Sarek/pull/673), [#675](https://github.com/SciLifeLab/Sarek/pull/675), [#676](https://github.com/SciLifeLab/Sarek/pull/676) - Profiles for BinAC and CFC clusters in Tübingen - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds` -- [#692](https://github.com/SciLifeLab/Sarek/pull/692), [#697](https://github.com/SciLifeLab/Sarek/pull/697) - Add AWS iGenomes possibilities (within `conf/igenomes.conf`) +- [#692](https://github.com/SciLifeLab/Sarek/pull/692), [#697](https://github.com/SciLifeLab/Sarek/pull/697) - Add `AWS iGenomes` possibilities (within `conf/igenomes.conf`) - [#694](https://github.com/SciLifeLab/Sarek/pull/694) - Add monochrome and grey logos for light or dark background - [#698](https://github.com/SciLifeLab/Sarek/pull/698) - Add btb profile for munin server -- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Add font-ttf-dejavu-sans-mono `2.37` and fontconfig `2.12.6` to container +- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Add `font-ttf-dejavu-sans-mono` `2.37` and `fontconfig` `2.12.6` to container ### `Changed` - [#663](https://github.com/SciLifeLab/Sarek/pull/663) - Update `do_release.sh` script -- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params +- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - `publishDir` modes are now params - [#677](https://github.com/SciLifeLab/Sarek/pull/677), [#698](https://github.com/SciLifeLab/Sarek/pull/698), [#703](https://github.com/SciLifeLab/Sarek/pull/703) - Update docs -- [#678](https://github.com/SciLifeLab/Sarek/pull/678) - Changing VEP to v92 and adjusting CPUs for VEP -- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration -- [#682](https://github.com/SciLifeLab/Sarek/pull/682) - Specifications for memory and cpus for awsbatch -- [#693](https://github.com/SciLifeLab/Sarek/pull/693) - Qualimap bamQC is now ran after mapping and after recalibration for better QC -- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Update GATK to `4.0.9.0` -- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Update FastQC to `0.11.8` -- [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Change `--TMP_DIR` by `--tmp-dir` for GATK `4.0.9.0` BaseRecalibrator -- [#706](https://github.com/SciLifeLab/Sarek/pull/706) - Update TravisCI testing +- [#678](https://github.com/SciLifeLab/Sarek/pull/678) - Changing `VEP` to `v92` and adjusting CPUs for `VEP` +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old `awsbatch` configuration +- [#682](https://github.com/SciLifeLab/Sarek/pull/682) - Specifications for memory and cpus for `awsbatch` +- [#693](https://github.com/SciLifeLab/Sarek/pull/693) - `Qualimap bamQC` is now ran after mapping and after recalibration for better QC +- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Update `GATK` to `4.0.9.0` +- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Update `FastQC` to `0.11.8` +- [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Change `--TMP_DIR` by `--tmp-dir` for `GATK` `4.0.9.0` `BaseRecalibrator` +- [#706](https://github.com/SciLifeLab/Sarek/pull/706) - Update `Travis CI` testing ### `Fixed` -- [#665](https://github.com/SciLifeLab/Sarek/pull/665) - Input bam file now has always the same name (whether it is from a single fastq pair or multiple) in the MarkDuplicates process, so metrics too -- [#672](https://github.com/SciLifeLab/Sarek/pull/672) - process `PullSingularityContainers` from `buildContainers.nf` now expect a file with the correct `.simg` extension for singularity images, and no longer the `.img` one. -- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add publishDirMode for `germlineVC.nf` -- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Fix [#699](https://github.com/SciLifeLab/Sarek/issues/699) missing DP in the FORMAT column VCFs for MuTect2 +- [#665](https://github.com/SciLifeLab/Sarek/pull/665) - Input bam file now has always the same name (whether it is from a single fastq pair or multiple) in the `MarkDuplicates` process, so metrics too +- [#672](https://github.com/SciLifeLab/Sarek/pull/672) - Process `PullSingularityContainers` from `buildContainers.nf` now expect a file with the correct `.simg` extension for singularity images, and no longer the `.img` one. +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add `publishDirMode` for `germlineVC.nf` +- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Fix [#699](https://github.com/SciLifeLab/Sarek/issues/699) missing DP in the FORMAT column VCFs for Mutect2 - [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Fix [#701](https://github.com/SciLifeLab/Sarek/issues/701) - [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Fix [#704](https://github.com/SciLifeLab/Sarek/issues/704) @@ -252,16 +313,18 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Changed` - [#646](https://github.com/SciLifeLab/Sarek/pull/646) - Update [`pathfindr`](https://github.com/NBISweden/pathfindr) submodule -- [#659](https://github.com/SciLifeLab/Sarek/pull/659) - Update Nextflow to `0.32.0` +- [#659](https://github.com/SciLifeLab/Sarek/pull/659) - Update `Nextflow` to `0.32.0` - [#660](https://github.com/SciLifeLab/Sarek/pull/660) - Update docs ### `Fixed` - [#657](https://github.com/SciLifeLab/Sarek/pull/657) - Fix `RunMultiQC.nf` bug -- [#659](https://github.com/SciLifeLab/Sarek/pull/659) - Fix bugs due to updating Nextflow +- [#659](https://github.com/SciLifeLab/Sarek/pull/659) - Fix bugs due to updating `Nextflow` ## [2.2.0] - Skårki - 2018-09-21 +Skårki is one of the main massif in the Sarek National Park. + ### `Added` - [#613](https://github.com/SciLifeLab/Sarek/pull/613) - Add Issue Templates (bug report and feature request) @@ -276,23 +339,23 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Changed` -- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update `Nextflow` required version - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Use `splitCsv` instead of `readlines` -- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update `CHANGELOG` - [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Improve install script - [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Simplify tests - [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629), [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Refactor docs - [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor config -- [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes +- [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus `FastQC` processes - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images - [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs - [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST -- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - MultiQC 1.5 -> 1.6 -- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Qualimap 2.2.2a -> 2.2.2b -- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update conda channel order priorities -- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - VCFanno 0.2.8 -> 0.3.0 -- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - VCFtools 0.1.15 -> 0.1.16 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `MultiQC` 1.5 -> 1.6 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `Qualimap` 2.2.2a -> 2.2.2b +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update `conda` channel order priorities +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `VCFanno` 0.2.8 -> 0.3.0 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `VCFtools` 0.1.15 -> 0.1.16 ### `Removed` @@ -302,34 +365,36 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Fixed` -- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Fix VEP tests +- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Fix `VEP` tests - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Fix links in MD files ## [2.1.0] - Ruotes - 2018-08-14 +Ruotes is one of the main massif in the Sarek National Park. + ### `Added` - [#555](https://github.com/SciLifeLab/Sarek/pull/555) - `snpEff` output into `VEP` - [#556](https://github.com/SciLifeLab/Sarek/pull/556) - `Strelka` Best Practices - [#563](https://github.com/SciLifeLab/Sarek/pull/563) - Use `SnpEFF` reports in `MultiQC` - [#568](https://github.com/SciLifeLab/Sarek/pull/568) - `VCFTools` process `RunVcftools` for QC -- [#574](https://github.com/SciLifeLab/Sarek/pull/574), [#580](https://github.com/SciLifeLab/Sarek/pull/580) - Abstracts for NPMI, JOBIM and EACR25 +- [#574](https://github.com/SciLifeLab/Sarek/pull/574), [#580](https://github.com/SciLifeLab/Sarek/pull/580) - Abstracts for `NPMI`, `JOBIM` and `EACR25` - [#577](https://github.com/SciLifeLab/Sarek/pull/577) - New repository for testing: [Sarek-data](https://github.com/SciLifeLab/Sarek-data) - [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New library `QC` for functions `bamQC`, `bcftools`, `samtoolsStats`, `vcftools`, `getVersionBCFtools`, `getVersionGATK`, `getVersionManta`, `getVersionSnpEFF`, `getVersionStrelka`, `getVersionVCFtools`, `getVersionVEP` - [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New Processes `GetVersionBCFtools`, `GetVersionGATK`, `GetVersionManta`, `GetVersionSnpEFF`, `GetVersionStrelka`, `GetVersionVCFtools`, `GetVersionVEP` -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - new Python script `bin/scrape_tool_versions.py` inspired by @ewels and @apeltzer +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New `Python` script `bin/scrape_tool_versions.py` inspired by @ewels and @apeltzer - [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New QC Process `RunVcftools` -- [#596](https://github.com/SciLifeLab/Sarek/pull/596) - New profile for BinAC cluster +- [#596](https://github.com/SciLifeLab/Sarek/pull/596) - New profile for `BinAC` cluster - [#597](https://github.com/SciLifeLab/Sarek/pull/597) - New function `sarek_ascii()` in `SarekUtils` - [#599](https://github.com/SciLifeLab/Sarek/pull/599), [#602](https://github.com/SciLifeLab/Sarek/pull/602) - New Process `CompressVCF` -- [#601](https://github.com/SciLifeLab/Sarek/pull/601), [#603](https://github.com/SciLifeLab/Sarek/pull/603) - Container for GATK4 +- [#601](https://github.com/SciLifeLab/Sarek/pull/601), [#603](https://github.com/SciLifeLab/Sarek/pull/603) - Container for `GATK4` - [#606](https://github.com/SciLifeLab/Sarek/pull/606) - Add test data as a submodule from [`Sarek-data`](https://github.com/SciLifeLab/Sarek-data) - [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Add documentation on how to install Nextflow on `bianca` ### `Changed` - [#557](https://github.com/SciLifeLab/Sarek/pull/557), [#583](https://github.com/SciLifeLab/Sarek/pull/583), [#585](https://github.com/SciLifeLab/Sarek/pull/585), [#588](https://github.com/SciLifeLab/Sarek/pull/588) - Update help -- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - GitHub langage for the repository is now `Nextflow` +- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - `GitHub` langage for the repository is now `Nextflow` - [#561](https://github.com/SciLifeLab/Sarek/pull/561) - `do_all.sh` build only containers for one genome reference (default `GRCh38`) only - [#571](https://github.com/SciLifeLab/Sarek/pull/571) - Only one container for all QC tools - [#582](https://github.com/SciLifeLab/Sarek/pull/582), [#587](https://github.com/SciLifeLab/Sarek/pull/587) - Update figures @@ -342,10 +407,10 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#599](https://github.com/SciLifeLab/Sarek/pull/599) - Merge is tested with `ANNOTATEALL` - [#604](https://github.com/SciLifeLab/Sarek/pull/604) - Synching `GRCh38` `wgs_calling_regions` bedfiles - [#607](https://github.com/SciLifeLab/Sarek/pull/607) - One container approach -- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Update to GATK4 -- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version -- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG -- [#617](https://github.com/SciLifeLab/Sarek/pull/617) - Replace deprecated $name syntax with withName +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Update to `GATK4` +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update `Nextflow` required version +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update `CHANGELOG` +- [#617](https://github.com/SciLifeLab/Sarek/pull/617) - Replace deprecated `Nextflow ``$name` syntax with `withName` ### `Fixed` @@ -353,24 +418,26 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - [#566](https://github.com/SciLifeLab/Sarek/pull/566) - `slurmDownload` profile - [#579](https://github.com/SciLifeLab/Sarek/pull/579), [#584](https://github.com/SciLifeLab/Sarek/pull/584) - `Manta` output reorganized after modification for `Strelka Best Practices` process - [#585](https://github.com/SciLifeLab/Sarek/pull/583) - Trace file is plain txt -- [#590](https://github.com/SciLifeLab/Sarek/pull/590), [#593](https://github.com/SciLifeLab/Sarek/pull/593) - Fix Singularity installation in Travis CI testing -- [#598](https://github.com/SciLifeLab/Sarek/pull/598), [#601](https://github.com/SciLifeLab/Sarek/pull/601) - Fixes for Python script `selectROI.py` to work with CLC viewer +- [#590](https://github.com/SciLifeLab/Sarek/pull/590), [#593](https://github.com/SciLifeLab/Sarek/pull/593) - Fix `Singularity` installation in `Travis CI` testing +- [#598](https://github.com/SciLifeLab/Sarek/pull/598), [#601](https://github.com/SciLifeLab/Sarek/pull/601) - Fixes for `Python` script `selectROI.py` to work with `CLC` viewer ### `Removed` -- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Remove Mutect1 +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Remove `Mutect1` ## [2.0.0] - 2018-03-23 +First release under the `Sarek` name, from the National Park in Northern Sweden + ### `Added` -- basic wrapper script +- Basic wrapper script - Abstract, posters and figures -- ROI selector and FreeBayes sanitizer scripts +- ROI selector and `FreeBayes` sanitizer scripts - New logo and icon for the project -- check for existing tumor/normal channel +- Check for existing tumor/normal channel - `SarekUtils` with `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` functions -- some `runOptions` for `docker` (prevent some user right problem) +- Some `runOptions` for `docker` (prevent some user right problem) - This `CHANGELOG` ### `Changed` @@ -379,22 +446,22 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - Dissect Workflow in 5 new scripts: `annotate.nf`, `main.nf`, `germlineVC.nf`, `runMultiQC.nf` and `somaticVC.nf` - `report.html`, `timeline.html` and `trace.html` are generated in `Reports/` - `--version` is now used to define the workflow version -- most params are now defined in the base.config file instead of in the scripts -- update RELEASE_CHECKLIST.md +- Most params are now defined in the `base.config` file instead of in the scripts +- Update `RELEASE_CHECKLIST.md` - `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` in script functions are now called within `SarekUtils` - `nf_required_version` is now `params.nfRequiredVersion` -- in `buildReferences.nf` script, channels now begin by `ch_`, and files by `f_` -- use `PublishDir mode: 'link'` instead of `copy` +- In `buildReferences.nf` script, channels now begin by `ch_`, and files by `f_` +- Use `PublishDir mode: 'link'` instead of `copy` - `directoryMap` now contains `params.outDir` -- [#539](https://github.com/SciLifeLab/Sarek/issues/539) - use Nextflow support of scratch -- reordered Travis CI tests -- update documentation +- [#539](https://github.com/SciLifeLab/Sarek/issues/539) - Use Nextflow support of scratch +- Reordered `Travis CI` tests +- Update documentation - `MultiQC` version in container from v`1.4` to v`1.5` - `vepgrch37` container base image from `release_90.6` to `release_92` - `vepgrch38` container base image from `release_90.6` to `release_92` - `VEP` version in containers from v`90` to v`91` - `nucleotidesPerSecond` is now `params.nucleotidesPerSecond` -- default `params.tag` is now `latest` instead of current version, so --tag needs to be specified with the right version to be sure of using the `containers` corresponding +- Default `params.tag` is now `latest` instead of current version, so `--tag` needs to be specified with the right version to be sure of using the `containers` corresponding ### `Deprecated` @@ -404,22 +471,22 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Removed` - `scripts/skeleton_batch.sh` -- old data and tsv files -- UPPMAX directories from containers +- Old data and tsv files +- `UPPMAX` directories from containers - `--step` in `annotate.nf`, `germlineVC.nf` and `somatic.nf` -- some `runOptions` for Singularity (binding not needed anymore on UPPMAX) +- Some `runOptions` for `Singularity` (binding not needed anymore on `UPPMAX`) - `download` profile ### `Fixed` -- [#530](https://github.com/SciLifeLab/Sarek/issues/530) - use `$PWD` for default `outDir` +- [#530](https://github.com/SciLifeLab/Sarek/issues/530) - Use `$PWD` for default `outDir` - [#533](https://github.com/SciLifeLab/Sarek/issues/533) - Replace `VEP` `--pick` option by `--per_gene` ## [1.2.5] - 2018-01-18 ### `Added` -- Zenodo for DOI +- `Zenodo` for DOI - Delivery README - Document use of the `--sampleDir` option - Contributing Guidelines @@ -428,22 +495,22 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) - `--outDir` - `awsbatch` profile - `aws-batch.config` config file -- `--noBAMQC` params (failing sometimes on Bianca) +- `--noBAMQC` params (failing sometimes on `Bianca`) ### `Changed` -- Update `Nextflow` to `0.26.0` (new fancy report + AWS Batch) -- Extra time on Travis CI testing +- Update `Nextflow` to `0.26.0` (new fancy report + `AWS Batch`) +- Extra time on `Travis CI` testing - Replace `bundleDir` by `params.genome_base` -- Update `MultiQC` to `1.3` (MEGAQC FTW) +- Update `MultiQC` to `1.3` (`MEGAQC` FTW) - Move and rename some test files ### `Fixed` -- Version of COSMIC GRCh37 v83 +- Version of `COSMIC` `GRCh37` `v83` - Write an error message when `--sampleDir` does not find any FASTQ files -- `base.config` for ConcatVCF process -- File specification for recalibrationReport in RecalibrateBam process (got error on AWS Batch) +- `base.config` for `ConcatVCF` process +- File specification for `recalibrationReport` in `RecalibrateBam` process (got error on `AWS Batch`) ## [1.2.4] - 2017-10-27 @@ -457,7 +524,7 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Fixed` -- [#357](https://github.com/SciLifeLab/Sarek/issues/357) - `ASCAT` works for GRCh38 +- [#357](https://github.com/SciLifeLab/Sarek/issues/357) - `ASCAT` works for `GRCh38` - [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on `/scratch` - [#475](https://github.com/SciLifeLab/Sarek/issues/475) - 16 cpus for local executor - [#480](https://github.com/SciLifeLab/Sarek/issues/480) - No `tsv` file needed for step `annotate` @@ -483,7 +550,7 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Fixed` - [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on /scratch -- [#472](https://github.com/SciLifeLab/Sarek/issues/472) - Update function to check Nextflow version +- [#472](https://github.com/SciLifeLab/Sarek/issues/472) - Update function to check `Nextflow` version - [#473](https://github.com/SciLifeLab/Sarek/issues/473) - Remove `returnMin()` function ## [1.2.0] - 2017-10-02 @@ -496,14 +563,14 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Added` -- Singularity possibilities +- `Singularity` possibilities ### `Changed` - Reports made by default - Intervals file can be a bed file -- Normal sample preprocessing + HaplotypeCaller is possible -- Better Travis CI tests +- Normal sample preprocessing + `HaplotypeCaller` is possible +- Better `Travis CI` tests ### `Fixed` @@ -513,7 +580,7 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### `Added` -- Docker possibilities +- `Docker` possibilities ## [0.9] - 2016-11-16 diff --git a/Dockerfile b/Dockerfile index c493eab98f..84d313d85a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,4 +4,4 @@ LABEL authors="Maxime Garcia, Szilveszter Juhos" \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-sarek-2.5.1/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-sarek-2.5.2/bin:$PATH diff --git a/Jenkinsfile b/Jenkinsfile index ffe06115b0..71ea4d76b7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -8,35 +8,46 @@ pipeline { stages { stage('Docker setup') { steps { - sh "./scripts/download_image.sh -n docker -t ALL --source-version 2.5 --target-version 2.5.1 -g smallGRCh37" + sh "docker pull nfcore/sarek:dev" + sh "docker tag nfcore/sarek:dev nfcore/sarek:2.5.2" + sh "docker pull nfcore/sareksnpeff:dev.GRCh37" + sh "docker tag nfcore/sareksnpeff:dev.GRCh37 nfcore/sareksnpeff:2.5.2.GRCh37" + sh "docker pull nfcore/sarekvep:dev.GRCh37" + sh "docker tag nfcore/sarekvep:dev.GRCh37 nfcore/sarekvep:2.5.2.GRCh37" + } + } + stage('Annotation') { + steps { + sh "nextflow run . -profile test_annotation,kraken --verbose --tools snpeff,vep,merge" } } stage('Germline') { steps { sh "rm -rf data/" sh "git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data" - sh "./scripts/run_tests.sh --profile kraken --test GERMLINE --no-reports" + sh "nextflow run . -profile test,kraken --input data/testdata/tiny/normal" + sh "nextflow run . -profile test,kraken --input=false --step recalibrate -resume" + sh "nextflow run . -profile test,kraken --input=false --step variantCalling" sh "rm -rf data/" } } - stage('Somatic') { + stage('Minimal') { steps { - sh "./scripts/run_tests.sh --profile kraken --test SOMATIC --no-reports" + sh "nextflow run . -profile test,kraken --skipQC all --verbose --genome smallerGRCh37 --no_intervals --tools Manta,mpileup,Strelka" + sh "nextflow run . -profile test,kraken --skipQC all --verbose --genome smallerGRCh37 --tools Manta,mpileup,Strelka" + sh "nextflow run . -profile test,kraken --skipQC all --verbose --genome minimalGRCh37 --no_intervals --tools Manta,mpileup,Strelka" + sh "nextflow run . -profile test,kraken --skipQC all --verbose --genome minimalGRCh37 --tools Manta,mpileup,Strelka" } } - stage('Targeted') { - steps { - sh "./scripts/run_tests.sh --profile kraken --test TARGETED --no-reports" - } - } - stage('Annotation') { + stage('Profile') { steps { - sh "./scripts/run_tests.sh --profile kraken --test ANNOTATEBOTH --no-reports" + sh "nextflow run . -profile test_splitfastq,kraken --verbose" + sh "nextflow run . -profile test_targeted,kraken --verbose" } } - stage('Multiple') { + stage('Tools') { steps { - sh "./scripts/run_tests.sh --profile kraken --test MULTIPLE" + sh "nextflow run . -profile test_tool,kraken --verbose --tools Haplotypecaller,Freebayes,Manta,mpileup,Mutect2,Strelka" } } } diff --git a/README.md b/README.md index bbf9718ad7..f2528f51e3 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,23 @@ -# [![Sarek](docs/images/nf-core_sarek_logo.png "Sarek")](https://sarek.scilifelab.se/) +# [![Sarek](docs/images/nf-core_sarek_logo.png "Sarek")](https://nf-co.re/sarek) > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) -[![Travis build status](https://img.shields.io/travis/nf-core/sarek.svg)](https://travis-ci.com/nf-core/sarek/) -[![CircleCi build status](https://img.shields.io/circleci/project/github/nf-core/sarek.svg)](https://circleci.com/gh/nf-core/sarek/) +[![GitHub Actions CI status](https://github.com/nf-core/sarek/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/sarek/actions?query=workflow%3A%22sarek+CI%22) +[![GitHub Actions Linting status](https://github.com/nf-core/sarek/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/sarek/actions?query=workflow%3A%22sarek+linting%22) +[![CircleCi build status](https://img.shields.io/circleci/project/github/nf-core/sarek?logo=circleci)](https://circleci.com/gh/nf-core/sarek/) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) [![Docker Container available](https://img.shields.io/docker/automated/nfcore/sarek.svg)](https://hub.docker.com/r/nfcore/sarek/) [![Install with Singularity](https://img.shields.io/badge/use%20with-singularity-purple.svg)](https://www.sylabs.io/docs/) -[![Join us on Slack](https://img.shields.io/badge/slack-nfcore/sarek-blue.svg)](https://nfcore.slack.com/messages/CGFUX04HZ/) - +[![Join us on Slack](https://img.shields.io/badge/slack-nfcore/sarek-blue.svg)](https://nfcore.slack.com/channels/sarek) ## Introduction - - -Previously known as the Cancer Analysis Workflow (CAW), Sarek is a workflow designed to run analyses on whole genome or targeted sequencing data from regular samples or tumour / normal pairs and could include additional relapses. It's built using [Nextflow](https://www.nextflow.io), @@ -29,7 +26,11 @@ across multiple compute infrastructures in a very portable manner. Software dependencies are handled using [Conda](https://conda.io/), [Docker](https://www.docker.com) or [Singularity](https://www.sylabs.io/singularity/) - environment/container technologies that provide excellent reproducibility and ease of use. Thus making installation trivial and results highly reproducible. -It's listed on the [Elixir - Tools and Data Services Registry](https://bio.tools/Sarek), [Dockstore](https://dockstore.org/workflows/github.com/SciLifeLab/Sarek/) and [omicX - Bioinformatics tools](https://omictools.com/sarek-tool). +

+ +

+ +It's listed on the [Elixir - Tools and Data Services Registry](https://bio.tools/Sarek), [Dockstore](https://dockstore.org/workflows/github.com/nf-core/sarek) and [omicX - Bioinformatics tools](https://omictools.com/sarek-tool). ## Documentation @@ -95,7 +96,7 @@ For further information or help, don't hesitate to get in touch on [Slack](https ## Citation -If you use nf-core/sarek for your analysis, please cite the `Sarek` pre-print as follows: +If you use `nf-core/sarek` for your analysis, please cite the `Sarek` pre-print as follows: > Garcia MU, Juhos S, Larsson M, Olason PI, Martin M, Eisfeldt J, DiLorenzo S, Sandgren J, de Ståhl TD, Wirta V, Nistér M, Nystedt B, Käller M. **Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants**. *bioRxiv*. 2018. p. 316976. [doi: 10.1101/316976](https://www.biorxiv.org/content/10.1101/316976v1). You can cite the sarek zenodo record for a specific version using the following [doi: 10.5281/zenodo.3476426](https://zenodo.org/badge/latestdoi/184289291) diff --git a/conf/awsbatch.config b/conf/awsbatch.config deleted file mode 100644 index 14af5866f5..0000000000 --- a/conf/awsbatch.config +++ /dev/null @@ -1,18 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running on AWS batch - * ------------------------------------------------- - * Base config needed for running with -profile awsbatch - */ -params { - config_profile_name = 'AWSBATCH' - config_profile_description = 'AWSBATCH Cloud Profile' - config_profile_contact = 'Alexander Peltzer (@apeltzer)' - config_profile_url = 'https://aws.amazon.com/de/batch/' -} - -aws.region = params.awsregion -process.executor = 'awsbatch' -process.queue = params.awsqueue -executor.awscli = '/home/ec2-user/miniconda/bin/aws' -params.tracedir = './' diff --git a/conf/base.config b/conf/base.config index 833f7e8afe..8640cad463 100644 --- a/conf/base.config +++ b/conf/base.config @@ -54,10 +54,7 @@ process { // (exit code 141). Rerunning the process will usually work. errorStrategy = {task.exitStatus == 141 ? 'retry' : 'terminate'} } - withName:FastQCBAM { - errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} - } - withName:FastQCFQ { + withLabel:FastQC { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } withName:MapReads { @@ -68,11 +65,11 @@ process { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } withName:Snpeff { - container = {(params.annotation_cache && params.snpEff_cache) ? 'nfcore/sarek:2.5.1' : "nfcore/sareksnpeff:2.5.1.${params.genome}"} + container = {(params.annotation_cache && params.snpEff_cache) ? 'nfcore/sarek:2.5.2' : "nfcore/sareksnpeff:2.5.2.${params.genome}"} errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } withLabel:VEP { - container = {(params.annotation_cache && params.vep_cache) ? 'nfcore/sarek:2.5.1' : "nfcore/sarekvep:2.5.1.${params.genome}"} + container = {(params.annotation_cache && params.vep_cache) ? 'nfcore/sarek:2.5.2' : "nfcore/sarekvep:2.5.2.${params.genome}"} errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } } \ No newline at end of file diff --git a/conf/genomes.config b/conf/genomes.config index 7d7527b21a..11c449ba55 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -25,8 +25,9 @@ params { intervals = "${params.genomes_base}/wgs_calling_regions_Sarek.list" knownIndels = "${params.genomes_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" knownIndelsIndex = "${params.genomes_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" - snpeffDb = "GRCh37.75" - vepCacheVersion = "95" + snpeffDb = 'GRCh37.75' + species = 'homo_sapiens' + vepCacheVersion = '95' } 'GRCh38' { acLoci = "${params.genomes_base}/1000G_phase3_GRCh38_maf0.3.loci" @@ -44,19 +45,26 @@ params { intervals = "${params.genomes_base}/wgs_calling_regions.hg38.bed" knownIndels = "${params.genomes_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" knownIndelsIndex = "${params.genomes_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" - snpeffDb = "GRCh38.86" - vepCacheVersion = "95" + snpeffDb = 'GRCh38.86' + species = 'homo_sapiens' + vepCacheVersion = '95' + } + 'minimalGRCh37' { + fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" } 'smallGRCh37' { - acLoci = "${params.genomes_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" - acLociGC = "${params.genomes_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc" dbsnp = "${params.genomes_base}/dbsnp_138.b37.small.vcf.gz" fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" germlineResource = "${params.genomes_base}/dbsnp_138.b37.small.vcf.gz" intervals = "${params.genomes_base}/small.intervals" knownIndels = ["${params.genomes_base}/1000G_phase1.indels.b37.small.vcf.gz", "${params.genomes_base}/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"] - snpeffDb = "GRCh37.75" - vepCacheVersion = "95" + snpeffDb = 'GRCh37.75' + species = 'homo_sapiens' + vepCacheVersion = '95' + } + 'smallerGRCh37' { + fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" + knownIndels = "${params.genomes_base}/dbsnp_138.b37.small.vcf.gz" } } } diff --git a/conf/igenomes.config b/conf/igenomes.config index 24b12fa4ed..3f48f34210 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -25,8 +25,9 @@ params { intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/intervals/wgs_calling_regions_Sarek.list" knownIndels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" knownIndelsIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" - snpeffDb = "GRCh37.75" - vepCacheVersion = "95" + snpeffDb = 'GRCh37.75' + species = 'homo_sapiens' + vepCacheVersion = '95' } 'GRCh38' { acLoci = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" @@ -44,8 +45,161 @@ params { intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/intervals/wgs_calling_regions.hg38.bed" knownIndels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" knownIndelsIndex = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" - snpeffDb = "GRCh38.86" - vepCacheVersion = "95" + snpeffDb = 'GRCh38.86' + species = 'homo_sapiens' + vepCacheVersion = '95' + } + 'GRCm38' { + bwaIndex = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + chrDir = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Chromosomes" + chrLength = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Length/GRCm38.len" + dbsnp = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/MouseGenomeProject/Annotation/mgp.v5.merged.snps_all.dbSNP142.vcf.gz" + dbsnpIndex = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/MouseGenomeProject/Annotation/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi" + dict = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.dict" + fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" + fastaFai = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa.fai" + intervals = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/intervals/GRCm38_calling_list.bed" + knownIndels = "${params.igenomes_base}/Mus_musculus/Annotation/MouseGenomeProject/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz" + knownIndelsIndex = "${params.igenomes_base}/Mus_musculus/Annotation/MouseGenomeProject/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi" + snpeffDb = 'GRCm38.86' + species = 'mus_musculus' + vepCacheVersion = '98' + } + 'TAIR10' { + bwaIndex = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" + } + 'EB2' { + bwaIndex = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" + } + 'UMD3.1' { + bwaIndex = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" + } + 'bosTau8' { + bwaIndex = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" + } + 'WBcel235' { + bwaIndex = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" + } + 'ce10' { + bwaIndex = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" + } + 'CanFam3.1' { + bwaIndex = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" + } + 'canFam3' { + bwaIndex = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" + } + 'GRCz10' { + bwaIndex = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" + } + 'danRer10' { + bwaIndex = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" + } + 'BDGP6' { + bwaIndex = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" + } + 'dm6' { + bwaIndex = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" + } + 'EquCab2' { + bwaIndex = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" + } + 'equCab2' { + bwaIndex = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" + } + 'EB1' { + bwaIndex = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" + } + 'Galgal4' { + bwaIndex = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" + } + 'galGal4' { + bwaIndex = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" + } + 'Gm01' { + bwaIndex = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" + } + 'hg38' { + bwaIndex = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" + } + 'hg19' { + bwaIndex = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" + } + 'Mmul_1' { + bwaIndex = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" + } + 'mm10' { + bwaIndex = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" + } + 'IRGSP-1.0' { + bwaIndex = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" + } + 'CHIMP2.1.4' { + bwaIndex = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" + } + 'panTro4' { + bwaIndex = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" + } + 'Rnor_6.0' { + bwaIndex = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" + } + 'rn6' { + bwaIndex = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" + } + 'R64-1-1' { + bwaIndex = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" + } + 'sacCer3' { + bwaIndex = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" + } + 'EF2' { + bwaIndex = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" + } + 'Sbi1' { + bwaIndex = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" + } + 'Sscrofa10.2' { + bwaIndex = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" + } + 'susScr3' { + bwaIndex = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" + } + 'AGPv3' { + bwaIndex = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/genome.fa.{amb,ann,bwt,pac,sa}" + fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" } 'GRCh38_PGP_UK' { acLoci = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" diff --git a/conf/test.config b/conf/test.config index 9c32937aca..55b2defe4d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,16 +20,15 @@ params { igenomesIgnore = true genome = 'smallGRCh37' genomes_base = "https://github.com/nf-core/test-datasets/raw/sarek/reference" - - // Use publishDir mode link so that work can be removed - publishDirMode = 'link' } process { withName:Snpeff { + container = 'nfcore/sareksnpeff:2.5.2.GRCh37' maxForks = 1 } - withName:VEP { + withLabel:VEP { + container = 'nfcore/sarekvep:2.5.2.GRCh37' maxForks = 1 } } diff --git a/conf/test_annotation.config b/conf/test_annotation.config new file mode 100644 index 0000000000..6dc8799d22 --- /dev/null +++ b/conf/test_annotation.config @@ -0,0 +1,15 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test + */ + +includeConfig 'test.config' + +params { + input = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/vcf/Strelka_1234N_variants.vcf.gz' + genome = 'GRCh37' +} \ No newline at end of file diff --git a/conf/test_splitfastq.config b/conf/test_splitfastq.config new file mode 100644 index 0000000000..5209ec401b --- /dev/null +++ b/conf/test_splitfastq.config @@ -0,0 +1,14 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test + */ + +includeConfig 'test.config' + +params { + split_fastq = 500 +} \ No newline at end of file diff --git a/conf/test_targeted.config b/conf/test_targeted.config new file mode 100644 index 0000000000..b3575300c1 --- /dev/null +++ b/conf/test_targeted.config @@ -0,0 +1,15 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test_targeted + */ + +includeConfig 'test.config' + +params { + targetBed = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-manta-https.tsv' + tools = 'manta,strelka' +} \ No newline at end of file diff --git a/conf/test_tool.config b/conf/test_tool.config new file mode 100644 index 0000000000..fa51779592 --- /dev/null +++ b/conf/test_tool.config @@ -0,0 +1,16 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test + */ + +includeConfig 'test.config' + +params { + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-recal-pair-https.tsv' + step = 'variantcalling' +} \ No newline at end of file diff --git a/containers/snpeff/Dockerfile b/containers/snpeff/Dockerfile index 769ef317e9..65aea05046 100644 --- a/containers/snpeff/Dockerfile +++ b/containers/snpeff/Dockerfile @@ -7,7 +7,7 @@ LABEL \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/sarek-snpeff-2.5.1/bin:$PATH +ENV PATH /opt/conda/envs/sarek-snpeff-2.5.2/bin:$PATH # Setup default ARG variables ARG GENOME=GRCh38 diff --git a/containers/snpeff/environment.yml b/containers/snpeff/environment.yml index 31428bb7a4..35279d068c 100644 --- a/containers/snpeff/environment.yml +++ b/containers/snpeff/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: sarek-snpeff-2.5.1 +name: sarek-snpeff-2.5.2 channels: - conda-forge - bioconda diff --git a/containers/vep/Dockerfile b/containers/vep/Dockerfile index e140f8e77b..30d3ba5378 100644 --- a/containers/vep/Dockerfile +++ b/containers/vep/Dockerfile @@ -7,7 +7,7 @@ LABEL \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/sarek-vep-2.5.1/bin:$PATH +ENV PATH /opt/conda/envs/sarek-vep-2.5.2/bin:$PATH # Setup default ARG variables ARG GENOME=GRCh38 diff --git a/containers/vep/environment.yml b/containers/vep/environment.yml index b1d32f1d02..f5c437d330 100644 --- a/containers/vep/environment.yml +++ b/containers/vep/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: sarek-vep-2.5.1 +name: sarek-vep-2.5.2 channels: - conda-forge - bioconda diff --git a/docs/abstracts/2016-09-KICR.md b/docs/abstracts/2016-09-KICR.md index 3b3df35eec..a5758c5288 100644 --- a/docs/abstracts/2016-09-KICR.md +++ b/docs/abstracts/2016-09-KICR.md @@ -1,4 +1,4 @@ -# The XVth KICancer Retreat 2016 +# The XVth KICancer Retreat - Djurö, Sweden, 2016/09 ## Cancer Analysis Workflow Of Tumor/Normal Pairs At The National Genomics Infrastructure Of SciLifeLab diff --git a/docs/abstracts/2017-05-ESHG.md b/docs/abstracts/2017-05-ESHG.md index 76e86db02f..8098600c34 100644 --- a/docs/abstracts/2017-05-ESHG.md +++ b/docs/abstracts/2017-05-ESHG.md @@ -1,4 +1,4 @@ -# European Human Genetics Conference 2017 +# European Human Genetics Conference - Copenhagen, Denmark, 2017/05 ## CAW - Cancer Analysis Workflow to process normal/tumor WGS data diff --git a/docs/abstracts/2018-05-PMC.md b/docs/abstracts/2018-05-PMC.md index 7e25e269b4..6f4ab2a166 100644 --- a/docs/abstracts/2018-05-PMC.md +++ b/docs/abstracts/2018-05-PMC.md @@ -1,4 +1,4 @@ -# Keystone Symposia - Precision Medicine in Cancer +# Keystone Symposia - Precision Medicine in Cancer - Stockholm, Sweden, 2018/05 ## Sarek, a workflow for WGS analysis of germline and somatic mutations diff --git a/docs/abstracts/2018-06-EACR25.md b/docs/abstracts/2018-06-EACR25.md index 57af817151..6a4be1c8df 100644 --- a/docs/abstracts/2018-06-EACR25.md +++ b/docs/abstracts/2018-06-EACR25.md @@ -1,4 +1,4 @@ -# 25th Biennial Congress Of The European Association For Cancer Research 2018 +# 25th Biennial Congress Of The European Association For Cancer Research - Amsterdam, Netherlands, 2018/06-07 ## Somatic and germline calls from tumour/normal whole genome data: bioinformatics workflow for reproducible research diff --git a/docs/abstracts/2018-06-NPMI.md b/docs/abstracts/2018-06-NPMI.md index ef15cc035f..fbb3d97d8c 100644 --- a/docs/abstracts/2018-06-NPMI.md +++ b/docs/abstracts/2018-06-NPMI.md @@ -1,4 +1,4 @@ -# The Nordic Precision Medicine Initiative - Meeting No 5 +# The Nordic Precision Medicine Initiative - Meeting No 5 - Reykjavìk, Iceland, 2018/06 ## Sarek, a portable workflow for WGS analysis of germline and somatic mutations diff --git a/docs/abstracts/2018-07-JOBIM.md b/docs/abstracts/2018-07-JOBIM.md index 19e42d372c..9a6257cddb 100644 --- a/docs/abstracts/2018-07-JOBIM.md +++ b/docs/abstracts/2018-07-JOBIM.md @@ -1,4 +1,4 @@ -# Journées Ouvertes en Biologie, Informatique et Mathématiques 2018 +# Journées Ouvertes en Biologie, Informatique et Mathématiques - Marseille, France, 2018/07 ## Sarek, a portable workflow for WGS analysis of germline and somatic mutations diff --git a/docs/annotation.md b/docs/annotation.md index b7babedc7c..8500a6d59e 100644 --- a/docs/annotation.md +++ b/docs/annotation.md @@ -14,6 +14,7 @@ With Sarek, annotation is done using `snpEff`, `VEP`, or even both consecutively - To annotate using `snpEff` followed by `VEP` VCF produced by Sarek will be annotated if `snpEff` or `VEP` are specified with the `--tools` command. +As Sarek will use `bgzip` and `tabix` to compress and index VCF files annotated, it expects VCF files to be sorted. In these examples, all command lines will be launched starting with step `annotate`. It can of course be started directly from any other step instead. @@ -23,7 +24,7 @@ It can of course be started directly from any other step instead. Sarek has already designed containers with `snpEff` and `VEP` files for `GRCh37`, `GRCh38` and `GRCm38`. Default settings will run using these containers. -The main Sarek container has also `snpEff` and `VEP` installed, but without the cache files that can be downloaded separatelly. +The main Sarek container has also `snpEff` and `VEP` installed, but without the cache files that can be downloaded separately. ## Using downloaded cache @@ -35,8 +36,8 @@ The cache will only be used when `--annotation_cache` and cache directories are Example: ```bash -nextflow run nf-core/sarek/main.nf --tools snpEff --step annotate --sample file.vcf.gz --snpEff_cache /Path/To/snpEffCache --annotation_cache -nextflow run nf-core/sarek/main.nf --tools VEP --step annotate --sample file.vcf.gz --vep_cache /Path/To/vepCache --annotation_cache +nextflow run nf-core/sarek --tools snpEff --step annotate --sample file.vcf.gz --snpEff_cache /Path/To/snpEffCache --annotation_cache +nextflow run nf-core/sarek --tools VEP --step annotate --sample file.vcf.gz --vep_cache /Path/To/vepCache --annotation_cache ``` ## Using VEP CADD plugin @@ -50,7 +51,7 @@ To enable the use of the VEP CADD plugin: Example: ```bash -nextflow run nf-core/sarek/main.nf --step annotate --tools VEP --sample file.vcf.gz --cadd_cache \ +nextflow run nf-core/sarek --step annotate --tools VEP --sample file.vcf.gz --cadd_cache \ --cadd_InDels /PathToCADD/InDels.tsv.gz \ --cadd_InDels_tbi /PathToCADD/InDels.tsv.gz.tbi \ --cadd_WG_SNVs /PathToCADD/whole_genome_SNVs.tsv.gz \ @@ -75,5 +76,5 @@ To enable the use of the VEP GeneSplicer plugin: Example: ```bash -nextflow run annotate.nf --tools VEP --sample file.vcf.gz --genesplicer +nextflow run nf-core/sarek --step annotate --tools VEP --sample file.vcf.gz --genesplicer ``` diff --git a/docs/containers.md b/docs/containers.md index 87b7f05b87..16e6291612 100644 --- a/docs/containers.md +++ b/docs/containers.md @@ -48,42 +48,96 @@ For annotation, the main container can be used, but the cache has to be download - Contain **[VEP](https://github.com/Ensembl/ensembl-vep)** 95.2 - Contain cache for `GRCh37`, `GRCh38`, `GRCm38` or `CanFam3.1` -## Using helper script +## Building your own -A helper script, used for testing can also be used to help with pulling docker containers, or building singularity images. -The following parameters can be used: +Our containers are designed using [Conda](https://conda.io/). +The [`environment.yml`](../environment.yml) file can be modified if particular versions of tools are more suited to your needs. -### Engine: -n +The following commands can be used to build/download containers on your own system: -Specify which container engine to use: `docker` or `singularity`. -Default:`docker` +- Adjust `VERSION` for sarek version (typically a release or `dev`). -### Containers: -c +### Build with Conda -Specify which containers to build: `SNPEFF`, `VEP` or `ALL`. -Default:`ALL` +```Bash +conda env create -f environment.yml +``` -### Version: -T +### Build with Docker -Specify which release to pull or build: any tagged release, or `dev`. -Default:`dev` +- `sarek` -### Genome: -g +```Bash +docker build -t nfcore/sarek: . +``` -Specify which release genome to use for annotation containers (`sareksnpeff`, `sarekvep`): `smallGRCh37`, `GRCh37`, `GRCh38`, `GRCm38` or `CanFam3.1`. -Default:`smallGRCh37` +- `sareksnpeff` -### Singularity +Adjust arguments for `GENOME` version and snpEff `CACHE_VERSION` -To specify where to build singularity image, use the Nextflow ENV variable `NXF_SINGULARITY_CACHEDIR`, ie: +```Bash +docker build -t nfcore/sareksnpeff:. containers/snpeff/. --build-arg GENOME= --build-arg CACHE_VERSION= +``` -```bash -NXF_SINGULARITY_CACHEDIR=/data/singularity ./scripts/download_image.sh -n singularity -t ALL -T dev -g GRCh38 +- `sarekvep` + +Adjust arguments for `GENOME` version, `SPECIES` name and VEP `VEP_VERSION` + +```Bash +docker build -t nfcore/sarekvep:. containers/vep/. --build-arg GENOME= --build-arg SPECIES= --build-arg VEP_VERSION= ``` -That will build the main container, plus the annotation containers (`sareksnpeff`, `sarekvep`) for `GRCh38`, in the `/data/singularity` folder. +### Pull with Docker -## Building your own +- `sarek` -Our containers are designed using [Conda](https://conda.io/). -The `environment.yml` file can easilly be modified if particular versions of tools are more suited to your needs. +```Bash +docker pull nfcore/sarek: +``` + +- `sareksnpeff` + +Adjust arguments for `GENOME` version + +```Bash +docker pull nfcore/sareksnpeff:. +``` + +- `sarekvep` + +Adjust arguments for `GENOME` version + +```Bash +docker pull nfcore/sarekvep:. +``` + +### Pull with Singularity + +You can directly pull singularity image, in the path used by the Nextflow ENV variable `NXF_SINGULARITY_CACHEDIR`, ie: + +```Bash +cd $NXF_SINGULARITY_CACHEDIR +singularity build ... +``` + +- `sarek` + +```Bash +singularity build nfcore-sarek-.img docker://nfcore/sarek: +``` + +- `sareksnpeff` + +Adjust arguments for `GENOME` version + +```Bash +singularity build nfcore-sareksnpeff-..img docker://nfcore/sareksnpeff:. +``` + +- `sarekvep` + +Adjust arguments for `GENOME` version + +```Bash +singularity build nfcore-sarekvep-..img docker://nfcore/sarekvep:. +``` diff --git a/docs/images/CAW_logo.png b/docs/images/CAW_logo.png deleted file mode 100644 index 285d727408..0000000000 Binary files a/docs/images/CAW_logo.png and /dev/null differ diff --git a/docs/images/CAW_logo.svg b/docs/images/CAW_logo.svg deleted file mode 100644 index 9b25dec5f6..0000000000 --- a/docs/images/CAW_logo.svg +++ /dev/null @@ -1,649 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/sarek_workflow.png b/docs/images/sarek_workflow.png new file mode 100644 index 0000000000..4ff6f6721e Binary files /dev/null and b/docs/images/sarek_workflow.png differ diff --git a/docs/images/sarek_workflow.svg b/docs/images/sarek_workflow.svg new file mode 100644 index 0000000000..ecd3792288 --- /dev/null +++ b/docs/images/sarek_workflow.svg @@ -0,0 +1,3723 @@ + + + +image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +r + + + + + + + + +fastqfastqfastqbambambamvcfvcfvcfBased on GATK Best PracticesPreprocessing + + + + + + + + + +• HaplotypeCaller mpileup, Strelka2• Manta, TIDDITVariant Calling• FreeBayes, Mutect2 Strelka2• Manta• ASCAT, Control-FREECsnpEff, VEPsnpEff, VEPAnnotation + + + + + + + + + +Reports + + + + + + + + + +2.5.2 \ No newline at end of file diff --git a/docs/input.md b/docs/input.md index 18e6695855..7507344cd9 100644 --- a/docs/input.md +++ b/docs/input.md @@ -131,6 +131,8 @@ G15511 XX 1 D0ENMT pathToFiles/G15511.D0ENMT.md.recal.bam pathToF Input files for Sarek can be specified using the path to a VCF directory given to the `--input` command only with the `annotate` step. Multiple VCF files can be specified if the path is enclosed in quotes. +As Sarek will use `bgzip` and `tabix` to compress and index VCF files annotated, it expects VCF files to be sorted. + ```bash nextflow run nf-core/sarek --step annotate --input "results/VariantCalling/*/.vcf.gz" ... ``` diff --git a/docs/output.md b/docs/output.md index 58f9af63a2..4908f032ad 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,72 +1,97 @@ -# nf-core/sarek: Output +# nf-core/sarek: Output This document describes the output produced by the pipeline. -## Pipeline overview +## Pipeline overview The pipeline processes data using the following steps: -1. [**Preprocessing**](#Preprocessing) _(based on [GATK best practices](https://software.broadinstitute.org/gatk/best-practices/))_ - * Map reads to Reference - * `BWA mem` - * Mark Duplicates - * `GATK MarkDuplicates` - * Base (Quality Score) Recalibration - * `GATK BaseRecalibrator` - * `GATK GatherBQSRReports` - * `GATK ApplyBQSR` -2. [**Variant calling**](#Variant-Calling) - * SNVs and small indels - * [`FreeBayes`](#FreeBayes) - * [`GATK HaplotypeCaller`](#HaplotypeCaller) - * [`GATK GenotypeGVCFs`](#GenotypeGVCFs) - * [`GATK Mutect2`](#Mutect2) - * [`Strelka2`](#Strelka2) - * Structural variants - * [`Manta`](#Manta) - * [`TIDDIT`](#TIDDIT) - * Sample heterogeneity, ploidy and CNVs - * `alleleCounter` - * [`ConvertAlleleCounts`](#ConvertAlleleCounts) - * [`ASCAT`](#ASCAT) - * [`samtools mpileup`](#mpileup) - * [`Control-FREEC`](#Control-FREEC) -3. [**Annotation**](#Annotation) - * Variant annotation - * [`snpEff`](#snpEff) - * [`VEP` (Variant Effect Predictor)](#VEP) -4. [**QC and Reporting**](#QC-and-reporting) - * QC - * [`FastQC`](#FastQC) - * [`Qualimap bamqc`](#bamQC) - * [`GATK MarkDuplicates`](#MarkDuplicates-reports) - * [`samtools stats`](#Samtools-stats) - * [`bcftools stats`](#bcftools-stats) - * [`VCFtools`](#VCFtools) - * [`snpeff`](#snpEff-reports) - * [`VEP`](#snpEff-reports) - * Reporting - * [`MultiQC`](#MultiQC) +- [Preprocessing](#preprocessing) + - [Map to Reference](#map-to-reference) + - [BWA mem](#bwa-mem) + - [Mark Duplicates](#mark-duplicates) + - [GATK MarkDuplicates](#gatk-markduplicates) + - [Base (Quality Score) Recalibration](#base-quality-score-recalibration) + - [GATK BaseRecalibrator](#gatk-baserecalibrator) + - [GATK ApplyBQSR](#gatk-applybqsr) + - [TSV files](#tsv-files) +- [Variant Calling](#variant-calling) + - [SNVs and small indels](#snvs-and-small-indels) + - [FreeBayes](#freebayes) + - [GATK HaplotypeCaller](#gatk-haplotypecaller) + - [GATK GenotypeGVCFs](#gatk-genotypegvcfs) + - [GATK Mutect2](#gatk-mutect2) + - [samtools mpileup](#samtools-mpileup) + - [Strelka2](#strelka2) + - [Sentieon DNAseq](#sentieon-dnaseq) + - [Sentieon DNAscope](#sentieon-dnascope) + - [Sentieon TNscope](#sentieon-tnscope) + - [Structural Variants](#structural-variants) + - [Manta](#manta) + - [TIDDIT](#tiddit) + - [Sentieon DNAscope SV](#sentieon-dnascope-sv) + - [Sample heterogeneity, ploidy and CNVs](#sample-heterogeneity-ploidy-and-cnvs) + - [ConvertAlleleCounts](#convertallelecounts) + - [ASCAT](#ascat) + - [Control-FREEC](#control-freec) +- [Variant annotation](#variant-annotation) + - [snpEff](#snpeff) + - [VEP](#vep) +- [QC and reporting](#qc-and-reporting) + - [QC](#qc) + - [FastQC](#fastqc) + - [bamQC](#bamqc) + - [MarkDuplicates reports](#markduplicates-reports) + - [samtools stats](#samtools-stats) + - [bcftools stats](#bcftools-stats) + - [VCFtools](#vcftools) + - [snpEff reports](#snpeff-reports) + - [VEP reports](#vep-reports) + - [Reporting](#reporting) + - [MultiQC](#multiqc) ## Preprocessing Sarek preprocesses raw FastQ files or unmapped BAM files, based on [GATK best practices](https://software.broadinstitute.org/gatk/best-practices/). -BAM files with Recalibration tables can also be used as an input to start with the recalibration of said BAM files, for more information see [TSV files output information](#TSV-files) +BAM files with Recalibration tables can also be used as an input to start with the recalibration of said BAM files, for more information see [TSV files output information](#tsv-files) -### Duplicate Marked BAM file(s) with Recalibration Table(s) +### Map to Reference -This directory is the location for the BAM files delivered to users. Besides the duplicate marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files. +#### BWA mem + +[BWA mem](http://bio-bwa.sourceforge.net/) is a software package for mapping low-divergent sequences against a large reference genome. + +Such files are intermediate and not kept in the final files delivered to users. + +### Mark Duplicates + +#### GATK MarkDuplicates + +[GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. + +This directory is the location for the BAM files delivered to users. +Besides the duplicate marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files. For further reading and documentation see the [data pre-processing workflow from the GATK best practices](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165). For all samples: **Output directory: `results/Preprocessing/[SAMPLE]/DuplicateMarked`** -* `[SAMPLE].md.bam`, `[SAMPLE].md.bai` and `[SAMPLE].recal.table` - * BAM file and index with Recalibration Table +- `[SAMPLE].md.bam`, `[SAMPLE].md.bai` and `[SAMPLE].recal.table` + - BAM file and index with Recalibration Table + +### Base (Quality Score) Recalibration + +#### GATK BaseRecalibrator + +[GATK BaseRecalibrator](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php) generates a recalibration table based on various covariates. + +Such files are intermediate and not kept in the final files delivered to users. -### Recalibrated BAM file(s) +#### GATK ApplyBQSR + +[GATK ApplyBQSR](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php) recalibrates the base qualities of the input reads based on the recalibration table produced by the [`BaseRecalibrator`](#gatk-baserecalibrator) tool. This directory is usually empty, it is the location for the final recalibrated BAM files. Recalibrated BAM files are usually 2-3 times larger than the duplicate marked BAM files. @@ -77,30 +102,42 @@ For further reading and documentation see the [data pre-processing workflow from For all samples: **Output directory: `results/Preprocessing/[SAMPLE]/Recalibrated`** -* `[SAMPLE].recal.bam` and `[SAMPLE].recal.bai` - * BAM file and index +- `[SAMPLE].recal.bam` and `[SAMPLE].recal.bam.bai` + - BAM file and index ### TSV files -The TSV files are autogenerated and can be used by Sarek for further processing and/or variant calling. +The TSV files are auto-generated and can be used by Sarek for further processing and/or variant calling. For further reading and documentation see the [input documentation](https://github.com/nf-core/sarek/blob/master/docs/input.md). For all samples: **Output directory: `results/Preprocessing/TSV`** -* `duplicateMarked.tsv` and `recalibrated.tsv` - * TSV files to start Sarek from `recalibration` or `variantcalling` steps. -* `duplicateMarked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv` - * TSV files to start Sarek from `recalibration` or `variantcalling` steps for a specific sample. +- `duplicateMarked.tsv` and `recalibrated.tsv` + - TSV files to start Sarek from `recalibration` or `variantcalling` steps. +- `duplicateMarked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv` + - TSV files to start Sarek from `recalibration` or `variantcalling` steps for a specific sample. + +> :warning: Only with [`--sentieon`](usage.md#--sentieon) + +For all samples: +**Output directory: `results/Preprocessing/TSV`** + +- `recalibrated_sentieon.tsv` + - TSV files to start Sarek from `variantcalling` step. +- `recalibrated_sentieon_[SAMPLE].tsv` + - TSV files to start Sarek from `variantcalling` step for a specific sample. ## Variant Calling -All the results regarding variant-calling are collected in this directory. +All the results regarding Variant Calling are collected in this directory. + +Recalibrated BAM files can also be used as an input to start the Variant Calling, for more information see [TSV files output information](#tsv-files) -Recalibrated BAM files can also be used as an input to start the Variant Calling, for more information see [TSV files output information](#TSV-files) +### SNVs and small indels -### FreeBayes +#### FreeBayes [FreeBayes](https://github.com/ekg/freebayes) is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs, and complex events smaller than the length of a short-read sequencing alignment.. @@ -109,10 +146,10 @@ For further reading and documentation see the [FreeBayes manual](https://github. For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/FreeBayes`** -* `FreeBayes_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `FreeBayes_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` - * VCF with Tabix index +- `FreeBayes_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `FreeBayes_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` + - VCF with Tabix index -### HaplotypeCaller +#### GATK HaplotypeCaller [GATK HaplotypeCaller](https://github.com/broadinstitute/gatk) calls germline SNPs and indels via local re-assembly of haplotypes. @@ -123,10 +160,10 @@ For further reading and documentation see the [HaplotypeCaller manual](https://s For all samples: **Output directory: `results/VariantCalling/[SAMPLE]/HaploTypeCaller`** -* `HaplotypeCaller_[SAMPLE].vcf.gz` and `HaplotypeCaller_[SAMPLE].vcf.gz.tbi` - * VCF with Tabix index +- `HaplotypeCaller_[SAMPLE].vcf.gz` and `HaplotypeCaller_[SAMPLE].vcf.gz.tbi` + - VCF with Tabix index -### GenotypeGVCFs +#### GATK GenotypeGVCFs [GATK GenotypeGVCFs](https://github.com/broadinstitute/gatk) performs joint genotyping on one or more samples pre-called with HaplotypeCaller. @@ -137,10 +174,10 @@ For further reading and documentation see the [GenotypeGVCFs manual](https://sof For all samples: **Output directory: `results/VariantCalling/[SAMPLE]/HaplotypeCallerGVCF`** -* `HaplotypeCaller_[SAMPLE].g.vcf.gz` and `HaplotypeCaller_[SAMPLE].g.vcf.gz.tbi` - * VCF with Tabix index +- `HaplotypeCaller_[SAMPLE].g.vcf.gz` and `HaplotypeCaller_[SAMPLE].g.vcf.gz.tbi` + - VCF with Tabix index -### Mutect2 +#### GATK Mutect2 [GATK Mutect2](https://github.com/broadinstitute/gatk) calls somatic SNVs and indels via local assembly of haplotypes. @@ -152,41 +189,28 @@ For a Tumor/Normal pair only: Files created: -* `unfiltered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `unfiltered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` - * unfiltered (raw) Mutect2 calls VCF with Tabix index -* `filtered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `filtered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` - * filtered Mutect2 calls VCF with Tabix index: these entries has a PASS filter, you can get these when supplying a panel of normals using the `--pon` option -* `[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.stats` - * a stats file generated during calling raw variants (needed for filtering) -* `[TUMORSAMPLE]_contamination.table` - * a text file exported when panel-of-normals provided about sample contamination +- `unfiltered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `unfiltered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` + - unfiltered (raw) Mutect2 calls VCF with Tabix index +- `filtered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `filtered_Mutect2_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` + - filtered Mutect2 calls VCF with Tabix index: these entries has a PASS filter, you can get these when supplying a panel of normals using the `--pon` option +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.stats` + - a stats file generated during calling raw variants (needed for filtering) +- `[TUMORSAMPLE]_contamination.table` + - a text file exported when panel-of-normals provided about sample contamination -### TIDDIT +#### samtools mpileup -[TIDDIT](https://github.com/SciLifeLab/TIDDIT) identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions. - -Germline calls are provided for all samples, to able comparison of both tumor and normal for possible mixup. -Low quality calls are removed internally, to simplify processing of variant calls but they are saved by Sarek. +[samtools mpileup](https://www.htslib.org/doc/samtools.html) generate pileup for a BAM file. -For further reading and documentation see the [TIDDIT manual](https://github.com/SciLifeLab/TIDDIT/blob/master/README.md). +For further reading and documentation see the [samtools manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS). For all samples: -**Output directory: `results/VariantCalling/[SAMPLE]/TIDDIT`** +**Output directory: `results/VariantCalling/[SAMPLE]/mpileup`** + +- `[SAMPLE].pileup.gz` + - The pileup format is a text-based format for summarizing the base calls of aligned reads to a reference sequence. Alignment records are grouped by sample (SM) identifiers in @RG header lines. -* `TIDDIT_[SAMPLE].vcf.gz` and `TIDDIT_[SAMPLE].vcf.gz.tbi` - * VCF with Tabix index -* `TIDDIT_[SAMPLE].signals.tab` - * tab file describing coverage across the genome, binned per 50 bp -* `TIDDIT_[SAMPLE].ploidy.tab` - * tab file describing the estimated ploïdy and coverage across each contig -* `TIDDIT_[SAMPLE].old.vcf` - * VCF including the low qualiy calls -* `TIDDIT_[SAMPLE].wig` - * wiggle file containing coverage across the genome, binned per 50 bp -* `TIDDIT_[SAMPLE].gc.wig` - * wiggle file containing fraction of gc content, binned per 50 bp - -### Strelka2 +#### Strelka2 [Strelka2](https://github.com/Illumina/strelka) is a fast and accurate small variant caller optimized for analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs. @@ -195,28 +219,72 @@ For further reading and documentation see the [Strelka2 user guide](https://gith For all samples: **Output directory: `results/VariantCalling/[SAMPLE]/Strelka`** -* `Strelka_Sample_genome.vcf.gz` and `Strelka_Sample_genome.vcf.gz.tbi` - * VCF with Tabix index -* `Strelka_Sample_variants.vcf.gz` and `Strelka_Sample_variants.vcf.gz.tbi` - * VCF with Tabix index +- `Strelka_Sample_genome.vcf.gz` and `Strelka_Sample_genome.vcf.gz.tbi` + - VCF with Tabix index +- `Strelka_Sample_variants.vcf.gz` and `Strelka_Sample_variants.vcf.gz.tbi` + - VCF with Tabix index For a Tumor/Normal pair: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Strelka`** -* `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz` and `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz.tbi` - * VCF with Tabix index -* `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz` and `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz.tbi` - * VCF with Tabix index +- `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz` and `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz.tbi` + - VCF with Tabix index +- `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz` and `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz.tbi` + - VCF with Tabix index Using [Strelka Best Practices](https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md#somatic-configuration-example) with the `candidateSmallIndels` from `Manta`: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Strelka`** -* `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz` and `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz.tbi` - * VCF with Tabix index -* `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz` and `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz.tbi` - * VCF with Tabix index +- `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz` and `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz.tbi` + - VCF with Tabix index +- `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz` and `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz.tbi` + - VCF with Tabix index + +#### Sentieon DNAseq + +> :warning: Only with [`--sentieon`](usage.md#--sentieon) + +[Sentieon DNAseq](https://www.sentieon.com/products/#dnaseq) implements the same mathematics used in the Broad Institute’s BWA-GATK HaplotypeCaller 3.3-4.1 Best Practices Workflow pipeline. + +For further reading and documentation see the [Sentieon DNAseq user guide](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/). + +For all samples: +**Output directory: `results/VariantCalling/[SAMPLE]/SentieonDNAseq`** + +- `DNAseq_Sample.vcf.gz` and `DNAseq_Sample.vcf.gz.tbi` + - VCF with Tabix index + +#### Sentieon DNAscope + +> :warning: Only with [`--sentieon`](usage.md#--sentieon) + +[Sentieon DNAscope](https://www.sentieon.com/products) calls SNPs and small indels. + +For further reading and documentation see the [Sentieon DNAscope user guide](https://support.sentieon.com/manual/DNAscope_usage/dnascope/). + +For all samples: +**Output directory: `results/VariantCalling/[SAMPLE]/SentieonDNAscope`** + +- `DNAscope_Sample.vcf.gz` and `DNAscope_Sample.vcf.gz.tbi` + - VCF with Tabix index + +#### Sentieon TNscope + +> :warning: Only with [`--sentieon`](usage.md#--sentieon) + +[Sentieon TNscope](https://www.sentieon.com/products/#tnscope) calls SNPs and small indels on an Tumor/Normal pair. + +For further reading and documentation see the [Sentieon TNscope user guide](https://support.sentieon.com/manual/TNscope_usage/tnscope/). -### Manta +For a Tumor/Normal pair: +**Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/SentieonTNscope`** + +- `TNscope_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `TNscope_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` + - VCF with Tabix index + +### Structural Variants + +#### Manta [Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. @@ -227,46 +295,87 @@ For further reading and documentation see the [Manta user guide](https://github. For all samples: **Output directory: `results/VariantCalling/[SAMPLE]/Manta`** -* `Manta_[SAMPLE].candidateSmallIndels.vcf.gz` and `Manta_[SAMPLE].candidateSmallIndels.vcf.gz.tbi` - * VCF with Tabix index -* `Manta_[SAMPLE].candidateSV.vcf.gz` and `Manta_[SAMPLE].candidateSV.vcf.gz.tbi` - * VCF with Tabix index +- `Manta_[SAMPLE].candidateSmallIndels.vcf.gz` and `Manta_[SAMPLE].candidateSmallIndels.vcf.gz.tbi` + - VCF with Tabix index +- `Manta_[SAMPLE].candidateSV.vcf.gz` and `Manta_[SAMPLE].candidateSV.vcf.gz.tbi` + - VCF with Tabix index For Normal sample only: -* `Manta_[NORMALSAMPLE].diploidSV.vcf.gz` and `Manta_[NORMALSAMPLE].diploidSV.vcf.gz.tbi` - * VCF with Tabix index +- `Manta_[NORMALSAMPLE].diploidSV.vcf.gz` and `Manta_[NORMALSAMPLE].diploidSV.vcf.gz.tbi` + - VCF with Tabix index For a Tumor sample only: -* `Manta_[TUMORSAMPLE].tumorSV.vcf.gz` and `Manta_[TUMORSAMPLE].tumorSV.vcf.gz.tbi` - * VCF with Tabix index +- `Manta_[TUMORSAMPLE].tumorSV.vcf.gz` and `Manta_[TUMORSAMPLE].tumorSV.vcf.gz.tbi` + - VCF with Tabix index For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Manta`** -* `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSmallIndels.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSmallIndels.vcf.gz.tbi` - * VCF with Tabix index -* `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSV.vcf.gz.tbi` - * VCF with Tabix index -* `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].diploidSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].diploidSV.vcf.gz.tbi` - * VCF with Tabix index -* `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].somaticSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].somaticSV.vcf.gz.tbi` - * VCF with Tabix index +- `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSmallIndels.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSmallIndels.vcf.gz.tbi` + - VCF with Tabix index +- `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSV.vcf.gz.tbi` + - VCF with Tabix index +- `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].diploidSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].diploidSV.vcf.gz.tbi` + - VCF with Tabix index +- `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].somaticSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].somaticSV.vcf.gz.tbi` + - VCF with Tabix index + +#### TIDDIT + +[TIDDIT](https://github.com/SciLifeLab/TIDDIT) identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions. + +Germline calls are provided for all samples, to able comparison of both tumor and normal for possible mixup. +Low quality calls are removed internally, to simplify processing of variant calls but they are saved by Sarek. + +For further reading and documentation see the [TIDDIT manual](https://github.com/SciLifeLab/TIDDIT/blob/master/README.md). + +For all samples: +**Output directory: `results/VariantCalling/[SAMPLE]/TIDDIT`** + +- `TIDDIT_[SAMPLE].vcf.gz` and `TIDDIT_[SAMPLE].vcf.gz.tbi` + - VCF with Tabix index +- `TIDDIT_[SAMPLE].signals.tab` + - tab file describing coverage across the genome, binned per 50 bp +- `TIDDIT_[SAMPLE].ploidy.tab` + - tab file describing the estimated ploïdy and coverage across each contig +- `TIDDIT_[SAMPLE].old.vcf` + - VCF including the low qualiy calls +- `TIDDIT_[SAMPLE].wig` + - wiggle file containing coverage across the genome, binned per 50 bp +- `TIDDIT_[SAMPLE].gc.wig` + - wiggle file containing fraction of gc content, binned per 50 bp + +#### Sentieon DNAscope SV + +> :warning: Only with [`--sentieon`](usage.md#--sentieon) + +[Sentieon DNAscope](https://www.sentieon.com/products) can perform structural variant calling in addition to calling SNPs and small indels. + +For further reading and documentation see the [Sentieon DNAscope user guide](https://support.sentieon.com/manual/DNAscope_usage/dnascope/). -### ConvertAlleleCounts +For all samples: +**Output directory: `results/VariantCalling/[SAMPLE]/SentieonDNAscope`** + +- `DNAscope_SV_Sample.vcf.gz` and `DNAscope_SV_Sample.vcf.gz.tbi` + - VCF with Tabix index + +### Sample heterogeneity, ploidy and CNVs + +#### ConvertAlleleCounts [ConvertAlleleCounts](https://github.com/nf-core/sarek/blob/master/bin/convertAlleleCounts.r) is a R-script for converting output from AlleleCount to BAF and LogR values. For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/ASCAT`** -* `[TUMORSAMPLE].BAF` and `[NORMALSAMPLE].BAF` - * file with beta allele frequencies -* `[TUMORSAMPLE].LogR` and `[NORMALSAMPLE].LogR` - * file with total copy number on a logarithmic scale +- `[TUMORSAMPLE].BAF` and `[NORMALSAMPLE].BAF` + - file with beta allele frequencies +- `[TUMORSAMPLE].LogR` and `[NORMALSAMPLE].LogR` + - file with total copy number on a logarithmic scale -### ASCAT +#### ASCAT [ASCAT](https://github.com/Crick-CancerGenomics/ascat) is a method to derive copy number profiles of tumor cells, accounting for normal cell admixture and tumor aneuploidy. ASCAT infers tumor purity and ploidy and calculates whole-genome allele-specific copy number profiles. @@ -276,38 +385,26 @@ For further reading and documentation see [the Sarek documentation about ASCAT]( For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/ASCAT`** -* `[TUMORSAMPLE].aberrationreliability.png` - * Image with information about aberration reliability -* `[TUMORSAMPLE].ASCATprofile.png` - * Image with information about ASCAT profile -* `[TUMORSAMPLE].ASPCF.png` - * Image with information about ASPCF -* `[TUMORSAMPLE].rawprofile.png` - * Image with information about raw profile -* `[TUMORSAMPLE].sunrise.png` - * Image with information about sunrise -* `[TUMORSAMPLE].tumour.png` - * Image with information about tumor -* `[TUMORSAMPLE].cnvs.txt` - * file with information about CNVS -* `[TUMORSAMPLE].LogR.PCFed.txt` - * file with information about LogR -* `[TUMORSAMPLE].purityploidy.txt` - * file with information about purity ploidy - -### mpileup - -[samtools mpileup](https://www.htslib.org/doc/samtools.html) generate pileup for a BAM file. - -For further reading and documentation see the [samtools manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS). - -For all samples: -**Output directory: `results/VariantCalling/[SAMPLE]/mpileup`** - -* `[SAMPLE].pileup.gz` - * The pileup format is a text-based format for summarizing the base calls of aligned reads to a reference sequence. Alignment records are grouped by sample (SM) identifiers in @RG header lines. - -### Control-FREEC +- `[TUMORSAMPLE].aberrationreliability.png` + - Image with information about aberration reliability +- `[TUMORSAMPLE].ASCATprofile.png` + - Image with information about ASCAT profile +- `[TUMORSAMPLE].ASPCF.png` + - Image with information about ASPCF +- `[TUMORSAMPLE].rawprofile.png` + - Image with information about raw profile +- `[TUMORSAMPLE].sunrise.png` + - Image with information about sunrise +- `[TUMORSAMPLE].tumour.png` + - Image with information about tumor +- `[TUMORSAMPLE].cnvs.txt` + - file with information about CNVS +- `[TUMORSAMPLE].LogR.PCFed.txt` + - file with information about LogR +- `[TUMORSAMPLE].purityploidy.txt` + - file with information about purity ploidy + +#### Control-FREEC [Control-FREEC](https://github.com/BoevaLab/FREEC) is a tool for detection of copy-number changes and allelic imbalances (including LOH) using deep-sequencing data. Control-FREEC automatically computes, normalizes, segments copy number and beta allele frequency profiles, then calls copy number alterations and LOH. @@ -318,16 +415,16 @@ For further reading and documentation see the [Control-FREEC manual](http://boev For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/ControlFREEC`** -* `[TUMORSAMPLE]_vs_[NORMALSAMPLE].config.txt` - * Configuration file used to run Control-FREEC -* `[TUMORSAMPLE].pileup.gz_CNVs` and `[TUMORSAMPLE].pileup.gz_normal_CNVs` - * file with coordinates of predicted copy number alterations -* `[TUMORSAMPLE].pileup.gz_ratio.txt` and `[TUMORSAMPLE].pileup.gz_normal_ratio.txt` - * file with ratios and predicted copy number alterations for each window -* `[TUMORSAMPLE].pileup.gz_BAF.txt` and `[NORMALSAMPLE].pileup.gz_BAF.txt` - * file with beta allele frequencies for each possibly heterozygous SNP position +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE].config.txt` + - Configuration file used to run Control-FREEC +- `[TUMORSAMPLE].pileup.gz_CNVs` and `[TUMORSAMPLE].pileup.gz_normal_CNVs` + - file with coordinates of predicted copy number alterations +- `[TUMORSAMPLE].pileup.gz_ratio.txt` and `[TUMORSAMPLE].pileup.gz_normal_ratio.txt` + - file with ratios and predicted copy number alterations for each window +- `[TUMORSAMPLE].pileup.gz_BAF.txt` and `[NORMALSAMPLE].pileup.gz_BAF.txt` + - file with beta allele frequencies for each possibly heterozygous SNP position -## Annotation +## Variant annotation This directory contains results from the final annotation steps: two software are used for annotation, [snpEff](http://snpeff.sourceforge.net/) and [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). Only a subset of the VCF files are annotated, and only variants that have a PASS filter. @@ -345,8 +442,8 @@ For further reading and documentation see the [snpEff manual](http://snpeff.sour For all samples: **Output directory: `results/Annotation/[SAMPLE]/snpEff`** -* `VariantCaller_Sample_snpEff.ann.vcf.gz` and `VariantCaller_Sample_snpEff.ann.vcf.gz.tbi` - * VCF with Tabix index +- `VariantCaller_Sample_snpEff.ann.vcf.gz` and `VariantCaller_Sample_snpEff.ann.vcf.gz.tbi` + - VCF with Tabix index ### VEP @@ -355,29 +452,31 @@ The generated VCF header contains the software version, also the version numbers The format of the [consequence annotations](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) is also in the VCF header describing the INFO field. In the moment it contains: -* Consequence: impact of the variation, if there is any -* Codons: the codon change, i.e. cGt/cAt -* Amino_acids: change in amino acids, i.e. R/H if there is any -* Gene: ENSEMBL gene name -* SYMBOL: gene symbol -* Feature: actual transcript name -* EXON: affected exon -* PolyPhen: prediction based on [PolyPhen](http://genetics.bwh.harvard.edu/pph2/) -* SIFT: prediction by [SIFT](http://sift.bii.a-star.edu.sg/) -* Protein_position: Relative position of amino acid in protein -* BIOTYPE: Biotype of transcript or regulatory feature +- Consequence: impact of the variation, if there is any +- Codons: the codon change, i.e. cGt/cAt +- Amino_acids: change in amino acids, i.e. R/H if there is any +- Gene: ENSEMBL gene name +- SYMBOL: gene symbol +- Feature: actual transcript name +- EXON: affected exon +- PolyPhen: prediction based on [PolyPhen](http://genetics.bwh.harvard.edu/pph2/) +- SIFT: prediction by [SIFT](http://sift.bii.a-star.edu.sg/) +- Protein_position: Relative position of amino acid in protein +- BIOTYPE: Biotype of transcript or regulatory feature For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html) For all samples: **Output directory: `results/Annotation/[SAMPLE]/VEP`** -* `VariantCaller_Sample_VEP.ann.vcf.gz` and `VariantCaller_Sample_VEP.ann.vcf.gz.tbi` - * VCF with Tabix index +- `VariantCaller_Sample_VEP.ann.vcf.gz` and `VariantCaller_Sample_VEP.ann.vcf.gz.tbi` + - VCF with Tabix index ## QC and reporting -### FastQC +### QC + +#### FastQC [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). @@ -388,28 +487,28 @@ For further reading and documentation see the [FastQC help](http://www.bioinform For all samples: **Output directory: `results/Reports/[SAMPLE]/fastqc`** -* `sample_R1_XXX_fastqc.html` and `sample_R2_XXX_fastqc.html` - * FastQC report, containing quality metrics for each pair of the raw fastq files -* `sample_R1_XXX_fastqc.zip` and `sample_R2_XXX_fastqc.zip` - * zip file containing the FastQC reports, tab-delimited data files and plot images +- `sample_R1_XXX_fastqc.html` and `sample_R2_XXX_fastqc.html` + - FastQC report, containing quality metrics for each pair of the raw fastq files +- `sample_R1_XXX_fastqc.zip` and `sample_R2_XXX_fastqc.zip` + - zip file containing the FastQC reports, tab-delimited data files and plot images -### bamQC +#### bamQC [Qualimap bamqc](http://qualimap.bioinfo.cipf.es/) reports information for the evaluation of the quality of the provided alignment data. In short, the basic statistics of the alignment (number of reads, coverage, GC-content, etc.) are summarized and a number of useful graphs are produced. Plot will show: -* Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. +- Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. For all samples: **Output directory: `results/Reports/[SAMPLE]/bamQC`** -* `VariantCaller_[SAMPLE].bcf.tools.stats.out` - * RAW statistics used by MultiQC +- `VariantCaller_[SAMPLE].bcf.tools.stats.out` + - RAW statistics used by MultiQC For more information about how to use Qualimap bamqc reports, see [Qualimap bamqc manual](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#id7) -### MarkDuplicates reports +#### MarkDuplicates reports [GATK MarkDuplicates](https://github.com/broadinstitute/gatk) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. Duplicates can arise during sample preparation e.g. @@ -420,99 +519,101 @@ These duplication artifacts are referred to as optical duplicates. For all samples: **Output directory: `results/Reports/[SAMPLE]/MarkDuplicates`** -* `[SAMPLE].bam.metrics` - * RAW statistics used by MultiQC +- `[SAMPLE].bam.metrics` + - RAW statistics used by MultiQC For further reading and documentation see the [MarkDuplicates manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/picard_sam_markduplicates_MarkDuplicates.php). -### samtools stats +#### samtools stats [samtools stats](https://www.htslib.org/doc/samtools.html) collects statistics from BAM files and outputs in a text format. Plots will show: -* Alignment metrics. +- Alignment metrics. For all samples: **Output directory: `results/Reports/[SAMPLE]/SamToolsStats`** -* `[SAMPLE].bam.samtools.stats.out` - * RAW statistics used by MultiQC +- `[SAMPLE].bam.samtools.stats.out` + - RAW statistics used by MultiQC For further reading and documentation see the [samtools manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS) -### bcftools stats +#### bcftools stats [bcftools](https://samtools.github.io/bcftools/) is a program for variant calling and manipulating files in the Variant Call Format. Plot will show: -* Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. +- Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. For all samples: **Output directory: `results/Reports/[SAMPLE]/BCFToolsStats`** -* `VariantCaller_[SAMPLE].bcf.tools.stats.out` - * RAW statistics used by MultiQC +- `VariantCaller_[SAMPLE].bcf.tools.stats.out` + - RAW statistics used by MultiQC For further reading and documentation see the [bcftools stats manual](https://samtools.github.io/bcftools/bcftools.html#stats) -### VCFtools +#### VCFtools [VCFtools](https://vcftools.github.io/) is a program package designed for working with VCF files. Plots will show: -* the summary counts of each type of transition to transversion ratio for each FILTER category. -* the transition to transversion ratio as a function of alternative allele count (using only bi-allelic SNPs). -* the transition to transversion ratio as a function of SNP quality threshold (using only bi-allelic SNPs). +- the summary counts of each type of transition to transversion ratio for each FILTER category. +- the transition to transversion ratio as a function of alternative allele count (using only bi-allelic SNPs). +- the transition to transversion ratio as a function of SNP quality threshold (using only bi-allelic SNPs). For all samples: **Output directory: `results/Reports/[SAMPLE]/VCFTools`** -* `VariantCaller_[SAMPLE].FILTER.summary` - * RAW statistics used by MultiQC -* `VariantCaller_[SAMPLE].TsTv.count` - * RAW statistics used by MultiQC -* `VariantCaller_[SAMPLE].TsTv.qual` - * RAW statistics used by MultiQC +- `VariantCaller_[SAMPLE].FILTER.summary` + - RAW statistics used by MultiQC +- `VariantCaller_[SAMPLE].TsTv.count` + - RAW statistics used by MultiQC +- `VariantCaller_[SAMPLE].TsTv.qual` + - RAW statistics used by MultiQC For further reading and documentation see the [VCFtools manual](https://vcftools.github.io/man_latest.html#OUTPUT%20OPTIONS) -### snpEff reports +#### snpEff reports [snpeff](http://snpeff.sourceforge.net/) is a genetic variant annotation and effect prediction toolbox. It annotates and predicts the effects of variants on genes (such as amino acid changes) using multiple databases for annotations. Plots will shows : -* locations of detected variants in the genome and the number of variants for each location. -* the putative impact of detected variants and the number of variants for each impact. -* the effect of variants at protein level and the number of variants for each effect type. -* the quantity as function of the variant quality score. +- locations of detected variants in the genome and the number of variants for each location. +- the putative impact of detected variants and the number of variants for each impact. +- the effect of variants at protein level and the number of variants for each effect type. +- the quantity as function of the variant quality score. For all samples: **Output directory: `results/Reports/[SAMPLE]/snpEff`** -* `VariantCaller_Sample_snpEff.csv` - * RAW statistics used by MultiQC -* `VariantCaller_Sample_snpEff.html` - * Statistics to be visualised with a web browser -* `VariantCaller_Sample_snpEff.txt` - * TXT (tab separated) summary counts for variants affecting each transcript and gene +- `VariantCaller_Sample_snpEff.csv` + - RAW statistics used by MultiQC +- `VariantCaller_Sample_snpEff.html` + - Statistics to be visualised with a web browser +- `VariantCaller_Sample_snpEff.txt` + - TXT (tab separated) summary counts for variants affecting each transcript and gene For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary) -### VEP reports +#### VEP reports [VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on Ensembl, is a tools to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. For all samples: **Output directory: `results/Reports/[SAMPLE]/VEP`** -* `VariantCaller_Sample_VEP.summary.html` - * Summary of the VEP run to be visualised with a web browser +- `VariantCaller_Sample_VEP.summary.html` + - Summary of the VEP run to be visualised with a web browser For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html) -### MultiQC +### Reporting + +#### MultiQC [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. @@ -522,9 +623,9 @@ The pipeline has special steps which allow the software versions used to be repo For the whole Sarek run: **Output directory: `results/Reports/MultiQC`** -* `multiqc_report.html` - * MultiQC report - a standalone HTML file that can be viewed in your web browser -* `multiqc_data/` - * Directory containing parsed statistics from the different tools used in the pipeline +- `multiqc_report.html` + - MultiQC report - a standalone HTML file that can be viewed in your web browser +- `multiqc_data/` + - Directory containing parsed statistics from the different tools used in the pipeline For further reading and documentation see the [MultiQC website](http://multiqc.info) diff --git a/docs/reference.md b/docs/reference.md index 9e69e2dcdd..741c61bd1b 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -4,7 +4,7 @@ Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references. Sarek currently uses `GRCh38` by default. -Both `GRCh37` and `GRCh38` are available with `--genome GRCh37` or `--genome GRCh38` respectively with any profile using the `conf/igenomes.config` file, or you can specify it with `-c conf/igenomes.config`. +`GRCh37`, `GRCh38` and `GRCm38` are available with `--genome GRCh37`, `--genome GRCh38` or `--genome GRCm38` respectively with any profile using the `conf/igenomes.config` file, or you can specify it with `-c conf/igenomes.config`. Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. Settings in `igenomes.config` can be tailored to your needs. @@ -12,8 +12,8 @@ Settings in `igenomes.config` can be tailored to your needs. To speed up some preprocessing and variant calling processes, the reference is chopped into smaller pieces. The intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs. -We are ignoring the hs37d5 contig that contains concatenated decoy sequences. -Parts of preprocessing and variant calling are done by this intervals, and the different resulting files are then merged. +We are ignoring the `hs37d5` contig that contains concatenated decoy sequences. +Parts of preprocessing and variant calling are done by these intervals, and the different resulting files are then merged. This can parallelize processes, and push down wall clock time significantly. The calling intervals can be defined using a `.list` or a `.bed` file. @@ -36,3 +36,27 @@ First, when there are multiple consecutive intervals in the file that take littl Second, the jobs with largest processing time are started first, which reduces wall-clock time. If no runtime is given, a time of 1000 nucleotides per second is assumed. Actual figures vary from 2 nucleotides/second to 30000 nucleotides/second. + +If no intervals files are specified, one will be automatically generated following: + +```bash +awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' .fasta.fai > .bed +``` + +To disable this feature, please use [`--no_intervals`](usage.md#--no_intervals) + +### Working with whole exome (WES) or panel data + +The `--targetBED` parameter does _not_ imply that the workflow is running alignment or variant calling only for the supplied targets. +Instead, we are aligning for the whole genome, and selecting variants only at the very end by intersecting with the provided target file. +Adding every exon as an interval in case of WES can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. +Furthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations. +If you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, better to change the reference itself. + +### Working with other genomes + +> :warning: This is a new feature, in active development, so usage could change. + +Sarek can also do limited preprocessing from any genome, providing a `fasta` file as a reference genome, followed by limited variant calling using `mpileup`, `Manta` and `Strelka`. + +Limited support for `TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`. diff --git a/docs/usage.md b/docs/usage.md index f1532af20b..1d74294890 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,74 +1,73 @@ -# nf-core/sarek: Usage - -## Table of contents - - - -* [Table of contents](#table-of-contents) -* [Introduction](#introduction) -* [Running the pipeline](#running-the-pipeline) - * [Updating the pipeline](#updating-the-pipeline) - * [Reproducibility](#reproducibility) -* [Main arguments](#main-arguments) - * [`-profile`](#-profile) - * [`--input`](#--input) - * [`--sample`](#--sample) - * [`--sampleDir`](#--sampledir) - * [`--annotateVCF`](#--annotatevcf) - * [`--noGVCF`](#--nogvcf) - * [`--skipQC`](#--skipqc) - * [`--noReports`](#--noreports) - * [`--nucleotidesPerSecond`](#--nucleotidespersecond) - * [`--step`](#--step) - * [`--tools`](#--tools) - * [`--noStrelkaBP`](#--nostrelkabp) - * [`--targetBED`](#--targetbed) -* [Reference genomes](#reference-genomes) - * [`--genome` (using iGenomes)](#--genome-using-igenomes) - * [`--acLoci`](#--acloci) - * [`--acLociGC`](#--aclocigc) - * [`--bwaIndex`](#--bwaindex) - * [`--chrDir`](#--chrdir) - * [`--chrLength`](#--chrlength) - * [`--dbsnp`](#--dbsnp) - * [`--dbsnpIndex`](#--dbsnpindex) - * [`--dict`](#--dict) - * [`--fasta`](#--fasta) - * [`--fastaFai`](#--fastafai) - * [`--genomeDict`](#--genomedict) - * [`--genomeFile`](#--genomefile) - * [`--genomeIndex`](#--genomeindex) - * [`--germlineResource`](#--germlineresource) - * [`--germlineResourceIndex`](#--germlineresourceindex) - * [`--intervals`](#--intervals) - * [`--knownIndels`](#--knownindels) - * [`--knownIndelsIndex`](#--knownindelsindex) - * [`--pon`](#--pon) - * [`--snpeffDb`](#--snpeffdb) - * [`--vepCacheVersion`](#--vepcacheversion) - * [`--igenomesIgnore`](#--igenomesignore) -* [Job resources](#job-resources) - * [Automatic resubmission](#automatic-resubmission) - * [Custom resource requests](#custom-resource-requests) -* [AWS Batch specific parameters](#aws-batch-specific-parameters) - * [`--awsqueue`](#--awsqueue) - * [`--awsregion`](#--awsregion) -* [Other command line parameters](#other-command-line-parameters) - * [`--outdir`](#--outdir) - * [`--sequencing_center`](#--sequencing_center) - * [`--email`](#--email) - * [`-name`](#-name) - * [`-resume`](#-resume) - * [`-c`](#-c) - * [`--custom_config_version`](#--custom_config_version) - * [`--custom_config_base`](#--custom_config_base) - * [`--max_memory`](#--max_memory) - * [`--max_time`](#--max_time) - * [`--max_cpus`](#--max_cpus) - * [`--plaintext_email`](#--plaintext_email) - * [`--monochrome_logs`](#--monochrome_logs) - * [`--multiqc_config`](#--multiqc_config) - +# nf-core/sarek: Usage + +- [Introduction](#introduction) +- [Running the pipeline](#running-the-pipeline) + - [Updating the pipeline](#updating-the-pipeline) + - [Reproducibility](#reproducibility) +- [Main arguments](#main-arguments) + - [-profile](#-profile) + - [--input](#--input) + - [--split_fastq](#--split_fastq) + - [--sample](#--sample) + - [--sampleDir](#--sampledir) + - [--annotateVCF](#--annotatevcf) + - [--noGVCF](#--nogvcf) + - [--skipQC](#--skipqc) + - [--noReports](#--noreports) + - [--nucleotidesPerSecond](#--nucleotidespersecond) + - [--step](#--step) + - [--tools](#--tools) + - [--sentieon](#--sentieon) + - [--noStrelkaBP](#--nostrelkabp) + - [--no_intervals](#--no_intervals) + - [--targetBED](#--targetbed) +- [Reference genomes](#reference-genomes) + - [--genome (using iGenomes)](#--genome-using-igenomes) + - [--acLoci](#--acloci) + - [--acLociGC](#--aclocigc) + - [--bwaIndex](#--bwaindex) + - [--chrDir](#--chrdir) + - [--chrLength](#--chrlength) + - [--dbsnp](#--dbsnp) + - [--dbsnpIndex](#--dbsnpindex) + - [--dict](#--dict) + - [--fasta](#--fasta) + - [--fastaFai](#--fastafai) + - [--genomeDict](#--genomedict) + - [--genomeFile](#--genomefile) + - [--genomeIndex](#--genomeindex) + - [--germlineResource](#--germlineresource) + - [--germlineResourceIndex](#--germlineresourceindex) + - [--intervals](#--intervals) + - [--knownIndels](#--knownindels) + - [--knownIndelsIndex](#--knownindelsindex) + - [--pon](#--pon) + - [--pon_index](#--pon_index) + - [--snpeffDb](#--snpeffdb) + - [--vepCacheVersion](#--vepcacheversion) + - [--igenomesIgnore](#--igenomesignore) + - [--species](#--species) +- [Job resources](#job-resources) + - [Automatic resubmission](#automatic-resubmission) + - [Custom resource requests](#custom-resource-requests) +- [AWS Batch specific parameters](#aws-batch-specific-parameters) + - [--awsqueue](#--awsqueue) + - [--awsregion](#--awsregion) +- [Other command line parameters](#other-command-line-parameters) + - [--outdir](#--outdir) + - [--sequencing_center](#--sequencing_center) + - [--email](#--email) + - [-name](#-name) + - [-resume](#-resume) + - [-c](#-c) + - [--custom_config_version](#--custom_config_version) + - [--custom_config_base](#--custom_config_base) + - [--max_memory](#--max_memory) + - [--max_time](#--max_time) + - [--max_cpus](#--max_cpus) + - [--plaintext_email](#--plaintext_email) + - [--monochrome_logs](#--monochrome_logs) + - [--multiqc_config](#--multiqc_config) ## Introduction @@ -105,8 +104,9 @@ results # Finished results (configurable, see below) ``` The nf-core/sarek pipeline comes with more documentation about running the pipeline, found in the `docs/` directory: - * [Extra Documentation on variant calling](docs/variantcalling.md) - * [Extra Documentation on annotation](docs/annotation.md) + +- [Output and how to interpret the results](output.md) +- [Extra Documentation on annotation](annotation.md) ### Updating the pipeline @@ -131,7 +131,7 @@ This version number will be logged in reports when you run the pipeline, so that ## Main arguments -### `-profile` +### -profile Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. @@ -139,22 +139,22 @@ Note that multiple profiles can be loaded, for example: `-profile docker` - the If `-profile` is not specified at all the pipeline will be run locally and expects all software to be installed and available on the `PATH`. -* `awsbatch` - * A generic configuration profile to be used with AWS Batch. -* `conda` - * A generic configuration profile to be used with [conda](https://conda.io/docs/) - * Pulls most software from [Bioconda](https://bioconda.github.io/) -* `docker` - * A generic configuration profile to be used with [Docker](http://docker.com/) - * Pulls software from dockerhub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) -* `singularity` - * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) - * Pulls software from DockerHub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) -* `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters - -### `--input` +- `awsbatch` + - A generic configuration profile to be used with AWS Batch. +- `conda` + - A generic configuration profile to be used with [conda](https://conda.io/docs/) + - Pulls most software from [Bioconda](https://bioconda.github.io/) +- `docker` + - A generic configuration profile to be used with [Docker](http://docker.com/) + - Pulls software from dockerhub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) +- `singularity` + - A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) + - Pulls software from DockerHub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters + +### --input Use this to specify the location of your input TSV file, on `mapping`, `recalibrate` and `variantcalling` steps. For example: @@ -181,7 +181,16 @@ For example: Multiple VCF files can be specified if the path must be enclosed in quotes -### `--sample` +### --split_fastq + +Use the Nextflow [`splitFastq`](https://www.nextflow.io/docs/latest/operator.html#splitfastq) operator to specify how many reads should be contained in the split fastq file. +For example: + +```bash +--split_fastq 10000 +``` + +### --sample > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--input`](#--input) @@ -211,7 +220,7 @@ For example: Multiple VCF files can be specified if the path must be enclosed in quotes -### `--sampleDir` +### --sampleDir > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--input`](#--input) @@ -223,7 +232,7 @@ For example: --sampleDir PathToDirectory ``` -### `--annotateVCF` +### --annotateVCF > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--input`](#--input) @@ -237,52 +246,65 @@ For example: Multiple VCF files can be specified if the path must be enclosed in quotes -### `--noGVCF` +### --noGVCF Use this to disable g.vcf from `HaplotypeCaller`. -### `--skipQC` +### --skipQC Use this to disable specific QC and Reporting tools. Available: `all`, `bamQC`, `BCFtools`, `FastQC`, `MultiQC`, `samtools`, `vcftools`, `versions` Default: `None` -### `--noReports` +### --noReports > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--skipQC`](#--skipQC) Use this to disable all QC and Reporting tools. -### `--nucleotidesPerSecond` +### --nucleotidesPerSecond Use this to estimate of how many seconds it will take to call variants on any interval, the default value is `1000` is it's not specified in the `.bed` file. -### `--step` +### --step Use this to specify the starting step: Default `mapping` Available: `mapping`, `recalibrate`, `variantcalling` and `annotate` -### `--tools` +### --tools Use this to specify the tools to run: -Available: `ASCAT`, `ControlFREEC`, `FreeBayes`, `HaplotypeCaller`, `Manta`, `mpileup`, `MuTect2`, `Strelka`, `TIDDIT` +Available: `ASCAT`, `ControlFREEC`, `FreeBayes`, `HaplotypeCaller`, `Manta`, `mpileup`, `Mutect2`, `Strelka`, `TIDDIT` + +### --sentieon + +If [Sentieon](https://www.sentieon.com/) is available, use this to enable it for preprocessing, and variant calling. +Adds the following tools for the [`--tools`](#--tools) options: `DNAseq`, `DNAscope` and `TNscope`. + +Please refer to the [nf-core/configs](https://github.com/nf-core/configs#adding-a-new-pipeline-specific-config) repository on how to make a pipeline-specific configuration file based on the [munin-sarek specific configuration file](https://github.com/nf-core/configs/blob/master/conf/pipeline/sarek/munin.config). + +Or ask us on the [nf-core Slack](http://nf-co.re/join/slack) on the following channels: [#sarek](https://nfcore.slack.com/channels/sarek) [#configs](https://nfcore.slack.com/channels/configs). -### `--noStrelkaBP` +### --noStrelkaBP Use this not to use `Manta` `candidateSmallIndels` for `Strelka` as Best Practice. -### `--targetBED` +### --no_intervals + +Disable usage of intervals file, and disable automatic generation of intervals file when none are provided. + +### --targetBED Use this to specify the target BED file for targeted or whole exome sequencing. ## Reference genomes -The pipeline config files come bundled with paths to the illumina iGenomes reference index files. +The pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. -### `--genome` (using iGenomes) +### --genome (using iGenomes) There are 2 different species supported by Sarek in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. @@ -290,9 +312,92 @@ To run the pipeline, you must specify which to use with the `--genome` flag. You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Genomes that are supported are: -* Human - * `--genome GRCh37` - * `--genome GRCh38` +- Homo sapiens + - `--genome GRCh37` (GATK Bundle) + - `--genome GRCh38` (GATK Bundle) + +- Mus musculus + - `--genome GRCm38` (Ensembl) + +Limited support for: + +- Arabidopsis thaliana + - `--genome TAIR10` (Ensembl) + +- Bacillus subtilis 168 + - `--genome EB2` (Ensembl) + +- Bos taurus + - `--genome UMD3.1` (Ensembl) + - `--genome bosTau8` (UCSC) + +- Caenorhabditis elegans + - `--genome WBcel235` (Ensembl) + - `--genome ce10` (UCSC) + +- Canis familiaris + - `--genome CanFam3.1` (Ensembl) + - `--genome canFam3` (UCSC) + +- Danio rerio + - `--genome GRCz10` (Ensembl) + - `--genome danRer10` (UCSC) + +- Drosophila melanogaster + - `--genome BDGP6` (Ensembl) + - `--genome dm6` (UCSC) + +- Equus caballus + - `--genome EquCab2` (Ensembl) + - `--genome equCab2` (UCSC) + +- Escherichia coli K 12 DH10B + - `--genome EB1` (Ensembl) + +- Gallus gallus + - `--genome Galgal4` (Ensembl) + - `--genome galgal4` (UCSC) + +- Glycine max + - `--genome Gm01` (Ensembl) + +- Homo sapiens + - `--genome hg19` (UCSC) + - `--genome hg38` (UCSC) + +- Macaca mulatta + - `--genome Mmul_1` (Ensembl) + +- Mus musculus + - `--genome mm10` (Ensembl) + +- Oryza sativa japonica + - `--genome IRGSP-1.0` (Ensembl) + +- Pan troglodytes + - `--genome CHIMP2.1.4` (Ensembl) + - `--genome panTro4` (UCSC) + +- Rattus norvegicus + - `--genome Rnor_6.0` (Ensembl) + - `--genome rn6` (UCSC) + +- Saccharomyces cerevisiae + - `--genome R64-1-1` (Ensembl) + - `--genome sacCer3` (UCSC) + +- Schizosaccharomyces pombe + - `--genome EF2` (Ensembl) + +- Sorghum bicolor + - `--genome Sbi1` (Ensembl) + +- Sus scrofa + - `--genome Sscrofa10.2` (Ensembl) + - `--genome susScr3` (UCSC) + +- Zea mays + - `--genome AGPv3` (Ensembl) Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. @@ -322,7 +427,7 @@ params { } ``` -### `--acLoci` +### --acLoci If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -330,7 +435,7 @@ If you prefer, you can specify the full path to your reference genome when you r --acLoci '[path to the acLoci file]' ``` -### `--acLociGC` +### --acLociGC If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -338,7 +443,7 @@ If you prefer, you can specify the full path to your reference genome when you r --acLociGC '[path to the acLociGC file]' ``` -### `--bwaIndex` +### --bwaIndex If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -346,7 +451,7 @@ If you prefer, you can specify the full path to your reference genome when you r --bwaIndex '[path to the bwa indexes]' ``` -### `--chrDir` +### --chrDir If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -354,7 +459,7 @@ If you prefer, you can specify the full path to your reference genome when you r --chrDir '[path to the Chromosomes folder]' ``` -### `--chrLength` +### --chrLength If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -362,7 +467,7 @@ If you prefer, you can specify the full path to your reference genome when you r --chrLength '[path to the Chromosomes length file]' ``` -### `--dbsnp` +### --dbsnp If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -370,7 +475,7 @@ If you prefer, you can specify the full path to your reference genome when you r --dbsnp '[path to the dbsnp file]' ``` -### `--dbsnpIndex` +### --dbsnpIndex If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -378,7 +483,7 @@ If you prefer, you can specify the full path to your reference genome when you r --dbsnpIndex '[path to the dbsnp index]' ``` -### `--dict` +### --dict If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -386,7 +491,7 @@ If you prefer, you can specify the full path to your reference genome when you r --dict '[path to the dict file]' ``` -### `--fasta` +### --fasta If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -394,7 +499,7 @@ If you prefer, you can specify the full path to your reference genome when you r --fasta '[path to the reference fasta file]' ``` -### `--fastaFai` +### --fastaFai If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -402,7 +507,7 @@ If you prefer, you can specify the full path to your reference genome when you r --fastaFai '[path to the reference index]' ``` -### `--genomeDict` +### --genomeDict > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--dict`](#--dict) @@ -413,7 +518,7 @@ If you prefer, you can specify the full path to your reference genome when you r --dict '[path to the dict file]' ``` -### `--genomeFile` +### --genomeFile > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--fasta`](#--fasta) @@ -424,7 +529,7 @@ If you prefer, you can specify the full path to your reference genome when you r --fasta '[path to the reference fasta file]' ``` -### `--genomeIndex` +### --genomeIndex > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--fastaFai`](#--fastaFai) @@ -435,7 +540,7 @@ If you prefer, you can specify the full path to your reference genome when you r --fastaFai '[path to the reference index]' ``` -### `--germlineResource` +### --germlineResource The [germline resource VCF file](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_Mutect2.php#--germline-resource) (bgzipped and tabixed) needed by GATK4 Mutect2 is a collection of calls that are likely present in the sample, with allele frequencies. The AF info field must be present. @@ -446,7 +551,7 @@ To add your own germline resource supply --germlineResource '[path to my resource.vcf.gz]' ``` -### `--germlineResourceIndex` +### --germlineResourceIndex Tabix index of the germline resource specified at [`--germlineResource`](#--germlineResource). To add your own germline resource supply @@ -455,7 +560,7 @@ To add your own germline resource supply --germlineResourceIndex '[path to my resource.vcf.gz.idx]' ``` -### `--intervals` +### --intervals If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -463,7 +568,7 @@ If you prefer, you can specify the full path to your reference genome when you r --intervals '[path to the intervals file]' ``` -### `--knownIndels` +### --knownIndels If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -471,7 +576,7 @@ If you prefer, you can specify the full path to your reference genome when you r --knownIndels '[path to the knownIndels file]' ``` -### `--knownIndelsIndex` +### --knownIndelsIndex If you prefer, you can specify the full path to your reference genome when you run the pipeline: @@ -479,7 +584,7 @@ If you prefer, you can specify the full path to your reference genome when you r --knownIndelsIndex '[path to the knownIndels index]' ``` -### `--pon` +### --pon When a panel of normals [PON](https://gatkforums.broadinstitute.org/gatk/discussion/24057/how-to-call-somatic-mutations-using-gatk4-mutect2#latest) is defined, you will get filtered somatic calls as a result. Without PON, there will be no calls with PASS in the INFO field, only an _unfiltered_ VCF is written. @@ -493,7 +598,11 @@ Provide your PON by: If the PON file is bgzipped, there has to be a tabixed index file at the same directory. -### `--snpeffDb` +### --pon_index + +Tabix index of the panel-of-normals bgzipped VCF file. + +### --snpeffDb If you prefer, you can specify the DB version when you run the pipeline: @@ -501,7 +610,7 @@ If you prefer, you can specify the DB version when you run the pipeline: --snpeffDb '[version of the snpEff DB]' ``` -### `--vepCacheVersion` +### --vepCacheVersion If you prefer, you can specify the cache version when you run the pipeline: @@ -509,11 +618,15 @@ If you prefer, you can specify the cache version when you run the pipeline: --vepCacheVersion '[version of the VEP cache]' ``` -### `--igenomesIgnore` +### --igenomesIgnore Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. +### --species + +This specifies the species used for running VEP annotation. For human data, this needs to be set to `homo_sapiens`, for mouse data `mus_musculus` as the annotation needs to know where to look for appropriate annotation references. If you use iGenomes or a local resource with `genomes.conf`, this has already been set for you appropriately. + ## Job resources ### Automatic resubmission @@ -538,11 +651,11 @@ If you have any questions or issues please send us a message on [Slack](https:// Running the pipeline on AWS Batch requires a couple of specific parameters to be set according to your AWS Batch configuration. Please use the `-awsbatch` profile and then specify all of the following parameters. -### `--awsqueue` +### --awsqueue The JobQueue that you intend to use on AWS Batch. -### `--awsregion` +### --awsregion The AWS region to run your job in. Default is set to `eu-west-1` but can be adjusted to your needs. @@ -551,21 +664,21 @@ Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a ## Other command line parameters -### `--outdir` +### --outdir The output directory where the results will be saved. Default: `results/ -### `--sequencing_center` +### --sequencing_center The sequencing center that will be used in the BAM CN field -### `--email` +### --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run. -### `-name` +### -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. @@ -574,7 +687,7 @@ This is used in the MultiQC report (if not default) and in the summary HTML / e- **NB:** Single hyphen (core Nextflow option) -### `-resume` +### -resume Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. @@ -584,7 +697,7 @@ Use the `nextflow log` command to show previous run names. **NB:** Single hyphen (core Nextflow option) -### `-c` +### -c Specify the path to a specific config file (this is a core NextFlow command). @@ -592,7 +705,7 @@ Specify the path to a specific config file (this is a core NextFlow command). Note - you can use this to override pipeline defaults. -### `--custom_config_version` +### --custom_config_version Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. @@ -603,7 +716,7 @@ Default is set to `master`. --custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96 ``` -### `--custom_config_base` +### --custom_config_base If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. @@ -625,29 +738,29 @@ nextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs > Note that the nf-core/tools helper package has a `download` command to download all required pipeline > files + singularity containers + institutional configs in one go for you, to make this process easier. -### `--max_memory` +### --max_memory Use to set a top-limit for the default memory requirement for each process. Should be a string in the format integer-unit eg. `--max_memory '8.GB'` -### `--max_time` +### --max_time Use to set a top-limit for the default time requirement for each process. Should be a string in the format integer-unit eg. `--max_time '2.h'` -### `--max_cpus` +### --max_cpus Use to set a top-limit for the default CPU requirement for each process. Should be a string in the format integer-unit eg. `--max_cpus 1` -### `--plaintext_email` +### --plaintext_email Set to receive plain-text e-mails instead of HTML formatted. -### `--monochrome_logs` +### --monochrome_logs Set to disable colourful command line output and live life in monochrome. -### `--multiqc_config` +### --multiqc_config Specify a path to a custom MultiQC configuration file. diff --git a/docs/use_cases.md b/docs/use_cases.md index 2491ad69e6..f94400735a 100644 --- a/docs/use_cases.md +++ b/docs/use_cases.md @@ -5,7 +5,7 @@ Using the `mapping` directive one will have a pair of mapped, deduplicated and r This is the usual option you have to give when you are starting from raw FASTQ data: ```bash -nextflow run nf-core/sarek/main.nf --input mysample.tsv --tools +nextflow run nf-core/sarek --input mysample.tsv --tools ``` `mapping` will start by default, you do not have to give any additional parameters, only the TSV file describing the sample (see below). @@ -20,7 +20,7 @@ Also, older version are renamed with incremented numbers. The workflow should be started in this case with the smallest set of options as written above: ```bash -nextflow run nf-core/sarek/main.nf --input mysample.tsv --tools +nextflow run nf-core/sarek --input mysample.tsv --tools ``` The TSV file should look like: @@ -36,7 +36,7 @@ See the [input files documentation](docs/input.md) for more information. The `--input` option can be also used to point Sarek to a directory with FASTQ files: ```bash -nextflow run nf-core/sarek/main.nf --input path/to/FASTQ/files --tools +nextflow run nf-core/sarek --input path/to/FASTQ/files --tools ``` The given directory is searched recursively for FASTQ files that are named `*_R1_*.fastq.gz`, and a matching pair with the same name except `_R2_` instead of `_R1_` is expected to exist alongside. @@ -78,7 +78,7 @@ See the [input files documentation](docs/input.md) for more information. ## Starting from recalibration ```bash -nextflow run nf-core/sarek/main.nf --input mysample.tsv --step recalibrate --tools +nextflow run nf-core/sarek --input mysample.tsv --step recalibrate --tools ``` And the corresponding TSV file should be like: @@ -97,7 +97,7 @@ See the [input files documentation](docs/input.md) for more information. At this step we are assuming that all the required preprocessing is over, we only want to run variant callers or other tools using recalibrated BAM files. ```bash -nextflow run nf-core/sarek/main.nf --step variantcalling --tools +nextflow run nf-core/sarek --step variantcalling --tools ``` And the corresponding TSV file should be like: @@ -121,5 +121,5 @@ It is adviced to pad the variant calling regions (exons or the target) to some e To add the target BED file configure the flow like: ```bash -nextflow run nf-core/sarek/main.nf --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --input my_panel.tsv +nextflow run nf-core/sarek --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --input my_panel.tsv ``` diff --git a/downloadcache.nf b/downloadcache.nf index 8e758391d5..1a1048a7e8 100644 --- a/downloadcache.nf +++ b/downloadcache.nf @@ -54,7 +54,7 @@ params.build = null params.offline = null params.cadd_cache = null params.cadd_version = 'v1.5' -params.genome = 'smallGRCh37' +params.genome = 'GRCh37' params.snpEff_cache = null params.vep_cache = null @@ -182,7 +182,7 @@ process BuildCache_VEP { when: params.vep_cache && params.download_cache && !params.offline script: - genome = params.genome == "smallGRCh37" ? "GRCh37" : params.genome + genome = params.genome species = genome =~ "GRCh3*" ? "homo_sapiens" : genome =~ "GRCm3*" ? "mus_musculus" : "" """ vep_install \ diff --git a/environment.yml b/environment.yml index c27f23966a..00a6acf06b 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-sarek-2.5.1 +name: nf-core-sarek-2.5.2 channels: - conda-forge - bioconda diff --git a/main.nf b/main.nf index be6bcfa22b..de1c2f690a 100644 --- a/main.nf +++ b/main.nf @@ -35,14 +35,15 @@ def helpMessage() { Works also with the path to a directory on mapping step with a single germline sample only Alternatively, path to VCF input file on annotate step Multiple VCF files can be specified with quotes - - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, awsbatch, test and more. + -profile Configuration profile to use + Can use multiple (comma separated) + Available: conda, docker, singularity, test and more Options: --genome Name of iGenomes reference --noGVCF No g.vcf output from HaplotypeCaller --noStrelkaBP Will not use Manta candidateSmallIndels for Strelka as Best Practice + --no_intervals Disable usage of intervals --nucleotidesPerSecond To estimate interval size Default: 1000.0 --targetBED Target BED file for targeted or whole exome sequencing @@ -50,7 +51,7 @@ def helpMessage() { Available: Mapping, Recalibrate, VariantCalling, Annotate Default: Mapping --tools Specify tools to use for variant calling: - Available: ASCAT, ControlFREEC, FreeBayes, HaplotypeCaller + Available: ASCAT, ControlFREEC, FreeBayes, HaplotypeCaller, GenomeChronicler Manta, mpileup, Mutect2, Strelka, TIDDIT and/or for annotation: snpEff, VEP, merge @@ -61,22 +62,37 @@ def helpMessage() { --annotateTools Specify from which tools Sarek will look for VCF files to annotate, only for step annotate Available: HaplotypeCaller, Manta, Mutect2, Strelka, TIDDIT Default: None + --sentieon If sentieon is available, will enable it for preprocessing, and variant calling + Adds the following tools for --tools: DNAseq, DNAscope and TNscope --annotation_cache Enable the use of cache for annotation, to be used with --snpEff_cache and/or --vep_cache --snpEff_cache Specity the path to snpEff cache, to be used with --annotation_cache --vep_cache Specity the path to VEP cache, to be used with --annotation_cache + --pon panel-of-normals VCF (bgzipped, indexed). See: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php + --pon_index index of pon panel-of-normals VCF References If not specified in the configuration file or you wish to overwrite any of the references. --acLoci acLoci file --acLociGC acLoci GC file --bwaIndex bwa indexes + If none provided, will be generated automatically from the fasta reference --dbsnp dbsnp file --dbsnpIndex dbsnp index + If none provided, will be generated automatically if a dbsnp file is provided --dict dict from the fasta reference + If none provided, will be generated automatically from the fasta reference --fasta fasta reference --fastafai reference index + If none provided, will be generated automatically from the fasta reference + --germlineResource Germline Resource File + --germlineResourceIndex Germline Resource Index + If none provided, will be generated automatically if a germlineResource file is provided --intervals intervals + If none provided, will be generated automatically from the fasta reference + Use --no_intervals to disable automatic generation --knownIndels knownIndels file --knownIndelsIndex knownIndels index + If none provided, will be generated automatically if a knownIndels file is provided + --species species for VEP --snpeffDb snpeffDb version --vepCacheVersion VEP Cache version @@ -87,7 +103,6 @@ def helpMessage() { --monochrome_logs Logs will be without colors --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - --pon panel-of-normals VCF (bgzipped, indexed). See: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic AWSBatch options: @@ -138,28 +153,6 @@ annoList = defineAnnoList() annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{it.trim().toLowerCase()} : [] if (!checkParameterList(annotateTools,annoList)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' -// Initialize each params in params.genomes, catch the command line first if it was defined -// params.fasta has to be the first one -params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null -// The rest can be sorted -params.acLoci = params.genome && 'ascat' in tools ? params.genomes[params.genome].acLoci ?: null : null -params.acLociGC = params.genome && 'ascat' in tools ? params.genomes[params.genome].acLociGC ?: null : null -params.bwaIndex = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwaIndex ?: null : null -params.chrDir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chrDir ?: null : null -params.chrLength = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chrLength ?: null : null -params.dbsnp = params.genome && ('mapping' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools) ? params.genomes[params.genome].dbsnp ?: null : null -params.dbsnpIndex = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnpIndex ?: null : null -params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null -params.fastaFai = params.genome && params.fasta ? params.genomes[params.genome].fastaFai ?: null : null -params.germlineResource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germlineResource ?: null : null -params.germlineResourceIndex = params.genome && params.germlineResource ? params.genomes[params.genome].germlineResourceIndex ?: null : null -params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null -params.knownIndels = params.genome && 'mapping' in step ? params.genomes[params.genome].knownIndels ?: null : null -params.knownIndelsIndex = params.genome && params.knownIndels ? params.genomes[params.genome].knownIndelsIndex ?: null : null -params.snpeffDb = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeffDb ?: null : null -params.vepCacheVersion = params.genome && 'vep' in tools ? params.genomes[params.genome].vepCacheVersion ?: null : null - - // Handle deprecation if (params.noReports) skipQC = skipQClist @@ -193,7 +186,13 @@ if (params.sampleDir) tsvPath = params.sampleDir // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps recalibrate and variantCalling if (!params.input && step != 'mapping' && step != 'annotate') { - tsvPath = step == 'recalibrate' ? "${params.outdir}/Preprocessing/TSV/duplicateMarked.tsv": "${params.outdir}/Preprocessing/TSV/recalibrated.tsv" + if (params.sentieon) { + if (step == 'variantcalling') tsvPath = "${params.outdir}/Preprocessing/TSV/recalibrated_sentieon.tsv" + else exit 1, "Not possible to restart from that step" + } + else { + tsvPath = step == 'recalibrate' ? "${params.outdir}/Preprocessing/TSV/duplicateMarked.tsv" : "${params.outdir}/Preprocessing/TSV/recalibrated.tsv" + } } inputSample = Channel.empty() @@ -207,17 +206,19 @@ if (tsvPath) { default: exit 1, "Unknown step ${step}" } } else if (params.input && !hasExtension(params.input, "tsv")) { - println "No TSV file" + log.info "No TSV file" if (step != 'mapping') exit 1, 'No other step than "mapping" support a dir as an input' - println "Reading ${params.input} directory" + log.info "Reading ${params.input} directory" inputSample = extractFastqFromDir(params.input) (inputSample, fastqTMP) = inputSample.into(2) fastqTMP.toList().subscribe onNext: { if (it.size() == 0) exit 1, "No FASTQ files found in --input directory '${params.input}'" } tsvFile = params.input // used in the reports +} else if (tsvPath && step == 'annotate') { + log.info "Annotating ${tsvPath}" } else if (step == 'annotate') { - println "Annotating ${tsvFile}" + log.info "Trying automatic annotation on file in the VariantCalling directory" } else exit 1, 'No sample were defined, see --help' (genderMap, statusMap, inputSample) = extractInfos(inputSample) @@ -228,6 +229,28 @@ if (tsvPath) { ================================================================================ */ +// Initialize each params in params.genomes, catch the command line first if it was defined +// params.fasta has to be the first one +params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null +// The rest can be sorted +params.acLoci = params.genome && 'ascat' in tools ? params.genomes[params.genome].acLoci ?: null : null +params.acLociGC = params.genome && 'ascat' in tools ? params.genomes[params.genome].acLociGC ?: null : null +params.bwaIndex = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwaIndex ?: null : null +params.chrDir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chrDir ?: null : null +params.chrLength = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chrLength ?: null : null +params.dbsnp = params.genome && ('mapping' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools) ? params.genomes[params.genome].dbsnp ?: null : null +params.dbsnpIndex = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnpIndex ?: null : null +params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null +params.fastaFai = params.genome && params.fasta ? params.genomes[params.genome].fastaFai ?: null : null +params.germlineResource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germlineResource ?: null : null +params.germlineResourceIndex = params.genome && params.germlineResource ? params.genomes[params.genome].germlineResourceIndex ?: null : null +params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null +params.knownIndels = params.genome && 'mapping' in step ? params.genomes[params.genome].knownIndels ?: null : null +params.knownIndelsIndex = params.genome && params.knownIndels ? params.genomes[params.genome].knownIndelsIndex ?: null : null +params.snpeffDb = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeffDb ?: null : null +params.species = params.genome && 'vep' in tools ? params.genomes[params.genome].species ?: null : null +params.vepCacheVersion = params.genome && 'vep' in tools ? params.genomes[params.genome].vepCacheVersion ?: null : null + // Initialize channels based on params ch_acLoci = params.acLoci && 'ascat' in tools ? Channel.value(file(params.acLoci)) : "null" ch_acLociGC = params.acLociGC && 'ascat' in tools ? Channel.value(file(params.acLociGC)) : "null" @@ -237,7 +260,7 @@ ch_dbsnp = params.dbsnp && ('mapping' in step || 'controlfreec' in tools || 'hap ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" ch_fastaFai = params.fastaFai && !('annotate' in step) ? Channel.value(file(params.fastaFai)) : "null" ch_germlineResource = params.germlineResource && 'mutect2' in tools ? Channel.value(file(params.germlineResource)) : "null" -ch_intervals = params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" +ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" // knownIndels is currently a list of file for smallGRCh37, so transform it in a channel li_knownIndels = [] @@ -272,14 +295,15 @@ summary['Max Resources'] = "${params.max_memory} memory, ${params.max_cpus} if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" if (params.input) summary['Input'] = params.input if (params.targetBED) summary['Target BED'] = params.targetBED -if (params.step) summary['Step'] = params.step +if (step) summary['Step'] = step if (params.tools) summary['Tools'] = tools.join(', ') if (params.skipQC) summary['QC tools skip'] = skipQC.join(', ') -if ('haplotypecaller' in tools) summary['GVCF'] = params.noGVCF ? 'No' : 'Yes' -if ('strelka' in tools && 'manta' in tools ) summary['Strelka BP'] = params.noStrelkaBP ? 'No' : 'Yes' -if (params.sequencing_center) summary['Sequenced by'] = params.sequencing_center -if (params.pon && 'mutect2' in tools) summary['Panel of normals'] = params.pon +if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' +if ('haplotypecaller' in tools) summary['GVCF'] = params.noGVCF ? 'No' : 'Yes' +if ('strelka' in tools && 'manta' in tools ) summary['Strelka BP'] = params.noStrelkaBP ? 'No' : 'Yes' +if (params.sequencing_center) summary['Sequenced by'] = params.sequencing_center +if (params.pon && 'mutect2' in tools) summary['Panel of normals'] = params.pon summary['Save Genome Index'] = params.saveGenomeIndex ? 'Yes' : 'No' summary['Nucleotides/s'] = params.nucleotidesPerSecond @@ -306,7 +330,11 @@ if (params.dbsnpIndex) summary['dbsnpIndex'] = params.dbsn if (params.knownIndels) summary['knownIndels'] = params.knownIndels if (params.knownIndelsIndex) summary['knownIndelsIndex'] = params.knownIndelsIndex if (params.snpeffDb) summary['snpeffDb'] = params.snpeffDb +if (params.species) summary['species'] = params.species if (params.vepCacheVersion) summary['vepCacheVersion'] = params.vepCacheVersion +if (params.species) summary['species'] = params.species +if (params.snpEff_cache) summary['snpEff_cache'] = params.snpEff_cache +if (params.vep_cache) summary['vep_cache'] = params.vep_cache if (workflow.profile == 'awsbatch') { summary['AWS Region'] = params.awsregion @@ -372,135 +400,186 @@ yamlSoftwareVersion = yamlSoftwareVersion.dump(tag:'SOFTWARE VERSIONS') ================================================================================ */ +// And then initialize channels based on params or indexes that were just built + process BuildBWAindexes { - tag {fasta} + tag {fasta} - publishDir params.outdir, mode: params.publishDirMode, - saveAs: {params.saveGenomeIndex ? "reference_genome/BWAIndex/${it}" : null } + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/BWAIndex/${it}" : null } - input: - file(fasta) from ch_fasta + input: + file(fasta) from ch_fasta - output: - file("${fasta}.*") into bwaIndexes + output: + file("${fasta}.*") into bwaIndexes - when: !(params.bwaIndex) && params.fasta && 'mapping' in step + when: !(params.bwaIndex) && params.fasta && 'mapping' in step - script: - """ - bwa index ${fasta} - """ + script: + """ + bwa index ${fasta} + """ } +ch_bwaIndex = params.bwaIndex ? Channel.value(file(params.bwaIndex)) : bwaIndexes + process BuildDict { - tag {fasta} + tag {fasta} - publishDir params.outdir, mode: params.publishDirMode, - saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } - input: - file(fasta) from ch_fasta + input: + file(fasta) from ch_fasta - output: - file("${fasta.baseName}.dict") into dictBuilt + output: + file("${fasta.baseName}.dict") into dictBuilt - when: !(params.dict) && params.fasta && !('annotate' in step) + when: !(params.dict) && params.fasta && !('annotate' in step) - script: - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - CreateSequenceDictionary \ - --REFERENCE ${fasta} \ - --OUTPUT ${fasta.baseName}.dict - """ + script: + """ + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ + CreateSequenceDictionary \ + --REFERENCE ${fasta} \ + --OUTPUT ${fasta.baseName}.dict + """ } +ch_dict = params.dict ? Channel.value(file(params.dict)) : dictBuilt + process BuildFastaFai { - tag {fasta} + tag {fasta} - publishDir params.outdir, mode: params.publishDirMode, - saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } - input: - file(fasta) from ch_fasta + input: + file(fasta) from ch_fasta - output: - file("${fasta}.fai") into fastaFaiBuilt + output: + file("${fasta}.fai") into fastaFaiBuilt - when: !(params.fastaFai) && params.fasta && !('annotate' in step) + when: !(params.fastaFai) && params.fasta && !('annotate' in step) - script: - """ - samtools faidx ${fasta} - """ + script: + """ + samtools faidx ${fasta} + """ } +ch_fastaFai = params.fastaFai ? Channel.value(file(params.fastaFai)) : fastaFaiBuilt + process BuildDbsnpIndex { - tag {dbsnp} + tag {dbsnp} - publishDir params.outdir, mode: params.publishDirMode, - saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } - input: - file(dbsnp) from ch_dbsnp + input: + file(dbsnp) from ch_dbsnp - output: - file("${dbsnp}.tbi") into dbsnpIndexBuilt + output: + file("${dbsnp}.tbi") into dbsnpIndexBuilt - when: !(params.dbsnpIndex) && params.dbsnp && ('mapping' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools) - script: - """ - tabix -p vcf ${dbsnp} - """ + when: !(params.dbsnpIndex) && params.dbsnp && ('mapping' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools) + + script: + """ + tabix -p vcf ${dbsnp} + """ } +ch_dbsnpIndex = params.dbsnp ? params.dbsnpIndex ? Channel.value(file(params.dbsnpIndex)) : dbsnpIndexBuilt : "null" + process BuildGermlineResourceIndex { - tag {germlineResource} + tag {germlineResource} - publishDir params.outdir, mode: params.publishDirMode, - saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } - input: - file(germlineResource) from ch_germlineResource + input: + file(germlineResource) from ch_germlineResource - output: - file("${germlineResource}.tbi") into germlineResourceIndexBuilt + output: + file("${germlineResource}.tbi") into germlineResourceIndexBuilt - when: !(params.germlineResourceIndex) && params.germlineResource && 'mutect2' in tools + when: !(params.germlineResourceIndex) && params.germlineResource && 'mutect2' in tools - script: - """ - tabix -p vcf ${germlineResource} - """ + script: + """ + tabix -p vcf ${germlineResource} + """ } +ch_germlineResourceIndex = params.germlineResource ? params.germlineResourceIndex ? Channel.value(file(params.germlineResourceIndex)) : germlineResourceIndexBuilt : "null" + process BuildKnownIndelsIndex { - tag {knownIndels} + tag {knownIndels} + + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } + + input: + each file(knownIndels) from ch_knownIndels + + output: + file("${knownIndels}.tbi") into knownIndelsIndexBuilt + + when: !(params.knownIndelsIndex) && params.knownIndels && 'mapping' in step + + script: + """ + tabix -p vcf ${knownIndels} + """ +} + +ch_knownIndelsIndex = params.knownIndels ? params.knownIndelsIndex ? Channel.value(file(params.knownIndelsIndex)) : knownIndelsIndexBuilt.collect() : "null" + +process BuildPonIndex { + tag {pon} + + publishDir params.outdir, mode: params.publishDirMode, + saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } + + input: + file(pon) from ch_pon + + output: + file("${pon}.tbi") into ponIndexBuilt + + when: !(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools) + + script: + """ + tabix -p vcf ${pon} + """ +} + +ch_ponIndex = params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt + +process BuildIntervals { + tag {fastaFai} publishDir params.outdir, mode: params.publishDirMode, saveAs: {params.saveGenomeIndex ? "reference_genome/${it}" : null } input: - each file(knownIndels) from ch_knownIndels + file(fastaFai) from ch_fastaFai output: - file("${knownIndels}.tbi") into knownIndelsIndexBuilt + file("${fastaFai.baseName}.bed") into intervalBuilt - when: !(params.knownIndelsIndex) && params.knownIndels && 'mapping' in step + when: !(params.intervals) && !('annotate' in step) && !(params.no_intervals) script: """ - tabix -p vcf ${knownIndels} + awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed """ } -// Initialize channels based on params or indexes that were just built -ch_bwaIndex = params.bwaIndex ? Channel.value(file(params.bwaIndex)) : bwaIndexes -ch_dbsnpIndex = params.dbsnpIndex ? Channel.value(file(params.dbsnpIndex)) : dbsnpIndexBuilt -ch_dict = params.dict ? Channel.value(file(params.dict)) : dictBuilt -ch_fastaFai = params.fastaFai ? Channel.value(file(params.fastaFai)) : fastaFaiBuilt -ch_germlineResourceIndex = params.germlineResourceIndex ? Channel.value(file(params.germlineResourceIndex)) : germlineResourceIndexBuilt -ch_knownIndelsIndex = params.knownIndelsIndex ? Channel.value(file(params.knownIndelsIndex)) : knownIndelsIndexBuilt.collect() +ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt /* ================================================================================ @@ -519,7 +598,7 @@ process CreateIntervalBeds { output: file '*.bed' into bedIntervals mode flatten - when: step != 'annotate' + when: (!params.no_intervals) && step != 'annotate' script: // If the interval file is BED format, the fifth column is interpreted to @@ -544,6 +623,13 @@ process CreateIntervalBeds { print \$0 > name }' ${intervals} """ + else if (hasExtension(intervals, "interval_list")) + """ + grep -v '^@' ${intervals} | awk -vFS="\t" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' + """ else """ awk -vFS="[:-]" '{ @@ -572,25 +658,49 @@ bedIntervals = bedIntervals bedIntervals = bedIntervals.dump(tag:'bedintervals') +if (params.no_intervals && step != 'annotate') bedIntervals = Channel.from(file("no_intervals.bed")) + (intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intMpileup, bedIntervals) = bedIntervals.into(5) // PREPARING CHANNELS FOR PREPROCESSING AND QC -if (step == 'mapping') (inputReads, inputReadsFastQC) = inputSample.into(2) -else (inputReads, inputReadsFastQC) = Channel.empty().into(2) +inputBam = Channel.create() +inputPairReads = Channel.create() -inputPairReadsFastQC = Channel.create() -inputBAMFastQC = Channel.create() +if (step in ['recalibrate', 'variantcalling', 'annotate']) { + inputBam.close() + inputPairReads.close() +} else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} -inputReadsFastQC.choice(inputPairReadsFastQC, inputBAMFastQC) {hasExtension(it[3], "bam") ? 1 : 0} +(inputBam, inputBamFastQC) = inputBam.into(2) // Removing inputFile2 wich is null in case of uBAM -inputBAMFastQC = inputBAMFastQC.map { +inputBamFastQC = inputBamFastQC.map { idPatient, idSample, idRun, inputFile1, inputFile2 -> [idPatient, idSample, idRun, inputFile1] } -inputReads = inputReads.dump(tag:'INPUT') +if (params.split_fastq){ + inputPairReads = inputPairReads + // newly splitfastq are named based on split, so the name is easier to catch + .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) + .map {idPatient, idSample, idRun, reads1, reads2 -> + // The split fastq read1 is the 4th element (indexed 3) its name is split_3 + // The split fastq read2's name is split_4 + // It's followed by which split it's acutally based on the mother fastq file + // Index start at 1 + // Extracting the index to get a new IdRun + splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") + newIdRun = idRun + "_" + splitIndex + // Giving the files a new nice name + newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") + newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") + [idPatient, idSample, newIdRun, reads1, reads2]} +} + +inputPairReads = inputPairReads.dump(tag:'INPUT') + +(inputPairReads, inputPairReadsFastQC) = inputPairReads.into(2) // STEP 0.5: QC ON READS @@ -598,6 +708,7 @@ inputReads = inputReads.dump(tag:'INPUT') // FASTQ and uBAM files are renamed based on the sample name process FastQCFQ { + label 'FastQC' label 'cpus_2' tag {idPatient + "-" + idRun} @@ -610,7 +721,7 @@ process FastQCFQ { output: file("*.{html,zip}") into fastQCFQReport - when: step == 'mapping' && !('fastqc' in skipQC) + when: !('fastqc' in skipQC) script: """ @@ -619,6 +730,7 @@ process FastQCFQ { } process FastQCBAM { + label 'FastQC' label 'cpus_2' tag {idPatient + "-" + idRun} @@ -626,12 +738,12 @@ process FastQCBAM { publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publishDirMode input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from inputBAMFastQC + set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from inputBamFastQC output: file("*.{html,zip}") into fastQCBAMReport - when: step == 'mapping' && !('fastqc' in skipQC) + when: !('fastqc' in skipQC) script: """ @@ -645,21 +757,30 @@ fastQCReport = fastQCReport.dump(tag:'FastQC') // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM +inputPairReads = inputPairReads.dump(tag:'INPUT') + +inputPairReads = inputPairReads.mix(inputBam) + +(inputPairReads, inputPairReadsSentieon) = inputPairReads.into(2) +if (params.sentieon) inputPairReads.close() +else inputPairReadsSentieon.close() + process MapReads { label 'cpus_max' + label 'memory_max' + echo true tag {idPatient + "-" + idRun} input: - set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputReads + set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReads file(bwaIndex) from ch_bwaIndex file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai output: set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMapped - set idPatient, idSample, file("${idSample}_${idRun}.bam") into bamMappedBamQC - - when: step == 'mapping' + set idPatient, val("${idSample}_${idRun}"), file("${idSample}_${idRun}.bam") into bamMappedBamQC script: // -K is an hidden option, used to fix the number of reads processed by bwa mem @@ -674,15 +795,19 @@ process MapReads { extra = status == 1 ? "-B 3" : "" convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" + // Pseudo-code: Add soft-coded memory allocation to the two tools, bwa mem | smatools sort + // Request only one from the user, the other is implicit: 1 - defined + bwa_cpus = params.bwa_cpus ? params.bwa_cpus : Math.floor ( params.bwa_cpus_fraction * task.cpus) as Integer + sort_cpus = params.sort_cpus ? params.sort_cpus : task.cpus - bwa_cpus """ - ${convertToFastq} - bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${input} | samtools sort - > ${idSample}_${idRun}.bam + ${convertToFastq} + bwa mem -k 23 -K 100000000 -R \"${readGroup}\" ${extra} -t ${bwa_cpus} -M ${fasta} \ + ${input} | \ + samtools sort --threads ${sort_cpus} - > ${idSample}_${idRun}.bam """ } bamMapped = bamMapped.dump(tag:'Mapped BAM') - // Sort BAM whether they are standalone or should be merged singleBam = Channel.create() @@ -695,10 +820,64 @@ singleBam = singleBam.map { } singleBam = singleBam.dump(tag:'Single BAM') +// STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM + +process SentieonMapReads { + label 'cpus_max' + label 'memory_max' + label 'sentieon' + + tag {idPatient + "-" + idRun} + + input: + set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReadsSentieon + file(bwaIndex) from ch_bwaIndex + file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai + + output: + set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMappedSentieon + set idPatient, idSample, file("${idSample}_${idRun}.bam") into bamMappedSentieonBamQC + + when: params.sentieon + + script: + // -K is an hidden option, used to fix the number of reads processed by bwa mem + // Chunk size can affect bwa results, if not specified, + // the number of threads can change which can give not deterministic result. + // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md + // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 + CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" + readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" + // adjust mismatch penalty for tumor samples + status = statusMap[idPatient, idSample] + extra = status == 1 ? "-B 3" : "" + """ + sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ + ${inputFile1} ${inputFile2} | \ + sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - + """ +} + +bamMappedSentieon = bamMappedSentieon.dump(tag:'Sentieon Mapped BAM') +// Sort BAM whether they are standalone or should be merged + +singleBamSentieon = Channel.create() +multipleBamSentieon = Channel.create() +bamMappedSentieon.groupTuple(by:[0, 1]) + .choice(singleBamSentieon, multipleBamSentieon) {it[2].size() > 1 ? 1 : 0} +singleBamSentieon = singleBamSentieon.map { + idPatient, idSample, idRun, bam -> + [idPatient, idSample, bam] +} +singleBamSentieon = singleBamSentieon.dump(tag:'Single BAM') + // STEP 1.5: MERGING BAM FROM MULTIPLE LANES +multipleBam = multipleBam.mix(multipleBamSentieon) + process MergeBamMapped { - label 'cpus_8' + label 'med_resources' tag {idPatient + "-" + idSample} @@ -708,8 +887,6 @@ process MergeBamMapped { output: set idPatient, idSample, file("${idSample}.bam") into mergedBam - when: step == 'mapping' - script: """ samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} @@ -717,21 +894,69 @@ process MergeBamMapped { } mergedBam = mergedBam.dump(tag:'Merged BAM') -mergedBam = mergedBam.mix(singleBam) + +mergedBam = mergedBam.mix(singleBam,singleBamSentieon) + +(mergedBam, mergedBamForSentieon) = mergedBam.into(2) + +if (!params.sentieon) mergedBamForSentieon.close() +else mergedBam.close() + mergedBam = mergedBam.dump(tag:'BAMs for MD') +mergedBamForSentieon = mergedBamForSentieon.dump(tag:'Sentieon BAMs to Index') + +process IndexBamMergedForSentieon { + label 'med_resources' + + tag {idPatient + "-" + idSample} + + input: + set idPatient, idSample, file(bam) from mergedBamForSentieon + + output: + set idPatient, idSample, file(bam), file("${idSample}.bam.bai") into bamForSentieonDedup + + script: + """ + samtools index ${bam} + """ +} + +(mergedBam, mergedBamToIndex) = mergedBam.into(2) + +process IndexBamFile { + label 'med_resources' + + tag {idPatient + "-" + idSample} + + input: + set idPatient, idSample, file(bam) from mergedBamToIndex + + output: + set idPatient, idSample, file(bam), file("*.bai") into indexedBam + + when: !params.knownIndels + + script: + """ + samtools index ${bam} + mv ${bam}.bai ${bam.baseName}.bai + """ +} // STEP 2: MARKING DUPLICATES -process MarkDuplicatesSpark { +process MarkDuplicates { label 'cpus_max' label 'memory_max' tag {idPatient + "-" + idSample} - echo true + publishDir params.outdir, mode: params.publishDirMode, saveAs: { - if (it == "${idSample}.bam.metrics" && 'markduplicates' in skipQC) "Reports/${idSample}/MarkDuplicates/${it}" + if (it == "${idSample}.bam.metrics" && 'markduplicates' in skipQC) null + else if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" else "Preprocessing/${idSample}/DuplicateMarked/${it}" } @@ -739,22 +964,24 @@ process MarkDuplicatesSpark { set idPatient, idSample, file("${idSample}.bam") from mergedBam output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into duplicateMarkedBams + set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bai") into duplicateMarkedBams file ("${idSample}.bam.metrics") into markDuplicatesReport - when: step == 'mapping' + when: params.knownIndels script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" """ - gatk MarkDuplicatesSpark \ - --input ${idSample}.bam \ - --output ${idSample}.md.bam \ - --tmp-dir . \ - --verbosity DEBUG \ - --create-output-bam-index true \ - --spark-runner LOCAL --spark-master local[${task.cpus}] - """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ + --INPUT ${idSample}.bam \ + --METRICS_FILE ${idSample}.bam.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${idSample}.md.bam + """ } if ('markduplicates' in skipQC) markDuplicatesReport.close() @@ -763,18 +990,72 @@ duplicateMarkedBams = duplicateMarkedBams.dump(tag:'MD BAM') markDuplicatesReport = markDuplicatesReport.dump(tag:'MD Report') (bamMD, bamMDToJoin) = duplicateMarkedBams.into(2) + bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) bamBaseRecalibrator = bamBaseRecalibrator.dump(tag:'BAM FOR BASERECALIBRATOR') -// STEP 3: CREATING RECALIBRATION TABLES +// STEP 2': SENTIEON DEDUP -process BaseRecalibratorSpark { +process SentieonDedup { + label 'cpus_max' label 'memory_max' - label 'cpus_1' + label 'sentieon' - tag {idPatient + "-" + idSample + "-" + intervalBed} - echo true + tag {idPatient + "-" + idSample} + + publishDir params.outdir, mode: params.publishDirMode, + saveAs: { + if (it == "${idSample}_*.txt" && 'sentieon' in skipQC) null + else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" + else null + } + + input: + set idPatient, idSample, file(bam), file(bai) from bamForSentieonDedup + file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai + + output: + set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bamDedupedSentieon + file("${idSample}_*.txt") into bamDedupedSentieonQC + + when: params.sentieon + + script: + """ + sentieon driver \ + -t ${task.cpus} \ + -i ${bam} \ + -r ${fasta} \ + --algo GCBias --summary ${idSample}_gc_summary.txt ${idSample}_gc_metric.txt \ + --algo MeanQualityByCycle ${idSample}_mq_metric.txt \ + --algo QualDistribution ${idSample}_qd_metric.txt \ + --algo InsertSizeMetricAlgo ${idSample}_is_metric.txt \ + --algo AlignmentStat ${idSample}_aln_metric.txt + + sentieon driver \ + -t ${task.cpus} \ + -i ${bam} \ + --algo LocusCollector \ + --fun score_info ${idSample}_score.gz + + sentieon driver \ + -t ${task.cpus} \ + -i ${bam} \ + --algo Dedup \ + --rmdup \ + --score_info ${idSample}_score.gz \ + --metrics ${idSample}_dedup_metric.txt ${idSample}.deduped.bam + """ +} + +// STEP 3: CREATING RECALIBRATION TABLES + +process BaseRecalibrator { + label 'med_resources' + + tag {idPatient + "-" + idSample + "-" + intervalBed.baseName} input: set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator @@ -787,28 +1068,39 @@ process BaseRecalibratorSpark { file(knownIndelsIndex) from ch_knownIndelsIndex output: - set idPatient, idSample, file("${intervalBed.baseName}_${idSample}.recal.table") into tableGatherBQSRReports + set idPatient, idSample, file("${prefix}${idSample}.recal.table") into tableGatherBQSRReports + set idPatient, idSample into recalTableTSVnoInt - when: step == 'mapping' + when: params.knownIndels script: - known = knownIndels.collect{"--known-sites ${it}"}.join(' ') + dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" + knownOptions = params.knownIndels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" + prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" + intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" // TODO: --use-original-qualities ??? """ - gatk BaseRecalibratorSpark \ - --input ${bam} \ - --output ${intervalBed.baseName}_${idSample}.recal.table \ - --tmp-dir . \ - --reference ${fasta} \ - --intervals ${intervalBed} \ - --known-sites ${dbsnp} \ - ${known} \ - --verbosity DEBUG \ - --spark-runner LOCAL --spark-master local[${task.cpus}] + gatk --java-options -Xmx${task.memory.toGiga()}g \ + BaseRecalibrator \ + -I ${bam} \ + -O ${prefix}${idSample}.recal.table \ + --tmp-dir /tmp \ + -R ${fasta} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + ${knownOptions} \ + --verbosity INFO """ } -tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) +if (!params.no_intervals) tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) + +tableGatherBQSRReports = tableGatherBQSRReports.dump(tag:'BQSR REPORTS') + +if (params.no_intervals) { + (tableGatherBQSRReports, tableGatherBQSRReportsNoInt) = tableGatherBQSRReports.into(2) + recalTable = tableGatherBQSRReportsNoInt +} else recalTableTSVnoInt.close() // STEP 3.5: MERGING RECALIBRATION TABLES @@ -825,9 +1117,9 @@ process GatherBQSRReports { output: set idPatient, idSample, file("${idSample}.recal.table") into recalTable - set idPatient, idSample, val("${idSample}.md.bam"), val("${idSample}.md.bai"), val("${idSample}.recal.table") into (recalTableTSV, recalTableSampleTSV) + set idPatient, idSample into recalTableTSV - when: step == 'mapping' + when: !(params.no_intervals) script: input = recal.collect{"-I ${it}"}.join(' ') @@ -839,34 +1131,49 @@ process GatherBQSRReports { """ } +recalTable = recalTable.dump(tag:'RECAL TABLE') + +(recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) + // Create TSV files to restart from this step -recalTableTSV.map { idPatient, idSample, bam, bai, recalTable -> +recalTableTSV.map { idPatient, idSample -> status = statusMap[idPatient, idSample] gender = genderMap[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${recalTable}\n" + bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai" + recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" }.collectFile( name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) recalTableSampleTSV .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { - idPatient, idSample, bam, bai, recalTable -> + idPatient, idSample -> status = statusMap[idPatient, idSample] gender = genderMap[idPatient] - ["duplicateMarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${recalTable}\n"] + bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai" + recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table" + ["duplicateMarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] } bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) if (step == 'recalibrate') bamApplyBQSR = inputSample -bamApplyBQSR = bamApplyBQSR.dump(tag:'recal.table') +bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') +// [DUMP: recal.table] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table] bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) +bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') +// [DUMP: BAM + BAI + RECAL TABLE + INT] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table, 1_1-200000.bed] + // STEP 4: RECALIBRATING -process ApplyBQSRSpark { +process ApplyBQSR { + label 'memory_singleCPU_2_task' label 'cpus_2' @@ -879,28 +1186,117 @@ process ApplyBQSRSpark { file(fastaFai) from ch_fastaFai output: - set idPatient, idSample, file("${intervalBed.baseName}_${idSample}.recal.bam") into bamMergeBamRecal + set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bamMergeBamRecal script: + prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" + intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" """ gatk --java-options -Xmx${task.memory.toGiga()}g \ - ApplyBQSRSpark \ - --reference ${fasta} \ + ApplyBQSR \ + -R ${fasta} \ --input ${bam} \ - --output ${intervalBed.baseName}_${idSample}.recal.bam \ - --intervals ${intervalBed} \ - --bqsr-recal-file ${recalibrationReport} \ - --verbosity DEBUG \ - --spark-runner LOCAL --spark-master local[${task.cpus}] &> applyBQSRspark.log.txt + --output ${prefix}${idSample}.recal.bam \ + ${intervalsOptions} \ + --bqsr-recal-file ${recalibrationReport} """ } bamMergeBamRecal = bamMergeBamRecal.groupTuple(by:[0, 1]) +(bamMergeBamRecal, bamMergeBamRecalNoInt) = bamMergeBamRecal.into(2) + +// STEP 4': SENTIEON BQSR + +bamDedupedSentieon = bamDedupedSentieon.dump(tag:'deduped.bam') + +process SentieonBQSR { + label 'cpus_max' + label 'memory_max' + label 'sentieon' + + tag {idPatient + "-" + idSample} + + publishDir params.outdir, mode: params.publishDirMode, + saveAs: { + if (it == "${idSample}_recal_result.csv" && 'sentieon' in skipQC) "Reports/${idSample}/Sentieon/${it}" + else "Preprocessing/${idSample}/RecalSentieon/${it}" + } + + input: + set idPatient, idSample, file(bam), file(bai) from bamDedupedSentieon + file(dbsnp) from ch_dbsnp + file(dbsnpIndex) from ch_dbsnpIndex + file(fasta) from ch_fasta + file(dict) from ch_dict + file(fastaFai) from ch_fastaFai + file(knownIndels) from ch_knownIndels + file(knownIndelsIndex) from ch_knownIndelsIndex + + output: + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecalSentieon + set idPatient, idSample into bamRecalSentieonTSV + file("${idSample}_recal_result.csv") into bamRecalSentieonQC + + when: params.sentieon + + script: + known = knownIndels.collect{"--known-sites ${it}"}.join(' ') + """ + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta} \ + -i ${idSample}.deduped.bam \ + --algo QualCal \ + -k ${dbsnp} \ + ${idSample}.recal.table + + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta} \ + -i ${idSample}.deduped.bam \ + -q ${idSample}.recal.table \ + --algo QualCal \ + -k ${dbsnp} \ + ${idSample}.table.post \ + --algo ReadWriter ${idSample}.recal.bam + + sentieon driver \ + -t ${task.cpus} \ + --algo QualCal \ + --plot \ + --before ${idSample}.recal.table \ + --after ${idSample}.table.post \ + ${idSample}_recal_result.csv + """ +} + +(bamRecalSentieonTSV, bamRecalSentieonSampleTSV) = bamRecalSentieonTSV.into(2) + +// Creating a TSV file to restart from this step +bamRecalSentieonTSV.map { idPatient, idSample -> + gender = genderMap[idPatient] + status = statusMap[idPatient, idSample] + bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +}.collectFile( + name: 'recalibrated_sentieon.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +) + +bamRecalSentieonSampleTSV + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { + idPatient, idSample -> + status = statusMap[idPatient, idSample] + gender = genderMap[idPatient] + bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" + ["recalibrated_sentieon_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +} // STEP 4.5.1: MERGING THE RECALIBRATED BAM FILES process MergeBamRecal { - label 'cpus_8' + label 'med_resources' tag {idPatient + "-" + idSample} @@ -910,16 +1306,17 @@ process MergeBamRecal { set idPatient, idSample, file(bam) from bamMergeBamRecal output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into bamRecal - set idPatient, idSample, file("${idSample}.recal.bam") into (bamRecalBamQC, bamRecalSamToolsStats) - set idPatient, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into (bamRecalTSV, bamRecalSampleTSV) + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecal + set idPatient, idSample, file("${idSample}.recal.bam") into bamRecalQC + set idPatient, idSample into bamRecalTSV file("${idSample}.recal.bam") into (bamGenomeChronicler, bamGenomeChroniclerToPrint) + when: !(params.no_intervals) + script: """ samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} samtools index ${idSample}.recal.bam - mv ${idSample}.recal.bam.bai ${idSample}.recal.bai """ } bamGenomeChroniclerToPrint.view() @@ -933,9 +1330,12 @@ Channel.fromPath(params.vepFile) // STEP 4.5.2: RUNNING GenomeChronicler FOR THE RECALIBRATED BAM FILES // TODO: Update this when there is a different VEP html report for each bam process RunGenomeChronicler { + + label 'cpus_max' + label 'memory_max' + tag "$bam" publishDir "$params.outdir/GenomeChronicler", mode: 'copy' - echo true input: file(bam) from bamGenomeChronicler @@ -944,6 +1344,8 @@ process RunGenomeChronicler { output: file("results_${bam.simpleName}") into chronicler_results + when: 'genomechronicler' in tools + script: optional_argument = vep.endsWith("no_vepFile.txt") ? '' : "--vepFile ${vep}" @@ -957,21 +1359,57 @@ process RunGenomeChronicler { """ } +// STEP 4.5': INDEXING THE RECALIBRATED BAM FILES + +process IndexBamRecal { + label 'med_resources' + + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publishDirMode + + input: + set idPatient, idSample, file("${idSample}.recal.bam") from bamMergeBamRecalNoInt + + output: + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecalNoInt + set idPatient, idSample, file("${idSample}.recal.bam") into bamRecalQCnoInt + set idPatient, idSample into bamRecalTSVnoInt + + when: params.no_intervals + + script: + """ + samtools index ${idSample}.recal.bam + """ +} + +bamRecal = bamRecal.mix(bamRecalNoInt) +bamRecalQC = bamRecalQC.mix(bamRecalQCnoInt) +bamRecalTSV = bamRecalTSV.mix(bamRecalTSVnoInt) + +(bamRecalBamQC, bamRecalSamToolsStats) = bamRecalQC.into(2) +(bamRecalTSV, bamRecalSampleTSV) = bamRecalTSV.into(2) + // Creating a TSV file to restart from this step -bamRecalTSV.map { idPatient, idSample, bam, bai -> +bamRecalTSV.map { idPatient, idSample -> gender = genderMap[idPatient] status = statusMap[idPatient, idSample] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bam}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bai}\n" + bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" }.collectFile( name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) bamRecalSampleTSV .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { - idPatient, idSample, bam, bai -> + idPatient, idSample -> status = statusMap[idPatient, idSample] gender = genderMap[idPatient] - ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bam}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bai}\n"] + bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" + ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] } // STEP 5: QC @@ -1003,7 +1441,7 @@ bamBamQC = bamMappedBamQC.mix(bamRecalBamQC) process BamQC { label 'memory_max' - label 'cpus_16' + label 'cpus_max' tag {idPatient + "-" + idSample} @@ -1043,6 +1481,13 @@ bamQCReport = bamQCReport.dump(tag:'BamQC') ================================================================================ */ +// When using sentieon for mapping, Channel bamRecal is bamRecalSentieon +if (params.sentieon && step == 'mapping') bamRecal = bamRecalSentieon + +// When no knownIndels for mapping, Channel bamRecal is indexedBam +bamRecal = (params.knownIndels && step == 'mapping') ? bamRecal : indexedBam + +// When starting with variant calling, Channel bamRecal is inputSample if (step == 'variantcalling') bamRecal = inputSample bamRecal = bamRecal.dump(tag:'BAM') @@ -1052,7 +1497,7 @@ bamRecal = bamRecal.dump(tag:'BAM') // Manta will be run in Germline mode, or in Tumor mode depending on status // HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples -(bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamRecalAll, bamRecalAllTemp) = bamRecal.into(5) +(bamSentieonDNAscope, bamSentieonDNAseq, bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamRecalAll, bamRecalAllTemp) = bamRecal.into(7) // To speed Variant Callers up we are chopping the reference into smaller pieces // Do variant calling by this intervals, and re-merge the VCFs @@ -1062,8 +1507,9 @@ bamHaplotypeCaller = bamRecalAllTemp.combine(intHaplotypeCaller) // STEP GATK HAPLOTYPECALLER.1 process HaplotypeCaller { - label 'memory_singleCPU_task_sq' - label 'cpus_2' + + label 'forks_max' + label 'cpus_1' tag {idSample + "-" + intervalBed.baseName} @@ -1135,6 +1581,94 @@ process GenotypeGVCFs { vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) +// STEP SENTIEON DNAseq + +process SentieonDNAseq { + label 'cpus_max' + label 'memory_max' + label 'sentieon' + + tag {idSample} + + input: + set idPatient, idSample, file(bam), file(bai) from bamSentieonDNAseq + file(dbsnp) from ch_dbsnp + file(dbsnpIndex) from ch_dbsnpIndex + file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai + + output: + set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into sentieonDNAseqVCF + + when: 'dnaseq' in tools && params.sentieon + + script: + """ + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta} \ + -i ${bam} \ + --algo Genotyper \ + -d ${dbsnp} \ + DNAseq_${idSample}.vcf + """ +} + +sentieonDNAseqVCF = sentieonDNAseqVCF.dump(tag:'sentieon DNAseq') + +// STEP SENTIEON DNAscope + +process SentieonDNAscope { + label 'cpus_max' + label 'memory_max' + label 'sentieon' + + tag {idSample} + + input: + set idPatient, idSample, file(bam), file(bai) from bamSentieonDNAscope + file(dbsnp) from ch_dbsnp + file(dbsnpIndex) from ch_dbsnpIndex + file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai + + output: + set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into sentieonDNAscopeVCF + set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into sentieonDNAscopeSVVCF + + when: 'dnascope' in tools && params.sentieon + + script: + """ + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta} \ + -i ${bam} \ + --algo DNAscope \ + -d ${dbsnp} \ + DNAscope_${idSample}.vcf + + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta}\ + -i ${bam} \ + --algo DNAscope \ + --var_type bnd \ + -d ${dbsnp} \ + DNAscope_${idSample}.temp.vcf + + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta}\ + --algo SVSolver \ + -v DNAscope_${idSample}.temp.vcf \ + DNAscope_SV_${idSample}.vcf + """ +} + +sentieonDNAscopeVCF = sentieonDNAscopeVCF.dump(tag:'sentieon DNAscope') +sentieonDNAscopeSVVCF = sentieonDNAscopeSVVCF.dump(tag:'sentieon DNAscope SV') + // STEP STRELKA.1 - SINGLE MODE process StrelkaSingle { @@ -1177,7 +1711,7 @@ process StrelkaSingle { Strelka_${idSample}_variants.vcf.gz mv Strelka/results/variants/variants.vcf.gz.tbi \ Strelka_${idSample}_variants.vcf.gz.tbi - """ + """ } vcfStrelkaSingle = vcfStrelkaSingle.dump(tag:'Strelka - Single Mode') @@ -1283,7 +1817,7 @@ vcfTIDDIT = vcfTIDDIT.dump(tag:'TIDDIT') */ // Ascat, Control-FREEC -(bamAscat, bamMpileup, bamRecalAll) = bamRecalAll.into(3) +(bamAscat, bamMpileup, bamMpileupNoInt, bamRecalAll) = bamRecalAll.into(4) // separate BAM by status bamNormal = Channel.create() @@ -1301,8 +1835,8 @@ pairBam = bamNormal.cross(bamTumor).map { pairBam = pairBam.dump(tag:'BAM Somatic Pair') -// Manta and Strelka -(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBam) = pairBam.into(6) +// Manta, Strelka, Mutect2 +(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamTNscope, pairBam) = pairBam.into(7) intervalPairBam = pairBam.spread(bedIntervals) @@ -1314,9 +1848,12 @@ bamMpileup = bamMpileup.spread(intMpileup) // STEP FREEBAYES process FreeBayes { - tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} + + label 'forks_max' label 'cpus_1' + tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} + input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes file(fasta) from ch_fasta @@ -1351,8 +1888,11 @@ vcfFreeBayes = vcfFreeBayes.groupTuple(by:[0,1,2]) process Mutect2 { tag {idSampleTumor + "_vs_" + idSampleNormal + "-" + intervalBed.baseName} + + label 'forks_max' label 'cpus_1' + input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 file(dict) from ch_dict @@ -1362,6 +1902,7 @@ process Mutect2 { file(germlineResourceIndex) from ch_germlineResourceIndex file(intervals) from ch_intervals file(pon) from ch_pon + file(ponIndex) from ch_ponIndex output: set val("Mutect2"), @@ -1436,11 +1977,11 @@ process MergeMutect2Stats { // STEP MERGING VCF - FREEBAYES, GATK HAPLOTYPECALLER & GATK MUTECT2 (UNFILTERED) -vcfConcatenateVCFs = mutect2Output.mix( vcfFreeBayes, vcfGenotypeGVCFs, gvcfHaplotypeCaller) +vcfConcatenateVCFs = mutect2Output.mix(vcfFreeBayes, vcfGenotypeGVCFs, gvcfHaplotypeCaller) vcfConcatenateVCFs = vcfConcatenateVCFs.dump(tag:'VCF to merge') process ConcatVCF { - label 'cpus_8' + label 'med_resources' tag {variantCaller + "-" + idSample} @@ -1477,6 +2018,8 @@ vcfConcatenated = vcfConcatenated.dump(tag:'VCF') process PileupSummariesForMutect2 { tag {idSampleTumor + "_vs_" + idSampleNormal + "_" + intervalBed.baseName } + + label 'forks_max' label 'cpus_1' input: @@ -1508,6 +2051,8 @@ pileupSummaries = pileupSummaries.groupTuple(by:[0,1]) // STEP GATK MUTECT2.4 - MERGING PILEUP SUMMARIES process MergePileupSummaries { + + label 'forks_max' label 'cpus_1' tag {idPatient + "_" + idSampleTumor} @@ -1536,6 +2081,8 @@ process MergePileupSummaries { // STEP GATK MUTECT2.5 - CALCULATING CONTAMINATION process CalculateContamination { + + label 'forks_max' label 'cpus_1' tag {idSampleTumor + "_vs_" + idSampleNormal} @@ -1564,6 +2111,8 @@ process CalculateContamination { // STEP GATK MUTECT2.6 - FILTERING CALLS process FilterMutect2Calls { + + label 'forks_max' label 'cpus_1' tag {idSampleTN} @@ -1602,6 +2151,73 @@ process FilterMutect2Calls { """ } +// STEP SENTIEON TNSCOPE + +process SentieonTNscope { + label 'cpus_max' + label 'memory_max' + label 'sentieon' + + tag {idSampleTumor + "_vs_" + idSampleNormal} + + input: + set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamTNscope + file(dict) from ch_dict + file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai + file(dbsnp) from ch_dbsnp + file(dbsnpIndex) from ch_dbsnpIndex + file(pon) from ch_pon + file(ponIndex) from ch_ponIndex + + output: + set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcfTNscope + + when: 'tnscope' in tools && params.sentieon + + script: + PON = params.pon ? "--pon ${pon}" : "" + """ + sentieon driver \ + -t ${task.cpus} \ + -r ${fasta} \ + -i ${bamTumor} \ + -i ${bamNormal} \ + --algo TNscope \ + --tumor_sample ${idSampleTumor} \ + --normal_sample ${idSampleNormal} \ + --dbsnp ${dbsnp} \ + ${PON} \ + TNscope_${idSampleTumor}_vs_${idSampleNormal}.vcf + """ +} + +vcfTNscope = vcfTNscope.dump(tag:'Sentieon TNscope') + +sentieonVCF = sentieonDNAseqVCF.mix(sentieonDNAscopeVCF, sentieonDNAscopeSVVCF, vcfTNscope) + +process CompressSentieonVCF { + tag {"${idSample} - ${vcf}"} + + publishDir "${params.outdir}/VariantCalling/${idSample}/${variantCaller}", mode: params.publishDirMode + + input: + set variantCaller, idPatient, idSample, file(vcf) from sentieonVCF + + output: + set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfSentieon + + when: params.sentieon + + script: + """ + bgzip < ${vcf} > ${vcf}.gz + tabix ${vcf}.gz + """ +} + +vcfSentieon = vcfSentieon.dump(tag:'Sentieon VCF indexed') + // STEP STRELKA.2 - SOMATIC PAIR process Strelka { @@ -1646,7 +2262,7 @@ process Strelka { Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ + """ } vcfStrelka = vcfStrelka.dump(tag:'Strelka') @@ -1762,7 +2378,7 @@ process StrelkaBP { StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ + """ } vcfStrelkaBP = vcfStrelkaBP.dump(tag:'Strelka BP') @@ -1868,7 +2484,7 @@ process Ascat { ascatOut.dump(tag:'ASCAT') -// STEP CONTROLFREEC.1 - MPILEUP +// STEP MPILEUP.1 process Mpileup { label 'memory_singleCPU_2_task' @@ -1881,22 +2497,30 @@ process Mpileup { file(fastaFai) from ch_fastaFai output: - set idPatient, idSample, file("${intervalBed.baseName}_${idSample}.pileup.gz") into mpileupMerge + set idPatient, idSample, file("${prefix}${idSample}.pileup.gz") into mpileupMerge when: 'controlfreec' in tools || 'mpileup' in tools script: + prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" + intervalsOptions = params.no_intervals ? "" : "-l ${intervalBed}" """ samtools mpileup \ -f ${fasta} ${bam} \ - -l ${intervalBed} \ - | bgzip --threads ${task.cpus} -c > ${intervalBed.baseName}_${idSample}.pileup.gz + ${intervalsOptions} \ + | bgzip --threads ${task.cpus} -c > ${prefix}${idSample}.pileup.gz """ } -mpileupMerge = mpileupMerge.groupTuple(by:[0, 1]) +if (!params.no_intervals) { + mpileupMerge = mpileupMerge.groupTuple(by:[0, 1]) + mpileupNoInt = Channel.empty() +} else { + (mpileupMerge, mpileupNoInt) = mpileupMerge.into(2) + mpileupMerge.close() +} -// STEP CONTROLFREEC.2 - MERGE MPILEUP +// STEP MPILEUP.2 - MERGE process MergeMpileup { tag {idSample} @@ -1909,7 +2533,7 @@ process MergeMpileup { output: set idPatient, idSample, file("${idSample}.pileup.gz") into mpileupOut - when: 'controlfreec' in tools || 'mpileup' in tools + when: !(params.no_intervals) && 'controlfreec' in tools || 'mpileup' in tools script: """ @@ -1923,6 +2547,7 @@ process MergeMpileup { """ } +mpileupOut = mpileupOut.mix(mpileupNoInt) mpileupOut = mpileupOut.dump(tag:'mpileup') mpileupOutNormal = Channel.create() @@ -1939,7 +2564,7 @@ mpileupOut = mpileupOut.map { [idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor] } -// STEP CONTROLFREEC.3 - CONTROLFREEC +// STEP CONTROLFREEC.1 - CONTROLFREEC process ControlFREEC { label 'memory_singleCPU_2_task' @@ -2039,6 +2664,10 @@ controlFreecVizOut.dump(tag:'ControlFreecViz') (vcfMantaSomaticSV, vcfMantaDiploidSV) = vcfManta.into(2) vcfKeep = Channel.empty().mix( + vcfSentieon.map { + variantcaller, idPatient, idSample, vcf, tbi -> + [variantcaller, idSample, vcf] + }, vcfStrelkaSingle.map { variantcaller, idPatient, idSample, vcf, tbi -> [variantcaller, idSample, vcf[1]] @@ -2081,6 +2710,8 @@ vcfKeep = Channel.empty().mix( // STEP VCF.QC process BcftoolsStats { + + label 'forks_max' label 'cpus_1' tag {"${variantCaller} - ${vcf}"} @@ -2104,6 +2735,8 @@ process BcftoolsStats { bcftoolsReport = bcftoolsReport.dump(tag:'BCFTools') process Vcftools { + + label 'forks_max' label 'cpus_1' tag {"${variantCaller} - ${vcf}"} @@ -2152,19 +2785,25 @@ if (step == 'annotate') { if (tsvPath == []) { // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller - // Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,Strelka,TIDDIT}/*.vcf.gz + // Basically it's: results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,SentieonDNAseq,SentieonDNAscope,SentieonTNscope,Strelka,TIDDIT}/*.vcf.gz // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka // The small snippet `vcf.minus(vcf.fileName)[-2]` catches idSample // This field is used to output final annotated VCFs in the correct directory Channel.empty().mix( Channel.fromPath("${params.outdir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") - .flatten().map{vcf -> ['haplotypecaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + .flatten().map{vcf -> ['HaplotypeCaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outdir}/VariantCalling/*/Manta/*[!candidate]SV.vcf.gz") - .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + .flatten().map{vcf -> ['Manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outdir}/VariantCalling/*/Mutect2/*.vcf.gz") - .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + .flatten().map{vcf -> ['Mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAseq/*.vcf.gz") + .flatten().map{vcf -> ['SentieonDNAseq', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAscope/*.vcf.gz") + .flatten().map{vcf -> ['SentieonDNAscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonTNscope/*.vcf.gz") + .flatten().map{vcf -> ['SentieonTNscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outdir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") - .flatten().map{vcf -> ['strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, + .flatten().map{vcf -> ['Strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, Channel.fromPath("${params.outdir}/VariantCalling/*/TIDDIT/*.vcf.gz") .flatten().map{vcf -> ['TIDDIT', vcf.minus(vcf.fileName)[-2].toString(), vcf]} ).choice(vcfToAnnotate, vcfNoAnnotate) { @@ -2258,7 +2897,8 @@ compressVCFsnpEffOut = compressVCFsnpEffOut.dump(tag:'VCF') process VEP { label 'VEP' - label 'cpus_4' + label 'med_resources' + echo true tag {"${idSample} - ${variantCaller} - ${vcf}"} @@ -2285,6 +2925,7 @@ process VEP { script: reducedVCF = reduceVCF(vcf.fileName) genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome + dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" cadd = (params.cadd_cache && params.cadd_WG_SNVs && params.cadd_InDels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" @@ -2297,6 +2938,7 @@ process VEP { -i ${vcf} \ -o ${reducedVCF}_VEP.ann.vcf \ --assembly ${genome} \ + --species ${params.species} \ ${cadd} \ ${genesplicer} \ --cache \ @@ -2321,7 +2963,7 @@ vepReport = vepReport.dump(tag:'VEP') process VEPmerge { label 'VEP' - label 'cpus_4' + label 'med_resources' tag {"${idSample} - ${variantCaller} - ${vcf}"} @@ -2358,6 +3000,7 @@ process VEPmerge { -i ${vcf} \ -o ${reducedVCF}_VEP.ann.vcf \ --assembly ${genome} \ + --species ${params.species} \ ${cadd} \ ${genesplicer} \ --cache \ @@ -2411,6 +3054,10 @@ compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') // STEP MULTIQC process MultiQC { + + label 'cpus_max' + label 'memory_max' + publishDir "${params.outdir}/MultiQC", mode: params.publishDirMode input: @@ -2630,7 +3277,7 @@ def checkNumberOfItem(row, number) { // Check parameter existence def checkParameterExistence(it, list) { if (!list.contains(it)) { - println("Unknown parameter: ${it}") + log.warn "Unknown parameter: ${it}" return false } return true @@ -2672,6 +3319,7 @@ def defineSkipQClist() { 'markduplicates', 'multiqc', 'samtools', + 'sentieon', 'vcftools', 'versions' ] @@ -2692,6 +3340,8 @@ def defineToolList() { return [ 'ascat', 'controlfreec', + 'dnascope', + 'dnaseq', 'freebayes', 'haplotypecaller', 'manta', @@ -2701,16 +3351,12 @@ def defineToolList() { 'snpeff', 'strelka', 'tiddit', - 'vep' + 'tnscope', + 'vep', + 'genomechronicler' ] } -// Print deprecation message -def deprecationMessage(oldItem, newItem = null) { - extra = newItem == null ? "": ", please use `${newItem}` instead" - log.warn "The ${oldItem} is deprecated${extra} -- it will be removed in a future release" -} - // Channeling the TSV file containing BAM. // Format is: "subject gender status sample bam bai" def extractBam(tsvFile) { diff --git a/nextflow.config b/nextflow.config index e426c6a2b2..df7843b3fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,10 +14,11 @@ params { input = null // No default input noGVCF = null // g.vcf are produced by HaplotypeCaller noStrelkaBP = null // Strelka will use Manta candidateSmallIndels if available + no_intervals = null // Intervals will be built from the fasta file skipQC = null // All QC tools are used step = 'mapping' // Starts with mapping tools = null // No default Variant Calling or Annotation tools - vepFile = 'https://raw.githubusercontent.com/cgpu/sarek/master/assets/no_vepFile.txt' + vepFile = 's3://lifebit-featured-datasets/pipelines/genomechronicler/no_vepFile.txt' // Workflow settings annotation_cache = null // Annotation cache disabled @@ -25,25 +26,24 @@ params { genesplicer = null // genesplicer disabled markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see https://github.com/SciLifeLab/Sarek/pull/689 for details nucleotidesPerSecond = 1000.0 // Default interval size + split_fastq = null // Fastq files will not be split by default outdir = './results' publishDirMode = 'copy' // Default PublishDirMode (same as other nf-core pipelines) saveGenomeIndex = null // Built Indexes not saved - sequencing_center = null // No sequencing center to be writen in BAM header in MapReads process + sequencing_center = null // No sequencing center to be written in BAM header in MapReads process + sentieon = null // Not using Sentieon by default // Optional files/directory - cadd_InDels = false // No CADD files - cadd_InDels_tbi = false // No CADD files - cadd_WG_SNVs = false // No CADD files - cadd_WG_SNVs_tbi = false // No CADD files - pon = false // No default PON file for GATK Mutect2 Panel of Normal + cadd_InDels = false // No CADD InDels file + cadd_InDels_tbi = false // No CADD InDels index + cadd_WG_SNVs = false // No CADD SNVs file + cadd_WG_SNVs_tbi = false // No CADD SNVs index + pon = false // No default PON (Panel of Normals) file for GATK Mutect2 / Sentieon TNscope + pon_index = false // No default PON index for GATK Mutect2 / Sentieon TNscope snpEff_cache = null // No directory for snpEff cache targetBED = false // No default TargetBED file for targeted sequencing vep_cache = null // No directory for VEP cache - // AWSBatch - awsqueue = false - awsregion = 'eu-west-1' - // Custom config config_profile_contact = false config_profile_description = false @@ -73,7 +73,12 @@ params { max_cpus = 16 max_memory = 128.GB max_time = 240.h + max_forks = 96 + med_resources_fraction = 0.50 singleCPUMem = 7.GB + bwa_cpus_fraction = 0.6 + bwa_cpus = null + sort_cpus = null // Deprecated params annotateVCF = null @@ -85,74 +90,66 @@ params { sampleDir = null } -// Container slug. Stable releases should specify release tag! -// Developmental code should specify :dev -process.container = 'nfcore/sarek:2.5.1' +// Container slug +// Stable releases should specify release tag (ie: `2.5`) +// Developmental code should specify dev +process.container = 'nfcore/sarek:2.5.2' process { - withName: MapReads { - container = "nfcore/sarek:2.5.1" - cpus = 30 - memory = 210.GB + withLabel: forks_max { + cpus = {params.max_forks} + maxForks = 96 + } + withLabel: cpus_max { + cpus = {params.max_cpus} maxForks = 2 } - + withLabel: memory_max { + memory = {params.max_memory} + } + + withLabel: med_resources { + cpus = { alloc_med_resource(params.max_cpus) } + //maxForks = { alloc_med_resource(params.max_forks) } + memory = { alloc_med_resource(params.max_memory) } + } + withName: BaseRecalibratorSpark { container = "broadinstitute/gatk:4.1.4.0" - cpus = 8 - memory = 32.GB - maxForks = 96 + maxForks = 64 } withName: MarkDuplicatesSpark { container = "broadinstitute/gatk:4.1.3.0" - cpus = 28 - memory = 180.GB maxForks = 2 } withName: RunGenomeChronicler { container = "lifebitai/genomechronicler:pgp-uk-5513c6f" - cpus = 1 - memory = 4.GB - maxForks = 96 } withName: ApplyBQSRSpark { container = "broadinstitute/gatk:4.1.4.0" - cpus = 4 - memory = 16.GB - maxForks = 64 + maxForks = 96 } withName: HaplotypeCaller { container = "broadinstitute/gatk:4.1.4.0" - cpus = 2 - memory = 8.GB - maxForks = 64 + cpus = 1 + maxForks = 96 } withName: Mutect2 { container = "broadinstitute/gatk:4.1.4.0" - cpus = 2 - memory = 8.GB - maxForks = 32 } withName: PileupSummariesForMutect2 { container = "broadinstitute/gatk:4.1.4.0" - cpus = 1 - memory = 4.GB - maxForks = 96 } withName: MultiQC { - container = "nfcore/sarek:2.5.1" - cpus = 8 - memory = 32.GB - maxForks = 1 errorStrategy = 'retry' maxRetries = 4 } @@ -168,8 +165,14 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } +// Load nf-core/sarek custom profiles from different Institutions +try { + includeConfig "${params.custom_config_base}/pipeline/sarek.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/sarek profiles: ${params.custom_config_base}/pipeline/sarek.config") +} + profiles { - awsbatch { includeConfig 'conf/awsbatch.config' } conda { docker.enabled = false process.conda = "$baseDir/environment.yml" @@ -190,6 +193,10 @@ profiles { singularity.enabled = true } test { includeConfig 'conf/test.config' } + test_annotation { includeConfig 'conf/test_annotation.config' } + test_splitfastq { includeConfig 'conf/test_splitfastq.config' } + test_targeted { includeConfig 'conf/test_targeted.config' } + test_tool { includeConfig 'conf/test_tool.config' } } // Load genomes.config or igenomes.config @@ -225,8 +232,8 @@ manifest { homePage = 'https://github.com/nf-core/sarek' description = 'An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing' mainScript = 'main.nf' - nextflowVersion = '>=19.04.0' - version = '2.5.1' + nextflowVersion = '>=19.10.0' + version = '2.5.2' } // Return the minimum between requirements and a maximum limit to ensure that resource requirements don't go over @@ -238,9 +245,24 @@ def check_resource(obj) { return params.max_time as nextflow.util.Duration else if (obj.getClass() == java.lang.Integer) return Math.min(obj, params.max_cpus as int) + else if (obj.getClass() == java.lang.Integer) + return Math.min(obj, params.max_forks as int) else return obj } catch (all) { println " ### ERROR ### Max params max_memory:'${params.max_memory}', max_time:'${params.max_time}' or max_cpus:'${params.max_cpus}' is not valid! Using default value: $obj" } } + +def alloc_med_resource(obj) { + try { + if (obj.getClass() == java.lang.String) + return ( (obj as nextflow.util.MemoryUnit) * (params.med_resources_fraction as float) ) + else if (obj.getClass() == java.lang.Integer) + return ( (obj as int) * (params.med_resources_fraction as float) as int) + else + return obj + } catch (all) { + println " ### ERROR ### Max params max_memory:'${params.max_memory}', or max_cpus:'${params.max_cpus}' or max_cpus:'${params.max_forks}' is not valid! Using default value: $obj" + } +} \ No newline at end of file diff --git a/scripts/download_image.sh b/scripts/download_image.sh deleted file mode 100755 index 730e8fe945..0000000000 --- a/scripts/download_image.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash -set -xeuo pipefail - -# This script download and tag image for sarek tests - -usage() { echo "Usage: $0 <-t test|annotation tool> <-n engine> <-S version to pull/build> <-T version to tag> <-g genome>" 1>&2; exit 1; } - -ENGINE=docker -GENOME=smallGRCh37 -NXF_SINGULARITY_CACHEDIR=${NXF_SINGULARITY_CACHEDIR:-work/singularity/.} -TEST=ALL -VERSION=dev -TARGETVERSION=${VERSION} - -while [[ $# -gt 0 ]] -do - key=$1 - case $key in - -g|--genome) - GENOME=$2 - shift # past argument - shift # past value - ;; - -n|--engine) - ENGINE=$2 - shift # past argument - shift # past value - ;; - -T|--target-version) - TARGETVERSION=$2 - shift # past argument - shift # past value - ;; - -S|--source-version) - VERSION=$2 - shift # past argument - shift # past value - ;; - -t|--test|--tool) - TEST=$2 - shift # past argument - shift # past value - ;; - *) # unknown option - usage - shift # past argument - ;; - esac -done - -SOURCEGENOME=${GENOME} - -if [[ smallGRCh37 =~ $SOURCEGENOME ]] -then - SOURCEGENOME=GRCh37 -fi - -get_image(){ - CONTAINER=$1 - SOURCE=$2 - TARGET=$3 - if [[ docker =~ $ENGINE ]] - then - docker pull nfcore/${1}:${2} - docker tag nfcore/${1}:${2} nfcore/${1}:${3} - elif [[ singularity =~ $ENGINE ]] - then - mkdir -p ${NXF_SINGULARITY_CACHEDIR} - singularity build ${NXF_SINGULARITY_CACHEDIR}/nfcore-${1}-${3}.img docker://nfcore/${1}:${2} - fi -} - -if [[ ALL,ANNOTATEBOTH,ANNOTATESNPEFF,SNPEFF =~ $TEST ]] -then - get_image sareksnpeff ${VERSION}.${SOURCEGENOME} ${TARGETVERSION}.${GENOME} -fi - -if [[ ALL,ANNOTATEBOTH,ANNOTATEVEP,VEP =~ $TEST ]] -then - get_image sarekvep ${VERSION}.${SOURCEGENOME} ${TARGETVERSION}.${GENOME} -fi - -get_image sarek ${VERSION} ${TARGETVERSION} diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh deleted file mode 100755 index cb6db88901..0000000000 --- a/scripts/run_tests.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/bash -set -xeuo pipefail - -# This script run sarek tests -# https://github.com/nf-core/test-datasets/raw/sarek - -usage() { echo "Usage: $0 <-p profile> <-t test> <-c cpus> <-n> <-v> <-m memory>" 1>&2; exit 1; } - -CPUS=2 -LOGS='' -MEMORY='6.GB' -NXF_SINGULARITY_CACHEDIR=${NXF_SINGULARITY_CACHEDIR:-work/singularity/.} -OFFLINE=false -PROFILE=docker -REPORTS='' -TEST=MULTIPLE -TRAVIS=${TRAVIS:-false} -TRAVIS_BUILD_DIR=${TRAVIS_BUILD_DIR:-.} -VERBOSE='' - -while [[ $# -gt 0 ]] -do - key=$1 - case $key in - -c|--cpus) - CPUS=$2 - shift # past value - ;; - -m|--memory) - MEMORY=$2 - shift # past argument - shift # past value - ;; - -n|--no-logs) - LOGS=true - shift # past value - ;; - --no-reports) - REPORTS="--skipQC all" - shift # past value - ;; - --offline) - OFFLINE=true - shift # past value - ;; - -p|--profile) - PROFILE=$2 - shift # past argument - shift # past value - ;; - -t|--test) - TEST=$2 - shift # past argument - shift # past value - ;; - -v|--verbose) - VERBOSE="-ansi-log false -dump-channels" - shift # past value - ;; - *) # unknown option - usage - shift # past argument - ;; - esac -done - -function manage_logs() { - if [[ $LOGS ]] - then - rm -rf .nextflow* results/ work/ - fi -} - -function run_sarek() { - nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile test,${PROFILE} ${VERBOSE} --monochrome_logs ${REPORTS} --max_memory ${MEMORY} $@ -} - -if [[ $OFFLINE == false ]] -then - PATHTOSAMPLE="https://github.com/nf-core/test-datasets/raw/sarek/testdata" - SUFFIX="-https" -else - PATHTOSAMPLE="data/testdata" - SUFFIX="" -fi - -OPTIONS="--tools FreeBayes,HaplotypeCaller,Manta,Mutect2,Strelka,TIDDIT" - -if [[ $TEST == "GERMLINE" ]] && [[ $OFFLINE == false ]] -then - rm -rf data - git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data -fi - -case $TEST in - ANNOTATEBOTH) - ANNOTATOR="merge,snpEFF,VEP" - TEST=ANNOTATE - ;; - ANNOTATESNPEFF) - ANNOTATOR="snpEFF" - TEST=ANNOTATE - ;; - ANNOTATEVEP) - ANNOTATOR="VEP" - TEST=ANNOTATE - ;; -esac - -case $TEST in - ANNOTATE) - run_sarek --step annotate --tools ${ANNOTATOR} --input ${PATHTOSAMPLE}/vcf/Strelka_1234N_variants.vcf.gz --skipQC all - ;; - GERMLINE) - run_sarek --tools=false --input data/testdata/tiny/normal - run_sarek --tools=false --input results/Preprocessing/TSV/duplicateMarked.tsv --step recalibrate - run_sarek --tools HaplotypeCaller --input results/Preprocessing/TSV/recalibrated.tsv --step variantCalling - ;; - MULTIPLE) - run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,TIDDIT,snpEff,VEP,merge --input ${PATHTOSAMPLE}/tsv/tiny-multiple${SUFFIX}.tsv - ;; - SOMATIC) - run_sarek ${OPTIONS} --input ${PATHTOSAMPLE}/tsv/tiny-manta${SUFFIX}.tsv - ;; - TARGETED) - run_sarek ${OPTIONS} --input ${PATHTOSAMPLE}/tsv/tiny-manta${SUFFIX}.tsv --targetBED ${PATHTOSAMPLE}/target.bed - ;; -esac - -if [[ $TEST == "GERMLINE" ]] && [[ $OFFLINE == false ]] -then - rm -rf data -fi