Skip to content

Commit

Permalink
Configs and docs (#12)
Browse files Browse the repository at this point in the history
* Update CI scripts
* Add possibility to download singularity images with helper script
* Update Docs
* Improve build.nf
* Use label for processes configuration
* Update configuration files
* Disable Docker in singularity profile
* Disable Singularity in docker profile
* Disable Docker and Singularity in conda profile
* Simplify check_max() function
  • Loading branch information
maxulysse committed Jun 12, 2019
1 parent 61799e2 commit a9cb70e
Show file tree
Hide file tree
Showing 23 changed files with 6,963 additions and 312 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ before_install:
# PRs to master are only ok if coming from dev branch
- '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])'
# Pull the docker image first so the test doesn't wait for this
- "travis_retry ./scripts/download_docker.sh --test $TEST"
- "travis_retry ./scripts/download_image.sh -n docker --test $TEST"

install:
# Install Nextflow
Expand Down
429 changes: 427 additions & 2 deletions CHANGELOG.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pipeline {
stages {
stage('Docker setup') {
steps {
sh "./scripts/download_docker.sh"
sh "./scripts/download_image.sh -n docker -t ALL"
}
}
stage('Build references') {
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The nf-core/sarek pipeline comes with documentation about the pipeline, found in
2. Pipeline configuration
* [Local installation](https://nf-co.re/usage/local_installation)
* [Adding your own system config](https://nf-co.re/usage/adding_own_config)
* [Install on a secure cluster](docs/install_bianca.md)
* [Reference genomes](https://nf-co.re/usage/reference_genomes)
* [Extra documentation on reference](docs/reference.md)
3. [Running the pipeline](docs/usage.md)
Expand Down
59 changes: 34 additions & 25 deletions bin/concatenateVCFs.sh
Original file line number Diff line number Diff line change
@@ -1,48 +1,59 @@
#!/usr/bin/env bash
# this script concatenates all VCFs that are in the local directory: the
# purpose is to make a single VCF from all the VCFs that were created from different intervals
set -euo pipefail

# This script concatenates all VCF files that are in the local directory,
# that were created from different intervals to make a single final VCF

usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }

while getopts "i:c:o:t:" p; do
case "${p}" in
i)
genomeIndex=${OPTARG}
while [[ $# -gt 0 ]]
do
key=$1
case $key in
-i)
genomeIndex=$2
shift # past argument
shift # past value
;;
c)
cpus=${OPTARG}
-c)
cpus=$2
shift # past argument
shift # past value
;;
o)
outputFile=${OPTARG}
-o)
outputFile=$2
shift # past argument
shift # past value
;;
t)
targetBED=${OPTARG}
-t)
targetBED=$2
shift # past argument
shift # past value
;;
*)
usage
shift # past argument
;;
esac
done
shift $((OPTIND-1))

if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi

set -euo pipefail

# first make a header from one of the VCF intervals
# get rid of interval information only from the GATK command-line, but leave the rest
# First make a header from one of the VCF
# Remove interval information from the GATK command-line, but leave the rest
FIRSTVCF=$(ls *.vcf | head -n 1)
sed -n '/^[^#]/q;p' $FIRSTVCF | \
awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
> header

# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
# header in the VCF as it is optional (FreeBayes does not save it, for example)
# Get list of contigs from the FASTA index (.fai)
# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)

CONTIGS=($(cut -f1 ${genomeIndex}))

# concatenate VCFs in the correct order
# Concatenate VCFs in the correct order
(
cat header

Expand Down Expand Up @@ -72,14 +83,12 @@ tabix rawcalls.vcf.gz

set +u

# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
echo "target is $targetBED"
# Now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
if [ ! -z ${targetBED+x} ]; then
echo "Selecting subset..."
echo "Target is $targetBED - Selecting subset..."
bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz
tabix ${outputFile}.gz
else
# simply rename the raw calls as WGS results
# Rename the raw calls as WGS results
for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done
fi

54 changes: 25 additions & 29 deletions build.nf
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,16 @@ params.vep_cache = null

ch_referencesFiles = Channel.empty()

if ((params.build) && (params.offline)) ch_referencesFiles = Channel.fromPath("data/reference/*")
if ((params.build) && (!params.offline)) ch_referencesFiles = ch_referencesFiles.mix(
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase1.indels.b37.small.vcf.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/dbsnp_138.b37.small.vcf.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/human_g1k_v37_decoy.small.fasta.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/small.intervals"))
pathToSource = params.offline ? "data/reference/" : "https://github.com/nf-core/test-datasets/raw/sarek/reference"

if (params.build) ch_referencesFiles = ch_referencesFiles.mix(
Channel.fromPath("${pathToSource}/1000G_phase1.indels.b37.small.vcf.gz"),
Channel.fromPath("${pathToSource}/1000G_phase3_20130502_SNP_maf0.3.small.loci"),
Channel.fromPath("${pathToSource}/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"),
Channel.fromPath("${pathToSource}/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"),
Channel.fromPath("${pathToSource}/dbsnp_138.b37.small.vcf.gz"),
Channel.fromPath("${pathToSource}/human_g1k_v37_decoy.small.fasta.gz"),
Channel.fromPath("${pathToSource}/small.intervals"))

ch_referencesFiles = ch_referencesFiles.dump(tag:'Reference Files')

Expand Down Expand Up @@ -134,7 +135,8 @@ if (params.email) {
summary['MultiQC maxsize'] = params.maxMultiqcEmailFileSize
}
log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
log.info "\033[2m----------------------------------------------------\033[0m"
if (params.monochrome_logs) log.info "----------------------------------------------------"
else log.info "\033[2m----------------------------------------------------\033[0m"

// Check the hostnames against configured profiles
checkHostname()
Expand Down Expand Up @@ -166,7 +168,7 @@ ch_compressedfiles = Channel.create()
ch_notCompressedfiles = Channel.create()

ch_referencesFiles
.choice(ch_compressedfiles, ch_notCompressedfiles) {it =~ ".(gz|tar.bz2)" ? 0 : 1}
.choice(ch_compressedfiles, ch_notCompressedfiles) {it =~ ".gz" ? 0 : 1}

process DecompressFile {
tag {f_reference}
Expand All @@ -178,15 +180,9 @@ process DecompressFile {
file("*.{vcf,fasta,loci}") into ch_decompressedFiles

script:
realReferenceFile="readlink ${f_reference}"
if (f_reference =~ ".gz")
"""
gzip -d -c \$(${realReferenceFile}) > ${f_reference.baseName}
"""
else if (f_reference =~ ".tar.bz2")
"""
tar xvjf \$(${realReferenceFile})
"""
"""
gzip -d -c -f ${f_reference} > ${f_reference.baseName}
"""
}

ch_decompressedFiles = ch_decompressedFiles.dump(tag:'DecompressedFile')
Expand Down Expand Up @@ -306,7 +302,7 @@ process BuildCache_snpEff {
output:
file("*")

when: params.snpEff_cache && params.download_cache
when: params.snpEff_cache && params.download_cache && !params.offline

script:
"""
Expand All @@ -325,7 +321,7 @@ process BuildCache_VEP {
output:
file("*")

when: params.vep_cache && params.download_cache
when: params.vep_cache && params.download_cache && !params.offline

script:
genome = params.genome == "smallGRCh37" ? "GRCh37" : params.genome
Expand Down Expand Up @@ -363,7 +359,7 @@ process DownloadCADD {
output:
set file("*.tsv.gz"), file("*.tsv.gz.tbi")

when: params.cadd_cache && params.download_cache
when: params.cadd_cache && params.download_cache && !params.offline

script:
"""
Expand Down Expand Up @@ -391,12 +387,12 @@ def nfcoreHeader(){
${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset}
${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset}
${c_green}`._,._,\'${c_reset}
${c_black} ____ ${c_blue} _____ _ ${c_reset}
${c_black} .' ${c_green}_${c_black} `. ${c_blue} / ____| | | ${c_reset}
${c_black} / ${c_green}|\\${c_white}`-_${c_black} \\ ${c_blue} | (___ ___ _ __ __ | | __ ${c_reset}
${c_black} | ${c_green}| \\ ${c_white}`-${c_black}| ${c_blue} \\___ \\/__ \\| ´__/ _\\| |/ / ${c_reset}
${c_black} \\ ${c_green}| \\ ${c_black}/ ${c_blue} ____) | __ | | | __| < ${c_reset}
${c_black} `${c_green}|${c_black}____${c_green}\\${c_black}' ${c_blue} |_____/\\____|_| \\__/|_|\\_\\ ${c_reset}
${c_white} ____ ${c_blue} _____ _ ${c_reset}
${c_white} .' _ `. ${c_blue} / ____| | | ${c_reset}
${c_white} / ${c_green}|\\${c_white}`-_${c_white} \\ ${c_blue} | (___ ___ _ __ __ | | __ ${c_reset}
${c_white} | ${c_green}| \\ ${c_white}`-${c_white}| ${c_blue} \\___ \\/__ \\| ´__/ _\\| |/ / ${c_reset}
${c_white} \\ ${c_green}| \\ ${c_white}/ ${c_blue} ____) | __ | | | __| < ${c_reset}
${c_white} `${c_green}|${c_white}____${c_green}\\${c_white}' ${c_blue} |_____/\\____|_| \\__/|_|\\_\\ ${c_reset}
${c_purple} nf-core/sarek v${workflow.manifest.version}${c_reset}
${c_dim}----------------------------------------------------${c_reset}
Expand Down
104 changes: 86 additions & 18 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,95 @@
* run on the logged in environment.
*/

process {
params {
// Defaults only, expecting to be overwritten
cpus = 10
igenomes_base = 's3://ngi-igenomes/igenomes/'
markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 for details
max_cpus = 16 // Base specifications
max_memory = 128.GB // Base specifications
max_time = 240.h // Base specifications
singleCPUMem = 7.GB // for processes that are using more memory but a single CPU only. Use the 'core' queue for these
}

// TODO nf-core: Check the defaults for all processes
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
time = { check_max( 2.h * task.attempt, 'time' ) }
process {
cpus = {check_max(params.cpus * task.attempt)}
memory = {check_max(15.GB * task.attempt)}
time = {check_max(24.h * task.attempt)}
shell = ['/bin/bash', '-euo', 'pipefail']

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
errorStrategy = {task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish'}
maxErrors = '-1'
maxRetries = 1
maxRetries = 3

// Process-specific resource requirements
// TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
}
withLabel:singleCPUmem_x_task {
memory = {check_max(params.singleCPUMem * task.attempt)}
}
withLabel:singleCPUmem_x2_task {
memory = {check_max(params.singleCPUMem * task.attempt * task.attempt)}
}
withLabel:singleCPUmem_2x_task {
memory = {check_max(params.singleCPUMem * 2 * task.attempt)}
}
withLabel:max_cpus {
cpus = {params.max_cpus}
}
withLabel:max_memory {
memory = {check_max(params.max_memory)}
}

params {
// Defaults only, expecting to be overwritten
igenomes_base = 's3://ngi-igenomes/igenomes/'
markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 for details
max_cpus = 16
max_memory = 128.GB
max_time = 240.h
withName:BamQCmapped {
cpus = {check_max(16)}
}
withName:BamQCrecalibrated {
cpus = {check_max(16)}
}
withName:BaseRecalibrator {
cpus = {check_max(16)}
}
withName:ConcatVCF {
cpus = {check_max(8)}
// For unknown reasons, ConcatVCF sometimes fails with SIGPIPE
// (exit code 141). Rerunning the process will usually work.
errorStrategy = {task.exitStatus == 141 ? 'retry' : 'terminate'}
}
withName:FastQCBAM {
cpus = {check_max(2)}
errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
}
withName:FastQCFQ {
// FastQC is only capable of running one thread per fastq file.
cpus = {check_max(2)}
errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
}
withName:GatherBQSRReports {
cpus = {check_max(2)}
}
withName:MapReads {
memory = {check_max(check_max(60.GB * task.attempt))}
}
withName:MarkDuplicates {
cpus = {check_max(16)}
}
withName:MergeBamMapped {
cpus = {check_max(8)}
}
withName:MergeBamRecal {
cpus = {check_max(8)}
}
withName:MultiQC {
errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
}
withName:SamtoolsStats {
cpus = {check_max(2)}
}
withName:Snpeff {
container = {(params.annotation_cache && params.snpEff_cache) ? 'nfcore/sarek:dev' : "nfcore/sareksnpeff:dev.${params.genome}"}
errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
}
withName:VEP {
container = {(params.annotation_cache && params.vep_cache) ? 'nfcore/sarek:dev' : "nfcore/sarekvep:dev.${params.genome}"}
cpus = {check_max(4)}
errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
}
}
9 changes: 9 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,16 @@ The nf-core/sarek documentation is split into the following files:
2. Pipeline configuration
* [Local installation](https://nf-co.re/usage/local_installation)
* [Adding your own system config](https://nf-co.re/usage/adding_own_config)
* [Install on a secure cluster](install_bianca.md)
* [Reference genomes](https://nf-co.re/usage/reference_genomes)
* [Extra documentation on reference](reference.md)
3. [Running the pipeline](usage.md)
* [Examples](use_cases.md)
* [Input files documentation](input.md)
* [Extra documentation on variant calling](variantcalling.md)
* [Documentation about containers](containers.md)
* [Extra documentation for targeted sequencing](targetseq.md)
4. [Output and how to interpret the results](output.md)
* [Complementary information about ASCAT](ascat.md)
* [Extra documentation on annotation](annotation.md)
5. [Troubleshooting](https://nf-co.re/usage/troubleshooting)
Loading

0 comments on commit a9cb70e

Please sign in to comment.