Configs and docs (#12)

* Update CI scripts * Add possibility to download singularity images with helper script * Update Docs * Improve build.nf * Use label for processes configuration * Update configuration files * Disable Docker in singularity profile * Disable Singularity in docker profile * Disable Docker and Singularity in conda profile * Simplify check_max() function
nf-core · Jun 12, 2019 · a9cb70e · a9cb70e
1 parent 61799e2
commit a9cb70e
Show file tree

Hide file tree

Showing 23 changed files with 6,963 additions and 312 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -27,7 +27,7 @@ before_install:
   # PRs to master are only ok if coming from dev branch
   - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])'
   # Pull the docker image first so the test doesn't wait for this
-  - "travis_retry ./scripts/download_docker.sh --test $TEST"
+  - "travis_retry ./scripts/download_image.sh -n docker --test $TEST"
 
 install:
   # Install Nextflow

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -8,7 +8,7 @@ pipeline {
     stages {
         stage('Docker setup') {
             steps {
-                sh "./scripts/download_docker.sh"
+                sh "./scripts/download_image.sh -n docker -t ALL"
             }
         }
         stage('Build references') {

diff --git a/README.md b/README.md
@@ -38,6 +38,7 @@ The nf-core/sarek pipeline comes with documentation about the pipeline, found in
 2. Pipeline configuration
     * [Local installation](https://nf-co.re/usage/local_installation)
     * [Adding your own system config](https://nf-co.re/usage/adding_own_config)
+    * [Install on a secure cluster](docs/install_bianca.md)
     * [Reference genomes](https://nf-co.re/usage/reference_genomes)
     * [Extra documentation on reference](docs/reference.md)
 3. [Running the pipeline](docs/usage.md)

diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh
@@ -1,48 +1,59 @@
 #!/usr/bin/env bash
-# this script concatenates all VCFs that are in the local directory: the 
-# purpose is to make a single VCF from all the VCFs that were created from different intervals
+set -euo pipefail
+
+# This script concatenates all VCF files that are in the local directory,
+# that were created from different intervals to make a single final VCF
 
 usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
 
-while getopts "i:c:o:t:" p; do
-	case "${p}" in
-		i)
-			genomeIndex=${OPTARG}
+while [[ $# -gt 0 ]]
+do
+  key=$1
+  case $key in
+		-i)
+			genomeIndex=$2
+			shift # past argument
+	    shift # past value
 			;;
-		c)
-			cpus=${OPTARG}
+		-c)
+			cpus=$2
+			shift # past argument
+	    shift # past value
 			;;
-		o)
-			outputFile=${OPTARG}
+		-o)
+			outputFile=$2
+			shift # past argument
+	    shift # past value
 			;;
-		t)
-			targetBED=${OPTARG}
+		-t)
+			targetBED=$2
+			shift # past argument
+	    shift # past value
 			;;
 		*)
 			usage
+			shift # past argument
 			;;
 	esac
 done
-shift $((OPTIND-1))
 
 if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
 if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
 if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi
 
-set -euo pipefail
-
-# first make a header from one of the VCF intervals
-# get rid of interval information only from the GATK command-line, but leave the rest
+# First make a header from one of the VCF
+# Remove interval information from the GATK command-line, but leave the rest
 FIRSTVCF=$(ls *.vcf | head -n 1)
 sed -n '/^[^#]/q;p' $FIRSTVCF | \
 awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
 > header
 
-# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
-# header in the VCF as it is optional (FreeBayes does not save it, for example)
+# Get list of contigs from the FASTA index (.fai)
+# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)
+
 CONTIGS=($(cut -f1 ${genomeIndex}))
 
-# concatenate VCFs in the correct order
+# Concatenate VCFs in the correct order
 (
   cat header
 
@@ -72,14 +83,12 @@ tabix rawcalls.vcf.gz
 
 set +u
 
-# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
-echo "target is $targetBED"
+# Now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
 if [ ! -z ${targetBED+x} ]; then
-	echo "Selecting subset..."
+	echo "Target is $targetBED - Selecting subset..."
 	bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz
 	tabix ${outputFile}.gz
 else
-	# simply rename the raw calls as WGS results
+	# Rename the raw calls as WGS results
 	for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done
 fi
-
diff --git a/build.nf b/build.nf
@@ -74,15 +74,16 @@ params.vep_cache = null
 
 ch_referencesFiles = Channel.empty()
 
-if ((params.build) && (params.offline)) ch_referencesFiles = Channel.fromPath("data/reference/*")
-if ((params.build) && (!params.offline)) ch_referencesFiles = ch_referencesFiles.mix(
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase1.indels.b37.small.vcf.gz"),
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci"),
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"),
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"),
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/dbsnp_138.b37.small.vcf.gz"),
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/human_g1k_v37_decoy.small.fasta.gz"),
-  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/small.intervals"))
+pathToSource = params.offline ? "data/reference/" : "https://github.com/nf-core/test-datasets/raw/sarek/reference"
+
+if (params.build) ch_referencesFiles = ch_referencesFiles.mix(
+  Channel.fromPath("${pathToSource}/1000G_phase1.indels.b37.small.vcf.gz"),
+  Channel.fromPath("${pathToSource}/1000G_phase3_20130502_SNP_maf0.3.small.loci"),
+  Channel.fromPath("${pathToSource}/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"),
+  Channel.fromPath("${pathToSource}/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"),
+  Channel.fromPath("${pathToSource}/dbsnp_138.b37.small.vcf.gz"),
+  Channel.fromPath("${pathToSource}/human_g1k_v37_decoy.small.fasta.gz"),
+  Channel.fromPath("${pathToSource}/small.intervals"))
 
 ch_referencesFiles = ch_referencesFiles.dump(tag:'Reference Files')
 
@@ -134,7 +135,8 @@ if (params.email) {
   summary['MultiQC maxsize'] = params.maxMultiqcEmailFileSize
 }
 log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
-log.info "\033[2m----------------------------------------------------\033[0m"
+if (params.monochrome_logs) log.info "----------------------------------------------------"
+else log.info "\033[2m----------------------------------------------------\033[0m"
 
 // Check the hostnames against configured profiles
 checkHostname()
@@ -166,7 +168,7 @@ ch_compressedfiles = Channel.create()
 ch_notCompressedfiles = Channel.create()
 
 ch_referencesFiles
-  .choice(ch_compressedfiles, ch_notCompressedfiles) {it =~ ".(gz|tar.bz2)" ? 0 : 1}
+  .choice(ch_compressedfiles, ch_notCompressedfiles) {it =~ ".gz" ? 0 : 1}
 
 process DecompressFile {
   tag {f_reference}
@@ -178,15 +180,9 @@ process DecompressFile {
     file("*.{vcf,fasta,loci}") into ch_decompressedFiles
 
   script:
-  realReferenceFile="readlink ${f_reference}"
-  if (f_reference =~ ".gz")
-    """
-    gzip -d -c \$(${realReferenceFile}) > ${f_reference.baseName}
-    """
-  else if (f_reference =~ ".tar.bz2")
-    """
-    tar xvjf \$(${realReferenceFile})
-    """
+  """
+  gzip -d -c -f ${f_reference} > ${f_reference.baseName}
+  """
 }
 
 ch_decompressedFiles = ch_decompressedFiles.dump(tag:'DecompressedFile')
@@ -306,7 +302,7 @@ process BuildCache_snpEff {
   output:
     file("*")
 
-  when: params.snpEff_cache && params.download_cache
+  when: params.snpEff_cache && params.download_cache && !params.offline
 
   script:
   """
@@ -325,7 +321,7 @@ process BuildCache_VEP {
   output:
     file("*")
 
-  when: params.vep_cache && params.download_cache
+  when: params.vep_cache && params.download_cache && !params.offline
 
   script:
   genome = params.genome == "smallGRCh37" ? "GRCh37" : params.genome
@@ -363,7 +359,7 @@ process DownloadCADD {
   output:
     set file("*.tsv.gz"), file("*.tsv.gz.tbi")
 
-  when: params.cadd_cache && params.download_cache
+  when: params.cadd_cache && params.download_cache && !params.offline
 
   script:
   """
@@ -391,12 +387,12 @@ def nfcoreHeader(){
     ${c_blue}  |\\ | |__  __ /  ` /  \\ |__) |__         ${c_yellow}}  {${c_reset}
     ${c_blue}  | \\| |       \\__, \\__/ |  \\ |___     ${c_green}\\`-._,-`-,${c_reset}
                                             ${c_green}`._,._,\'${c_reset}
-    ${c_black}       ____      ${c_blue}  _____               _ ${c_reset}
-    ${c_black}     .' ${c_green}_${c_black}  `.    ${c_blue} / ____|             | | ${c_reset}
-    ${c_black}    /  ${c_green}|\\${c_white}`-_${c_black} \\ ${c_blue}  | (___  ___  _ __ __ | | __ ${c_reset}
-    ${c_black}   |   ${c_green}| \\  ${c_white}`-${c_black}| ${c_blue}  \\___ \\/__ \\| ´__/ _\\| |/ / ${c_reset}
-    ${c_black}    \\ ${c_green}|   \\  ${c_black}/ ${c_blue}   ____) | __ | | |  __|   < ${c_reset}
-    ${c_black}     `${c_green}|${c_black}____${c_green}\\${c_black}'   ${c_blue} |_____/\\____|_|  \\__/|_|\\_\\ ${c_reset}
+    ${c_white}       ____      ${c_blue}  _____               _ ${c_reset}
+    ${c_white}     .' _  `.    ${c_blue} / ____|             | | ${c_reset}
+    ${c_white}    /  ${c_green}|\\${c_white}`-_${c_white} \\ ${c_blue}  | (___  ___  _ __ __ | | __ ${c_reset}
+    ${c_white}   |   ${c_green}| \\  ${c_white}`-${c_white}| ${c_blue}  \\___ \\/__ \\| ´__/ _\\| |/ / ${c_reset}
+    ${c_white}    \\ ${c_green}|   \\  ${c_white}/ ${c_blue}   ____) | __ | | |  __|   < ${c_reset}
+    ${c_white}     `${c_green}|${c_white}____${c_green}\\${c_white}'   ${c_blue} |_____/\\____|_|  \\__/|_|\\_\\ ${c_reset}
 
     ${c_purple}  nf-core/sarek v${workflow.manifest.version}${c_reset}
     ${c_dim}----------------------------------------------------${c_reset}

diff --git a/conf/base.config b/conf/base.config
@@ -9,27 +9,95 @@
  * run on the logged in environment.
  */
 
-process {
+params {
+  // Defaults only, expecting to be overwritten
+  cpus = 10
+  igenomes_base = 's3://ngi-igenomes/igenomes/'
+  markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 for details
+  max_cpus = 16 // Base specifications
+  max_memory = 128.GB // Base specifications
+  max_time = 240.h // Base specifications
+  singleCPUMem = 7.GB // for processes that are using more memory but a single CPU only. Use the 'core' queue for these
+}
 
-  // TODO nf-core: Check the defaults for all processes
-  cpus = { check_max( 1 * task.attempt, 'cpus' ) }
-  memory = { check_max( 8.GB * task.attempt, 'memory' ) }
-  time = { check_max( 2.h * task.attempt, 'time' ) }
+process {
+  cpus = {check_max(params.cpus * task.attempt)}
+  memory = {check_max(15.GB * task.attempt)}
+  time = {check_max(24.h * task.attempt)}
+  shell = ['/bin/bash', '-euo', 'pipefail']
 
-  errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
+  errorStrategy = {task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish'}
   maxErrors = '-1'
-  maxRetries = 1
+  maxRetries = 3
 
-  // Process-specific resource requirements
-  // TODO nf-core: Customise requirements for specific processes.
-  // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
-}
+  withLabel:singleCPUmem_x_task {
+    memory = {check_max(params.singleCPUMem * task.attempt)}
+  }
+  withLabel:singleCPUmem_x2_task {
+    memory = {check_max(params.singleCPUMem * task.attempt * task.attempt)}
+    }
+  withLabel:singleCPUmem_2x_task {
+    memory = {check_max(params.singleCPUMem * 2 * task.attempt)}
+  }
+  withLabel:max_cpus {
+    cpus = {params.max_cpus}
+  }
+  withLabel:max_memory {
+    memory = {check_max(params.max_memory)}
+  }
 
-params {
-  // Defaults only, expecting to be overwritten
-  igenomes_base = 's3://ngi-igenomes/igenomes/'
-  markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 for details
-  max_cpus = 16
-  max_memory = 128.GB
-  max_time = 240.h
+  withName:BamQCmapped {
+    cpus = {check_max(16)}
+  }
+  withName:BamQCrecalibrated {
+    cpus = {check_max(16)}
+  }
+  withName:BaseRecalibrator {
+    cpus = {check_max(16)}
+  }
+  withName:ConcatVCF {
+    cpus = {check_max(8)}
+    // For unknown reasons, ConcatVCF sometimes fails with SIGPIPE
+    // (exit code 141). Rerunning the process will usually work.
+    errorStrategy = {task.exitStatus == 141 ? 'retry' : 'terminate'}
+  }
+  withName:FastQCBAM {
+    cpus = {check_max(2)}
+    errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
+  }
+  withName:FastQCFQ {
+    // FastQC is only capable of running one thread per fastq file.
+    cpus = {check_max(2)}
+    errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
+  }
+  withName:GatherBQSRReports {
+    cpus = {check_max(2)}
+  }
+  withName:MapReads {
+    memory = {check_max(check_max(60.GB * task.attempt))}
+  }
+  withName:MarkDuplicates {
+    cpus = {check_max(16)}
+  }
+  withName:MergeBamMapped {
+    cpus = {check_max(8)}
+  }
+  withName:MergeBamRecal {
+    cpus = {check_max(8)}
+  }
+  withName:MultiQC {
+    errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
+  }
+  withName:SamtoolsStats {
+    cpus = {check_max(2)}
+  }
+  withName:Snpeff {
+    container = {(params.annotation_cache && params.snpEff_cache) ? 'nfcore/sarek:dev' : "nfcore/sareksnpeff:dev.${params.genome}"}
+    errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
+  }
+  withName:VEP {
+    container = {(params.annotation_cache && params.vep_cache) ? 'nfcore/sarek:dev' : "nfcore/sarekvep:dev.${params.genome}"}
+    cpus = {check_max(4)}
+    errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'}
+  }
 }
diff --git a/docs/README.md b/docs/README.md
@@ -6,7 +6,16 @@ The nf-core/sarek documentation is split into the following files:
 2. Pipeline configuration
     * [Local installation](https://nf-co.re/usage/local_installation)
     * [Adding your own system config](https://nf-co.re/usage/adding_own_config)
+    * [Install on a secure cluster](install_bianca.md)
     * [Reference genomes](https://nf-co.re/usage/reference_genomes)
+    * [Extra documentation on reference](reference.md)
 3. [Running the pipeline](usage.md)
+    * [Examples](use_cases.md)
+    * [Input files documentation](input.md)
+    * [Extra documentation on variant calling](variantcalling.md)
+    * [Documentation about containers](containers.md)
+    * [Extra documentation for targeted sequencing](targetseq.md)
 4. [Output and how to interpret the results](output.md)
+    * [Complementary information about ASCAT](ascat.md)
+    * [Extra documentation on annotation](annotation.md)
 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting)