nf-core · maxulysse · Jul 22, 2020 · Mar 4, 2020 · Mar 4, 2020 · Mar 4, 2020
@@ -0,0 +1,6 @@
+process{
+  withName:UMIMapBamFile {
+    cpus = 2
+    memory = 8.GB
+  }
+}
@@ -33,6 +33,9 @@
   - [--no_intervals](#--no_intervals)
   - [--target_bed](#--target_bed)
   - [--targetBED](#--targetbed)
+  - [--umi](#--umi)
+  - [--read_structure1](#--read_structure1)
+  - [--read_structure2](#--read_structure2)
 - [Reference genomes](#reference-genomes)
   - [--genome (using iGenomes)](#--genome-using-igenomes)
   - [--ac_loci](#--ac_loci)
@@ -356,6 +359,23 @@ Use this to specify the target BED file for targeted or whole exome sequencing.
 > :warning: This params is deprecated -- it will be removed in a future release.
 > Please check: [`--target_bed`](#--target_bed)
 
+### --umi
+
+If provided, UMIs steps will be run to extract and annotate the reads with UMIs and create consensus reads: this part of the pipeline uses *FGBIO* to convert the fastq files into a unmapped BAM, where reads are tagged with the UMIs extracted from the fastq sequences. In order to allow the correct tagging, the UMI sequence must be contained in the read sequence itself, and not in the FASTQ name.
+Following this step, the uBam is aligned and reads are then grouped based on mapping position and UMI tag.
+Finally, reads in the same groups are collapsed to create a consensus read. To create consensus, we have chosen to use the *adjacency method* [ref](https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/).
+In order for the correct tagging to be performed, a read structure needs to be  specified as indicated below.
+
+### --read_structure1
+
+When reads contain UMIs a structure for read 1 should be provided, to allow removal of UMI sequence from the read and correct annotation of the bam file. The read structure follows a format adopted by different tools, and described [here](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures)
+
+
+### --read_structure2
+
+When reads contain UMIs a structure for read 2 should be provided, to allow removal of UMI sequence from the read and correct annotation of the bam file. The read structure follows a format adopted by different tools, and described [here](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) 
+
+
 ## Reference genomes
 
 The pipeline config files come bundled with paths to the Illumina iGenomes reference index files.

@@ -30,4 +30,6 @@ dependencies:
   - bioconda::trim-galore=0.6.5
   - bioconda::vcfanno=0.3.2
   - bioconda::vcftools=0.1.16
-  - conda-forge::pigz=2.3.4 
+  - conda-forge::pigz=2.3.4
+  - bioconda::fgbio=1.1.0
+  - bioconda::samblaster=0.1.24
@@ -71,6 +71,9 @@ def helpMessage() {
       --pon_index              [file] Index of pon panel-of-normals VCF
       --ascat_ploidy            [int] Use this parameter together with to overwrite default behavior from ASCAT regarding ploidy. Note: Also requires that --ascat_purity is set.
       --ascat_purity            [int] Use this parameter to overwrite default behavior from ASCAT regarding purity. Note: Also requires that --ascat_ploidy is set.
+      --umi                       If provided, UMIs steps will be run to extract and annotate the reads with UMI and create consensus reads
+      --read_structure1           When reads contain UMIs a structure for read 1 should be provided, to allow removal of UMI sequence from the read. See: https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures
+      --read_structure2           When reads contain UMIs a structure for read 2 should be provided, to allow removal of UMI sequence from the read. See: https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures
 
     Trimming:
       --trim_fastq             [bool] Run Trim Galore
@@ -930,7 +933,10 @@ if (params.split_fastq){
 
 inputPairReads = inputPairReads.dump(tag:'INPUT')
 
-(inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3)
+(inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC, inputPairReadsUMI) = inputPairReads.into(4)
+if(params.umi) inputPairReads.close()
+else inputPairReadsUMI.close()
+
 
 // STEP 0.5: QC ON READS
 
@@ -1037,14 +1043,171 @@ process TrimGalore {
    trimGaloreReport = Channel.empty()
 }
 
+
+
+
+/*
+================================================================================
+                            UMIs PROCESSING
+================================================================================
+*/
+
+// UMI - STEP 1 - ANNOTATE
+// the process needs to convert fastq to unmapped bam
+// and while doing the conversion, tag the bam field RX with the UMI sequence
+
+process UMIFastqToBAM {
+
+  publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publishDirMode
+
+  input:
+  set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsUMI
+
+  output:
+  tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_converted.bam") into umi_converted_bams_ch
+
+  when: params.umi && params.read_structure1 && params.read_structure2
+
+
+  // tmp folder for fgbio might be solved more elengantly?
+
+  script:
+  """
+  mkdir tmpFolder
+
+  fgbio --tmp-dir=${PWD}/tmpFolder \
+  FastqToBam \
+  -i "${idSample}_${idRun}_R1.fastq.gz" "${idSample}_${idRun}_R2.fastq.gz" \
+  -o "${idSample}_umi_converted.bam" \
+  --read-structures $params.read_structure1 $params.read_structure2 \
+  --sample $idSample \
+  --library $idSample
+  """
+
+}
+
+
+// UMI - STEP 2 - MAP THE BAM FILE
+// this is necessary because the UMI groups are created based on
+// mapping position + same UMI tag
+
+
+
+process UMIMapBamFile {
+
+  input:
+  set idPatient, idSample, idRun, file(convertedBam) from umi_converted_bams_ch
+  file(bwaIndex) from ch_bwa
+  file(fasta) from ch_fasta
+  file(fastaFai) from ch_fai
+
+  output:
+  tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi_unsorted.bam") into umi_aligned_bams_ch
+
+  when: params.umi && params.read_structure1 && params.read_structure2
+
+  script:
+  """
+  samtools bam2fq -T RX ${convertedBam} | \
+  bwa mem -p -t ${task.cpus} -C -M -R \"@RG\\tID:${idSample}\\tSM:${idSample}\\tPL:Illumina\" \
+  ${fasta} - | \
+  samtools view -bS - > ${idSample}_umi_unsorted.bam
+  """
+
+}
+
+
+// UMI - STEP 3 - GROUP READS BY UMIs
+// We have chose the Adjacency method, following the nice paper and blog explanation integrated in both
+// UMItools and FGBIO
+// https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/
+// alternatively we can define this as input for the user to choose from
+
+process GroupReadsByUmi {
+
+  publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publishDirMode
+
+  input:
+  set idPatient, idSample, idRun, file(alignedBam) from umi_aligned_bams_ch
+
+  output:
+  file("${idSample}_umi_histogram.txt") into umi_histogram_ch
+  tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-grouped.bam") into umi_grouped_bams_ch
+
+  when: params.umi && params.read_structure1 && params.read_structure2
+
+  script:
+  """
+  mkdir tmpFolder
+
+  samtools view -h $alignedBam | \
+  samblaster -M --addMateTags | \
+  samtools view -Sb - >${idSample}_unsorted_tagged.bam
+
+  fgbio --tmp-dir=${PWD}/tmpFolder \
+  GroupReadsByUmi \
+  -s Adjacency \
+  -i ${idSample}_unsorted_tagged.bam \
+  -o ${idSample}_umi-grouped.bam \
+  -f ${idSample}_umi_histogram.txt
+  """
+
+}
+
+// UMI - STEP 4 - CALL MOLECULAR CONSENSUS
+// Now that the reads are organised by UMI groups a molecular consensus will be created
+// the resulting bam file will be again unmapped and therefore can be fed into the
+// existing workflow from the step mapping
+
+process CallMolecularConsensusReads {
+
+  publishDir "${params.outdir}/Reports/${idSample}/UMI/${idSample}_${idRun}", mode: params.publishDirMode
+
+  input:
+  set idPatient, idSample, idRun, file(groupedBamFile) from umi_grouped_bams_ch
+
+  output:
+  tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_umi-consensus.bam") into consensus_bam_ch
+
+  when: params.umi && params.read_structure1 && params.read_structure2
+
+  script:
+  """
+  mkdir tmpFolder
+
+  fgbio --tmp-dir=${PWD}/tmpFolder \
+  CallMolecularConsensusReads \
+  -i $groupedBamFile \
+  -o ${idSample}_umi-consensus.bam \
+  -M 1 -S Coordinate
+  """
+}
+
+// ################# END OF UMI READS PRE-PROCESSING
+// from this moment on the generated uBam files can feed into the existing tools
+
+
+
+
 // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM
 
-inputPairReads = outputPairReadsTrimGalore.mix(inputBam)
-inputPairReads = inputPairReads.dump(tag:'INPUT')
+if(params.umi){
+  (inputPairReads, inputPairReadsSentieon) = consensus_bam_ch.into(2)
+
+  inputPairReads = inputPairReads.dump(tag:'INPUT')
+
+  if (params.sentieon) inputPairReads.close()
+  else inputPairReadsSentieon.close()
+}
+else {
+  inputPairReads = outputPairReadsTrimGalore.mix(inputBam)
+  inputPairReads = inputPairReads.dump(tag:'INPUT')
+
+  (inputPairReads, inputPairReadsSentieon) = inputPairReads.into(2)
+  if (params.sentieon) inputPairReads.close()
+  else inputPairReadsSentieon.close()
+}
 
-(inputPairReads, inputPairReadsSentieon) = inputPairReads.into(2)
-if (params.sentieon) inputPairReads.close()
-else inputPairReadsSentieon.close()
 
 process MapReads {
     label 'cpus_max'
@@ -2178,7 +2341,7 @@ process MergeMutect2Stats {
 
     when: 'mutect2' in tools
 
-    script:   
+    script:
                stats = statsFiles.collect{ "-stats ${it} " }.join(' ')
     """
     gatk --java-options "-Xmx${task.memory.toGiga()}g" \
@@ -2309,13 +2472,13 @@ process CalculateContamination {
 
     input:
         set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(mergedPileup) from pairBamCalculateContamination
- 
+
      output:
         set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${idSampleTumor}_contamination.table") into contaminationTable
 
     when: 'mutect2' in tools
 
-    script:   
+    script:
              """
     # calculate contamination
     gatk --java-options "-Xmx${task.memory.toGiga()}g" \
@@ -2347,7 +2510,7 @@ process FilterMutect2Calls {
         file(germlineResource) from ch_germline_resource
         file(germlineResourceIndex) from ch_germline_resource_tbi
         file(intervals) from ch_intervals
-      
+
                   output:
         set val("Mutect2"), idPatient, idSamplePair, file("Mutect2_filtered_${idSamplePair}.vcf.gz"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.tbi"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.filteringStats.tsv") into filteredMutect2Output
 
@@ -3758,4 +3921,4 @@ def reduceVCF(file) {
 def returnStatus(it) {
     if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information"
     return it
-}
+}