From da871df91ceca19ef5ace22365d453db33d71696 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 10:15:08 +0200 Subject: [PATCH 01/27] code polishing --- conf/modules.config | 52 ++++++------------- .../nf-core/annotation/ensemblvep/main.nf | 22 ++++---- .../nf-core/annotation/snpeff/main.nf | 20 +++---- 3 files changed, 36 insertions(+), 58 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a47053d427..e46565f160 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -953,10 +953,8 @@ process{ withName: 'VCFTOOLS_SUMMARY'{ ext.args = "--FILTER-summary" } -} // ANNOTATE -process { withName: 'ENSEMBLVEP' { ext.args = '--everything --filter_common --per_gene --total_length --offline' @@ -979,56 +977,36 @@ process { ] } - withName: 'ANNOTATION_BGZIPTABIX' { + withName: "NFCORE_SAREK:SAREK:ANNOTATE:*:TABIX_BGZIPTABIX" { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" }, pattern: "*{gz,gz.tbi}" ] } -} -if (params.tools && (params.tools.contains('snpeff') || params.tools.contains('merge'))) { - process { - withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_SNPEFF:ANNOTATION_BGZIPTABIX' { - ext.prefix = {"${meta.id}_snpEff.ann.vcf"} - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" }, - pattern: "*{gz,gz.tbi}", - saveAs: { params.tools.contains('snpeff') ? it : null } - ] - } + withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_SNPEFF:TABIX_BGZIPTABIX' { + ext.prefix = {"${meta.id}_snpEff.ann.vcf"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" }, + pattern: "*{gz,gz.tbi}", + saveAs: { params.tools.contains('snpeff') ? it : null } + ] } -} -if (params.tools && (params.tools.contains('vep'))) { - process { - withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_ENSEMBLVEP:ANNOTATION_BGZIPTABIX' { - ext.prefix = {"${meta.id}_VEP.ann.vcf"} - } + withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_ENSEMBLVEP:TABIX_BGZIPTABIX' { + ext.prefix = {"${meta.id}_VEP.ann.vcf"} } -} -if (params.tools && (params.tools.contains('merge'))) { - process { - withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_MERGE:ANNOTATION_BGZIPTABIX' { - ext.prefix = {"${meta.id}_snpEff_VEP.ann.vcf"} - } + withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_MERGE:TABIX_BGZIPTABIX' { + ext.prefix = {"${meta.id}_snpEff_VEP.ann.vcf"} } -} -process { + // MULTIQC + withName:'MULTIQC' { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} ext.args = { params.multiqc_config ? "--config $multiqc_custom_config" : "" } } } - -// process { - // withName: CUSTOM_DUMPSOFTWAREVERSIONS { - // publishDir = [ - // mode: params.publish_dir_mode, - // path: { "${params.outdir}/pipeline_info" }, - // pattern: '*_versions.yml' -// } diff --git a/subworkflows/nf-core/annotation/ensemblvep/main.nf b/subworkflows/nf-core/annotation/ensemblvep/main.nf index f1e326e8ba..8a7e0308bb 100644 --- a/subworkflows/nf-core/annotation/ensemblvep/main.nf +++ b/subworkflows/nf-core/annotation/ensemblvep/main.nf @@ -2,29 +2,29 @@ // Run VEP to annotate VCF files // -include { ENSEMBLVEP } from '../../../../modules/nf-core/modules/ensemblvep/main' -include { TABIX_BGZIPTABIX as ANNOTATION_BGZIPTABIX } from '../../../../modules/nf-core/modules/tabix/bgziptabix/main' +include { ENSEMBLVEP } from '../../../../modules/nf-core/modules/ensemblvep/main' +include { TABIX_BGZIPTABIX } from '../../../../modules/nf-core/modules/tabix/bgziptabix/main' workflow ANNOTATION_ENSEMBLVEP { take: vcf // channel: [ val(meta), vcf ] - vep_genome // value: which genome - vep_species // value: which species - vep_cache_version // value: which cache version - vep_cache // path: path_to_vep_cache (optionnal) + vep_genome // value: genome to use + vep_species // value: species to use + vep_cache_version // value: cache version to use + vep_cache // path: /path/to/vep/cache (optionnal) main: ch_versions = Channel.empty() ENSEMBLVEP(vcf, vep_genome, vep_species, vep_cache_version, vep_cache) - ANNOTATION_BGZIPTABIX(ENSEMBLVEP.out.vcf) + TABIX_BGZIPTABIX(ENSEMBLVEP.out.vcf) // Gather versions of all tools used ch_versions = ch_versions.mix(ENSEMBLVEP.out.versions.first()) - ch_versions = ch_versions.mix(ANNOTATION_BGZIPTABIX.out.versions.first()) + ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions.first()) emit: - vcf_tbi = ANNOTATION_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), vcf.gz, vcf.gz.tbi ] - reports = ENSEMBLVEP.out.report // path: *.html - versions = ch_versions // path: versions.yml + vcf_tbi = TABIX_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), vcf.gz, vcf.gz.tbi ] + reports = ENSEMBLVEP.out.report // path: *.html + versions = ch_versions // path: versions.yml } diff --git a/subworkflows/nf-core/annotation/snpeff/main.nf b/subworkflows/nf-core/annotation/snpeff/main.nf index e5a020603e..54bfb9caa2 100644 --- a/subworkflows/nf-core/annotation/snpeff/main.nf +++ b/subworkflows/nf-core/annotation/snpeff/main.nf @@ -2,27 +2,27 @@ // Run SNPEFF to annotate VCF files // -include { SNPEFF } from '../../../../modules/nf-core/modules/snpeff/main' -include { TABIX_BGZIPTABIX as ANNOTATION_BGZIPTABIX } from '../../../../modules/nf-core/modules/tabix/bgziptabix/main' +include { SNPEFF } from '../../../../modules/nf-core/modules/snpeff/main' +include { TABIX_BGZIPTABIX } from '../../../../modules/nf-core/modules/tabix/bgziptabix/main' workflow ANNOTATION_SNPEFF { take: - vcf // channel: [ val(meta), vcf ] - snpeff_db // value: version of db to use - snpeff_cache // path: path_to_snpeff_cache (optionnal) + vcf // channel: [ val(meta), vcf ] + snpeff_db // value: db version to use + snpeff_cache // path: /path/to/snpeff/cache (optionnal) main: ch_versions = Channel.empty() SNPEFF(vcf, snpeff_db, snpeff_cache) - ANNOTATION_BGZIPTABIX(SNPEFF.out.vcf) + TABIX_BGZIPTABIX(SNPEFF.out.vcf) // Gather versions of all tools used ch_versions = ch_versions.mix(SNPEFF.out.versions.first()) - ch_versions = ch_versions.mix(ANNOTATION_BGZIPTABIX.out.versions.first()) + ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions.first()) emit: - vcf_tbi = ANNOTATION_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), vcf.gz, vcf.gz.tbi ] - reports = SNPEFF.out.report // path: *.html - versions = ch_versions // path: versions.yml + vcf_tbi = TABIX_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), vcf.gz, vcf.gz.tbi ] + reports = SNPEFF.out.report // path: *.html + versions = ch_versions // path: versions.yml } From 4148c1116714a43918c4ebfee018fccb37105612 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 13:54:45 +0200 Subject: [PATCH 02/27] add CADD + GeneSplicer modules --- bin/CADD.pm | 157 +++++++++++++++++++ bin/GeneSplicer.pm | 369 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 526 insertions(+) create mode 100644 bin/CADD.pm create mode 100644 bin/GeneSplicer.pm diff --git a/bin/CADD.pm b/bin/CADD.pm new file mode 100644 index 0000000000..8098384b9b --- /dev/null +++ b/bin/CADD.pm @@ -0,0 +1,157 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2021] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl + +=cut + +=head1 NAME + + CADD + +=head1 SYNOPSIS + + mv CADD.pm ~/.vep/Plugins + ./vep -i variations.vcf --plugin CADD,/FULL_PATH_TO_CADD_FILE/whole_genome_SNVs.tsv.gz,/FULL_PATH_TO_CADD_FILE/InDels.tsv.gz + +=head1 DESCRIPTION + + A VEP plugin that retrieves CADD scores for variants from one or more + tabix-indexed CADD data files. + + Please cite the CADD publication alongside the VEP if you use this resource: + https://www.ncbi.nlm.nih.gov/pubmed/24487276 + + The tabix utility must be installed in your path to use this plugin. The CADD + data files can be downloaded from + http://cadd.gs.washington.edu/download + + The plugin works with all versions of available CADD files. The plugin only + reports scores and does not consider any additional annotations from a CADD + file. It is therefore sufficient to use CADD files without the additional + annotations. + +=cut + +package CADD; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); + +use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + $self->expand_left(0); + $self->expand_right(0); + + $self->get_user_params(); + + return $self; +} + +sub feature_types { + return ['Feature','Intergenic']; +} + +sub get_header_info { + my $self = shift; + return { + CADD_PHRED => 'PHRED-like scaled CADD score', + CADD_RAW => 'Raw CADD score' + } +} + +sub run { + my ($self, $tva) = @_; + + my $vf = $tva->variation_feature; + + # get allele + my $allele = $tva->variation_feature_seq; + + return {} unless $allele =~ /^[ACGT-]+$/; + + my @data = @{$self->get_data($vf->{chr}, $vf->{start} - 2, $vf->{end})}; + + foreach (@data) { + my $matches = get_matched_variant_alleles( + { + ref => $vf->ref_allele_string, + alts => [$allele], + pos => $vf->{start}, + strand => $vf->strand + }, + { + ref => $_->{ref}, + alts => [$_->{alt}], + pos => $_->{start}, + } + ); + return $_->{result} if (@$matches); + } + return {}; +} + +sub parse_data { + my ($self, $line) = @_; + my ($c, $s, $ref, $alt, $raw, $phred) = split /\t/, $line; + + # do VCF-like coord adjustment for mismatched subs + my $e = ($s + length($ref)) - 1; + if(length($alt) != length($ref)) { + my $first_ref = substr($ref, 0, 1); + my $first_alt = substr($alt, 0, 1); + if ($first_ref eq $first_alt) { + $s++; + $ref = substr($ref, 1); + $alt = substr($alt, 1); + $ref ||= '-'; + $alt ||= '-'; + } + } + return { + ref => $ref, + alt => $alt, + start => $s, + end => $e, + result => { + CADD_RAW => $raw, + CADD_PHRED => $phred + } + }; +} + +sub get_start { + return $_[1]->{start}; +} + +sub get_end { + return $_[1]->{end}; +} + +1; diff --git a/bin/GeneSplicer.pm b/bin/GeneSplicer.pm new file mode 100644 index 0000000000..3801a16cb9 --- /dev/null +++ b/bin/GeneSplicer.pm @@ -0,0 +1,369 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2021] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl + +=cut + +=head1 NAME + + GeneSplicer + +=head1 SYNOPSIS + + mv GeneSplicer.pm ~/.vep/Plugins + ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value] + +=head1 DESCRIPTION + + This is a plugin for the Ensembl Variant Effect Predictor (VEP) that + runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get + splice site predictions. + + It evaluates a tract of sequence either side of and including the + variant, both in reference and alternate states. The amount of + sequence included either side defaults to 100bp, but can be modified + by passing e.g. "context=50" as a parameter to the plugin. + + Any predicted splicing regions that overlap the variant are reported + in the output with one of four states: no_change, diff, gain, loss + + There follows a "/"-separated string consisting of the following data: + + 1) type (donor, acceptor) + 2) coordinates (start-end) + 3) confidence (Low, Medium, High) + 4) score + + Example: loss/acceptor/727006-727007/High/16.231924 + + If multiple sites are predicted, their reports are separated by ",". + + For diff, the confidence and score for both the reference and alternate + sequences is reported as REF-ALT. + + Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368 + + Several parameters can be modified by passing them to the plugin string: + + context : change the amount of sequence added either side of + the variant (default: 100bp) + tmpdir : change the temporary directory used (default: /tmp) + cache_size : change how many sequences' scores are cached in memory + (default: 50) + + Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp + + On some systems the binaries provided will not execute, but can be compiled from source: + + cd $GS/sources + make + cd - + ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human + + On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification: + + cd $GS/sources + perl -pi -e "s/^main /int main /" genesplicer.cpp + make + + +=cut + +package GeneSplicer; + +use strict; +use warnings; + +use Digest::MD5 qw(md5_hex); + +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); + +use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +our %DEFAULTS = ( + context => 100, + tmpdir => '/tmp', + cache_size => 50, +); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + # we need sequence, so no offline mode unless we have FASTA + die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta}; + + my $params = $self->params; + + my $bin = shift @$params; + die("ERROR: genesplicer binary not specified\n") unless $bin; + die("ERROR: genesplicer binary not found\n") unless -e $bin; + my $test = `$bin 2>&1`; + die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/; + $self->{_bin} = $bin; + + my $training_dir = shift @$params; + die("ERROR: training directory not specified\n") unless $training_dir; + die("ERROR: training directory not found\n") unless -d $training_dir; + $self->{_training_dir} = $training_dir; + + # defaults + $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS; + + # REST API passes 1 as first param + shift @$params if $params->[0] && $params->[0] eq '1'; + + # set/override with user params + foreach my $param(@$params) { + my ($key, $val) = split('=', $param); + die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val); + + $self->{'_param_'.$key} = $val; + } + + return $self; +} + +sub feature_types { + return ['Transcript']; +} + +sub get_header_info { + return { + GeneSplicer => "GeneSplicer predictions" + }; +} + +sub run { + my ($self, $tva) = @_; + + my $vf = $tva->variation_feature; + + # get up and downstream sequences + my $up_seq = $vf->{slice}->sub_Slice( + $vf->{start} - $self->{'_param_context'}, + $vf->{start} - 1, + $vf->strand + )->seq; + + my $down_seq = $vf->{slice}->sub_Slice( + $vf->{end} + 1, + $vf->{end} + $self->{'_param_context'}, + $vf->strand + )->seq; + + # create ref seq by grabbing reference TVA + my $ref_seq = join("", + $up_seq, + $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq, + $down_seq + ); + + return {} unless $ref_seq =~ /^[ACGT]+$/; + + # create alt seq + my $alt_allele = $tva->variation_feature_seq; + $alt_allele =~ s/\-//g; + my $alt_seq = $up_seq.$alt_allele.$down_seq; + + + return {} unless $alt_seq =~ /^[ACGT]+$/; + + # reverse comp if strands differ + if($tva->transcript->strand != $vf->strand) { + reverse_comp(\$ref_seq); + reverse_comp(\$alt_seq); + } + + # get results + my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq); + my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq); + + # compare results both ways + my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results); + my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results); + + # get VF pos relative to tested sequence + my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1)); + + # get overlapping losses and gains + # and map to chromosome coords + my @losses = + map {$_->{gl} = 'loss'; $_} + @{$diff_ref_to_alt->{lost}}; + + my @gains = + map {$_->{gl} = 'gain'; $_} + @{$diff_alt_to_ref->{lost}}; + + my @diffs = + map {$_->{gl} = 'diff'; $_} + @{$diff_ref_to_alt->{diff}}; + + my $return = join(',', + map { + join('/', + $_->[0]->{gl}, + $_->[0]->{type}, + $_->[1]->{end5}.'-'.$_->[1]->{end3}, + $_->[0]->{confidence}, + $_->[0]->{score} + ) + } + map {[$_, $self->map_ss_coords($_, $vf)]} + grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} + (@losses, @gains, @diffs) + ); + + # probably of interest to report splice sites were found + # but no difference between ref and alt + if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) { + $return = join(',', + map { + join('/', + 'no_change', + $_->[0]->{type}, + $_->[1]->{end5}.'-'.$_->[1]->{end3}, + $_->[0]->{confidence}, + $_->[0]->{score} + ) + } + map {[$_, $self->map_ss_coords($_, $vf)]} + grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results + ); + } + + return $return ? { GeneSplicer => $return } : {}; +} + +sub results_from_seq { + my $self = shift; + my $seq = shift; + + # write seqs to file + my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa"; + open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n"); + print SEQ ">SEQ\n$seq\n"; + close SEQ; + + my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results"; + + my $cmd = sprintf( + '%s %s %s -f %s', + $self->{'_bin'}, + $seq_file, + $self->{'_training_dir'}, + $result_file + ); + + my $output = `$cmd 2>&1`; + unlink($seq_file); + + return [] unless -e $result_file; + + open RES, $result_file; + my @results; + + while() { + chomp; + my ($end5, $end3, $score, $confidence, $type) = split; + + push @results, { + end5 => $end5, + end3 => $end3, + score => $score, + confidence => $confidence, + type => $type + }; + } + close RES; + + unlink($result_file); + + push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results}; + shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size}; + + return \@results; +} + +sub results_from_cache { + my $self = shift; + my $seq = shift; + + my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []}; + + return $results; +} + +sub compare_results { + my $self = shift; + my $a = shift; + my $b = shift; + + my (@diff, @lost); + + foreach my $res_a(@$a) { + my @match = grep { + $_->{end5} == $res_a->{end5} && + $_->{end3} == $res_a->{end3} && + $_->{type} eq $res_a->{type} + } @$b; + + # result not found in b + if(!@match) { + push @lost, $res_a; + } + + # >1 result found + elsif(scalar @match > 1) { + warn("WARNING: Found two matches?\n"); + } + + # 1 match + elsif($match[0]->{score} != $res_a->{score}) { + my %diff = %$res_a; + $diff{score} .= '-'.$match[0]->{score}; + $diff{confidence} .= '-'.$match[0]->{confidence}; + push @diff, \%diff; + } + } + + return { diff => \@diff, lost => \@lost}; +} + +sub map_ss_coords { + my $self = shift; + my $res = shift; + my $vf = shift; + + my $return = {}; + + foreach my $coord(qw(end5 end3)) { + $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1; + } + + return $return; +} + +1; + From 478fcf1a7fbf44c47b84bb17ed43de4fa41b04af Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 14:25:18 +0200 Subject: [PATCH 03/27] add CADD citation --- CITATIONS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 5cedcdd425..e5e3937fb8 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -71,8 +71,13 @@ > Danecek P, Auton A, Abecasis G, et al.: The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8. doi: 10.1093/bioinformatics/btr330. Epub 2011 Jun 7. PubMed PMID: 21653522; PubMed Central PMCID: PMC3137218. - [VEP](https://pubmed.ncbi.nlm.nih.gov/27268795/) + > McLaren W, Gil L, Hunt SE, et al.: The Ensembl Variant Effect Predictor. Genome Biol. 2016 Jun 6;17(1):122. doi: 10.1186/s13059-016-0974-4. PubMed PMID: 27268795; PubMed Central PMCID: PMC4893825. +- [CADD](https://pubmed.ncbi.nlm.nih.gov/24487276/) + + > Kircher M, et al.: A general framework for estimating the relative pathogenicity of human genetic variants. Nat Genet. 2014 Mar;46(3):310-5. doi: 10.1038/ng.2892. PubMed PMID: 24487276; PubMed Central PMCID: PMC3992975. + ## R packages - [R](https://www.R-project.org/) @@ -88,6 +93,7 @@ > Trevor L Davis (2018). optparse: Command Line Option Parser. - [RColorBrewer](https://CRAN.R-project.org/package=RColorBrewer) + > Erich Neuwirth (2014). RColorBrewer: ColorBrewer Palettes. ## Software packaging/containerisation tools @@ -107,4 +113,5 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. From 19d06ad101c0c7dd17bb8f4a089a1affee9ef3a7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 14:35:10 +0200 Subject: [PATCH 04/27] update modules --- modules.json | 4 ++-- modules/nf-core/modules/ensemblvep/Dockerfile | 11 ++++++----- modules/nf-core/modules/ensemblvep/build.sh | 5 +++-- modules/nf-core/modules/ensemblvep/main.nf | 1 + modules/nf-core/modules/ensemblvep/meta.yml | 15 ++++----------- modules/nf-core/modules/snpeff/Dockerfile | 9 +++++---- modules/nf-core/modules/snpeff/build.sh | 5 +++-- modules/nf-core/modules/snpeff/meta.yml | 12 ------------ 8 files changed, 24 insertions(+), 38 deletions(-) diff --git a/modules.json b/modules.json index 651c22ec8a..1334e99aca 100644 --- a/modules.json +++ b/modules.json @@ -61,7 +61,7 @@ "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "ensemblvep": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "40dd662fd26c3eb3160b7c8cbbe9bff80bbe2c30" }, "fastqc": { "git_sha": "49b18b1639f4f7104187058866a8fab33332bdfe" @@ -202,7 +202,7 @@ "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "snpeff": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "40dd662fd26c3eb3160b7c8cbbe9bff80bbe2c30" }, "strelka/germline": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" diff --git a/modules/nf-core/modules/ensemblvep/Dockerfile b/modules/nf-core/modules/ensemblvep/Dockerfile index ac1b469117..b4a1c66471 100644 --- a/modules/nf-core/modules/ensemblvep/Dockerfile +++ b/modules/nf-core/modules/ensemblvep/Dockerfile @@ -8,13 +8,14 @@ LABEL \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -# Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-vep-104.3/bin:$PATH - # Setup default ARG variables ARG GENOME=GRCh38 ARG SPECIES=homo_sapiens -ARG VEP_VERSION=99 +ARG VEP_VERSION=104 +ARG VEP_TAG=104.3 + +# Add conda installation dir to PATH (instead of doing 'conda activate') +ENV PATH /opt/conda/envs/nf-core-vep-${VEP_TAG}/bin:$PATH # Download Genome RUN vep_install \ @@ -27,4 +28,4 @@ RUN vep_install \ --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-vep-104.3 > nf-core-vep-104.3.yml +RUN conda env export --name nf-core-vep-${VEP_TAG} > nf-core-vep-${VEP_TAG}.yml diff --git a/modules/nf-core/modules/ensemblvep/build.sh b/modules/nf-core/modules/ensemblvep/build.sh index 5fcb91dfe7..650c8704e5 100644 --- a/modules/nf-core/modules/ensemblvep/build.sh +++ b/modules/nf-core/modules/ensemblvep/build.sh @@ -10,11 +10,12 @@ build_push() { VEP_TAG=$4 docker build \ + . \ -t nfcore/vep:${VEP_TAG}.${GENOME} \ - software/vep/. \ --build-arg GENOME=${GENOME} \ --build-arg SPECIES=${SPECIES} \ - --build-arg VEP_VERSION=${VEP_VERSION} + --build-arg VEP_VERSION=${VEP_VERSION} \ + --build-arg VEP_TAG=${VEP_TAG} docker push nfcore/vep:${VEP_TAG}.${GENOME} } diff --git a/modules/nf-core/modules/ensemblvep/main.nf b/modules/nf-core/modules/ensemblvep/main.nf index c2bd055fa2..a5a9b1abcc 100644 --- a/modules/nf-core/modules/ensemblvep/main.nf +++ b/modules/nf-core/modules/ensemblvep/main.nf @@ -13,6 +13,7 @@ process ENSEMBLVEP { val species val cache_version path cache + path extra_files output: tuple val(meta), path("*.ann.vcf"), emit: vcf diff --git a/modules/nf-core/modules/ensemblvep/meta.yml b/modules/nf-core/modules/ensemblvep/meta.yml index cd9c89054a..418bb970d9 100644 --- a/modules/nf-core/modules/ensemblvep/meta.yml +++ b/modules/nf-core/modules/ensemblvep/meta.yml @@ -10,17 +10,6 @@ tools: homepage: https://www.ensembl.org/info/docs/tools/vep/index.html documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html licence: ["Apache-2.0"] -params: - - use_cache: - type: boolean - description: | - Enable the usage of containers with cache - Does not work with conda - - vep_tag: - type: value - description: | - Specify the tag for the container - https://hub.docker.com/r/nfcore/vep/tags input: - meta: type: map @@ -47,6 +36,10 @@ input: type: file description: | path to VEP cache (optional) + - extra_files: + type: tuple + description: | + path to file(s) needed for plugins (optional) output: - vcf: type: file diff --git a/modules/nf-core/modules/snpeff/Dockerfile b/modules/nf-core/modules/snpeff/Dockerfile index 608716a4de..d0e347573c 100644 --- a/modules/nf-core/modules/snpeff/Dockerfile +++ b/modules/nf-core/modules/snpeff/Dockerfile @@ -8,15 +8,16 @@ LABEL \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -# Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-snpeff-5.0/bin:$PATH - # Setup default ARG variables ARG GENOME=GRCh38 ARG SNPEFF_CACHE_VERSION=99 +ARG SNPEFF_TAG=99 + +# Add conda installation dir to PATH (instead of doing 'conda activate') +ENV PATH /opt/conda/envs/nf-core-snpeff-${SNPEFF_TAG}/bin:$PATH # Download Genome RUN snpEff download -v ${GENOME}.${SNPEFF_CACHE_VERSION} # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-snpeff-5.0 > nf-core-snpeff-5.0.yml +RUN conda env export --name nf-core-snpeff-${SNPEFF_TAG} > nf-core-snpeff-${SNPEFF_TAG}.yml diff --git a/modules/nf-core/modules/snpeff/build.sh b/modules/nf-core/modules/snpeff/build.sh index b94ffd6905..2fccf9a8b4 100644 --- a/modules/nf-core/modules/snpeff/build.sh +++ b/modules/nf-core/modules/snpeff/build.sh @@ -9,10 +9,11 @@ build_push() { SNPEFF_TAG=$3 docker build \ + . \ -t nfcore/snpeff:${SNPEFF_TAG}.${GENOME} \ - software/snpeff/. \ --build-arg GENOME=${GENOME} \ - --build-arg SNPEFF_CACHE_VERSION=${SNPEFF_CACHE_VERSION} + --build-arg SNPEFF_CACHE_VERSION=${SNPEFF_CACHE_VERSION} \ + --build-arg SNPEFF_TAG=${SNPEFF_TAG} docker push nfcore/snpeff:${SNPEFF_TAG}.${GENOME} } diff --git a/modules/nf-core/modules/snpeff/meta.yml b/modules/nf-core/modules/snpeff/meta.yml index c191b9acd8..2f0d866eeb 100644 --- a/modules/nf-core/modules/snpeff/meta.yml +++ b/modules/nf-core/modules/snpeff/meta.yml @@ -10,18 +10,6 @@ tools: homepage: https://pcingola.github.io/SnpEff/ documentation: https://pcingola.github.io/SnpEff/se_introduction/ licence: ["MIT"] -params: - - use_cache: - type: boolean - description: | - boolean to enable the usage of containers with cache - Enable the usage of containers with cache - Does not work with conda - - snpeff_tag: - type: value - description: | - Specify the tag for the container - https://hub.docker.com/r/nfcore/snpeff/tags input: - meta: type: map From 7283ec5d2c1a98ea0eac05868aa6ca014a65ca8c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 14:36:22 +0200 Subject: [PATCH 05/27] update subworkflows and workflows --- subworkflows/local/annotate.nf | 5 +++-- subworkflows/nf-core/annotation/ensemblvep/main.nf | 3 ++- workflows/sarek.nf | 14 +++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index 28ea88d888..a9c386b011 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -16,6 +16,7 @@ workflow ANNOTATE { vep_species vep_cache_version vep_cache + vep_extra_files main: ch_reports = Channel.empty() @@ -32,7 +33,7 @@ workflow ANNOTATE { if (tools.contains('merge')) { vcf_ann_for_merge = ANNOTATION_SNPEFF.out.vcf_tbi.map{ meta, vcf, tbi -> [meta, vcf] } - ANNOTATION_MERGE(vcf_ann_for_merge, vep_genome, vep_species, vep_cache_version, vep_cache) + ANNOTATION_MERGE(vcf_ann_for_merge, vep_genome, vep_species, vep_cache_version, vep_cache, vep_extra_files) ch_reports = ch_reports.mix(ANNOTATION_MERGE.out.reports) ch_vcf_ann = ch_vcf_ann.mix(ANNOTATION_MERGE.out.vcf_tbi) @@ -40,7 +41,7 @@ workflow ANNOTATE { } if (tools.contains('vep')) { - ANNOTATION_ENSEMBLVEP(vcf, vep_genome, vep_species, vep_cache_version, vep_cache) + ANNOTATION_ENSEMBLVEP(vcf, vep_genome, vep_species, vep_cache_version, vep_cache, vep_extra_files) ch_reports = ch_reports.mix(ANNOTATION_ENSEMBLVEP.out.reports) ch_vcf_ann = ch_vcf_ann.mix(ANNOTATION_ENSEMBLVEP.out.vcf_tbi) diff --git a/subworkflows/nf-core/annotation/ensemblvep/main.nf b/subworkflows/nf-core/annotation/ensemblvep/main.nf index 8a7e0308bb..4c7d0e3e69 100644 --- a/subworkflows/nf-core/annotation/ensemblvep/main.nf +++ b/subworkflows/nf-core/annotation/ensemblvep/main.nf @@ -12,11 +12,12 @@ workflow ANNOTATION_ENSEMBLVEP { vep_species // value: species to use vep_cache_version // value: cache version to use vep_cache // path: /path/to/vep/cache (optionnal) + vep_extra_files // channel: [ file1, file2...] (optionnal) main: ch_versions = Channel.empty() - ENSEMBLVEP(vcf, vep_genome, vep_species, vep_cache_version, vep_cache) + ENSEMBLVEP(vcf, vep_genome, vep_species, vep_cache_version, vep_cache, vep_extra_files) TABIX_BGZIPTABIX(ENSEMBLVEP.out.vcf) // Gather versions of all tools used diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 755565c931..0b00be8b1d 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -106,6 +106,17 @@ snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_ //target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : [] vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] +if (params.cadd_wg_snvs && params.cadd_wg_snvs_tbi && params.cadd_indels && params.cadd_indels && params.cadd_indels_tbi) { + vep_extra_files = Channel.empty().mix( + Channel.fromPath(params.cadd_wg_snvs), + Channel.fromPath(params.cadd_wg_snvs_tbi), + Channel.fromPath(params.cadd_indels), + Channel.fromPath(params.cadd_indels_tbi) + ).collect() +} else { + vep_extra_files = [] +} + // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope umi_read_structure = params.umi_read_structure ? "${params.umi_read_structure} ${params.umi_read_structure}" : Channel.empty() @@ -756,7 +767,8 @@ workflow SAREK { vep_genome, vep_species, vep_cache_version, - vep_cache) + vep_cache, + vep_extra_files) // Gather used softwares versions ch_versions = ch_versions.mix(ANNOTATE.out.versions) From 7fbbba94ad9f2cdc2e719614d1471f0d87067236 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 14:37:07 +0200 Subject: [PATCH 06/27] add params.snpeff_genome --- conf/genomes.config | 1 + conf/igenomes.config | 4 ++++ conf/modules.config | 9 ++++++--- nextflow_schema.json | 6 ++++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/conf/genomes.config b/conf/genomes.config index 7317f16f0f..0d3dfc4a74 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -21,6 +21,7 @@ params { intervals = "${params.genomes_base}/small.intervals" known_indels = "${params.genomes_base}/Mills_1000G_gold_standard_and_1000G_phase1.indels.b37.small.vcf.gz" snpeff_db = 'GRCh37.75' + snpeff_genome = 'GRCh37' vep_genome = 'GRCh37' vep_species = 'homo_sapiens' vep_cache_version = '104' diff --git a/conf/igenomes.config b/conf/igenomes.config index df2ed7cd09..c79370b8f6 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -28,6 +28,7 @@ params { known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem" snpeff_db = 'GRCh37.75' + snpeff_genome = 'GRCh37' vep_cache_version = '104' vep_genome = 'GRCh37' vep_species = 'homo_sapiens' @@ -51,6 +52,7 @@ params { known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Control-FREEC/out100m2_hg38.gem" snpeff_db = 'GRCh38.99' + snpeff_genome = 'GRCh38' vep_cache_version = '104' vep_genome = 'GRCh38' vep_species = 'homo_sapiens' @@ -78,6 +80,7 @@ params { mappability = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Control-FREEC/GRCm38_68_mm10.gem" readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" snpeff_db = 'GRCm38.99' + snpeff_genome = 'GRCm38' vep_cache_version = '102' vep_genome = 'GRCm38' vep_species = 'mus_musculus' @@ -101,6 +104,7 @@ params { bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" snpeff_db = 'WBcel235.99' + snpeff_genome = 'WBcel235' vep_cache_version = '104' vep_genome = 'WBcel235' vep_species = 'caenorhabditis_elegans' diff --git a/conf/modules.config b/conf/modules.config index e46565f160..5b33763310 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -957,8 +957,11 @@ process{ // ANNOTATE withName: 'ENSEMBLVEP' { - ext.args = '--everything --filter_common --per_gene --total_length --offline' - container = { "nfcore/vep:104.3.${params.genome}" } + ext.args = [ + '--everything --filter_common --per_gene --total_length --offline', + (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? '--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz' : '', + ].join(' ').trim() + if (!params.vep_cache) container = { params.vep_genome ? "nfcore/vep:104.3.${params.vep_genome}" : "nfcore/vep:104.3.${params.genome}" } publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/reports/EnsemblVEP/${meta.id}/${meta.variantcaller}" }, @@ -968,7 +971,7 @@ process{ withName: 'SNPEFF' { ext.args = '-nodownload -canon -v' - container = { "nfcore/snpeff:5.0.${params.genome}" } + if (!params.snpeff_cache) container = { params.snpeff_genome ? "nfcore/snpeff:5.0.${params.snpeff_genome}" : "nfcore/snpeff:5.0.${params.genome}" } publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/reports/SnpEff/${meta.id}/${meta.variantcaller}" }, diff --git a/nextflow_schema.json b/nextflow_schema.json index d9b208744d..d65f37644c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -509,6 +509,12 @@ "fa_icon": "fas fa-database", "description": "snpEff DB version" }, + "snpeff_genome": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "snpeff genome", + "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." + }, "vep_genome": { "type": "string", "fa_icon": "fas fa-microscope", From 11b69f2f1bb4cd631de4ba05efdbe26e5198c6b9 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 14:37:26 +0200 Subject: [PATCH 07/27] add params.snpeff_genome --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 7858f4c5d3..2d5788f04d 100644 --- a/main.nf +++ b/main.nf @@ -45,6 +45,7 @@ params.known_indels = WorkflowMain.getGenomeAttribute(params, 'known_in params.known_indels_tbi = WorkflowMain.getGenomeAttribute(params, 'known_indels_tbi') params.mappability = WorkflowMain.getGenomeAttribute(params, 'mappability') params.snpeff_db = WorkflowMain.getGenomeAttribute(params, 'snpeff_db') +params.snpeff_genome = WorkflowMain.getGenomeAttribute(params, 'snpeff_genome') params.vep_cache_version = WorkflowMain.getGenomeAttribute(params, 'vep_cache_version') params.vep_genome = WorkflowMain.getGenomeAttribute(params, 'vep_genome') params.vep_species = WorkflowMain.getGenomeAttribute(params, 'vep_species') From 1d0edb2c5a6691265cf6380c3c8160c32b01c8c4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 14:50:44 +0200 Subject: [PATCH 08/27] code polish --- nextflow_schema.json | 136 +++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d65f37644c..873e367cc9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -17,7 +17,7 @@ "default": "mapping", "fa_icon": "fas fa-play", "description": "Starting step.", - "help_text": "Only one step", + "help_text": "Only one step.", "enum": [ "mapping", "prepare_recalibration", @@ -44,32 +44,32 @@ "fa_icon": "fas fa-folder-open" } }, - "help_text": "" + "help_text": ."" }, "main_options": { "title": "Main options", "type": "object", - "description": "Option used for most of the pipeline", + "description": "Option used for most of the pipeline.", "default": "", "properties": { "tools": { "type": "string", "fa_icon": "fas fa-toolbox", "description": "Tools to use for variant calling and/or for annotation.", - "help_text": "Multiple separated with commas.\n\nGermline variant calling can currently only be performed with the following variant callers:\n- FreeBayes, HaplotypeCaller, Manta, mpileup, Strelka, TIDDIT\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- ASCAT, Control-FREEC, FreeBayes, Manta, MSIsensorpro, Mutect2, Strelka\n\nTumor-only somatic variant calling can currently only be performed with the following variant callers:\n- Control-FREEC, Manta, mpileup, Mutect2, TIDDIT\n\nAnnotation is done using snpEff, VEP, or even both consecutively.\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted.\n\n\n\n`DNAseq`, `DNAscope` and `TNscope` are only available with `--sentieon`\n\n> **NB** tools can be specified with no concern for case.\n", + "help_text": "Multiple separated with commas.\n\nGermline variant calling can currently only be performed with the following variant callers:\n- FreeBayes, HaplotypeCaller, Manta, mpileup, Strelka, TIDDIT\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- ASCAT, Control-FREEC, FreeBayes, Manta, MSIsensorpro, Mutect2, Strelka\n\nTumor-only somatic variant calling can currently only be performed with the following variant callers:\n- Control-FREEC, Manta, mpileup, Mutect2, TIDDIT\n\nAnnotation is done using snpEff, VEP, or even both consecutively.\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted.\n\n\n\n`DNAseq`, `DNAscope` and `TNscope` are only available with `--sentieon`\n\n> **NB** tools can be specified with no concern for case.", "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|dnascope|dnaseq|freebayes|haplotypecaller|manta|merge|mpileup|msisensorpro|mutect2|snpeff|strelka|tiddit|tnscope|vep)*,?)*$" }, "no_intervals": { "type": "boolean", "fa_icon": "fas fa-ban", "description": "Disable usage of intervals.", - "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling" + "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling." }, "nucleotides_per_second": { "type": "number", "fa_icon": "fas fa-clock", "description": "Estimate interval size.", - "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling", + "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling.", "default": 1000 }, "sentieon": { @@ -96,7 +96,7 @@ "wes": { "type": "boolean", "fa_icon": "fas fa-dna", - "description": "Enable when exome or panel data is provided" + "description": "Enable when exome or panel data is provided." } }, "fa_icon": "fas fa-user-cog" @@ -112,22 +112,22 @@ "type": "boolean", "fa_icon": "fas fa-cut", "description": "Run Trim Galore.", - "hidden": true, - "help_text": "Use this to perform adapter trimming with Trim Galore.\ncf [Trim Galore User Guide](https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md)" + "help_text": "Use this to perform adapter trimming with Trim Galore.\ncf [Trim Galore User Guide](https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md).", + "hidden": true }, "clip_r1": { "type": "integer", "default": 0, "fa_icon": "fas fa-cut", "description": "Remove bp from the 5' end of read 1.", - "help_text": "This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end.\n", + "help_text": "This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end.", "hidden": true }, "clip_r2": { "type": "integer", "default": 0, "description": "Remove bp from the 5' end of read 2.", - "help_text": "This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end.\n", + "help_text": "This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end.", "fa_icon": "fas fa-cut", "hidden": true }, @@ -136,7 +136,7 @@ "default": 0, "fa_icon": "fas fa-cut", "description": "Remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed.", - "help_text": "This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality.\n", + "help_text": "This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality.", "hidden": true }, "three_prime_clip_r2": { @@ -144,7 +144,7 @@ "default": 0, "fa_icon": "fas fa-cut", "description": "Remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed.", - "help_text": "This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality.\n", + "help_text": "This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality.", "hidden": true }, "trim_nextseq": { @@ -158,21 +158,21 @@ "save_trimmed": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Save trimmed FastQ file intermediates", + "description": "Save trimmed FastQ file intermediates.", "hidden": true }, "split_fastq": { "type": "integer", "default": 0, "fa_icon": "fas fa-cut", - "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn of splitting at all", - "help_text": "Use the the tools seqkit/split2 to split FASTQ file by number of reads", + "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn of splitting at all.", + "help_text": "Use the the tools seqkit/split2 to split FASTQ file by number of reads.", "hidden": true }, "save_split_fastqs": { "type": "boolean", "fa_icon": "fas fa-vial", - "description": "If set, publishes split fastq files. Intended for testing purposes.", + "description": "If set, publishes split FASTQ files. Intended for testing purposes.", "hidden": true }, "umi_read_structure": { @@ -200,20 +200,20 @@ "fa_icon": "fas fa-puzzle-piece", "enum": ["bwa-mem", "bwa-mem2", "dragmap"], "description": "Specify aligner to be used to map reads to reference genome.", - "help_text": "> **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2` and `dragmap`.\n> Use `--bwa=false` to have `Sarek` build them automatically.\n\n> **WARNING** BWA-mem2 is in active development\n> Sarek might not be able to require the right amount of resources for it at the moment\n> We recommend to use pre-built indexes", + "help_text": "> **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2` and `dragmap`.\n> Use `--bwa=false` to have `Sarek` build them automatically.\n\n> **WARNING** BWA-mem2 is in active development\n> Sarek might not be able to require the right amount of resources for it at the moment\n> We recommend to use pre-built indexes.", "hidden": true }, "use_gatk_spark": { "type": "string", "fa_icon": "fas fa-forward", - "description": "Tools for which to enable usage of GATK Spark implementation", + "description": "Tools for which to enable usage of GATK Spark implementation.", "help_text": "Multiple separated with commas.\n\n GATK4 BQSR tools are currently only available as Beta release. Use with caution!", "pattern": "^((baserecalibrator|markduplicates)*,?)*$" }, "save_bam_mapped": { "type": "boolean", "fa_icon": "fas fa-download", - "description": "Save Mapped BAMs" + "description": "Save Mapped BAMs." } } }, @@ -233,14 +233,14 @@ "type": "number", "fa_icon": "fas fa-bacon", "default": 2, - "hidden": true, - "description": "genome ploidy; In case of doubt, you can set different values and Control-FREEC will select the one that explains most observed CNAs" + "description": "genome ploidy; In case of doubt, you can set different values and Control-FREEC will select the one that explains most observed CNAs.", + "hidden": true }, "ascat_purity": { "type": "number", "fa_icon": "fas fa-wrench", - "description": "Overwrite ASCAT purity", - "help_text": "Requires that `--ascat_ploidy` is set" + "description": "Overwrite ASCAT purity.", + "help_text": "Requires that `--ascat_ploidy` is set." }, "cf_coeff": { "type": "number", @@ -267,54 +267,54 @@ "default": 0, "fa_icon": "fas fa-greater-than", "hidden": true, - "description": "Minimal sequencing quality for a position to be considered in BAF analysis" + "description": "Minimal sequencing quality for a position to be considered in BAF analysis." }, "cf_mincov": { "type": "number", "default": 0, "fa_icon": "fas fa-align-center", "hidden": true, - "description": "Minimal read coverage for a position to be considered in BAF analysis" + "description": "Minimal read coverage for a position to be considered in BAF analysis." }, "cf_window": { "type": "number", "fa_icon": "fas fa-wrench", - "description": "Overwrite Control-FREEC window size", - "help_text": "It is recommended to use a window size of 0 for exome data", + "description": "Overwrite Control-FREEC window size.", + "help_text": "It is recommended to use a window size of 0 for exome data.", "hidden": true }, "joint_germline": { "type": "boolean", "fa_icon": "fas fa-align-justify", - "description": "Enables GATK4 joint germline variant calling, if also haplotypecaller is selected" + "description": "Enables GATK4 joint germline variant calling, if also haplotypecaller is selected." }, "generate_gvcf": { "type": "boolean", "fa_icon": "fas fa-copy", - "description": "Generate g.vcf output from GATK HaplotypeCaller" + "description": "Generate g.vcf output from GATK HaplotypeCaller." }, "no_strelka_bp": { "type": "boolean", "fa_icon": "fas fa-ban", - "description": "Will not use Manta candidateSmallIndels for Strelka", - "help_text": "Not recommended by Best Practices" + "description": "Will not use Manta candidateSmallIndels for Strelka.", + "help_text": "Not recommended by Best Practice.s" }, "pon": { "type": "string", "fa_icon": "fas fa-file", - "description": "Panel-of-normals VCF (bgzipped) for GATK Mutect2 / Sentieon TNscope", + "description": "Panel-of-normals VCF (bgzipped) for GATK Mutect2 / Sentieon TNscope.", "help_text": "Without PON, there will be no calls with PASS in the INFO field, only an unfiltered VCF is written.\nIt is recommended to make your own PON, as it depends on sequencer and library preparation.\nFor tests in iGenomes there is a dummy PON file in the Annotation/GermlineResource directory, but it should not be used as a real PON file.\n\nSee [PON documentation](https://gatk.broadinstitute.org/hc/en-us/articles/360042479112-CreateSomaticPanelOfNormals-BETA)\n> **NB** PON file should be bgzipped." }, "pon_tbi": { "type": "string", "fa_icon": "fas fa-file", - "description": "Index of PON panel-of-normals VCF", + "description": "Index of PON panel-of-normals VCF.", "help_text": "If none provided, will be generated automatically from the PON bgzipped VCF file." }, "ignore_soft_clipped_bases": { "type": "boolean", "fa_icon": "fas fa-ban", - "description": "Do not analyze soft clipped bases in the reads for GATK Mutect2", + "description": "Do not analyze soft clipped bases in the reads for GATK Mutect2.", "help_text": "use the `--dont-use-soft-clipped-bases` params with GATK." } } @@ -329,22 +329,22 @@ "annotate_tools": { "type": "string", "fa_icon": "fas fa-hammer", - "description": "Specify from which tools Sarek should look for VCF files to annotate", - "help_text": "Only for step `annotate`", + "description": "Specify from which tools Sarek should look for VCF files to annotate.", + "help_text": "Only for step `annotate`.", "pattern": "^((haplotypecaller|manta|mutect2|strelka|tiddit)*(,)*)*$", "hidden": true }, "annotation_cache": { "type": "boolean", "fa_icon": "fas fa-database", - "description": "Enable the use of cache for annotation", - "help_text": "And disable usage of Sarek snpeff and vep specific containers for annotation\n\nTo be used with `--snpeff_cache` and/or `--vep_cache`", + "description": "Enable the use of cache for annotation.", + "help_text": "And disable usage of Sarek snpeff and vep specific containers for annotation\n\nTo be used with `--snpeff_cache` and/or `--vep_cache`.", "hidden": true }, - "cadd_cache": { + "cadd": { "type": "boolean", "fa_icon": "fas fa-database", - "description": "Enable CADD cache.", + "description": "Enable the use of the VEP CADD plugin.", "hidden": true }, "cadd_indels": { @@ -380,15 +380,15 @@ "snpeff_cache": { "type": "string", "fa_icon": "fas fa-database", - "description": "Path to snpEff cache", - "help_text": "To be used with `--annotation_cache`", + "description": "Path to snpEff cache.", + "help_text": "To be used with `--annotation_cache`.", "hidden": true }, "vep_cache": { "type": "string", "fa_icon": "fas fa-database", - "description": "Path to VEP cache", - "help_text": "To be used with `--annotation_cache`", + "description": "Path to VEP cache.", + "help_text": "To be used with `--annotation_cache`.", "hidden": true } } @@ -403,7 +403,7 @@ "type": "string", "description": "Name of iGenomes reference.", "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.\n" + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "ac_loci": { "type": "string", @@ -467,75 +467,75 @@ "fasta_fai": { "type": "string", "fa_icon": "fas fa-file", - "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference", + "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference.", "description": "Path to FASTA reference index." }, "germline_resource": { "type": "string", "fa_icon": "fas fa-file", - "description": "Path to GATK Mutect2 Germline Resource File", + "description": "Path to GATK Mutect2 Germline Resource File.", "help_text": "The germline resource VCF file (bgzipped and tabixed) needed by GATK4 Mutect2 is a collection of calls that are likely present in the sample, with allele frequencies.\nThe AF info field must be present.\nYou can find a smaller, stripped gnomAD VCF file (most of the annotation is removed and only calls signed by PASS are stored) in the AWS iGenomes Annotation/GermlineResource folder." }, "germline_resource_tbi": { "type": "string", "fa_icon": "fas fa-file", - "description": "Path to GATK Mutect2 Germline Resource Index", - "help_text": "> **NB** If none provided, will be generated automatically from the Germline Resource file, if provided" + "description": "Path to GATK Mutect2 Germline Resource Index.", + "help_text": "> **NB** If none provided, will be generated automatically from the Germline Resource file, if provided." }, "intervals": { "type": "string", "fa_icon": "fas fa-file-alt", - "help_text": "To speed up some preprocessing and variant calling processes, the reference is chopped into smaller pieces.\nThe intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\nParts of preprocessing and variant calling are done by these intervals, and the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 1000 nucleotides per second is assumed.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation", - "description": "Path to intervals file" + "help_text": "To speed up some preprocessing and variant calling processes, the reference is chopped into smaller pieces.\nThe intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\nParts of preprocessing and variant calling are done by these intervals, and the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 1000 nucleotides per second is assumed.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.", + "description": "Path to intervals file." }, "known_indels": { "type": "string", "fa_icon": "fas fa-copy", - "description": "Path to known indels file" + "description": "Path to known indels file." }, "known_indels_tbi": { "type": "string", "fa_icon": "fas fa-copy", - "description": "Path to known indels file index", - "help_text": "> **NB** If none provided, will be generated automatically from the known index file, if provided" + "description": "Path to known indels file index.", + "help_text": "> **NB** If none provided, will be generated automatically from the known index file, if provided." }, "mappability": { "type": "string", "fa_icon": "fas fa-file", - "description": "Path to Control-FREEC mappability file" + "description": "Path to Control-FREEC mappability file." }, "snpeff_db": { "type": "string", "fa_icon": "fas fa-database", - "description": "snpEff DB version" + "description": "snpEff DB version." }, "snpeff_genome": { "type": "string", "fa_icon": "fas fa-microscope", - "description": "snpeff genome", + "description": "snpeff genome.", "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." }, "vep_genome": { "type": "string", "fa_icon": "fas fa-microscope", - "description": "VEP genome", + "description": "VEP genome.", "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." }, "vep_species": { "type": "string", "fa_icon": "fas fa-microscope", - "description": "VEP species", + "description": "VEP species.", "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." }, "vep_cache_version": { "type": "string", "fa_icon": "fas fa-tag", - "description": "VEP cache version" + "description": "VEP cache version." }, "save_reference": { "type": "boolean", "fa_icon": "fas fa-download", - "description": "Save built references" + "description": "Save built references." }, "igenomes_base": { "type": "string", @@ -548,7 +548,7 @@ "type": "string", "fa_icon": "fas fa-map-marker-alt", "description": "Directory / URL base for genomes references.", - "help_text": "All files are supposed to be in the same folder" + "help_text": "All files are supposed to be in the same folder." }, "igenomes_ignore": { "type": "boolean", @@ -557,7 +557,7 @@ "help_text": "Do not load `igenomes.config` when running the pipeline.\nYou may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.\nThis option will load the `genomes.config` file instead.\n\n> **NB** You can then specify the genome custom and specify at least a FASTA genome file." } }, - "help_text": "The pipeline config files come bundled with paths to the Illumina iGenomes reference index files.\nThe configuration is set up to use the AWS-iGenomes resource\ncf https://ewels.github.io/AWS-iGenomes/\n" + "help_text": "The pipeline config files come bundled with paths to the Illumina iGenomes reference index files.\nThe configuration is set up to use the AWS-iGenomes resource\ncf https://ewels.github.io/AWS-iGenomes/." }, "institutional_config_options": { "title": "Institutional config options", @@ -616,7 +616,7 @@ "fa_icon": "fas fa-university", "default": "ILLUMINA", "description": "Sequencing platform information to be added to read group (PL field).", - "help_text": "Default: ILLUMINA. Will be used to create a proper header for further GATK4 downstream analysis", + "help_text": "Default: ILLUMINA. Will be used to create a proper header for further GATK4 downstream analysis.", "hidden": true } } @@ -634,7 +634,7 @@ "default": 16, "fa_icon": "fas fa-microchip", "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`." }, "max_memory": { "type": "string", @@ -643,7 +643,7 @@ "fa_icon": "fas fa-memory", "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`." }, "max_time": { "type": "string", @@ -652,7 +652,7 @@ "fa_icon": "far fa-clock", "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`." } } }, From 38977580bb143527f5a666cfd5557380e41eb0ff Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 15:10:21 +0200 Subject: [PATCH 09/27] fix typo --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 873e367cc9..1260a79b29 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -44,7 +44,7 @@ "fa_icon": "fas fa-folder-open" } }, - "help_text": ."" + "help_text": "" }, "main_options": { "title": "Main options", From acd8eb93fcb209ebc70d624756cb479a8815d1f1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 5 May 2022 16:47:50 +0200 Subject: [PATCH 10/27] code polish --- conf/modules.config | 2 +- conf/test.config | 11 ++--------- nextflow.config | 4 ++-- workflows/sarek.nf | 3 --- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5b33763310..bb454298b7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -959,7 +959,7 @@ process{ withName: 'ENSEMBLVEP' { ext.args = [ '--everything --filter_common --per_gene --total_length --offline', - (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? '--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz' : '', + (params.cadd && params.cadd_wg_snvs && params.cadd_indels) ? '--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz' : '', ].join(' ').trim() if (!params.vep_cache) container = { params.vep_genome ? "nfcore/vep:104.3.${params.vep_genome}" : "nfcore/vep:104.3.${params.genome}" } publishDir = [ diff --git a/conf/test.config b/conf/test.config index 43010f4096..eadfd04b1f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -37,6 +37,8 @@ params { snpeff_db = 'WBcel235.99' vep_species = 'caenorhabditis_elegans' vep_cache_version = '104' + snpeff_genome = 'WBcel235' + vep_genome = 'WBcel235' // Ignore `--input` as otherwise the parameter validation will throw an error schema_ignore_params = 'genomes,input' @@ -44,7 +46,6 @@ params { profiles { annotation { - params.genome = 'WBcel235' params.igenomes_ignore = false params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/tiny-vcf-https.csv' params.step = 'annotate' @@ -83,8 +84,6 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true - params.genome = 'WBcel235' - params.vep_genome = 'WBcel235' //params.vep_cache = } tools_germline { @@ -95,8 +94,6 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true - params.genome = 'WBcel235' - params.vep_genome = 'WBcel235' } tools_tumoronly { params.input = "${baseDir}/tests/csv/3.0/recalibrated_tumoronly.csv" @@ -108,8 +105,6 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true - params.genome = 'WBcel235' - params.vep_genome = 'WBcel235' } tools_somatic { params.input = "${baseDir}/tests/csv/3.0/recalibrated_somatic.csv" @@ -121,8 +116,6 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true - params.genome = 'WBcel235' - params.vep_genome = 'WBcel235' params.chr_dir = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz" } trimming { diff --git a/nextflow.config b/nextflow.config index 32c35ad057..86bcc24c12 100644 --- a/nextflow.config +++ b/nextflow.config @@ -72,12 +72,12 @@ params { // Annotation annotate_tools = null // Only with --step annotate annotation_cache = false // Annotation cache disabled - cadd_cache = null // CADD cache disabled + cadd = null // CADD plugin disabled within VEP cadd_indels = null // No CADD InDels file cadd_indels_tbi = null // No CADD InDels index cadd_wg_snvs = null // No CADD SNVs file cadd_wg_snvs_tbi = null // No CADD SNVs index - genesplicer = null // genesplicer disabled within VEP + genesplicer = null // genesplicer plugin disabled within VEP snpeff_cache = null // No directory for snpEff cache vep_cache = null // No directory for VEP cache diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 0b00be8b1d..26482279f8 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -858,9 +858,6 @@ def extract_csv(csv_file) { def read_group = "\"@RG\\tID:${row.lane}\\t${CN}PU:${row.lane}\\tSM:${row.sample}\\tLB:${row.sample}\\tPL:${params.seq_platform}\"" meta.numLanes = numLanes.toInteger() meta.read_group = read_group.toString() - - println read_group - meta.data_type = "fastq" meta.size = 1 // default number of splitted fastq return [meta, [fastq_1, fastq_2]] From 3cda531e7273b05e2aa86526e7115848824f3903 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 May 2022 14:42:28 +0200 Subject: [PATCH 11/27] removed un-used files --- bin/CADD.pm | 157 ------------------- bin/GeneSplicer.pm | 369 -------------------------------------------- conf/genomes.config | 37 ----- 3 files changed, 563 deletions(-) delete mode 100644 bin/CADD.pm delete mode 100644 bin/GeneSplicer.pm delete mode 100644 conf/genomes.config diff --git a/bin/CADD.pm b/bin/CADD.pm deleted file mode 100644 index 8098384b9b..0000000000 --- a/bin/CADD.pm +++ /dev/null @@ -1,157 +0,0 @@ -=head1 LICENSE - -Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute -Copyright [2016-2021] EMBL-European Bioinformatics Institute - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -=head1 CONTACT - - Ensembl - -=cut - -=head1 NAME - - CADD - -=head1 SYNOPSIS - - mv CADD.pm ~/.vep/Plugins - ./vep -i variations.vcf --plugin CADD,/FULL_PATH_TO_CADD_FILE/whole_genome_SNVs.tsv.gz,/FULL_PATH_TO_CADD_FILE/InDels.tsv.gz - -=head1 DESCRIPTION - - A VEP plugin that retrieves CADD scores for variants from one or more - tabix-indexed CADD data files. - - Please cite the CADD publication alongside the VEP if you use this resource: - https://www.ncbi.nlm.nih.gov/pubmed/24487276 - - The tabix utility must be installed in your path to use this plugin. The CADD - data files can be downloaded from - http://cadd.gs.washington.edu/download - - The plugin works with all versions of available CADD files. The plugin only - reports scores and does not consider any additional annotations from a CADD - file. It is therefore sufficient to use CADD files without the additional - annotations. - -=cut - -package CADD; - -use strict; -use warnings; - -use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); -use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); - -use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; - -use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); - -sub new { - my $class = shift; - - my $self = $class->SUPER::new(@_); - - $self->expand_left(0); - $self->expand_right(0); - - $self->get_user_params(); - - return $self; -} - -sub feature_types { - return ['Feature','Intergenic']; -} - -sub get_header_info { - my $self = shift; - return { - CADD_PHRED => 'PHRED-like scaled CADD score', - CADD_RAW => 'Raw CADD score' - } -} - -sub run { - my ($self, $tva) = @_; - - my $vf = $tva->variation_feature; - - # get allele - my $allele = $tva->variation_feature_seq; - - return {} unless $allele =~ /^[ACGT-]+$/; - - my @data = @{$self->get_data($vf->{chr}, $vf->{start} - 2, $vf->{end})}; - - foreach (@data) { - my $matches = get_matched_variant_alleles( - { - ref => $vf->ref_allele_string, - alts => [$allele], - pos => $vf->{start}, - strand => $vf->strand - }, - { - ref => $_->{ref}, - alts => [$_->{alt}], - pos => $_->{start}, - } - ); - return $_->{result} if (@$matches); - } - return {}; -} - -sub parse_data { - my ($self, $line) = @_; - my ($c, $s, $ref, $alt, $raw, $phred) = split /\t/, $line; - - # do VCF-like coord adjustment for mismatched subs - my $e = ($s + length($ref)) - 1; - if(length($alt) != length($ref)) { - my $first_ref = substr($ref, 0, 1); - my $first_alt = substr($alt, 0, 1); - if ($first_ref eq $first_alt) { - $s++; - $ref = substr($ref, 1); - $alt = substr($alt, 1); - $ref ||= '-'; - $alt ||= '-'; - } - } - return { - ref => $ref, - alt => $alt, - start => $s, - end => $e, - result => { - CADD_RAW => $raw, - CADD_PHRED => $phred - } - }; -} - -sub get_start { - return $_[1]->{start}; -} - -sub get_end { - return $_[1]->{end}; -} - -1; diff --git a/bin/GeneSplicer.pm b/bin/GeneSplicer.pm deleted file mode 100644 index 3801a16cb9..0000000000 --- a/bin/GeneSplicer.pm +++ /dev/null @@ -1,369 +0,0 @@ -=head1 LICENSE - -Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute -Copyright [2016-2021] EMBL-European Bioinformatics Institute - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -=head1 CONTACT - - Ensembl - -=cut - -=head1 NAME - - GeneSplicer - -=head1 SYNOPSIS - - mv GeneSplicer.pm ~/.vep/Plugins - ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value] - -=head1 DESCRIPTION - - This is a plugin for the Ensembl Variant Effect Predictor (VEP) that - runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get - splice site predictions. - - It evaluates a tract of sequence either side of and including the - variant, both in reference and alternate states. The amount of - sequence included either side defaults to 100bp, but can be modified - by passing e.g. "context=50" as a parameter to the plugin. - - Any predicted splicing regions that overlap the variant are reported - in the output with one of four states: no_change, diff, gain, loss - - There follows a "/"-separated string consisting of the following data: - - 1) type (donor, acceptor) - 2) coordinates (start-end) - 3) confidence (Low, Medium, High) - 4) score - - Example: loss/acceptor/727006-727007/High/16.231924 - - If multiple sites are predicted, their reports are separated by ",". - - For diff, the confidence and score for both the reference and alternate - sequences is reported as REF-ALT. - - Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368 - - Several parameters can be modified by passing them to the plugin string: - - context : change the amount of sequence added either side of - the variant (default: 100bp) - tmpdir : change the temporary directory used (default: /tmp) - cache_size : change how many sequences' scores are cached in memory - (default: 50) - - Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp - - On some systems the binaries provided will not execute, but can be compiled from source: - - cd $GS/sources - make - cd - - ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human - - On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification: - - cd $GS/sources - perl -pi -e "s/^main /int main /" genesplicer.cpp - make - - -=cut - -package GeneSplicer; - -use strict; -use warnings; - -use Digest::MD5 qw(md5_hex); - -use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); -use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); - -use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; -use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); - -our %DEFAULTS = ( - context => 100, - tmpdir => '/tmp', - cache_size => 50, -); - -sub new { - my $class = shift; - - my $self = $class->SUPER::new(@_); - - # we need sequence, so no offline mode unless we have FASTA - die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta}; - - my $params = $self->params; - - my $bin = shift @$params; - die("ERROR: genesplicer binary not specified\n") unless $bin; - die("ERROR: genesplicer binary not found\n") unless -e $bin; - my $test = `$bin 2>&1`; - die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/; - $self->{_bin} = $bin; - - my $training_dir = shift @$params; - die("ERROR: training directory not specified\n") unless $training_dir; - die("ERROR: training directory not found\n") unless -d $training_dir; - $self->{_training_dir} = $training_dir; - - # defaults - $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS; - - # REST API passes 1 as first param - shift @$params if $params->[0] && $params->[0] eq '1'; - - # set/override with user params - foreach my $param(@$params) { - my ($key, $val) = split('=', $param); - die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val); - - $self->{'_param_'.$key} = $val; - } - - return $self; -} - -sub feature_types { - return ['Transcript']; -} - -sub get_header_info { - return { - GeneSplicer => "GeneSplicer predictions" - }; -} - -sub run { - my ($self, $tva) = @_; - - my $vf = $tva->variation_feature; - - # get up and downstream sequences - my $up_seq = $vf->{slice}->sub_Slice( - $vf->{start} - $self->{'_param_context'}, - $vf->{start} - 1, - $vf->strand - )->seq; - - my $down_seq = $vf->{slice}->sub_Slice( - $vf->{end} + 1, - $vf->{end} + $self->{'_param_context'}, - $vf->strand - )->seq; - - # create ref seq by grabbing reference TVA - my $ref_seq = join("", - $up_seq, - $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq, - $down_seq - ); - - return {} unless $ref_seq =~ /^[ACGT]+$/; - - # create alt seq - my $alt_allele = $tva->variation_feature_seq; - $alt_allele =~ s/\-//g; - my $alt_seq = $up_seq.$alt_allele.$down_seq; - - - return {} unless $alt_seq =~ /^[ACGT]+$/; - - # reverse comp if strands differ - if($tva->transcript->strand != $vf->strand) { - reverse_comp(\$ref_seq); - reverse_comp(\$alt_seq); - } - - # get results - my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq); - my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq); - - # compare results both ways - my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results); - my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results); - - # get VF pos relative to tested sequence - my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1)); - - # get overlapping losses and gains - # and map to chromosome coords - my @losses = - map {$_->{gl} = 'loss'; $_} - @{$diff_ref_to_alt->{lost}}; - - my @gains = - map {$_->{gl} = 'gain'; $_} - @{$diff_alt_to_ref->{lost}}; - - my @diffs = - map {$_->{gl} = 'diff'; $_} - @{$diff_ref_to_alt->{diff}}; - - my $return = join(',', - map { - join('/', - $_->[0]->{gl}, - $_->[0]->{type}, - $_->[1]->{end5}.'-'.$_->[1]->{end3}, - $_->[0]->{confidence}, - $_->[0]->{score} - ) - } - map {[$_, $self->map_ss_coords($_, $vf)]} - grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} - (@losses, @gains, @diffs) - ); - - # probably of interest to report splice sites were found - # but no difference between ref and alt - if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) { - $return = join(',', - map { - join('/', - 'no_change', - $_->[0]->{type}, - $_->[1]->{end5}.'-'.$_->[1]->{end3}, - $_->[0]->{confidence}, - $_->[0]->{score} - ) - } - map {[$_, $self->map_ss_coords($_, $vf)]} - grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results - ); - } - - return $return ? { GeneSplicer => $return } : {}; -} - -sub results_from_seq { - my $self = shift; - my $seq = shift; - - # write seqs to file - my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa"; - open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n"); - print SEQ ">SEQ\n$seq\n"; - close SEQ; - - my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results"; - - my $cmd = sprintf( - '%s %s %s -f %s', - $self->{'_bin'}, - $seq_file, - $self->{'_training_dir'}, - $result_file - ); - - my $output = `$cmd 2>&1`; - unlink($seq_file); - - return [] unless -e $result_file; - - open RES, $result_file; - my @results; - - while() { - chomp; - my ($end5, $end3, $score, $confidence, $type) = split; - - push @results, { - end5 => $end5, - end3 => $end3, - score => $score, - confidence => $confidence, - type => $type - }; - } - close RES; - - unlink($result_file); - - push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results}; - shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size}; - - return \@results; -} - -sub results_from_cache { - my $self = shift; - my $seq = shift; - - my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []}; - - return $results; -} - -sub compare_results { - my $self = shift; - my $a = shift; - my $b = shift; - - my (@diff, @lost); - - foreach my $res_a(@$a) { - my @match = grep { - $_->{end5} == $res_a->{end5} && - $_->{end3} == $res_a->{end3} && - $_->{type} eq $res_a->{type} - } @$b; - - # result not found in b - if(!@match) { - push @lost, $res_a; - } - - # >1 result found - elsif(scalar @match > 1) { - warn("WARNING: Found two matches?\n"); - } - - # 1 match - elsif($match[0]->{score} != $res_a->{score}) { - my %diff = %$res_a; - $diff{score} .= '-'.$match[0]->{score}; - $diff{confidence} .= '-'.$match[0]->{confidence}; - push @diff, \%diff; - } - } - - return { diff => \@diff, lost => \@lost}; -} - -sub map_ss_coords { - my $self = shift; - my $res = shift; - my $vf = shift; - - my $return = {}; - - foreach my $coord(qw(end5 end3)) { - $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1; - } - - return $return; -} - -1; - diff --git a/conf/genomes.config b/conf/genomes.config deleted file mode 100644 index 0d3dfc4a74..0000000000 --- a/conf/genomes.config +++ /dev/null @@ -1,37 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for reference genome - * ------------------------------------------------- - * Defines reference genomes, without using iGenome paths - * Can be used by any config that customises the base - * path using $params.genomes_base / --genomes_base - * - * CAREFUL: Some o the files might be reuiqred in the CI tests not yet implemented. They should be gradually moved to the test.config. Until then lets keep this file. - */ - -params { - genomes { - 'minimalGRCh37' { - fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" - } - 'smallGRCh37' { - dbsnp = "${params.genomes_base}/dbsnp_138.b37.small.vcf.gz" - fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" - germline_resource = "${params.genomes_base}/gnomAD.r2.1.1.GRCh37.small.PASS.AC.AF.only.vcf.gz" - intervals = "${params.genomes_base}/small.intervals" - known_indels = "${params.genomes_base}/Mills_1000G_gold_standard_and_1000G_phase1.indels.b37.small.vcf.gz" - snpeff_db = 'GRCh37.75' - snpeff_genome = 'GRCh37' - vep_genome = 'GRCh37' - vep_species = 'homo_sapiens' - vep_cache_version = '104' - } - 'smallerGRCh37' { - fasta = "${params.genomes_base}/human_g1k_v37_decoy.small.fasta" - known_indels = "${params.genomes_base}/dbsnp_138.b37.small.vcf.gz" - } - 'custom' { - fasta = null - } - } -} From 8bafcec0758de85a8877898f52accaec84e6d14e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 May 2022 14:43:07 +0200 Subject: [PATCH 12/27] add new vep_plugins --- conf/modules.config | 5 ++- conf/test.config | 5 ++- nextflow.config | 17 ++++++----- nextflow_schema.json | 72 +++++++++++++++++++++++++++++++------------- workflows/sarek.nf | 37 +++++++++++++---------- 5 files changed, 88 insertions(+), 48 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index bb454298b7..c3536b92e4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -959,7 +959,10 @@ process{ withName: 'ENSEMBLVEP' { ext.args = [ '--everything --filter_common --per_gene --total_length --offline', - (params.cadd && params.cadd_wg_snvs && params.cadd_indels) ? '--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz' : '', + (params.vep_dbnsfp && params.dbnsfp) ? '--plugin dbNSFP,dbNSFP.gz,rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF' : '', + (params.vep_loftee) ? '--plugin LoF,loftee_path:.' : '', + (params.vep_spliceai && params.spliceai_snv && params.spliceai_indel) ? '--plugin SpliceAI,snv=spliceai_scores.raw.snv.hg38.vcf.gz,indel=spliceai_scores.raw.indel.hg38.vcf.gz' : '', + (params.vep_spliceregion) ? '--plugin SpliceRegion' : '' ].join(' ').trim() if (!params.vep_cache) container = { params.vep_genome ? "nfcore/vep:104.3.${params.vep_genome}" : "nfcore/vep:104.3.${params.genome}" } publishDir = [ diff --git a/conf/test.config b/conf/test.config index eadfd04b1f..dbf86ed777 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,7 +24,7 @@ params { // Small reference genome igenomes_ignore = true - genome = 'small_hg38' + genome = null genomes_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules' dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" @@ -36,7 +36,7 @@ params { snpeff_db = 'WBcel235.99' vep_species = 'caenorhabditis_elegans' - vep_cache_version = '104' + vep_cache_version = 104 snpeff_genome = 'WBcel235' vep_genome = 'WBcel235' @@ -46,7 +46,6 @@ params { profiles { annotation { - params.igenomes_ignore = false params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/tiny-vcf-https.csv' params.step = 'annotate' } diff --git a/nextflow.config b/nextflow.config index 86bcc24c12..c71dc85c4b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -70,14 +70,17 @@ params { joint_germline = false // // Annotation - annotate_tools = null // Only with --step annotate + vep_dbnsfp = null // dbnsfp plugin disabled within VEP + dbnsfp = null // No dbnsfp processed file + dbnsfp_tbi = null // No dbnsfp processed file index + vep_loftee = null // loftee plugin disabled within VEP + vep_spliceai = null // spliceai plugin disabled within VEP + spliceai_snv = null // No spliceai_snv file + spliceai_snv_tbi = null // No spliceai_snv file index + spliceai_indel = null // No spliceai_indel file + spliceai_indel_tbi = null // No spliceai_indel file index + vep_spliceregion = null // spliceregion plugin disabled within VEP annotation_cache = false // Annotation cache disabled - cadd = null // CADD plugin disabled within VEP - cadd_indels = null // No CADD InDels file - cadd_indels_tbi = null // No CADD InDels index - cadd_wg_snvs = null // No CADD SNVs file - cadd_wg_snvs_tbi = null // No CADD SNVs index - genesplicer = null // genesplicer plugin disabled within VEP snpeff_cache = null // No directory for snpEff cache vep_cache = null // No directory for VEP cache diff --git a/nextflow_schema.json b/nextflow_schema.json index 1260a79b29..b958d5f7a2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -334,47 +334,77 @@ "pattern": "^((haplotypecaller|manta|mutect2|strelka|tiddit)*(,)*)*$", "hidden": true }, - "annotation_cache": { + "vep_dbnsfp": { "type": "boolean", "fa_icon": "fas fa-database", - "description": "Enable the use of cache for annotation.", - "help_text": "And disable usage of Sarek snpeff and vep specific containers for annotation\n\nTo be used with `--snpeff_cache` and/or `--vep_cache`.", + "description": "Enable the use of the VEP dbNSFP plugin.", "hidden": true }, - "cadd": { + "dbnsfp": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "Path to dbNSFP processed file.", + "help_text": "To be used with `--vep_dbnsfp`.", + "hidden": true + }, + "dbnsfp_tbi": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "Path to dbNSFP tabix indexed file.", + "help_text": "To be used with `--vep_dbnsfp`.", + "hidden": true + }, + "vep_loftee": { + "type": "boolean", + "fa_icon": "fas fa-database", + "description": "Enable the use of the VEP LOFTEE plugin.", + "hidden": true + }, + "vep_spliceai": { "type": "boolean", "fa_icon": "fas fa-database", - "description": "Enable the use of the VEP CADD plugin.", + "description": "Enable the use of the VEP SpliceAI plugin.", "hidden": true }, - "cadd_indels": { + "spliceai_snv": { "type": "string", - "fa_icon": "fas fa-file", - "description": "Path to CADD InDels file.", + "fa_icon": "fas fa-database", + "description": "Path to spliceai raw scores snv file.", + "help_text": "To be used with `--vep_spliceai`.", "hidden": true }, - "cadd_indels_tbi": { + "spliceai_snv_tbi": { "type": "string", - "fa_icon": "fas fa-file", - "description": "Path to CADD InDels index.", + "fa_icon": "fas fa-database", + "description": "Path to spliceai raw scores snv tabix indexed file.", + "help_text": "To be used with `--vep_spliceai`.", "hidden": true }, - "cadd_wg_snvs": { + "spliceai_indel": { "type": "string", - "fa_icon": "fas fa-file", - "description": "Path to CADD SNVs file.", + "fa_icon": "fas fa-database", + "description": "Path to spliceai raw scores indel file.", + "help_text": "To be used with `--vep_spliceai`.", "hidden": true }, - "cadd_wg_snvs_tbi": { + "spliceai_indel_tbi": { "type": "string", - "fa_icon": "fas fa-file", - "description": "Path to CADD SNVs index.", + "fa_icon": "fas fa-database", + "description": "Path to spliceai raw scores indel tabix indexed file.", + "help_text": "To be used with `--vep_spliceai`.", + "hidden": true + }, + "vep_spliceregion": { + "type": "boolean", + "fa_icon": "fas fa-database", + "description": "Enable the use of the VEP SpliceRegion plugin.", "hidden": true }, - "genesplicer": { + "annotation_cache": { "type": "boolean", - "fa_icon": "fas fa-gavel", - "description": "Enable the use of the VEP GeneSplicer plugin.", + "fa_icon": "fas fa-database", + "description": "Enable the use of cache for annotation.", + "help_text": "And disable usage of Sarek snpeff and vep specific containers for annotation\n\nTo be used with `--snpeff_cache` and/or `--vep_cache`.", "hidden": true }, "snpeff_cache": { @@ -528,7 +558,7 @@ "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." }, "vep_cache_version": { - "type": "string", + "type": "number", "fa_icon": "fas fa-tag", "description": "VEP cache version." }, diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 26482279f8..f7f2debb7d 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -15,11 +15,9 @@ def checkPathParamList = [ params.ac_loci_gc, params.bwa, params.bwamem2, - params.cadd_indels, - params.cadd_indels_tbi, - params.cadd_wg_snvs, - params.cadd_wg_snvs_tbi, params.chr_dir, + params.dbnsfp, + params.dbnsfp_tbi, params.dbsnp, params.dbsnp_tbi, params.dict, @@ -37,6 +35,10 @@ def checkPathParamList = [ params.pon, params.pon_tbi, params.snpeff_cache, + params.spliceai_snv, + params.spliceai_snv, + params.spliceai_snv_tbi, + params.spliceai_snv_tbi, //params.target_bed, params.vep_cache ] @@ -97,24 +99,27 @@ vep_genome = params.vep_genome ?: Channel.empty() vep_species = params.vep_species ?: Channel.empty() // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope -cadd_indels = params.cadd_indels ? Channel.fromPath(params.cadd_indels).collect() : [] -cadd_indels_tbi = params.cadd_indels_tbi ? Channel.fromPath(params.cadd_indels_tbi).collect() : [] -cadd_wg_snvs = params.cadd_wg_snvs ? Channel.fromPath(params.cadd_wg_snvs).collect() : [] -cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.fromPath(params.cadd_wg_snvs_tbi).collect() : [] pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.empty() snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : [] //target_bed = params.target_bed ? Channel.fromPath(params.target_bed).collect() : [] vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] -if (params.cadd_wg_snvs && params.cadd_wg_snvs_tbi && params.cadd_indels && params.cadd_indels && params.cadd_indels_tbi) { - vep_extra_files = Channel.empty().mix( - Channel.fromPath(params.cadd_wg_snvs), - Channel.fromPath(params.cadd_wg_snvs_tbi), - Channel.fromPath(params.cadd_indels), - Channel.fromPath(params.cadd_indels_tbi) +vep_extra_files = [] + +if (params.dbnsfp && params.dbnsfp_tbi) { + vep_extra_files = vep_extra_files.mix( + Channel.fromPath(params.dbnsfp), + Channel.fromPath(params.dbnsfp_tbi) + ).collect() +} + +if (params.spliceai_snv && params.spliceai_snv_tbi && params.spliceai_indel && params.spliceai_indel_tbi) { + vep_extra_files = vep_extra_files.mix( + Channel.fromPath(params.spliceai_snv), + Channel.fromPath(params.spliceai_snv_tbi), + Channel.fromPath(params.spliceai_snv), + Channel.fromPath(params.spliceai_snv_tbi) ).collect() -} else { - vep_extra_files = [] } // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope From 651621b24a3f015390f536e4d749e7a3d5f0c108 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 May 2022 14:53:47 +0200 Subject: [PATCH 13/27] update CITATIONS --- CITATIONS.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index e5e3937fb8..1a77668081 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -74,9 +74,19 @@ > McLaren W, Gil L, Hunt SE, et al.: The Ensembl Variant Effect Predictor. Genome Biol. 2016 Jun 6;17(1):122. doi: 10.1186/s13059-016-0974-4. PubMed PMID: 27268795; PubMed Central PMCID: PMC4893825. -- [CADD](https://pubmed.ncbi.nlm.nih.gov/24487276/) +- [dbNSFP](https://pubmed.ncbi.nlm.nih.gov/33261662/) - > Kircher M, et al.: A general framework for estimating the relative pathogenicity of human genetic variants. Nat Genet. 2014 Mar;46(3):310-5. doi: 10.1038/ng.2892. PubMed PMID: 24487276; PubMed Central PMCID: PMC3992975. + > Liu X, et al.: dbNSFP v4: a comprehensive database of transcript-specific functional predictions and annotations for human nonsynonymous and splice-site SNVs. Genome Med. 2020 Dec 2;12(1):103. doi: 10.1186/s13073-020-00803-9. PubMed PMID: 33261662; PubMed Central PMCID: PMC7709417. + +- [LOFTEE](https://pubmed.ncbi.nlm.nih.gov/32461654/) + + > Karczewski KJ, et al.: The mutational constraint spectrum quantified from variation in 141,456 humans. Nature. 2020 May;581(7809):434-443. doi: 10.1038/s41586-020-2308-7. PubMed PMID: 32461654; PubMed Central PMCID: PMC7334197. + +- [SpliceAI](https://pubmed.ncbi.nlm.nih.gov/30661751/) + + > Jaganathan K, et al.: Predicting Splicing from Primary Sequence with Deep Learning. Cell. 2019 Jan 24;176(3):535-548.e24. doi: 10.1016/j.cell.2018.12.015. PubMed PMID: 30661751. + +- [SpliceRegion](https://github.com/Ensembl/VEP_plugins/blob/release/106/SpliceRegion.pm) ## R packages From 3d883cac890d45c62d9d9c8d8f50726a964173f1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 May 2022 14:56:07 +0200 Subject: [PATCH 14/27] rm --annotate_tools --- nextflow_schema.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b958d5f7a2..ca9b734279 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -326,14 +326,6 @@ "default": "", "fa_icon": "fas fa-toolbox", "properties": { - "annotate_tools": { - "type": "string", - "fa_icon": "fas fa-hammer", - "description": "Specify from which tools Sarek should look for VCF files to annotate.", - "help_text": "Only for step `annotate`.", - "pattern": "^((haplotypecaller|manta|mutect2|strelka|tiddit)*(,)*)*$", - "hidden": true - }, "vep_dbnsfp": { "type": "boolean", "fa_icon": "fas fa-database", From f8ce6042aff53acf91be4e90e5885b7392e7fc08 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 May 2022 15:42:45 +0200 Subject: [PATCH 15/27] fix collision name --- conf/modules.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index c3536b92e4..b34a1a42a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -972,6 +972,10 @@ process{ ] } + withName: ".*:ANNOTATION_MERGE:ENSEMBLVEP" { + ext.prefix = {"${meta.id}_snpEff"} + } + withName: 'SNPEFF' { ext.args = '-nodownload -canon -v' if (!params.snpeff_cache) container = { params.snpeff_genome ? "nfcore/snpeff:5.0.${params.snpeff_genome}" : "nfcore/snpeff:5.0.${params.genome}" } From 4834215d15a6ef39b80b1996cddae678aa0304cc Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 6 May 2022 17:18:26 +0200 Subject: [PATCH 16/27] fix output for VEP --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index b34a1a42a1..2681b0c132 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -987,7 +987,7 @@ process{ ] } - withName: "NFCORE_SAREK:SAREK:ANNOTATE:*:TABIX_BGZIPTABIX" { + withName: "NFCORE_SAREK:SAREK:ANNOTATE:.*:TABIX_BGZIPTABIX" { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" }, From 352fdd6409a9cb2b36bfbff89be0af88bda44a75 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Sun, 8 May 2022 12:19:58 +0200 Subject: [PATCH 17/27] fully use modules test-datasets + typos + polish --- CHANGELOG.md | 4 ++- conf/test.config | 76 ++++++++++++++++++++------------------------ nextflow.config | 1 - nextflow_schema.json | 41 +++++++++++------------- 4 files changed, 56 insertions(+), 66 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12dde941b8..6e68f28fc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -76,7 +76,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#485](https://github.com/nf-core/sarek/pull/485) - `--skip_qc`, `--skip_markduplicates` and `--skip_bqsr` is now `--skip_tools` - [#538](https://github.com/nf-core/sarek/pull/538) - `--sequencing_center` is now `--seq_center` -- [#538](https://github.com/nf-core/sarek/pull/538) - `--markdup_java_options` has been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `--annotate_tools` has been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `--cadd_cache`, `--cadd_indels`, `--cadd_indels_tbi`, `--cadd_wg_snvs`, `--cadd_wg_snvs_tbi` have been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `--genesplicer` has been removed ## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - PĆ„rtejekna diff --git a/conf/test.config b/conf/test.config index dbf86ed777..e71283bd45 100644 --- a/conf/test.config +++ b/conf/test.config @@ -9,6 +9,12 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +try { + includeConfig "https://raw.githubusercontent.com/nf-core/modules/master/tests/config/test_data.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/modules test data config") +} + params { config_profile_name = 'Test profile' @@ -25,13 +31,13 @@ params { // Small reference genome igenomes_ignore = true genome = null - genomes_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules' - dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" - fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/genome.fasta" - germline_resource = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" - intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/genome.interval_list" - known_indels = "${params.genomes_base}/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_fasta'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_interval_list'] + known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz'] + nucleotides_per_second = 20 snpeff_db = 'WBcel235.99' @@ -39,9 +45,6 @@ params { vep_cache_version = 104 snpeff_genome = 'WBcel235' vep_genome = 'WBcel235' - - // Ignore `--input` as otherwise the parameter validation will throw an error - schema_ignore_params = 'genomes,input' } profiles { @@ -70,16 +73,16 @@ profiles { params.save_split_fastqs = true } targeted { - params.intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/multi_intervals.bed" + params.intervals = params.test_data['homo_sapiens']['genome']['genome_multi_interval_bed'] params.wes = true } tools { params.input = "${baseDir}/tests/csv/3.0/recalibrated.csv" - params.dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbsnp_138.hg38.vcf.gz" - params.fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta" - params.germline_resource = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz" - params.intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" - params.pon = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/mills_and_1000G.indels.hg38.vcf.gz" + params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + params.germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + params.pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] params.step = 'variant_calling' params.joint_germline = true params.wes = true @@ -87,35 +90,36 @@ profiles { } tools_germline { params.input = "${baseDir}/tests/csv/3.0/recalibrated_germline.csv" - params.dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbsnp_138.hg38.vcf.gz" - params.fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta" - params.intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" + params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] params.step = 'variant_calling' params.joint_germline = true params.wes = true } tools_tumoronly { params.input = "${baseDir}/tests/csv/3.0/recalibrated_tumoronly.csv" - params.dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbsnp_138.hg38.vcf.gz" - params.fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta" - params.germline_resource = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz" - params.intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" - params.pon = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/mills_and_1000G.indels.hg38.vcf.gz" + params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + params.germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + params.pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] params.step = 'variant_calling' params.joint_germline = true params.wes = true } tools_somatic { params.input = "${baseDir}/tests/csv/3.0/recalibrated_somatic.csv" - params.dbsnp = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbsnp_138.hg38.vcf.gz" - params.fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta" - params.germline_resource = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz" - params.intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" - params.pon = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/mills_and_1000G.indels.hg38.vcf.gz" + params.chr_dir = params.test_data['homo_sapiens']['genome']['genome_21_chromosomes_dir'] + params.dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + params.germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + params.pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] params.step = 'variant_calling' params.joint_germline = true params.wes = true - params.chr_dir = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz" + } trimming { params.clip_r1 = 1 @@ -133,19 +137,9 @@ profiles { } variantcalling_channels { params.input = "${baseDir}/tests/csv/3.0/recalibrated.csv" - params.fasta = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta" + params.fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] params.wes = true params.step = 'variant_calling' - params.intervals = "${params.genomes_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" - } -} - -//This is apparently useless as it won't overwrite things in the modules.config -process { - withName:ENSEMBLVEP { - maxForks = 1 - } - withName:SNPEFF { - maxForks = 1 } } diff --git a/nextflow.config b/nextflow.config index c71dc85c4b..55899a203c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,6 @@ params { genome = 'GRCh38' igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - genomes_base = null // Disabled by default save_reference = false // Built references not saved // Main options diff --git a/nextflow_schema.json b/nextflow_schema.json index ca9b734279..1bd042778d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -200,7 +200,7 @@ "fa_icon": "fas fa-puzzle-piece", "enum": ["bwa-mem", "bwa-mem2", "dragmap"], "description": "Specify aligner to be used to map reads to reference genome.", - "help_text": "> **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2` and `dragmap`.\n> Use `--bwa=false` to have `Sarek` build them automatically.\n\n> **WARNING** BWA-mem2 is in active development\n> Sarek might not be able to require the right amount of resources for it at the moment\n> We recommend to use pre-built indexes.", + "help_text": "> **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2` and `dragmap`.\n> `Sarek` will build them automatically if not provided.\n\n> **WARNING** BWA-mem2 is in active development\n> Sarek might not be able to require the right amount of resources for it at the moment\n> We recommend to use pre-built indexes.", "hidden": true }, "use_gatk_spark": { @@ -297,7 +297,7 @@ "type": "boolean", "fa_icon": "fas fa-ban", "description": "Will not use Manta candidateSmallIndels for Strelka.", - "help_text": "Not recommended by Best Practice.s" + "help_text": "Not recommended by Best Practices." }, "pon": { "type": "string", @@ -441,13 +441,13 @@ "type": "string", "fa_icon": "fas fa-copy", "description": "Path to BWA mem indices.", - "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference." + "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs." }, "bwamem2": { "type": "string", "fa_icon": "fas fa-copy", - "description": "Path to bwamem2 mem indices.", - "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference, if --aligner bwamem-2 is specified." + "description": "Path to bwa-mem2 mem indices.", + "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner bwa-mem2` is specified. Combine with `--save_reference` to save for future runs." }, "chr_dir": { "type": "string", @@ -463,19 +463,19 @@ "type": "string", "fa_icon": "fas fa-file", "description": "Path to dbsnp index.", - "help_text": "> **NB** If none provided, will be generated automatically from the dbsnp file." + "help_text": "> **NB** If none provided, will be generated automatically from the dbsnp file. Combine with `--save_reference` to save for future runs." }, "dict": { "type": "string", "fa_icon": "fas fa-file", "description": "Path to FASTA dictionary file.", - "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference." + "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs." }, "dragmap": { "type": "string", "fa_icon": "fas fa-copy", "description": "Path to dragmap indices.", - "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference, if --aligner dragmap is specified" + "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner dragmap` is specified. Combine with `--save_reference` to save for future runs." }, "fasta": { "type": "string", @@ -483,13 +483,13 @@ "mimetype": "text/plain", "pattern": "\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "help_text": "This parameter is *mandatory* if `--genome` is not specified.", "fa_icon": "far fa-file-code" }, "fasta_fai": { "type": "string", "fa_icon": "fas fa-file", - "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference.", + "help_text": "> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", "description": "Path to FASTA reference index." }, "germline_resource": { @@ -502,7 +502,7 @@ "type": "string", "fa_icon": "fas fa-file", "description": "Path to GATK Mutect2 Germline Resource Index.", - "help_text": "> **NB** If none provided, will be generated automatically from the Germline Resource file, if provided." + "help_text": "> **NB** If none provided, will be generated automatically from the Germline Resource file, if provided. Combine with `--save_reference` to save for future runs." }, "intervals": { "type": "string", @@ -519,7 +519,7 @@ "type": "string", "fa_icon": "fas fa-copy", "description": "Path to known indels file index.", - "help_text": "> **NB** If none provided, will be generated automatically from the known index file, if provided." + "help_text": "> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs." }, "mappability": { "type": "string", @@ -535,24 +535,25 @@ "type": "string", "fa_icon": "fas fa-microscope", "description": "snpeff genome.", - "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." }, "vep_genome": { "type": "string", "fa_icon": "fas fa-microscope", "description": "VEP genome.", - "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." }, "vep_species": { "type": "string", "fa_icon": "fas fa-microscope", "description": "VEP species.", - "help_text": "If you use AWS iGenomes or a local resource with genomes.conf, this has already been set for you appropriately." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." }, "vep_cache_version": { "type": "number", "fa_icon": "fas fa-tag", - "description": "VEP cache version." + "description": "VEP cache version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." }, "save_reference": { "type": "boolean", @@ -566,17 +567,11 @@ "default": "s3://ngi-igenomes/igenomes", "fa_icon": "fas fa-cloud-download-alt" }, - "genomes_base": { - "type": "string", - "fa_icon": "fas fa-map-marker-alt", - "description": "Directory / URL base for genomes references.", - "help_text": "All files are supposed to be in the same folder." - }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", "fa_icon": "fas fa-ban", - "help_text": "Do not load `igenomes.config` when running the pipeline.\nYou may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.\nThis option will load the `genomes.config` file instead.\n\n> **NB** You can then specify the genome custom and specify at least a FASTA genome file." + "help_text": "Do not load `igenomes.config` when running the pipeline.\nYou may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.\n\n> **NB** You can then run `Sarek` by specifying at least a FASTA genome file." } }, "help_text": "The pipeline config files come bundled with paths to the Illumina iGenomes reference index files.\nThe configuration is set up to use the AWS-iGenomes resource\ncf https://ewels.github.io/AWS-iGenomes/." From f6a12018e4931bcb2b06fc3ff479425679b4d0fb Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 9 May 2022 10:24:47 +0200 Subject: [PATCH 18/27] get rid of unexpected parameters WARN --- conf/test.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/test.config b/conf/test.config index e71283bd45..0b5109912d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -45,6 +45,8 @@ params { vep_cache_version = 104 snpeff_genome = 'WBcel235' vep_genome = 'WBcel235' + + schema_ignore_params = "genomes,test_data" } profiles { From 27749b641e08cef17db24f85845034e24ea1bc5b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 9 May 2022 17:08:12 +0200 Subject: [PATCH 19/27] improve tests --- conf/igenomes.config | 8 ++++---- conf/test.config | 23 ++++++++++++++--------- nextflow.config | 2 +- nextflow_schema.json | 3 ++- tests/csv/3.0/vcf_single.csv | 2 ++ tests/test_annotation.yml | 22 +++++++++++----------- 6 files changed, 34 insertions(+), 26 deletions(-) create mode 100644 tests/csv/3.0/vcf_single.csv diff --git a/conf/igenomes.config b/conf/igenomes.config index c79370b8f6..fbd9881bba 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -29,7 +29,7 @@ params { mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem" snpeff_db = 'GRCh37.75' snpeff_genome = 'GRCh37' - vep_cache_version = '104' + vep_cache_version = 104 vep_genome = 'GRCh37' vep_species = 'homo_sapiens' } @@ -53,7 +53,7 @@ params { mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Control-FREEC/out100m2_hg38.gem" snpeff_db = 'GRCh38.99' snpeff_genome = 'GRCh38' - vep_cache_version = '104' + vep_cache_version = 104 vep_genome = 'GRCh38' vep_species = 'homo_sapiens' } @@ -81,7 +81,7 @@ params { readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" snpeff_db = 'GRCm38.99' snpeff_genome = 'GRCm38' - vep_cache_version = '102' + vep_cache_version = 102 vep_genome = 'GRCm38' vep_species = 'mus_musculus' } @@ -105,7 +105,7 @@ params { fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" snpeff_db = 'WBcel235.99' snpeff_genome = 'WBcel235' - vep_cache_version = '104' + vep_cache_version = 104 vep_genome = 'WBcel235' vep_species = 'caenorhabditis_elegans' } diff --git a/conf/test.config b/conf/test.config index 0b5109912d..93e48a02f9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,29 +29,26 @@ params { input = "${baseDir}/tests/csv/3.0/fastq_single.csv" // Small reference genome - igenomes_ignore = true genome = null - + igenomes_ignore = true dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'] fasta = params.test_data['homo_sapiens']['genome']['genome_fasta'] germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz'] intervals = params.test_data['homo_sapiens']['genome']['genome_interval_list'] known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz'] - - nucleotides_per_second = 20 - snpeff_db = 'WBcel235.99' - vep_species = 'caenorhabditis_elegans' - vep_cache_version = 104 snpeff_genome = 'WBcel235' + vep_cache_version = 104 vep_genome = 'WBcel235' + vep_species = 'caenorhabditis_elegans' + // Ignore `--input` as otherwise the parameter validation will throw an error schema_ignore_params = "genomes,test_data" } profiles { annotation { - params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/tiny-vcf-https.csv' + params.input = "${baseDir}/tests/csv/3.0/vcf_single.csv" params.step = 'annotate' } no_intervals { @@ -88,7 +85,8 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true - //params.vep_cache = + + params.nucleotides_per_second = 20 } tools_germline { params.input = "${baseDir}/tests/csv/3.0/recalibrated_germline.csv" @@ -98,6 +96,8 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true + + params.nucleotides_per_second = 20 } tools_tumoronly { params.input = "${baseDir}/tests/csv/3.0/recalibrated_tumoronly.csv" @@ -109,6 +109,8 @@ profiles { params.step = 'variant_calling' params.joint_germline = true params.wes = true + + params.nucleotides_per_second = 20 } tools_somatic { params.input = "${baseDir}/tests/csv/3.0/recalibrated_somatic.csv" @@ -122,6 +124,7 @@ profiles { params.joint_germline = true params.wes = true + params.nucleotides_per_second = 20 } trimming { params.clip_r1 = 1 @@ -143,5 +146,7 @@ profiles { params.intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] params.wes = true params.step = 'variant_calling' + + params.nucleotides_per_second = 20 } } diff --git a/nextflow.config b/nextflow.config index 55899a203c..5a7bcba4de 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ params { step = 'mapping' // Starts with mapping // Genome and references options - genome = 'GRCh38' + genome = 'GATK.GRCh38' igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false save_reference = false // Built references not saved diff --git a/nextflow_schema.json b/nextflow_schema.json index 1bd042778d..f29f197231 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -424,6 +424,7 @@ "genome": { "type": "string", "description": "Name of iGenomes reference.", + "default": "GATK.GRCh38", "fa_icon": "fas fa-book", "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, @@ -564,7 +565,7 @@ "type": "string", "format": "directory-path", "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", + "default": "s3://ngi-igenomes/igenomes/", "fa_icon": "fas fa-cloud-download-alt" }, "igenomes_ignore": { diff --git a/tests/csv/3.0/vcf_single.csv b/tests/csv/3.0/vcf_single.csv new file mode 100644 index 0000000000..601e72f60f --- /dev/null +++ b/tests/csv/3.0/vcf_single.csv @@ -0,0 +1,2 @@ +patient,sample,vcf +test,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/vcf/test.vcf.gz diff --git a/tests/test_annotation.yml b/tests/test_annotation.yml index 0fc517804a..c2ef482e7a 100644 --- a/tests/test_annotation.yml +++ b/tests/test_annotation.yml @@ -4,20 +4,20 @@ - annotation - snpeff files: - - path: results/annotation/1234N/1234N_snpEff.ann.vcf.gz - - path: results/annotation/1234N/1234N_snpEff.ann.vcf.gz.tbi - - path: results/reports/SnpEff/1234N/1234N.csv - # - path: results/multiqc //MultiQC not working (finishes succesfully, but log shows issues between human vcf and annotation) + - path: results/annotation/test/test_snpEff.ann.vcf.gz + - path: results/annotation/test/test_snpEff.ann.vcf.gz.tbi + - path: results/reports/SnpEff/test/test.csv + - path: results/multiqc - name: Run VEP command: nextflow run main.nf -profile test,annotation,docker --tools vep tags: - annotation - vep files: - - path: results/annotation/1234N/1234N_VEP.ann.vcf.gz - - path: results/annotation/1234N/1234N_VEP.ann.vcf.gz.tbi - - path: results/reports/EnsemblVEP/1234N/1234N.summary.html - # - path: results/multiqc //MultiQC not working issues between human vcf and annotation + - path: results/annotation/test/test_VEP.ann.vcf.gz + - path: results/annotation/test/test_VEP.ann.vcf.gz.tbi + - path: results/reports/EnsemblVEP/test/test.summary.html + - path: results/multiqc - name: Run snpEff followed by VEP command: nextflow run main.nf -profile test,annotation,docker --tools merge tags: @@ -26,6 +26,6 @@ - snpeff - vep files: - - path: results/annotation/1234N/1234N_snpEff_VEP.ann.vcf.gz - - path: results/annotation/1234N/1234N_snpEff_VEP.ann.vcf.gz.tbi - # - path: results/multiqc //MultiQC not working issues between human vcf and annotation + - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz + - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz.tbi + - path: results/multiqc From 5a37e531ec4c4fe0c8ce64319f176c8b7af58c20 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 9 May 2022 17:09:00 +0200 Subject: [PATCH 20/27] no multiqc with VEP --- tests/test_annotation.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_annotation.yml b/tests/test_annotation.yml index c2ef482e7a..f02bd80586 100644 --- a/tests/test_annotation.yml +++ b/tests/test_annotation.yml @@ -9,7 +9,7 @@ - path: results/reports/SnpEff/test/test.csv - path: results/multiqc - name: Run VEP - command: nextflow run main.nf -profile test,annotation,docker --tools vep + command: nextflow run main.nf -profile test,annotation,docker --tools vep --skip_tools multiqc tags: - annotation - vep @@ -17,9 +17,8 @@ - path: results/annotation/test/test_VEP.ann.vcf.gz - path: results/annotation/test/test_VEP.ann.vcf.gz.tbi - path: results/reports/EnsemblVEP/test/test.summary.html - - path: results/multiqc - name: Run snpEff followed by VEP - command: nextflow run main.nf -profile test,annotation,docker --tools merge + command: nextflow run main.nf -profile test,annotation,docker --tools merge --skip_tools multiqc tags: - annotation - merge @@ -28,4 +27,3 @@ files: - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz.tbi - - path: results/multiqc From a2a81b17d9cbed02abaa0f55d5e5f7995264635f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 9 May 2022 17:13:28 +0200 Subject: [PATCH 21/27] better comments --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 93e48a02f9..58c2117844 100644 --- a/conf/test.config +++ b/conf/test.config @@ -42,7 +42,7 @@ params { vep_genome = 'WBcel235' vep_species = 'caenorhabditis_elegans' - // Ignore `--input` as otherwise the parameter validation will throw an error + // Ignore params that will throw warning through params validation schema_ignore_params = "genomes,test_data" } From d7ffb2f38c818d26c3b775be0d42a835d030f419 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 May 2022 12:20:37 +0200 Subject: [PATCH 22/27] test out @ewels https://github.com/nf-core/tools/pull/1499 --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77358dee77..558e87d7a5 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -56,7 +56,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install --upgrade --force-reinstall git+https://github.com/ewels/nf-core-tools.git@actions-auth-api - name: Run nf-core lint env: From a4dc6a515d928cced145d7904aea50c558eee01c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 May 2022 15:18:59 +0200 Subject: [PATCH 23/27] fix path to loftee --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 2681b0c132..bdbc2a5983 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -960,7 +960,7 @@ process{ ext.args = [ '--everything --filter_common --per_gene --total_length --offline', (params.vep_dbnsfp && params.dbnsfp) ? '--plugin dbNSFP,dbNSFP.gz,rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF' : '', - (params.vep_loftee) ? '--plugin LoF,loftee_path:.' : '', + (params.vep_loftee) ? '--plugin LoF,loftee_path:/opt/conda/envs/nf-core-vep-104.3/share/ensembl-vep-104.3-0' : '', (params.vep_spliceai && params.spliceai_snv && params.spliceai_indel) ? '--plugin SpliceAI,snv=spliceai_scores.raw.snv.hg38.vcf.gz,indel=spliceai_scores.raw.indel.hg38.vcf.gz' : '', (params.vep_spliceregion) ? '--plugin SpliceRegion' : '' ].join(' ').trim() From ce498325bd113969b5ff4f592077d752f479796e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 May 2022 15:29:23 +0200 Subject: [PATCH 24/27] fix nf-core lint --- assets/email_template.html | 142 +++++++++++-------------------------- nextflow.config | 2 +- 2 files changed, 43 insertions(+), 101 deletions(-) diff --git a/assets/email_template.html b/assets/email_template.html index 5d4bf62f6e..36a9983efd 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,111 +1,53 @@ - - - - + + + + - - - nf-core/sarek Pipeline Report - - -
- + + nf-core/sarek Pipeline Report + + +
-

nf-core/sarek v${version}

-

Run Name: $runName

+ - <% if (!success){ out << """ -
-

nf-core/sarek execution completed unsuccessfully!

+

nf-core/sarek v${version}

+

Run Name: $runName

+ +<% if (!success){ + out << """ +
+

nf-core/sarek execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
${errorReport}
-
- """ } else { out << """ -
+
${errorReport}
+
+ """ +} else { + out << """ +
nf-core/sarek execution completed successfully! -
- """ } %> +
+ """ +} +%> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
-$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> " - - - - - " }.join("\n") %> - -
- $k - -
$v
-
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
-

nf-core/sarek

-

https://github.com/nf-core/sarek

-
- +

nf-core/sarek

+

https://github.com/nf-core/sarek

+ +
+ + diff --git a/nextflow.config b/nextflow.config index 5a7bcba4de..3c98b3ebb8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -222,7 +222,7 @@ trace { } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" } manifest { From 267e6a3e8fabe413ffa488d248fa32e690b056d9 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 May 2022 15:30:47 +0200 Subject: [PATCH 25/27] back to regular nf-core tools --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 558e87d7a5..77358dee77 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -56,7 +56,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install --upgrade --force-reinstall git+https://github.com/ewels/nf-core-tools.git@actions-auth-api + pip install nf-core - name: Run nf-core lint env: From f6997f671d452c15d34a762aff41657232a75be7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 May 2022 15:53:59 +0200 Subject: [PATCH 26/27] typo + update CHANGELOG --- CHANGELOG.md | 3 +++ workflows/sarek.nf | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e68f28fc8..71e56c7140 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#511](https://github.com/nf-core/sarek/pull/511) - Sync `TEMPLATE` with `tools` `2.3.2` - [#520](https://github.com/nf-core/sarek/pull/520) - Improve annotation subworkflows - [#537](https://github.com/nf-core/sarek/pull/537) - Update workflow figure +- [#539](https://github.com/nf-core/sarek/pull/539) - Update `CITATIONS.md` ### Fixed @@ -76,9 +77,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#485](https://github.com/nf-core/sarek/pull/485) - `--skip_qc`, `--skip_markduplicates` and `--skip_bqsr` is now `--skip_tools` - [#538](https://github.com/nf-core/sarek/pull/538) - `--sequencing_center` is now `--seq_center` +- [#538](https://github.com/nf-core/sarek/pull/538) - `--markdup_java_options` has been removed - [#539](https://github.com/nf-core/sarek/pull/539) - `--annotate_tools` has been removed - [#539](https://github.com/nf-core/sarek/pull/539) - `--cadd_cache`, `--cadd_indels`, `--cadd_indels_tbi`, `--cadd_wg_snvs`, `--cadd_wg_snvs_tbi` have been removed - [#539](https://github.com/nf-core/sarek/pull/539) - `--genesplicer` has been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `conf/genomes.config` and `params.genomes_base` have been removed ## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - PĆ„rtejekna diff --git a/workflows/sarek.nf b/workflows/sarek.nf index f7f2debb7d..78e465fae5 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -35,9 +35,9 @@ def checkPathParamList = [ params.pon, params.pon_tbi, params.snpeff_cache, + params.spliceai_indel, + params.spliceai_indel_tbi, params.spliceai_snv, - params.spliceai_snv, - params.spliceai_snv_tbi, params.spliceai_snv_tbi, //params.target_bed, params.vep_cache @@ -115,8 +115,8 @@ if (params.dbnsfp && params.dbnsfp_tbi) { if (params.spliceai_snv && params.spliceai_snv_tbi && params.spliceai_indel && params.spliceai_indel_tbi) { vep_extra_files = vep_extra_files.mix( - Channel.fromPath(params.spliceai_snv), - Channel.fromPath(params.spliceai_snv_tbi), + Channel.fromPath(params.spliceai_indel), + Channel.fromPath(params.spliceai_indel_tbi), Channel.fromPath(params.spliceai_snv), Channel.fromPath(params.spliceai_snv_tbi) ).collect() From b9c2089b05b987f4e2e2e08ca764ebbdf5f907de Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 May 2022 16:08:42 +0200 Subject: [PATCH 27/27] run prettier locally --- CITATIONS.md | 2 +- assets/email_template.html | 144 ++++++++++++++++++++++++++----------- 2 files changed, 103 insertions(+), 43 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 1a77668081..401e0057da 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -84,7 +84,7 @@ - [SpliceAI](https://pubmed.ncbi.nlm.nih.gov/30661751/) - > Jaganathan K, et al.: Predicting Splicing from Primary Sequence with Deep Learning. Cell. 2019 Jan 24;176(3):535-548.e24. doi: 10.1016/j.cell.2018.12.015. PubMed PMID: 30661751. + > Jaganathan K, et al.: Predicting Splicing from Primary Sequence with Deep Learning. Cell. 2019 Jan 24;176(3):535-548.e24. doi: 10.1016/j.cell.2018.12.015. PubMed PMID: 30661751. - [SpliceRegion](https://github.com/Ensembl/VEP_plugins/blob/release/106/SpliceRegion.pm) diff --git a/assets/email_template.html b/assets/email_template.html index 36a9983efd..130be84652 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,53 +1,113 @@ - - - - + + + + - - nf-core/sarek Pipeline Report - - -
+ + nf-core/sarek Pipeline Report + + +
+ - +

nf-core/sarek v${version}

+

Run Name: $runName

-

nf-core/sarek v${version}

-

Run Name: $runName

- -<% if (!success){ - out << """ -
-

nf-core/sarek execution completed unsuccessfully!

+ <% if (!success){ out << """ +
+

nf-core/sarek execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
${errorReport}
-
- """ -} else { - out << """ -
+
${errorReport}
+
+ """ } else { out << """ +
nf-core/sarek execution completed successfully! -
- """ -} -%> +
+ """ } %> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
+$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> "" }.join("\n") %> - -
$k
$v
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> " + + + + + " }.join("\n") %> + +
+ $k + +
$v
+
-

nf-core/sarek

-

https://github.com/nf-core/sarek

- -
- - +

nf-core/sarek

+

https://github.com/nf-core/sarek

+
+