From 25a80a2170dbccd44eff9869562dc689a80b095b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 22 Oct 2023 06:25:11 +0200 Subject: [PATCH 1/4] Disambiguate same-db name but bracken step kraken from profile merging steps --- conf/modules.config | 2 +- subworkflows/local/profiling.nf | 10 +++++--- .../local/standardisation_profiles.nf | 25 ++++++++++++++----- subworkflows/local/visualization_krona.nf | 9 +++++-- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 36209f3a..90ca6919 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -485,7 +485,7 @@ process { } withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN { - ext.prefix = { "kraken2_${meta.id}_combined_reports" } + ext.prefix = { "kraken2_${meta.db_name}_combined_reports" } publishDir = [ path: { "${params.outdir}/kraken2/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index bfe3398e..c02f7755 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -172,9 +172,13 @@ workflow PROFILING { ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.report - // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use. - // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names. - .map { meta, report -> [meta + [tool: 'kraken2'], report]} + // Rename tool in the meta for the for-bracken files to disambiguate from only-kraken2 results in downstream steps. + // Note may need to rename back to to just bracken in those downstream steps depending on context. + .map { + meta, report -> + def new_tool = meta['tool'] == 'bracken' ? 'kraken2-bracken' : meta['tool'] + [meta + [tool: new_tool], report] + } ) } diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index c4e36009..d8688d75 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -52,12 +52,19 @@ workflow STANDARDISATION_PROFILES { .map { meta, profile -> def meta_new = [:] - meta_new.id = meta.db_name meta_new.tool = meta.tool == 'malt' ? 'megan6' : meta.tool + meta_new.db_name = meta.db_name [meta_new, profile] } .groupTuple () - .map { [ it[0], it[1].flatten() ] } + .map { + meta, profiles -> + meta = meta + [ + tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool, // replace to get the right output-format description + id: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken + ] + [ meta, profiles.flatten() ] + } ch_taxpasta_tax_dir = params.taxpasta_taxonomy_dir ? Channel.fromPath(params.taxpasta_taxonomy_dir, checkIfExists: true).collect() : [] @@ -85,7 +92,7 @@ workflow STANDARDISATION_PROFILES { centrifuge: it[0]['tool'] == 'centrifuge' ganon: it[0]['tool'] == 'ganon' kmcp: it [0]['tool'] == 'kmcp' - kraken2: it[0]['tool'] == 'kraken2' + kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken' metaphlan: it[0]['tool'] == 'metaphlan' motus: it[0]['tool'] == 'motus' unknown: true @@ -158,11 +165,17 @@ workflow STANDARDISATION_PROFILES { // Have to sort by size to ensure first file actually has hits otherwise // the script fails ch_profiles_for_kraken2 = ch_input_profiles.kraken2 - .map { [it[0]['db_name'], it[1]] } - .groupTuple(sort: {-it.size()} ) .map { - [[id:it[0]], it[1]] + meta, profiles -> + def new_meta = [:] + new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description + new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken + new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken + [ new_meta, profiles ] } + .dump(tag: 'b4group') + .groupTuple(sort: {-it.size()}) + .dump(tag: 'whatsgoingon') KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf index 7d8e4f01..4cbaaf1a 100644 --- a/subworkflows/local/visualization_krona.nf +++ b/subworkflows/local/visualization_krona.nf @@ -27,7 +27,7 @@ workflow VISUALIZATION_KRONA { ch_input_profiles = profiles .branch { centrifuge: it[0]['tool'] == 'centrifuge' - kraken2: it[0]['tool'] == 'kraken2' + kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken' unknown: true } ch_input_classifications = classifications @@ -41,7 +41,12 @@ workflow VISUALIZATION_KRONA { Convert Kraken2 formatted reports into Krona text files */ ch_kraken_reports = ch_input_profiles.kraken2 - .mix( ch_input_profiles.centrifuge ) + .map { + meta, report -> + def new_tool = meta['tool'] == 'kraken2-bracken' ? 'bracken' : meta['tool'] + [meta + [tool: new_tool], report] + } + .mix( ch_input_profiles.centrifuge ) KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports ) ch_krona_text = ch_krona_text.mix( KRAKENTOOLS_KREPORT2KRONA.out.txt ) ch_versions = ch_versions.mix( KRAKENTOOLS_KREPORT2KRONA.out.versions.first() ) From f87ed97e393f7d0741b8f1875284d681f197114c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 23 Oct 2023 13:43:49 +0200 Subject: [PATCH 2/4] Add final docs --- docs/output.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/output.md b/docs/output.md index 493eed21..11b07fe5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -360,6 +360,7 @@ The main taxonomic profiling file from Bracken is the `*.tsv` file. This provide - `kraken2/` - `_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`) + - If you have also run Bracken, the original Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). - `/` - `_.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample - `_.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample @@ -582,6 +583,7 @@ The resulting HTML files can be loaded into your web browser for exploration. Ea - `_*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. - The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample. - Note that the file naming scheme will apply regardless of whether `TAXPASTA_MERGE` (multiple sample run) or `TAXPASTA_STANDARDISE` (single sample run) are executed. + - If you have also run Bracken, the initial Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). From 51cf6be7165faa14d61c442ec1deb2ded2dc9528 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 23 Oct 2023 13:48:06 +0200 Subject: [PATCH 3/4] Update changelgo --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0693863f..3f392df1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` +- [#406] Fixed overwriting of bracken-derived kraken2 outputs when the database name is shared between Bracken/Kraken2. (❤️ to @MajoroMask for reporting, fix by @jfy133) + ### `Dependencies` ### `Deprecated` From da8bf5ae26d1d83a83d05ba017977e60969dba89 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 25 Oct 2023 15:08:14 +0200 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- subworkflows/local/profiling.nf | 4 ++-- subworkflows/local/standardisation_profiles.nf | 2 -- subworkflows/local/visualization_krona.nf | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index c02f7755..b4c71efb 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -176,8 +176,8 @@ workflow PROFILING { // Note may need to rename back to to just bracken in those downstream steps depending on context. .map { meta, report -> - def new_tool = meta['tool'] == 'bracken' ? 'kraken2-bracken' : meta['tool'] - [meta + [tool: new_tool], report] + def new_tool = + [meta + [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report] } ) diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf index d8688d75..4592e9de 100644 --- a/subworkflows/local/standardisation_profiles.nf +++ b/subworkflows/local/standardisation_profiles.nf @@ -173,9 +173,7 @@ workflow STANDARDISATION_PROFILES { new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken [ new_meta, profiles ] } - .dump(tag: 'b4group') .groupTuple(sort: {-it.size()}) - .dump(tag: 'whatsgoingon') KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf index 44ac0b42..77e26a22 100644 --- a/subworkflows/local/visualization_krona.nf +++ b/subworkflows/local/visualization_krona.nf @@ -43,8 +43,7 @@ workflow VISUALIZATION_KRONA { ch_kraken_reports = ch_input_profiles.kraken2 .map { meta, report -> - def new_tool = meta['tool'] == 'kraken2-bracken' ? 'bracken' : meta['tool'] - [meta + [tool: new_tool], report] + [meta + [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report] } .mix( ch_input_profiles.centrifuge ) KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports )