From 25a80a2170dbccd44eff9869562dc689a80b095b Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Sun, 22 Oct 2023 06:25:11 +0200
Subject: [PATCH 1/4] Disambiguate same-db name but bracken step kraken from
 profile merging steps

---
 conf/modules.config                           |  2 +-
 subworkflows/local/profiling.nf               | 10 +++++---
 .../local/standardisation_profiles.nf         | 25 ++++++++++++++-----
 subworkflows/local/visualization_krona.nf     |  9 +++++--
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 36209f3a..90ca6919 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -485,7 +485,7 @@ process {
     }
 
     withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN {
-        ext.prefix = { "kraken2_${meta.id}_combined_reports" }
+        ext.prefix = { "kraken2_${meta.db_name}_combined_reports" }
         publishDir = [
             path: { "${params.outdir}/kraken2/" },
             mode: params.publish_dir_mode,
diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index bfe3398e..c02f7755 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -172,9 +172,13 @@ workflow PROFILING {
         ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment )
         ch_raw_profiles        = ch_raw_profiles.mix(
             KRAKEN2_KRAKEN2.out.report
-                // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use.
-                // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names.
-                .map { meta, report -> [meta + [tool: 'kraken2'], report]}
+                // Rename tool in the meta for the for-bracken files to disambiguate from only-kraken2 results in downstream steps.
+                // Note may need to rename back to to just bracken in those downstream steps depending on context.
+                .map {
+                    meta, report ->
+                        def new_tool = meta['tool'] == 'bracken' ? 'kraken2-bracken' : meta['tool']
+                    [meta + [tool: new_tool], report]
+                }
         )
 
     }
diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
index c4e36009..d8688d75 100644
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -52,12 +52,19 @@ workflow STANDARDISATION_PROFILES {
                             .map {
                                     meta, profile ->
                                         def meta_new = [:]
-                                        meta_new.id = meta.db_name
                                         meta_new.tool = meta.tool == 'malt' ? 'megan6' : meta.tool
+                                        meta_new.db_name = meta.db_name
                                         [meta_new, profile]
                             }
                             .groupTuple ()
-                            .map { [ it[0], it[1].flatten() ] }
+                            .map {
+                                meta, profiles ->
+                                    meta = meta + [
+                                        tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool, // replace to get the right output-format description
+                                        id: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
+                                    ]
+                                [ meta, profiles.flatten() ]
+                            }
 
     ch_taxpasta_tax_dir = params.taxpasta_taxonomy_dir ? Channel.fromPath(params.taxpasta_taxonomy_dir, checkIfExists: true).collect() : []
 
@@ -85,7 +92,7 @@ workflow STANDARDISATION_PROFILES {
             centrifuge: it[0]['tool'] == 'centrifuge'
             ganon: it[0]['tool'] == 'ganon'
             kmcp: it [0]['tool'] == 'kmcp'
-            kraken2: it[0]['tool'] == 'kraken2'
+            kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken'
             metaphlan: it[0]['tool'] == 'metaphlan'
             motus: it[0]['tool'] == 'motus'
             unknown: true
@@ -158,11 +165,17 @@ workflow STANDARDISATION_PROFILES {
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
     ch_profiles_for_kraken2 = ch_input_profiles.kraken2
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple(sort: {-it.size()} )
                                 .map {
-                                    [[id:it[0]], it[1]]
+                                    meta, profiles ->
+                                        def new_meta = [:]
+                                        new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description
+                                        new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
+                                        new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
+                                    [ new_meta, profiles ]
                                 }
+                                .dump(tag: 'b4group')
+                                .groupTuple(sort: {-it.size()})
+                                .dump(tag: 'whatsgoingon')
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )
diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf
index 7d8e4f01..4cbaaf1a 100644
--- a/subworkflows/local/visualization_krona.nf
+++ b/subworkflows/local/visualization_krona.nf
@@ -27,7 +27,7 @@ workflow VISUALIZATION_KRONA {
     ch_input_profiles = profiles
         .branch {
             centrifuge: it[0]['tool'] == 'centrifuge'
-            kraken2: it[0]['tool'] == 'kraken2'
+            kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken'
             unknown: true
         }
     ch_input_classifications = classifications
@@ -41,7 +41,12 @@ workflow VISUALIZATION_KRONA {
         Convert Kraken2 formatted reports into Krona text files
     */
     ch_kraken_reports = ch_input_profiles.kraken2
-        .mix( ch_input_profiles.centrifuge )
+            .map {
+                meta, report ->
+                    def new_tool = meta['tool'] == 'kraken2-bracken' ? 'bracken' : meta['tool']
+                [meta + [tool: new_tool], report]
+            }
+            .mix( ch_input_profiles.centrifuge )
     KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports )
     ch_krona_text = ch_krona_text.mix( KRAKENTOOLS_KREPORT2KRONA.out.txt )
     ch_versions = ch_versions.mix( KRAKENTOOLS_KREPORT2KRONA.out.versions.first() )

From f87ed97e393f7d0741b8f1875284d681f197114c Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 23 Oct 2023 13:43:49 +0200
Subject: [PATCH 2/4] Add final docs

---
 docs/output.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/output.md b/docs/output.md
index 493eed21..11b07fe5 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -360,6 +360,7 @@ The main taxonomic profiling file from Bracken is the `*.tsv` file. This provide
 
 - `kraken2/`
   - `<db_name>_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`)
+    - If you have also run Bracken, the original Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2-<mydatabase>-bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_<mydatabase>.tsv`).
   - `<db_name>/`
     - `<sample_id>_<db_name>.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample
     - `<sample_id>_<db_name>.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample
@@ -582,6 +583,7 @@ The resulting HTML files can be loaded into your web browser for exploration. Ea
   - `<tool>_<database>*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`.
     - The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample.
     - Note that the file naming scheme will apply regardless of whether `TAXPASTA_MERGE` (multiple sample run) or `TAXPASTA_STANDARDISE` (single sample run) are executed.
+    - If you have also run Bracken, the initial Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2-<mydatabase>-bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_<mydatabase>.tsv`).
 
   </details>
 

From 51cf6be7165faa14d61c442ec1deb2ded2dc9528 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Mon, 23 Oct 2023 13:48:06 +0200
Subject: [PATCH 3/4] Update changelgo

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0693863f..3f392df1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Fixed`
 
+- [#406] Fixed overwriting of bracken-derived kraken2 outputs when the database name is shared between Bracken/Kraken2. (❤️ to @MajoroMask for reporting, fix by @jfy133)
+
 ### `Dependencies`
 
 ### `Deprecated`

From da8bf5ae26d1d83a83d05ba017977e60969dba89 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Wed, 25 Oct 2023 15:08:14 +0200
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Moritz E. Beber <midnighter@posteo.net>
---
 subworkflows/local/profiling.nf                | 4 ++--
 subworkflows/local/standardisation_profiles.nf | 2 --
 subworkflows/local/visualization_krona.nf      | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index c02f7755..b4c71efb 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -176,8 +176,8 @@ workflow PROFILING {
                 // Note may need to rename back to to just bracken in those downstream steps depending on context.
                 .map {
                     meta, report ->
-                        def new_tool = meta['tool'] == 'bracken' ? 'kraken2-bracken' : meta['tool']
-                    [meta + [tool: new_tool], report]
+                        def new_tool = 
+                    [meta + [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report]
                 }
         )
 
diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
index d8688d75..4592e9de 100644
--- a/subworkflows/local/standardisation_profiles.nf
+++ b/subworkflows/local/standardisation_profiles.nf
@@ -173,9 +173,7 @@ workflow STANDARDISATION_PROFILES {
                                         new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
                                     [ new_meta, profiles ]
                                 }
-                                .dump(tag: 'b4group')
                                 .groupTuple(sort: {-it.size()})
-                                .dump(tag: 'whatsgoingon')
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )
diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf
index 44ac0b42..77e26a22 100644
--- a/subworkflows/local/visualization_krona.nf
+++ b/subworkflows/local/visualization_krona.nf
@@ -43,8 +43,7 @@ workflow VISUALIZATION_KRONA {
     ch_kraken_reports = ch_input_profiles.kraken2
             .map {
                 meta, report ->
-                    def new_tool = meta['tool'] == 'kraken2-bracken' ? 'bracken' : meta['tool']
-                [meta + [tool: new_tool], report]
+                [meta +  [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report]
             }
             .mix( ch_input_profiles.centrifuge )
     KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports )