refactor: Use collect output in filter/add_column and optional report (…

…#139) * refactor: Integrate concat and filter matches in one process * refactor: Change image for filter_matches * fix: Change output module name * fix: Correct string interpolation * refactor: Change add column to work with collected output * fix: Change sed regex * fix: Change sed regex * refactor: Make sed expression case-insensitive * fix: Correct annotation trace size * feat: Make report creation optional * feat: Add option to skip profile creation * feat: Pre-filter RGI outputs * refactor: Temporarily remove high_memory label from report * fix: Add missing arg in concatenate_matches subwf * fix: Correct types in filtering script * fix: Remove reference to column before creation * refactor: Make filter matches case insensitive * refactor: Remove rgi filtering from create_report * refactor: Make profile optional out * test: Add report creation to test
beiko-lab · Jun 26, 2023 · 9d16d34 · 9d16d34
1 parent cf46732
commit 9d16d34
Show file tree

Hide file tree

Showing 12 changed files with 153 additions and 123 deletions.
diff --git a/bin/create_report.py b/bin/create_report.py
@@ -47,6 +47,7 @@ def parse_args(args=None):
         nargs="?",
         const=None,
     )
+    parser.add_argument("--skip_profile", dest="skip_profile", action="store_true")
     return parser.parse_args(args)
 
 
@@ -118,17 +119,7 @@ def create_report(ann, diamond_outs, rgi, vfdb_fasta, phispy, mobsuite):
 
     # RGI output
     rgi_df = read_table(rgi)
-    rgi_df.columns = rgi_df.columns.str.replace(" ", "_")
-    rgi_sum = rgi_df[
-        (rgi_df["Best_Identities"] > 80)
-        & (rgi_df["Percentage_Length_of_Reference_Sequence"] > 80)
-    ]
-    rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off", "genome_id"]].rename(
-        columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"}
-    )
-    rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0)
-
-    diamond_sums.append(rgi_sum)
+    diamond_sums.append(rgi_df)
 
     # Bakta/Prokka output
     ann_tool = os.path.basename(ann).strip(".txt").lower()
@@ -273,7 +264,9 @@ def main(args=None):
         args.PHISPY,
         args.MOBSUITE,
     )
-    create_feature_profile(ann_report)
+
+    if not args.skip_profile:
+        create_feature_profile(ann_report)
 
 
 if __name__ == "__main__":

diff --git a/bin/filter_alignment.py b/bin/filter_alignment.py
@@ -4,7 +4,7 @@
 import sys
 import errno
 import argparse
-from pandas import read_csv
+from pandas import read_csv, read_table
 
 
 def parse_args(args=None):
@@ -20,9 +20,29 @@ def parse_args(args=None):
     )
     parser.add_argument("QCOVER", help="Minimum coverage of each match for filtering.")
     parser.add_argument("FILE_OUT", help="Output file.")
+    parser.add_argument("--parse_rgi", dest="parse_rgi", action="store_true")
     return parser.parse_args(args)
 
 
+def filter_rgi(file_in, genome_id, min_pident, min_qcover, file_out):
+    rgi_df = read_table(file_in)
+
+    rgi_df.columns = rgi_df.columns.str.replace(" ", "_")
+    min_qcover_scaled = float(min_qcover) * 100
+    rgi_sum = rgi_df[
+        (rgi_df["Best_Identities"] > int(min_pident))
+        & (rgi_df["Percentage_Length_of_Reference_Sequence"] > min_qcover_scaled)
+    ]
+    rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off"]].rename(
+        columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"}
+    )
+    rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0)
+
+    rgi_sum["genome_id"] = genome_id
+
+    rgi_sum.to_csv(file_out, sep="\t", index=False)
+
+
 def filter_alignment(file_in, genome_id, header, min_pident, min_qcover, file_out):
     df = read_csv(file_in, sep="\t", names=header.split(" "))
 
@@ -45,9 +65,17 @@ def filter_alignment(file_in, genome_id, header, min_pident, min_qcover, file_ou
 
 def main(args=None):
     args = parse_args(args)
-    filter_alignment(
-        args.FILE_IN, args.GENOME, args.HEADER, args.PIDENT, args.QCOVER, args.FILE_OUT
-    )
+    if args.parse_rgi:
+        filter_rgi(args.FILE_IN, args.GENOME, args.PIDENT, args.QCOVER, args.FILE_OUT)
+    else:
+        filter_alignment(
+            args.FILE_IN,
+            args.GENOME,
+            args.HEADER,
+            args.PIDENT,
+            args.QCOVER,
+            args.FILE_OUT,
+        )
 
 
 if __name__ == "__main__":

diff --git a/conf/modules.config b/conf/modules.config
@@ -107,7 +107,7 @@ process {
         ]
     }
 
-    withName: CONCAT_PROKKA {
+    withName: PROKKA_ADD_COLUMN {
         publishDir = [
             path: { "${params.outdir}/annotation/prokka/" },
             mode: params.publish_dir_mode,
@@ -123,7 +123,7 @@ process {
         ]
     }
 
-    withName: CONCAT_RGI {
+    withName: RGI_ADD_COLUMN {
         publishDir = [
             path: { "${params.outdir}/annotation/rgi" },
             mode: params.publish_dir_mode,
@@ -159,7 +159,7 @@ process {
         ]
     }
 
-    withName: CONCAT_BAKTA {
+    withName: BAKTA_ADD_COLUMN {
         publishDir = [
             path: { "${params.outdir}/annotation/bakta/" },
             mode: params.publish_dir_mode,
@@ -177,31 +177,31 @@ process {
         ]
     }
 
-    withName: '.*CAZY_FILTER:CONCAT_OUTPUT' {
+    withName: '.*CAZY_FILTER:FILTER_AND_CONCAT_MATCHES' {
         publishDir = [
             path: { "${params.outdir}/annotation/cazy/" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('CAZY.txt') ? filename : null }
         ]
     }
 
-    withName: '.*VFDB_FILTER:CONCAT_OUTPUT' {
+    withName: '.*VFDB_FILTER:FILTER_AND_CONCAT_MATCHES' {
         publishDir = [
             path: { "${params.outdir}/annotation/vfdb/" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('VFDB.txt') ? filename : null }
         ]
     }
 
-    withName: '.*BACMET_FILTER:CONCAT_OUTPUT' {
+    withName: '.*BACMET_FILTER:FILTER_AND_CONCAT_MATCHES' {
         publishDir = [
             path: { "${params.outdir}/annotation/bacmet/" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('BACMET.txt') ? filename : null }
         ]
     }
 
-    withName: '.*ICEBERG_FILTER:CONCAT_OUTPUT' {
+    withName: '.*ICEBERG_FILTER:FILTER_AND_CONCAT_MATCHES' {
         publishDir = [
             path: { "${params.outdir}/annotation/iceberg2/" },
             mode: params.publish_dir_mode,
@@ -265,7 +265,7 @@ process {
         ]
     }
 
-    withName: CONCAT_PHISPY {
+    withName: PHISPY_ADD_COLUMN {
         publishDir = [
             path: { "${params.outdir}/annotation/phispy/" },
             mode: params.publish_dir_mode,

diff --git a/conf/test.config b/conf/test.config
@@ -24,7 +24,7 @@ params {
     use_prokka = true
     skip_kraken = true
     skip_poppunk = true
-    annotation_tools = 'mobsuite,rgi,vfdb'
+    annotation_tools = 'mobsuite,rgi,vfdb,report'
 }
 
 process {

diff --git a/modules/local/add_genome_column.nf b/modules/local/add_genome_column.nf
@@ -1,32 +1,41 @@
 process ADD_GENOME_COLUMN {
-    tag "$meta.id"
     label "process_low"
 
     conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
+        container "docker://docker.io/biocontainers/seaborn:0.12.2_cv1"
     } else {
-        container "quay.io/biocontainers/pandas:1.4.3"
+        container "docker.io/biocontainers/seaborn:0.12.2_cv1"
     }
 
     input:
-    tuple val(meta), path(tsv)
+    path tsv
     val dbname
     val skip_n_rows
+    val header_line
 
     output:
-    tuple val(meta), path("${prefix}.txt"), emit: txt
+    path("${prefix}.txt"), emit: txt
 
     script:
     def args = task.ext.args ?: ''
-    prefix = task.ext.prefix ?: "${meta.id}_${dbname}"
+    prefix = task.ext.prefix ?: "${dbname}"
 
     """
-    add_column.py $tsv $meta.id $skip_n_rows ${prefix}.txt
+    for table in \$(ls $tsv); do
+
+        sampleid=\$(basename \$table .tsv | sed 's/_${dbname}.*//Ig')
+
+        add_column.py \$table "\${sampleid}" $skip_n_rows "\${sampleid}_added.txt"
+    done
+
+    sed -s '1,${header_line}d' *_added.txt > no_header.txt
+    sed -sn ${header_line}p *_added.txt | uniq > header.txt
+    cat header.txt no_header.txt > ${prefix}.txt
     """
 
     stub:
-    prefix = task.ext.prefix ?: "${meta.id}_${dbname}"
+    prefix = task.ext.prefix ?: "${dbname}"
     """
     touch ${prefix}.txt
     """

diff --git a/modules/local/create_report.nf b/modules/local/create_report.nf
@@ -1,6 +1,5 @@
 process CREATE_REPORT {
-    label 'process_high'
-    label 'process_high_memory'
+    label 'process_medium'
 
     conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
@@ -16,20 +15,23 @@ process CREATE_REPORT {
     path vfdb_fasta
     path phispy_output
     path mobsuite_output
+    val skip_profile
 
     output:
     path("annotation_report.tsv.gz"), emit: report
-    path("feature_profile.tsv.gz"), emit: profile
+    path("feature_profile.tsv.gz"), emit: profile, optional: true
 
     script:
+    def skip = skip_profile ? "--skip_profile" : ""
     """
     create_report.py \\
         --annotation_out $annotation \\
         --diamond_outs $diamond_results \\
         --rgi_out $rgi_output \\
         --vfdb_fasta $vfdb_fasta \\
         --phispy_out $phispy_output \\
-        --mobsuite_out $mobsuite_output
+        --mobsuite_out $mobsuite_output \\
+        $skip
     """
 
     stub:

diff --git a/modules/local/filter_matches.nf b/modules/local/filter_matches.nf
@@ -1,34 +1,46 @@
-process FILTER_MATCHES {
-    tag "$meta.id"
+process FILTER_AND_CONCAT_MATCHES {
     label "process_low"
 
     conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
+        container "docker://docker.io/biocontainers/seaborn:0.12.2_cv1"
     } else {
-        container "quay.io/biocontainers/pandas:1.4.3"
+        container "docker.io/biocontainers/seaborn:0.12.2_cv1"
     }
 
     input:
-    tuple val(meta), path(aln)
+    path aln
     val dbname
     val header
     val pident
     val qcover
+    val header_line
+    val parse_rgi
 
     output:
-    tuple val(meta), path("*.txt"), emit: txt
+    path("${prefix}.txt"), emit: txt
 
     script:
+    def is_rgi = parse_rgi ? "--parse_rgi" : ""
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}_${dbname}_filtered"
+    prefix = task.ext.prefix ?: "${dbname}"
 
     """
-    filter_alignment.py ${aln} '${meta.id}' '${header}' \\
-        ${pident} ${qcover} ${prefix}.txt
+    for aln in \$(ls $aln); do
+
+        sampleid=\$(basename \$aln .txt | sed 's/_${dbname}//Ig')
+
+        filter_alignment.py \$aln "\${sampleid}" '${header}' \\
+            ${pident} ${qcover} "\${sampleid}_filtered.txt" $is_rgi
+    done
+
+    sed -s '1,${header_line}d' *_filtered.txt > no_header.txt
+    sed -sn ${header_line}p *_filtered.txt | uniq > header.txt
+    cat header.txt no_header.txt > ${prefix}.txt
     """
 
     stub:
+    prefix = task.ext.prefix ?: "${dbname}"
     """
     touch ${prefix}.txt
     """

diff --git a/nextflow.config b/nextflow.config
@@ -34,10 +34,10 @@ params {
     // Annotation parameters
     use_prokka                 = false
     bakta_db                   = null
-    annotation_tools           = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy'
+    annotation_tools           = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report'
     min_pident                 = 60
     min_qcover                 = 0.6
-    skip_phispy                = false
+    skip_profile_creation      = false
 
     // References
     reference_genome           = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -84,10 +84,10 @@
             "properties": {
                 "annotation_tools": {
                     "type": "string",
-                    "default": "mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy",
+                    "default": "mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report",
                     "fa_icon": "fas fa-tools",
                     "description": "Comma-separated list of annotation tools to run",
-                    "pattern": "^((rgi|mobsuite|islandpath|phispy|vfdb|cazy|bacmet|iceberg|integronfinder)?,?)*(?<!,)$"
+                    "pattern": "^((rgi|mobsuite|islandpath|phispy|vfdb|cazy|bacmet|iceberg|integronfinder|report)?,?)*(?<!,)$"
                 },
                 "bakta_db": {
                     "type": "string",
@@ -112,10 +112,10 @@
                     "fa_icon": "fas fa-equals",
                     "description": "Minimum coverage of each match for filtering"
                 },
-                "skip_phispy": {
+                "skip_profile_creation": {
                     "type": "boolean",
-                    "fa_icon": "fas fa-virus",
-                    "description": "Skip PhiSpy phage detection tool"
+                    "description": "Skip annotation feature profile creation",
+                    "fa_icon": "fas fa-forward"
                 }
             }
         },