Skip to content

Commit

Permalink
refactor: Use collect output in filter/add_column and optional report (
Browse files Browse the repository at this point in the history
…#139)

* refactor: Integrate concat and filter matches in one process

* refactor: Change image for filter_matches

* fix: Change output module name

* fix: Correct string interpolation

* refactor: Change add column to work with collected output

* fix: Change sed regex

* fix: Change sed regex

* refactor: Make sed expression case-insensitive

* fix: Correct annotation trace size

* feat: Make report creation optional

* feat: Add option to skip profile creation

* feat: Pre-filter RGI outputs

* refactor: Temporarily remove high_memory label from report

* fix: Add missing arg in concatenate_matches subwf

* fix: Correct types in filtering script

* fix: Remove reference to column before creation

* refactor: Make filter matches case insensitive

* refactor: Remove rgi filtering from create_report

* refactor: Make profile optional out

* test: Add report creation to test
  • Loading branch information
jvfe committed Jun 26, 2023
1 parent cf46732 commit 9d16d34
Show file tree
Hide file tree
Showing 12 changed files with 153 additions and 123 deletions.
17 changes: 5 additions & 12 deletions bin/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def parse_args(args=None):
nargs="?",
const=None,
)
parser.add_argument("--skip_profile", dest="skip_profile", action="store_true")
return parser.parse_args(args)


Expand Down Expand Up @@ -118,17 +119,7 @@ def create_report(ann, diamond_outs, rgi, vfdb_fasta, phispy, mobsuite):

# RGI output
rgi_df = read_table(rgi)
rgi_df.columns = rgi_df.columns.str.replace(" ", "_")
rgi_sum = rgi_df[
(rgi_df["Best_Identities"] > 80)
& (rgi_df["Percentage_Length_of_Reference_Sequence"] > 80)
]
rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off", "genome_id"]].rename(
columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"}
)
rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0)

diamond_sums.append(rgi_sum)
diamond_sums.append(rgi_df)

# Bakta/Prokka output
ann_tool = os.path.basename(ann).strip(".txt").lower()
Expand Down Expand Up @@ -273,7 +264,9 @@ def main(args=None):
args.PHISPY,
args.MOBSUITE,
)
create_feature_profile(ann_report)

if not args.skip_profile:
create_feature_profile(ann_report)


if __name__ == "__main__":
Expand Down
36 changes: 32 additions & 4 deletions bin/filter_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
import errno
import argparse
from pandas import read_csv
from pandas import read_csv, read_table


def parse_args(args=None):
Expand All @@ -20,9 +20,29 @@ def parse_args(args=None):
)
parser.add_argument("QCOVER", help="Minimum coverage of each match for filtering.")
parser.add_argument("FILE_OUT", help="Output file.")
parser.add_argument("--parse_rgi", dest="parse_rgi", action="store_true")
return parser.parse_args(args)


def filter_rgi(file_in, genome_id, min_pident, min_qcover, file_out):
rgi_df = read_table(file_in)

rgi_df.columns = rgi_df.columns.str.replace(" ", "_")
min_qcover_scaled = float(min_qcover) * 100
rgi_sum = rgi_df[
(rgi_df["Best_Identities"] > int(min_pident))
& (rgi_df["Percentage_Length_of_Reference_Sequence"] > min_qcover_scaled)
]
rgi_sum = rgi_sum[["Contig", "Best_Hit_ARO", "Cut_Off"]].rename(
columns={"Contig": "orf", "Best_Hit_ARO": "AMR", "Cut_Off": "rgi_cutoff"}
)
rgi_sum["orf"] = rgi_sum["orf"].str.rsplit("_", n=1).str.get(0)

rgi_sum["genome_id"] = genome_id

rgi_sum.to_csv(file_out, sep="\t", index=False)


def filter_alignment(file_in, genome_id, header, min_pident, min_qcover, file_out):
df = read_csv(file_in, sep="\t", names=header.split(" "))

Expand All @@ -45,9 +65,17 @@ def filter_alignment(file_in, genome_id, header, min_pident, min_qcover, file_ou

def main(args=None):
args = parse_args(args)
filter_alignment(
args.FILE_IN, args.GENOME, args.HEADER, args.PIDENT, args.QCOVER, args.FILE_OUT
)
if args.parse_rgi:
filter_rgi(args.FILE_IN, args.GENOME, args.PIDENT, args.QCOVER, args.FILE_OUT)
else:
filter_alignment(
args.FILE_IN,
args.GENOME,
args.HEADER,
args.PIDENT,
args.QCOVER,
args.FILE_OUT,
)


if __name__ == "__main__":
Expand Down
16 changes: 8 additions & 8 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ process {
]
}

withName: CONCAT_PROKKA {
withName: PROKKA_ADD_COLUMN {
publishDir = [
path: { "${params.outdir}/annotation/prokka/" },
mode: params.publish_dir_mode,
Expand All @@ -123,7 +123,7 @@ process {
]
}

withName: CONCAT_RGI {
withName: RGI_ADD_COLUMN {
publishDir = [
path: { "${params.outdir}/annotation/rgi" },
mode: params.publish_dir_mode,
Expand Down Expand Up @@ -159,7 +159,7 @@ process {
]
}

withName: CONCAT_BAKTA {
withName: BAKTA_ADD_COLUMN {
publishDir = [
path: { "${params.outdir}/annotation/bakta/" },
mode: params.publish_dir_mode,
Expand All @@ -177,31 +177,31 @@ process {
]
}

withName: '.*CAZY_FILTER:CONCAT_OUTPUT' {
withName: '.*CAZY_FILTER:FILTER_AND_CONCAT_MATCHES' {
publishDir = [
path: { "${params.outdir}/annotation/cazy/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('CAZY.txt') ? filename : null }
]
}

withName: '.*VFDB_FILTER:CONCAT_OUTPUT' {
withName: '.*VFDB_FILTER:FILTER_AND_CONCAT_MATCHES' {
publishDir = [
path: { "${params.outdir}/annotation/vfdb/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('VFDB.txt') ? filename : null }
]
}

withName: '.*BACMET_FILTER:CONCAT_OUTPUT' {
withName: '.*BACMET_FILTER:FILTER_AND_CONCAT_MATCHES' {
publishDir = [
path: { "${params.outdir}/annotation/bacmet/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('BACMET.txt') ? filename : null }
]
}

withName: '.*ICEBERG_FILTER:CONCAT_OUTPUT' {
withName: '.*ICEBERG_FILTER:FILTER_AND_CONCAT_MATCHES' {
publishDir = [
path: { "${params.outdir}/annotation/iceberg2/" },
mode: params.publish_dir_mode,
Expand Down Expand Up @@ -265,7 +265,7 @@ process {
]
}

withName: CONCAT_PHISPY {
withName: PHISPY_ADD_COLUMN {
publishDir = [
path: { "${params.outdir}/annotation/phispy/" },
mode: params.publish_dir_mode,
Expand Down
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ params {
use_prokka = true
skip_kraken = true
skip_poppunk = true
annotation_tools = 'mobsuite,rgi,vfdb'
annotation_tools = 'mobsuite,rgi,vfdb,report'
}

process {
Expand Down
25 changes: 17 additions & 8 deletions modules/local/add_genome_column.nf
Original file line number Diff line number Diff line change
@@ -1,32 +1,41 @@
process ADD_GENOME_COLUMN {
tag "$meta.id"
label "process_low"

conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
container "docker://docker.io/biocontainers/seaborn:0.12.2_cv1"
} else {
container "quay.io/biocontainers/pandas:1.4.3"
container "docker.io/biocontainers/seaborn:0.12.2_cv1"
}

input:
tuple val(meta), path(tsv)
path tsv
val dbname
val skip_n_rows
val header_line

output:
tuple val(meta), path("${prefix}.txt"), emit: txt
path("${prefix}.txt"), emit: txt

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}_${dbname}"
prefix = task.ext.prefix ?: "${dbname}"

"""
add_column.py $tsv $meta.id $skip_n_rows ${prefix}.txt
for table in \$(ls $tsv); do
sampleid=\$(basename \$table .tsv | sed 's/_${dbname}.*//Ig')
add_column.py \$table "\${sampleid}" $skip_n_rows "\${sampleid}_added.txt"
done
sed -s '1,${header_line}d' *_added.txt > no_header.txt
sed -sn ${header_line}p *_added.txt | uniq > header.txt
cat header.txt no_header.txt > ${prefix}.txt
"""

stub:
prefix = task.ext.prefix ?: "${meta.id}_${dbname}"
prefix = task.ext.prefix ?: "${dbname}"
"""
touch ${prefix}.txt
"""
Expand Down
10 changes: 6 additions & 4 deletions modules/local/create_report.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
process CREATE_REPORT {
label 'process_high'
label 'process_high_memory'
label 'process_medium'

conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
Expand All @@ -16,20 +15,23 @@ process CREATE_REPORT {
path vfdb_fasta
path phispy_output
path mobsuite_output
val skip_profile

output:
path("annotation_report.tsv.gz"), emit: report
path("feature_profile.tsv.gz"), emit: profile
path("feature_profile.tsv.gz"), emit: profile, optional: true

script:
def skip = skip_profile ? "--skip_profile" : ""
"""
create_report.py \\
--annotation_out $annotation \\
--diamond_outs $diamond_results \\
--rgi_out $rgi_output \\
--vfdb_fasta $vfdb_fasta \\
--phispy_out $phispy_output \\
--mobsuite_out $mobsuite_output
--mobsuite_out $mobsuite_output \\
$skip
"""

stub:
Expand Down
30 changes: 21 additions & 9 deletions modules/local/filter_matches.nf
Original file line number Diff line number Diff line change
@@ -1,34 +1,46 @@
process FILTER_MATCHES {
tag "$meta.id"
process FILTER_AND_CONCAT_MATCHES {
label "process_low"

conda (params.enable_conda ? "conda-forge::pandas=1.4.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/pandas:1.4.3"
container "docker://docker.io/biocontainers/seaborn:0.12.2_cv1"
} else {
container "quay.io/biocontainers/pandas:1.4.3"
container "docker.io/biocontainers/seaborn:0.12.2_cv1"
}

input:
tuple val(meta), path(aln)
path aln
val dbname
val header
val pident
val qcover
val header_line
val parse_rgi

output:
tuple val(meta), path("*.txt"), emit: txt
path("${prefix}.txt"), emit: txt

script:
def is_rgi = parse_rgi ? "--parse_rgi" : ""
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}_${dbname}_filtered"
prefix = task.ext.prefix ?: "${dbname}"

"""
filter_alignment.py ${aln} '${meta.id}' '${header}' \\
${pident} ${qcover} ${prefix}.txt
for aln in \$(ls $aln); do
sampleid=\$(basename \$aln .txt | sed 's/_${dbname}//Ig')
filter_alignment.py \$aln "\${sampleid}" '${header}' \\
${pident} ${qcover} "\${sampleid}_filtered.txt" $is_rgi
done
sed -s '1,${header_line}d' *_filtered.txt > no_header.txt
sed -sn ${header_line}p *_filtered.txt | uniq > header.txt
cat header.txt no_header.txt > ${prefix}.txt
"""

stub:
prefix = task.ext.prefix ?: "${dbname}"
"""
touch ${prefix}.txt
"""
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ params {
// Annotation parameters
use_prokka = false
bakta_db = null
annotation_tools = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy'
annotation_tools = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report'
min_pident = 60
min_qcover = 0.6
skip_phispy = false
skip_profile_creation = false

// References
reference_genome = null
Expand Down
10 changes: 5 additions & 5 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@
"properties": {
"annotation_tools": {
"type": "string",
"default": "mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy",
"default": "mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report",
"fa_icon": "fas fa-tools",
"description": "Comma-separated list of annotation tools to run",
"pattern": "^((rgi|mobsuite|islandpath|phispy|vfdb|cazy|bacmet|iceberg|integronfinder)?,?)*(?<!,)$"
"pattern": "^((rgi|mobsuite|islandpath|phispy|vfdb|cazy|bacmet|iceberg|integronfinder|report)?,?)*(?<!,)$"
},
"bakta_db": {
"type": "string",
Expand All @@ -112,10 +112,10 @@
"fa_icon": "fas fa-equals",
"description": "Minimum coverage of each match for filtering"
},
"skip_phispy": {
"skip_profile_creation": {
"type": "boolean",
"fa_icon": "fas fa-virus",
"description": "Skip PhiSpy phage detection tool"
"description": "Skip annotation feature profile creation",
"fa_icon": "fas fa-forward"
}
}
},
Expand Down
Loading

0 comments on commit 9d16d34

Please sign in to comment.