Skip to content

Commit

Permalink
Release 3.5.1
Browse files Browse the repository at this point in the history
- Adds DisinFinder output
- Accept Nanopore reads for PointFinder (but prefer contigs)
  • Loading branch information
zwets committed Sep 21, 2022
1 parent 6a9fcbf commit 3f25d79
Show file tree
Hide file tree
Showing 20 changed files with 129 additions and 132 deletions.
10 changes: 5 additions & 5 deletions Dockerfile.patch
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ WORKDIR /usr/src

# CGE Services
#COPY ext/resfinder ext/
COPY ext/resfinder ext/resfinder
RUN cd ext/resfinder && pip3 install --no-color --no-cache-dir --no-deps "." && cd .. && rm -rf resfinder
#COPY ext/resfinder ext/resfinder
#RUN cd ext/resfinder && pip3 install --no-color --no-cache-dir --no-deps "." && cd .. && rm -rf resfinder
#COPY ext/choleraefinder/choleraefinder.py ext/choleraefinder/

# KCRI BAP package
Expand All @@ -42,13 +42,13 @@ RUN cd ext/resfinder && pip3 install --no-color --no-cache-dir --no-deps "." &&
#COPY src/kcri/bap/services.py kcri/bap/
#COPY src/kcri/bap/shims/ResFinder.py kcri/bap/shims/
#COPY src/kcri/bap/shims/PointFinder.py kcri/bap/shims/
#COPY src/kcri/bap/shims/DisinfFinder.py kcri/bap/shims/
#COPY src/kcri/bap/shims/DisinFinder.py kcri/bap/shims/
#COPY src/kcri/bap/shims/CholeraeFinder.py kcri/bap/shims/
#COPY src/kcri/bap/shims/base.py kcri/bap/shims/
#COPY src/kcri/bap/shims/pMLST.py kcri/bap/shims/
#COPY src/kcri/bap/shims/cgMLSTFinder.py kcri/bap/shims/
#COPY src ./
#RUN python3 setup.py install
COPY src ./
RUN python3 setup.py install

# Drop down to the original user and workdir
USER nobody:nogroup
Expand Down
3 changes: 0 additions & 3 deletions TODO
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
TODO

- ResFinder with species if available (uses the pheno table)
- DisinfFinder output to the summary
- Fix Nanopore with DisinfFinder (test-06, fails to find database?)
- Add Medaka so we can do full assembly (but GPU for speed)
- Add Polypolish so we can do hybrid assembly
- Add Nanopore tests
- Add spa-type, SCCmecFinder
- Add SPIFinder
- Add FimTyper, CHTyper, PAst
- Add disinf-finder
- Conda variant
- HTCondor backend
8 changes: 5 additions & 3 deletions src/kcri/bap/BAP.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,13 +305,15 @@ def main():
'species': commasep(b.get_detected_species([])),
'mlst': commasep(b.get_mlsts()),
'amr_cls': commasep(b.get_amr_classes()),
'amr_phe': commasep(b.get_amr_phenotypes()),
'amr_gen': commasep(b.get_amr_genes()),
'amr_res': commasep(b.get_amr_antibiotics()),
'dis_res': commasep(b.get_dis_resistances()),
'vir_gen': commasep(b.get_virulence_genes()),
'plasmid': commasep(b.get_detected_plasmids([])),
'pmlsts': commasep(b.get_pmlsts()),
'cgst': commasep(b.get_cgmlsts()),
'amr_mut': commasep(b.get_amr_mutations())
'amr_gen': commasep(b.get_amr_genes()),
'amr_mut': commasep(b.get_amr_mutations()),
'dis_gen': commasep(b.get_dis_genes())
})
print('#', '\t'.join(d.keys()), file=f_tsv)
print('\t'.join(map(lambda v: str(v) if v else '', d.values())), file=f_tsv)
Expand Down
2 changes: 1 addition & 1 deletion src/kcri/bap/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = [ 'BAP', 'data', 'services', 'shims', 'workflow' ]
__version__ = "3.5.0"
__version__ = "3.5.1"
22 changes: 17 additions & 5 deletions src/kcri/bap/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,24 +210,36 @@ def add_amr_gene(self, gene):
def get_amr_genes(self):
return sorted(self.get('bap/summary/amr_genes', []))

def add_amr_classes(self, classes):
def add_amr_class(self, classes):
self.append_to('bap/summary/amr_classes', classes, True)

def get_amr_classes(self):
return sorted(self.get('bap/summary/amr_classes', []))

def add_amr_phenotype(self, pheno):
self.append_to('bap/summary/amr_phenotypes', pheno, True)
def add_amr_antibiotic(self, pheno):
self.append_to('bap/summary/amr_antibiotics', pheno, True)

def get_amr_phenotypes(self):
return sorted(self.get('bap/summary/amr_phenotypes', []))
def get_amr_antibiotics(self):
return sorted(self.get('bap/summary/amr_antibiotics', []))

def add_amr_mutation(self, mut):
self.append_to('bap/summary/amr_mutations', mut, True)

def get_amr_mutations(self):
return sorted(self.get('bap/summary/amr_mutations', []))

def add_dis_gene(self, gene):
self.append_to('bap/summary/dis_genes', gene, True)

def get_dis_genes(self):
return sorted(self.get('bap/summary/dis_genes', []))

def add_dis_resistance(self, dis):
self.append_to('bap/summary/dis_resistances', dis, True)

def get_dis_resistances(self):
return sorted(self.get('bap/summary/dis_resistances', []))

# cgMLST

def add_cgmlst(self, scheme, st, pct):
Expand Down
Binary file added src/kcri/bap/shims/.DisinFinder.py.swp
Binary file not shown.
Binary file added src/kcri/bap/shims/.PointFinder.py.swp
Binary file not shown.
56 changes: 29 additions & 27 deletions src/kcri/bap/shims/DisinFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,10 @@ def execute(self, sid, xid, blackboard, scheduler):
# Set the thresholds from the RF parameters; we have no params specific to DF.
# We could be fancy and offer separate settings, but then if we migrate to a
# single Resistance 'all-in-one', we can't pass them to ResFinder separately.
# It's no big deal; we mention in BAP help that these are for both Res and Disin.
# No big deal; we point out in BAP help that these are for both Res and Disin.
'-t', execution.get_user_input('rf_i'),
'-l', execution.get_user_input('rf_c'),
'--acq_overlap', execution.get_user_input('rf_o'),
'--threshold_point', execution.get_user_input('rf_i'),
'--min_cov_point', execution.get_user_input('rf_c'),
'-j', 'disinfinder.json',
'-o', '.' ]

Expand All @@ -50,10 +48,12 @@ def execute(self, sid, xid, blackboard, scheduler):
if illufqs:
for f in illufqs:
params.extend(['--inputfastq', f])
elif execution.get_contigs_path(""):
params.extend(['--inputfasta', execution.get_contigs_path()])
elif execution.get_nanofq_path(""):
params.extend(['--nanopore', '--inputfastq', execution.get_nanofq_path()])
else:
params.extend(['--inputfasta', execution.get_contigs_path()])
else: # the end is neigh
raise UserException("no input data to analyse")

job_spec = JobSpec('resfinder', params, MAX_CPU, MAX_MEM, MAX_TIM)
execution.store_job_spec(job_spec.as_dict())
Expand Down Expand Up @@ -87,38 +87,40 @@ def collect_output(self, job):

res_out = dict()

# ResFinder JSON output since 4.2 has top-level elements
# 'genes', 'seq_variations', and 'phenotypes'.
# We include these but change them from objects to lists. So this:
# 'genes' : { 'aph(6)-Id;;1;;M28829': { ..., 'key' : 'aph(6)-Id;;1;;M28829', ...
# becomes:
# 'genes' : [ { ..., 'key' : 'aph(6)-Id;;1;;M28829', ... }, ...]
# This is cleaner design (they have list semantics, not object), and
# avoids issues with keys such as "aac(6')-Ib;;..." that are bound
# to create issues down the line as they contain JSON delimiters.

json_out = job.file_path('disinfinder.json')
out_file = job.file_path('disinfinder.json')
try:
with open(json_out, 'r') as f: json_in = json.load(f)
with open(out_file, 'r') as f: json_in = json.load(f)
except Exception as e:
logging.exception(e)
self.fail('failed to open or load JSON from file: %s' % json_out)
self.fail('failed to open or load JSON from file: %s' % out_file)
return

# Append to the result dictionary, converting as documented above.
# ResFinder since 4.2 has standardised JSON with these elements:
# - seq_regions (loci with AMR-causing genes or mutations)
# - seq_variations (mutations keying into seq_regions)
# - phenotypes (antibiotic resistances, keying into above)

# We include these but change them from objects to lists, so this:
# 'seq_regions' : { 'XYZ': { ..., 'key' : 'XYZ', ...
# becomes:
# 'seq_regions' : [ { ..., 'key' : 'XYZ', ... }, ...]
# This is cleaner design (they have list semantics, not object), and
# avoids issues downstream with keys containing JSON delimiters.

for k, v in json_in.items():
if k in ['genes','seq_variations','phenotypes']:
if k in ['seq_regions','seq_variations','phenotypes']:
res_out[k] = [ o for o in v.values() ]
else:
res_out[k] = v

# # Store the genes, classes and phenotypes in the summary
# for g in res_out.get('genes', []):
# self._blackboard.add_amr_gene(g.get('name','unknown'))
# # Store the classes and phenotypes in the summary
# for p in filter(lambda d: d.get('resistant', False), res_out.get('phenotypes', [])):
# self._blackboard.add_amr_classes(p.get('amr_classes',[]))
# self._blackboard.add_amr_phenotype(p.get('resistance','unknown'))
# Helpers to retrieve genes names g for regions r causing phenotype p
r2g = lambda r: json_in.get('seq_regions',{}).get(r,{}).get('name')
p2gs = lambda p: filter(None, map(r2g, p.get('seq_regions', [])))

# Store the resistant phenotypes and causative genes for summary
for p in filter(lambda d: d.get('amr_resistant', False), res_out.get('phenotypes', [])):
for g in p2gs(p): self._blackboard.add_dis_gene(g)
self._blackboard.add_dis_resistance(p.get('amr_resistance','?unspecified?'))

# Store the results on the blackboard
self.store_results(res_out)
Expand Down
76 changes: 26 additions & 50 deletions src/kcri/bap/shims/PointFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ def execute(self, sid, xid, blackboard, scheduler):
if illufqs:
for f in illufqs:
params.extend(['--inputfastq', f])
elif execution.get_contigs_path(""):
params.extend(['--inputfasta', execution.get_contigs_path()])
elif execution.get_nanofq_path(""):
params.extend(['--nanopore', '--inputfastq', execution.get_nanofq_path()])
else:
params.extend(['--inputfasta', execution.get_contigs_path()])
else: # LPT1 is on fire
raise

# Parse list of user specified genes and check with DB
for g in filter(None, execution.get_user_input('pt_g',"").split(',')):
Expand Down Expand Up @@ -123,69 +125,43 @@ def collect_output(self, job):

res_out = dict()

# The provisional JSON we parse further down does not yet link sequence variations
# to phenotypes, so for now we parse PointFinder_results.txt:
# NOTE 2021-04-12: seems this data in now partly present, but without e.g. PMID
# Mutation Nucleotide change Amino acid change Resistance PMID
# gyrA p.S83L TCG -> TTG S -> L Nalidixic acid,Ciprofloxacin 8891148
# gyrA p.D87N GAC -> AAC D -> N Nalidixic acid,Nalidixic acid,Ciprofloxacin 12654733
# parC p.S80I AGC -> ATC S -> I Nalidixic acid,Ciprofloxacin 8851598
# parE p.S458T TCG -> ACG S -> T Nalidixic acid,Ciprofloxacin 14506034

tab_out = job.file_path('PointFinder_results.txt')
out_file = job.file_path('pointfinder.json')
try:
res_out['findings'] = list()
with open(tab_out, newline='') as f:
reader = csv.DictReader(f, delimiter='\t')
for row in reader:
# Append the mutation to the findings list
res_out['findings'].append(dict({
'mutation': row['Mutation'],
'nt_change': row['Nucleotide change'],
'aa_change': row['Amino acid change'],
'resistance': sorted(list(set(row.get('Resistance','').split(',')))),
'pmid': row.get('PMID')}))
# Append the mutation to the summary info on the blackboard
self._blackboard.add_amr_mutation(row['Mutation'])
with open(out_file, 'r') as f: json_in = json.load(f)
except Exception as e:
logging.exception(e)
self.fail('failed to open or read file: %s' % tab_out)
self.fail('failed to open or load JSON from file: %s' % out_file)
return

# ResFinder and PointFinder have provisional standardised output in
# 'std_format_under_development.json', which has top-level elements
# 'seq_regions', 'seq_variations', and 'phenotypes'.
# We include these but change them from objects to lists. So this:
# 'seq_regions' : { 'aph(6)-Id;;1;;M28829': { ..., 'key' : 'aph(6)-Id;;1;;M28829', ...
# ResFinder since 4.2 has standardised JSON with these elements:
# - seq_regions (loci with AMR-causing genes or mutations)
# - seq_variations (mutations keying into seq_regions)
# - phenotypes (antibiotic resistances, keying into above)

# We include these but change them from objects to lists, so this:
# 'seq_regions' : { 'XYZ': { ..., 'key' : 'XYZ', ...
# becomes:
# 'seq_regions' : [ { ..., 'key' : 'aph(6)-Id;;1;;M28829', ... }, ...]
# 'seq_regions' : [ { ..., 'key' : 'XYZ', ... }, ...]
# This is cleaner design (they have list semantics, not object), and
# avoids issues with keys such as "aac(6')-Ib;;..." that are bound
# to create issues down the line as they contain JSON delimiters.

json_out = job.file_path('pointfinder.json')
try:
with open(json_out, 'r') as f: json_in = json.load(f)
except Exception as e:
logging.exception(e)
self.fail('failed to open or load JSON from file: %s' % json_out)
return
# avoids issues downstream with keys containing JSON delimiters.

# Append to the result dictionary, converting as documented above.
for k, v in json_in.items():
if k in ['seq_regions','seq_variations','phenotypes']:
res_out[k] = [ o for o in v.values() ]
else:
res_out[k] = v

# Store the classes and phenotypes in the summary
# Helpers to retrieve mutation ids m for variant v causing phenotype p
v2m = lambda v: json_in.get('seq_variations',{}).get(v,{}).get('seq_var')
p2ms = lambda p: filter(None, map(v2m, p.get('seq_variations', [])))

# Store the resistant phenotypes and causative mutations for the summary output
# Note that a lot more information is present, including PMID references and notes
for p in filter(lambda d: d.get('amr_resistant', False), res_out.get('phenotypes', [])):
self._blackboard.add_amr_classes(p.get('amr_classes',[]))
self._blackboard.add_amr_phenotype(p.get('amr_resistance','unknown'))
# We don't store the seq_variations until we know their phenotype (instead do above)
#for m in res_out.get('seq_variations', []):
# self._blackboard.add_amr_mutation('%s:%s' % (m.get('genes',['?'])[0], m.get('seq_var','?')))
for g in p2ms(p): self._blackboard.add_amr_mutation(g)
for c in p.get('amr_classes',[]): self._blackboard.add_amr_class(c)
self._blackboard.add_amr_antibiotic(p.get('amr_resistance','?unspecified?'))

# Store the results on the blackboard
# Store on the blackboard
self.store_results(res_out)

50 changes: 27 additions & 23 deletions src/kcri/bap/shims/ResFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ def execute(self, sid, xid, blackboard, scheduler):
if illufqs:
for f in illufqs:
params.extend(['--inputfastq', f])
elif execution.get_contigs_path(""):
params.extend(['--inputfasta', execution.get_contigs_path()])
elif execution.get_nanofq_path(""):
params.extend(['--nanopore', '--inputfastq', execution.get_nanofq_path()])
else:
params.extend(['--inputfasta', execution.get_contigs_path()])
else: # expect the unexpected
raise UserException("no input data to analyse")

job_spec = JobSpec('resfinder', params, MAX_CPU, MAX_MEM, MAX_TIM)
execution.store_job_spec(job_spec.as_dict())
Expand Down Expand Up @@ -96,19 +98,7 @@ def collect_output(self, job):
'''Collect the job output and put on blackboard.
This method is called by super().report() once job is done.'''

# ResFinder and PointFinder had provisional standardised output in
# 'std_format_under_development.json', now configurable with '-j'.
# It had top-level elements 'genes', 'seq_variations', and 'phenotypes'.
# Since 4.2.1 'genes' is 'seq_regions', and has 'gene': true.
# We include these but change them from objects to lists. So this:
# 'genes' : { 'aph(6)-Id;;1;;M28829': { ..., 'key' : 'aph(6)-Id;;1;;M28829', ...
# becomes:
# 'genes' : [ { ..., 'key' : 'aph(6)-Id;;1;;M28829', ... }, ...]
# This is cleaner design (they have list semantics, not object), and
# avoids issues with keys such as "aac(6')-Ib;;..." that are bound
# to create issues down the line as they contain JSON delimiters.

# TODO: the tables have more info than the JSON, extract or fix json?
res_out = dict()

out_file = job.file_path('resfinder.json')
try:
Expand All @@ -118,21 +108,35 @@ def collect_output(self, job):
self.fail('failed to open or load JSON from file: %s' % out_file)
return

# Produce the result dictionary, converting as documented above.
res_out = dict()
# ResFinder since 4.2 has standardised JSON with these elements:
# - seq_regions (loci with AMR-causing genes or mutations)
# - seq_variations (mutations keying into seq_regions)
# - phenotypes (antibiotic resistances, keying into above)

# We include these but change them from objects to lists, so this:
# 'seq_regions' : { 'XYZ': { ..., 'key' : 'XYZ', ...
# becomes:
# 'seq_regions' : [ { ..., 'key' : 'XYZ', ... }, ...]
# This is cleaner design (they have list semantics, not object), and
# avoids issues downstream with keys containing JSON delimiters.

for k, v in json_in.items():
if k in ['seq_regions','seq_variations','phenotypes']:
res_out[k] = [ o for o in v.values() ]
else:
res_out[k] = v

# Store the genes/seq_regions, classes and phenetypes in the summary
for g in res_out.get('seq_regions', []):
self._blackboard.add_amr_gene(g.get('name','unknown'))
# Helpers to retrieve gene names g for regions r causing phenotype p
r2g = lambda r: json_in.get('seq_regions',{}).get(r,{}).get('name')
p2gs = lambda p: filter(None, map(r2g, p.get('seq_regions', [])))

# Store the resistant phenotypes and causative regions for the summary output
# Note that a lot more information is present, including PMIDs and notes
for p in filter(lambda d: d.get('amr_resistant', False), res_out.get('phenotypes', [])):
self._blackboard.add_amr_classes(p.get('amr_classes',[]))
self._blackboard.add_amr_phenotype(p.get('amr_resistance','unknown'))
for g in p2gs(p): self._blackboard.add_amr_gene(g)
for c in p.get('amr_classes',[]): self._blackboard.add_amr_class(c)
self._blackboard.add_amr_antibiotic(p.get('amr_resistance','?unspecified?'))

# Store the results on the blackboard
# Store on the blackboard
self.store_results(res_out)

Loading

0 comments on commit 3f25d79

Please sign in to comment.