Release 3.5.1

- Adds DisinFinder output - Accept Nanopore reads for PointFinder (but prefer contigs)
kcri-tz · Sep 21, 2022 · 3f25d79 · 3f25d79
1 parent 6a9fcbf
commit 3f25d79
Show file tree

Hide file tree

Showing 20 changed files with 129 additions and 132 deletions.
diff --git a/Dockerfile.patch b/Dockerfile.patch
@@ -31,8 +31,8 @@ WORKDIR /usr/src
 
 # CGE Services
 #COPY ext/resfinder ext/
-COPY ext/resfinder ext/resfinder
-RUN cd ext/resfinder && pip3 install --no-color --no-cache-dir --no-deps "." && cd .. && rm -rf resfinder
+#COPY ext/resfinder ext/resfinder
+#RUN cd ext/resfinder && pip3 install --no-color --no-cache-dir --no-deps "." && cd .. && rm -rf resfinder
 #COPY ext/choleraefinder/choleraefinder.py ext/choleraefinder/
 
 # KCRI BAP package
@@ -42,13 +42,13 @@ RUN cd ext/resfinder && pip3 install --no-color --no-cache-dir --no-deps "." &&
 #COPY src/kcri/bap/services.py kcri/bap/
 #COPY src/kcri/bap/shims/ResFinder.py kcri/bap/shims/
 #COPY src/kcri/bap/shims/PointFinder.py kcri/bap/shims/
-#COPY src/kcri/bap/shims/DisinfFinder.py kcri/bap/shims/
+#COPY src/kcri/bap/shims/DisinFinder.py kcri/bap/shims/
 #COPY src/kcri/bap/shims/CholeraeFinder.py kcri/bap/shims/
 #COPY src/kcri/bap/shims/base.py kcri/bap/shims/
 #COPY src/kcri/bap/shims/pMLST.py kcri/bap/shims/
 #COPY src/kcri/bap/shims/cgMLSTFinder.py kcri/bap/shims/
-#COPY src ./
-#RUN python3 setup.py install
+COPY src ./
+RUN python3 setup.py install
 
 # Drop down to the original user and workdir
 USER nobody:nogroup

diff --git a/TODO b/TODO
@@ -1,14 +1,11 @@
 TODO
 
 - ResFinder with species if available (uses the pheno table)
-- DisinfFinder output to the summary
-- Fix Nanopore with DisinfFinder (test-06, fails to find database?)
 - Add Medaka so we can do full assembly (but GPU for speed)
 - Add Polypolish so we can do hybrid assembly
 - Add Nanopore tests
 - Add spa-type, SCCmecFinder
 - Add SPIFinder
 - Add FimTyper, CHTyper, PAst
-- Add disinf-finder
 - Conda variant
 - HTCondor backend
diff --git a/src/kcri/bap/BAP.py b/src/kcri/bap/BAP.py
@@ -305,13 +305,15 @@ def main():
             'species': commasep(b.get_detected_species([])),
             'mlst': commasep(b.get_mlsts()),
             'amr_cls': commasep(b.get_amr_classes()),
-            'amr_phe': commasep(b.get_amr_phenotypes()),
-            'amr_gen': commasep(b.get_amr_genes()),
+            'amr_res': commasep(b.get_amr_antibiotics()),
+            'dis_res': commasep(b.get_dis_resistances()),
             'vir_gen': commasep(b.get_virulence_genes()),
             'plasmid': commasep(b.get_detected_plasmids([])),
             'pmlsts': commasep(b.get_pmlsts()),
             'cgst': commasep(b.get_cgmlsts()),
-            'amr_mut': commasep(b.get_amr_mutations())
+            'amr_gen': commasep(b.get_amr_genes()),
+            'amr_mut': commasep(b.get_amr_mutations()),
+            'dis_gen': commasep(b.get_dis_genes())
             })
         print('#', '\t'.join(d.keys()), file=f_tsv)
         print('\t'.join(map(lambda v: str(v) if v else '', d.values())), file=f_tsv)

diff --git a/src/kcri/bap/__init__.py b/src/kcri/bap/__init__.py
@@ -1,2 +1,2 @@
 __all__ = [ 'BAP', 'data', 'services', 'shims', 'workflow' ]
-__version__ = "3.5.0"
+__version__ = "3.5.1"
diff --git a/src/kcri/bap/data.py b/src/kcri/bap/data.py
@@ -210,24 +210,36 @@ def add_amr_gene(self, gene):
     def get_amr_genes(self):
         return sorted(self.get('bap/summary/amr_genes', []))
 
-    def add_amr_classes(self, classes):
+    def add_amr_class(self, classes):
         self.append_to('bap/summary/amr_classes', classes, True)
 
     def get_amr_classes(self):
         return sorted(self.get('bap/summary/amr_classes', []))
 
-    def add_amr_phenotype(self, pheno):
-        self.append_to('bap/summary/amr_phenotypes', pheno, True)
+    def add_amr_antibiotic(self, pheno):
+        self.append_to('bap/summary/amr_antibiotics', pheno, True)
 
-    def get_amr_phenotypes(self):
-        return sorted(self.get('bap/summary/amr_phenotypes', []))
+    def get_amr_antibiotics(self):
+        return sorted(self.get('bap/summary/amr_antibiotics', []))
 
     def add_amr_mutation(self, mut):
         self.append_to('bap/summary/amr_mutations', mut, True)
 
     def get_amr_mutations(self):
         return sorted(self.get('bap/summary/amr_mutations', []))
 
+    def add_dis_gene(self, gene):
+        self.append_to('bap/summary/dis_genes', gene, True)
+
+    def get_dis_genes(self):
+        return sorted(self.get('bap/summary/dis_genes', []))
+
+    def add_dis_resistance(self, dis):
+        self.append_to('bap/summary/dis_resistances', dis, True)
+
+    def get_dis_resistances(self):
+        return sorted(self.get('bap/summary/dis_resistances', []))
+
     # cgMLST
 
     def add_cgmlst(self, scheme, st, pct):

diff --git a/src/kcri/bap/shims/.DisinFinder.py.swp b/src/kcri/bap/shims/.DisinFinder.py.swp
diff --git a/src/kcri/bap/shims/.PointFinder.py.swp b/src/kcri/bap/shims/.PointFinder.py.swp
diff --git a/src/kcri/bap/shims/DisinFinder.py b/src/kcri/bap/shims/DisinFinder.py
@@ -36,12 +36,10 @@ def execute(self, sid, xid, blackboard, scheduler):
                 # Set the thresholds from the RF parameters; we have no params specific to DF.
                 # We could be fancy and offer separate settings, but then if we migrate to a
                 # single Resistance 'all-in-one', we can't pass them to ResFinder separately.
-                # It's no big deal; we mention in BAP help that these are for both Res and Disin.
+                # No big deal; we point out in BAP help that these are for both Res and Disin.
                 '-t', execution.get_user_input('rf_i'),
                 '-l', execution.get_user_input('rf_c'),
                 '--acq_overlap', execution.get_user_input('rf_o'),
-                '--threshold_point', execution.get_user_input('rf_i'),
-                '--min_cov_point', execution.get_user_input('rf_c'),
                 '-j', 'disinfinder.json',
                 '-o', '.' ]
 
@@ -50,10 +48,12 @@ def execute(self, sid, xid, blackboard, scheduler):
             if illufqs:
                 for f in illufqs:
                     params.extend(['--inputfastq', f])
+            elif execution.get_contigs_path(""):
+                params.extend(['--inputfasta', execution.get_contigs_path()])
             elif execution.get_nanofq_path(""):
                 params.extend(['--nanopore', '--inputfastq', execution.get_nanofq_path()])
-            else:
-                params.extend(['--inputfasta', execution.get_contigs_path()])
+            else: # the end is neigh
+                raise UserException("no input data to analyse")
 
             job_spec = JobSpec('resfinder', params, MAX_CPU, MAX_MEM, MAX_TIM)
             execution.store_job_spec(job_spec.as_dict())
@@ -87,38 +87,40 @@ def collect_output(self, job):
 
         res_out = dict()
 
-        # ResFinder JSON output since 4.2 has top-level elements
-        # 'genes', 'seq_variations', and 'phenotypes'.
-        # We include these but change them from objects to lists.  So this:
-        #    'genes' : { 'aph(6)-Id;;1;;M28829': { ..., 'key' : 'aph(6)-Id;;1;;M28829', ...
-        # becomes:
-        #    'genes' : [ { ..., 'key' : 'aph(6)-Id;;1;;M28829', ... }, ...]
-        # This is cleaner design (they have list semantics, not object), and
-        # avoids issues with keys such as "aac(6')-Ib;;..." that are bound
-        # to create issues down the line as they contain JSON delimiters.
-
-        json_out = job.file_path('disinfinder.json')
+        out_file = job.file_path('disinfinder.json')
         try:
-            with open(json_out, 'r') as f: json_in = json.load(f)
+            with open(out_file, 'r') as f: json_in = json.load(f)
         except Exception as e:
             logging.exception(e)
-            self.fail('failed to open or load JSON from file: %s' % json_out)
+            self.fail('failed to open or load JSON from file: %s' % out_file)
             return
 
-        # Append to the result dictionary, converting as documented above.
+        # ResFinder since 4.2 has standardised JSON with these elements:
+        # - seq_regions (loci with AMR-causing genes or mutations)
+        # - seq_variations (mutations keying into seq_regions)
+        # - phenotypes (antibiotic resistances, keying into above)
+
+        # We include these but change them from objects to lists, so this:
+        #   'seq_regions' : { 'XYZ': { ..., 'key' : 'XYZ', ...
+        # becomes:
+        #   'seq_regions' : [ { ..., 'key' : 'XYZ', ... }, ...]
+        # This is cleaner design (they have list semantics, not object), and
+        # avoids issues downstream with keys containing JSON delimiters.
+
         for k, v in json_in.items():
-            if k in ['genes','seq_variations','phenotypes']:
+            if k in ['seq_regions','seq_variations','phenotypes']:
                 res_out[k] = [ o for o in v.values() ]
             else:
                 res_out[k] = v
 
-#        # Store the genes, classes and phenotypes in the summary
-#        for g in res_out.get('genes', []):
-#            self._blackboard.add_amr_gene(g.get('name','unknown'))
-#        # Store the classes and phenotypes in the summary
-#        for p in filter(lambda d: d.get('resistant', False), res_out.get('phenotypes', [])):
-#            self._blackboard.add_amr_classes(p.get('amr_classes',[]))
-#            self._blackboard.add_amr_phenotype(p.get('resistance','unknown'))
+        # Helpers to retrieve genes names g for regions r causing phenotype p
+        r2g = lambda r: json_in.get('seq_regions',{}).get(r,{}).get('name')
+        p2gs = lambda p: filter(None, map(r2g, p.get('seq_regions', [])))
+
+        # Store the resistant phenotypes and causative genes for summary
+        for p in filter(lambda d: d.get('amr_resistant', False), res_out.get('phenotypes', [])):
+            for g in p2gs(p): self._blackboard.add_dis_gene(g)
+            self._blackboard.add_dis_resistance(p.get('amr_resistance','?unspecified?'))
 
         # Store the results on the blackboard
         self.store_results(res_out)

diff --git a/src/kcri/bap/shims/PointFinder.py b/src/kcri/bap/shims/PointFinder.py
@@ -53,10 +53,12 @@ def execute(self, sid, xid, blackboard, scheduler):
             if illufqs:
                 for f in illufqs:
                     params.extend(['--inputfastq', f])
+            elif execution.get_contigs_path(""):
+                params.extend(['--inputfasta', execution.get_contigs_path()])
             elif execution.get_nanofq_path(""):
                 params.extend(['--nanopore', '--inputfastq', execution.get_nanofq_path()])
-            else:
-                params.extend(['--inputfasta', execution.get_contigs_path()])
+            else: # LPT1 is on fire
+                raise
 
             # Parse list of user specified genes and check with DB
             for g in filter(None, execution.get_user_input('pt_g',"").split(',')):
@@ -123,69 +125,43 @@ def collect_output(self, job):
 
         res_out = dict()
 
-        # The provisional JSON we parse further down does not yet link sequence variations
-        # to phenotypes, so for now we parse PointFinder_results.txt:
-        # NOTE 2021-04-12: seems this data in now partly present, but without e.g. PMID
-        #    Mutation        Nucleotide change       Amino acid change       Resistance      PMID
-        #    gyrA p.S83L     TCG -> TTG      S -> L  Nalidixic acid,Ciprofloxacin    8891148
-        #    gyrA p.D87N     GAC -> AAC      D -> N  Nalidixic acid,Nalidixic acid,Ciprofloxacin     12654733
-        #    parC p.S80I     AGC -> ATC      S -> I  Nalidixic acid,Ciprofloxacin    8851598
-        #    parE p.S458T    TCG -> ACG      S -> T  Nalidixic acid,Ciprofloxacin    14506034
-
-        tab_out = job.file_path('PointFinder_results.txt')
+        out_file = job.file_path('pointfinder.json')
         try:
-            res_out['findings'] = list()
-            with open(tab_out, newline='') as f:
-                reader = csv.DictReader(f, delimiter='\t')
-                for row in reader:
-                    # Append the mutation to the findings list
-                    res_out['findings'].append(dict({
-                        'mutation': row['Mutation'],
-                        'nt_change': row['Nucleotide change'],
-                        'aa_change': row['Amino acid change'],
-                        'resistance': sorted(list(set(row.get('Resistance','').split(',')))),
-                        'pmid': row.get('PMID')}))
-                    # Append the mutation to the summary info on the blackboard
-                    self._blackboard.add_amr_mutation(row['Mutation'])
+            with open(out_file, 'r') as f: json_in = json.load(f)
         except Exception as e:
             logging.exception(e)
-            self.fail('failed to open or read file: %s' % tab_out)
+            self.fail('failed to open or load JSON from file: %s' % out_file)
             return
 
-        # ResFinder and PointFinder have provisional standardised output in 
-        # 'std_format_under_development.json', which has top-level elements
-        # 'seq_regions', 'seq_variations', and 'phenotypes'.
-        # We include these but change them from objects to lists.  So this:
-        #    'seq_regions' : { 'aph(6)-Id;;1;;M28829': { ..., 'key' : 'aph(6)-Id;;1;;M28829', ...
+        # ResFinder since 4.2 has standardised JSON with these elements:
+        # - seq_regions (loci with AMR-causing genes or mutations)
+        # - seq_variations (mutations keying into seq_regions)
+        # - phenotypes (antibiotic resistances, keying into above)
+
+        # We include these but change them from objects to lists, so this:
+        #   'seq_regions' : { 'XYZ': { ..., 'key' : 'XYZ', ...
         # becomes:
-        #    'seq_regions' : [ { ..., 'key' : 'aph(6)-Id;;1;;M28829', ... }, ...]
+        #   'seq_regions' : [ { ..., 'key' : 'XYZ', ... }, ...]
         # This is cleaner design (they have list semantics, not object), and
-        # avoids issues with keys such as "aac(6')-Ib;;..." that are bound
-        # to create issues down the line as they contain JSON delimiters.
-
-        json_out = job.file_path('pointfinder.json')
-        try:
-            with open(json_out, 'r') as f: json_in = json.load(f)
-        except Exception as e:
-            logging.exception(e)
-            self.fail('failed to open or load JSON from file: %s' % json_out)
-            return
+        # avoids issues downstream with keys containing JSON delimiters.
 
-        # Append to the result dictionary, converting as documented above.
         for k, v in json_in.items():
             if k in ['seq_regions','seq_variations','phenotypes']:
                 res_out[k] = [ o for o in v.values() ]
             else:
                 res_out[k] = v
 
-        # Store the classes and phenotypes in the summary
+        # Helpers to retrieve mutation ids m for variant v causing phenotype p
+        v2m = lambda v: json_in.get('seq_variations',{}).get(v,{}).get('seq_var')
+        p2ms = lambda p: filter(None, map(v2m, p.get('seq_variations', [])))
+
+        # Store the resistant phenotypes and causative mutations for the summary output
+        # Note that a lot more information is present, including PMID references and notes
         for p in filter(lambda d: d.get('amr_resistant', False), res_out.get('phenotypes', [])):
-            self._blackboard.add_amr_classes(p.get('amr_classes',[]))
-            self._blackboard.add_amr_phenotype(p.get('amr_resistance','unknown'))
-        # We don't store the seq_variations until we know their phenotype (instead do above)
-        #for m in res_out.get('seq_variations', []):
-        #    self._blackboard.add_amr_mutation('%s:%s' % (m.get('genes',['?'])[0], m.get('seq_var','?')))
+            for g in p2ms(p): self._blackboard.add_amr_mutation(g)
+            for c in p.get('amr_classes',[]): self._blackboard.add_amr_class(c)
+            self._blackboard.add_amr_antibiotic(p.get('amr_resistance','?unspecified?'))
 
-        # Store the results on the blackboard
+        # Store on the blackboard
         self.store_results(res_out)
 
diff --git a/src/kcri/bap/shims/ResFinder.py b/src/kcri/bap/shims/ResFinder.py
@@ -43,10 +43,12 @@ def execute(self, sid, xid, blackboard, scheduler):
             if illufqs:
                 for f in illufqs:
                     params.extend(['--inputfastq', f])
+            elif execution.get_contigs_path(""):
+                params.extend(['--inputfasta', execution.get_contigs_path()])
             elif execution.get_nanofq_path(""):
                 params.extend(['--nanopore', '--inputfastq', execution.get_nanofq_path()])
-            else:
-                params.extend(['--inputfasta', execution.get_contigs_path()])
+            else: # expect the unexpected
+                raise UserException("no input data to analyse")
 
             job_spec = JobSpec('resfinder', params, MAX_CPU, MAX_MEM, MAX_TIM)
             execution.store_job_spec(job_spec.as_dict())
@@ -96,19 +98,7 @@ def collect_output(self, job):
         '''Collect the job output and put on blackboard.
            This method is called by super().report() once job is done.'''
 
-        # ResFinder and PointFinder had provisional standardised output in 
-        # 'std_format_under_development.json', now configurable with '-j'.
-        # It had top-level elements 'genes', 'seq_variations', and 'phenotypes'.
-        # Since 4.2.1 'genes' is 'seq_regions', and has 'gene': true.
-        # We include these but change them from objects to lists.  So this:
-        #    'genes' : { 'aph(6)-Id;;1;;M28829': { ..., 'key' : 'aph(6)-Id;;1;;M28829', ...
-        # becomes:
-        #    'genes' : [ { ..., 'key' : 'aph(6)-Id;;1;;M28829', ... }, ...]
-        # This is cleaner design (they have list semantics, not object), and
-        # avoids issues with keys such as "aac(6')-Ib;;..." that are bound
-        # to create issues down the line as they contain JSON delimiters.
-
-        # TODO: the tables have more info than the JSON, extract or fix json?
+        res_out = dict()
 
         out_file = job.file_path('resfinder.json')
         try:
@@ -118,21 +108,35 @@ def collect_output(self, job):
             self.fail('failed to open or load JSON from file: %s' % out_file)
             return
 
-        # Produce the result dictionary, converting as documented above.
-        res_out = dict()
+        # ResFinder since 4.2 has standardised JSON with these elements:
+        # - seq_regions (loci with AMR-causing genes or mutations)
+        # - seq_variations (mutations keying into seq_regions)
+        # - phenotypes (antibiotic resistances, keying into above)
+
+        # We include these but change them from objects to lists, so this:
+        #   'seq_regions' : { 'XYZ': { ..., 'key' : 'XYZ', ...
+        # becomes:
+        #   'seq_regions' : [ { ..., 'key' : 'XYZ', ... }, ...]
+        # This is cleaner design (they have list semantics, not object), and
+        # avoids issues downstream with keys containing JSON delimiters.
+
         for k, v in json_in.items():
             if k in ['seq_regions','seq_variations','phenotypes']:
                 res_out[k] = [ o for o in v.values() ]
             else:
                 res_out[k] = v
 
-        # Store the genes/seq_regions, classes and phenetypes in the summary
-        for g in res_out.get('seq_regions', []):
-            self._blackboard.add_amr_gene(g.get('name','unknown'))
+        # Helpers to retrieve gene names g for regions r causing phenotype p
+        r2g = lambda r: json_in.get('seq_regions',{}).get(r,{}).get('name')
+        p2gs = lambda p: filter(None, map(r2g, p.get('seq_regions', [])))
+
+        # Store the resistant phenotypes and causative regions for the summary output
+        # Note that a lot more information is present, including PMIDs and notes
         for p in filter(lambda d: d.get('amr_resistant', False), res_out.get('phenotypes', [])):
-            self._blackboard.add_amr_classes(p.get('amr_classes',[]))
-            self._blackboard.add_amr_phenotype(p.get('amr_resistance','unknown'))
+            for g in p2gs(p): self._blackboard.add_amr_gene(g)
+            for c in p.get('amr_classes',[]): self._blackboard.add_amr_class(c)
+            self._blackboard.add_amr_antibiotic(p.get('amr_resistance','?unspecified?'))
 
-        # Store the results on the blackboard
+        # Store on the blackboard
         self.store_results(res_out)