sourmash-bio · bluegenes · Feb 9, 2023 · Feb 7, 2023 · Feb 8, 2023 · Feb 8, 2023
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -661,7 +661,7 @@ sourmash `kreport` columns:
 - `Estimated base pairs contained in taxon`: The cumulative estimated base pairs for this taxon and all descendants.
 - `Estimated base pairs "assigned" (species-level)`: The estimated base pairs assigned at species-level (cumulative count of base pairs assigned to individual genomes in this species).
 - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
-- [blank column]: (`NCBI Taxon ID` is not currently reported).
+- `NCBI Taxon ID`: Reported (v4.7+) if using NCBI taxonomy. Otherwise blank.
 - `Scientific Name`: The scientific name of the taxon.
 
 notes:

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
@@ -360,9 +360,9 @@ def grep(args):
 
     # determine if lineage matches.
     def find_pattern(lineage, select_rank):
-        for (rank, name) in lineage:
-            if select_rank is None or rank == select_rank:
-                if pattern.search(name):
+        for lp in lineage:
+            if select_rank is None or lp.rank == select_rank:
+                if pattern.search(lp.name):
                     return True
         return False
 

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -80,7 +80,7 @@ def __post_init__(self):
             self._init_empty()
 
     def __eq__(self, other):
-        if other == (): # just handy: if comparing to a null tuple, don't try to find it's lineage before returning False
+        if other == (): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False
             return False
         return all([self.ranks == other.ranks and self.lineage==other.lineage])
 
@@ -757,6 +757,9 @@ def load(cls, filename, *, delimiter=',', force=False,
             # is "strain" an available rank?
             if "strain" in header:
                 include_strain=True
+            load_taxids=False
+            if 'taxpath' in header:
+                load_taxids=True
 
             # check that all ranks are in header
             ranks = list(lca_utils.taxlist(include_strain=include_strain))
@@ -775,10 +778,15 @@ def load(cls, filename, *, delimiter=',', force=False,
             for n, row in enumerate(r):
                 num_rows += 1
                 lineage = []
+                taxid=None
                 # read row into a lineage pair
-                for rank in lca_utils.taxlist(include_strain=include_strain):
+                if load_taxids:
+                    taxpath = row['taxpath'].split('|')
+                for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)):
                     lin = row[rank]
-                    lineage.append(lca_utils.LineagePair(rank, lin))
+                    if load_taxids:
+                        taxid = taxpath[n]
+                    lineage.append(LineagePair(rank, name=lin, taxid=taxid))
                 ident = row[identifier]
 
                 # fold, spindle, and mutilate ident?
@@ -787,8 +795,8 @@ def load(cls, filename, *, delimiter=',', force=False,
                                   keep_identifier_versions=keep_identifier_versions)
 
                 # clean lineage of null names, replace with 'unassigned'
-                lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
-                lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage ]
+                lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ]
+                lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ]
 
                 # remove end nulls
                 while lineage and lineage[-1].name == 'unassigned':
@@ -942,7 +950,7 @@ def load(cls, location):
 
     def _make_tup(self, row):
         "build a tuple of LineagePairs for this sqlite row"
-        tup = [ lca_utils.LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
+        tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
         return tuple(tup)
 
     def __getitem__(self, ident):

diff --git a/tests/test-data/tax/test.ncbi-taxonomy.csv b/tests/test-data/tax/test.ncbi-taxonomy.csv
@@ -0,0 +1,7 @@
+ident,taxid,superkingdom,phylum,class,order,family,genus,species,strain,taxpath
+GCF_001881345.1,562,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,,2|1224|1236|91347|543|561|562|
+GCF_009494285.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179|
+GCF_013368705.1,821,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Phocaeicola,Phocaeicola vulgatus,,2|976|200643|171549|815|909656|821|
+GCF_003471795.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179|
+GCF_000017325.1,402882,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS185,2|1224|1236|135622|267890|22|62322|402882
+GCF_000021665.1,407976,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS223,2|1224|1236|135622|267890|22|62322|407976
diff --git a/tests/test_tax.py b/tests/test_tax.py
@@ -205,6 +205,46 @@ def test_metagenome_kreport_out(runtmp):
     assert ['1.56', '192000', '192000', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15]
 
 
+def test_metagenome_kreport_ncbi_taxid_out(runtmp):
+    # test NCBI taxid output from kreport
+    g_csv = utils.get_test_data('tax/test1.gather.v450.csv')
+    tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv')
+    csv_base = "out"
+    sum_csv = csv_base + ".kreport.txt"
+    csvout = runtmp.output(sum_csv)
+    outdir = os.path.dirname(csvout)
+
+    runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport")
+
+    print(runtmp.last_result.status)
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    assert runtmp.last_result.status == 0
+    assert os.path.exists(csvout)
+
+    kreport_results = [x.rstrip().split('\t') for x in open(csvout)]
+    assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err
+    print(kreport_results)
+    assert ['13.08', '1605999', '0', 'D', '2', 'Bacteria'] == kreport_results[0]
+    assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1]
+    assert ['7.27', '892000', '0', 'P', '976', 'Bacteroidota'] == kreport_results[2]
+    assert ['5.82', '714000', '0', 'P', '1224', 'Pseudomonadota'] == kreport_results[3]
+    assert ['7.27', '892000', '0', 'C', '200643', 'Bacteroidia'] == kreport_results[4]
+    assert ['5.82', '714000', '0', 'C', '1236', 'Gammaproteobacteria'] == kreport_results[5]
+    assert ['7.27', '892000', '0', 'O', '171549', 'Bacteroidales'] == kreport_results[6]
+    assert ['5.82', '714000', '0', 'O', '91347', 'Enterobacterales'] == kreport_results[7]
+    assert ['5.70', '700000', '0', 'F', '171552', 'Prevotellaceae'] == kreport_results[8]
+    assert ['5.82', '714000', '0', 'F', '543', 'Enterobacteriaceae'] == kreport_results[9]
+    assert ['1.56', '192000', '0', 'F', '815', 'Bacteroidaceae'] == kreport_results[10]
+    assert ['5.70', '700000', '0', 'G', '838', 'Prevotella'] == kreport_results[11]
+    assert ['5.82', '714000', '0', 'G', '561', 'Escherichia'] == kreport_results[12]
+    assert ['1.56', '192000', '0', 'G', '909656', 'Phocaeicola'] == kreport_results[13]
+    assert ['5.70', '700000', '700000', 'S', '165179', 'Prevotella copri'] == kreport_results[14]
+    assert ['5.82', '714000', '714000', 'S', '562', 'Escherichia coli'] == kreport_results[15]
+    assert ['1.56', '192000', '192000', 'S', '821', 'Phocaeicola vulgatus'] == kreport_results[16]
+
+
 def test_metagenome_kreport_out_lemonade(runtmp):
     # test 'kreport' kraken output format against lemonade output
     g_csv = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.csv')