From 016e17e046acef16e762843eae5d7d920a3097f0 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 21 Jun 2018 18:41:29 -0700 Subject: [PATCH] migrate command (update old indexes) (#494) * migrate cmd * Bump to 2.0.0a8 * Add a test for empty sigs, and fix min_n_below == 0 in update methods --- sourmash/VERSION | 2 +- sourmash/__main__.py | 5 +- sourmash/commands.py | 12 +++ sourmash/logging.py | 12 +++ sourmash/sbt.py | 122 +++++++++++++++++------ sourmash/sbtmh.py | 8 +- tests/test-data/sbt-search-bug/empty.sig | 1 + tests/test_sourmash.py | 52 +++++++++- 8 files changed, 177 insertions(+), 37 deletions(-) create mode 100644 tests/test-data/sbt-search-bug/empty.sig diff --git a/sourmash/VERSION b/sourmash/VERSION index 4b23e04d2f..55a349a6be 100644 --- a/sourmash/VERSION +++ b/sourmash/VERSION @@ -1 +1 @@ -2.0.0a7 +2.0.0a8 diff --git a/sourmash/__main__.py b/sourmash/__main__.py index 60c575e057..72a8a731b2 100644 --- a/sourmash/__main__.py +++ b/sourmash/__main__.py @@ -9,7 +9,7 @@ from .commands import (categorize, compare, compute, dump, import_csv, gather, index, sbt_combine, search, - plot, watch, info, storage) + plot, watch, info, storage, migrate) from .lca import main as lca_main usage=''' @@ -57,7 +57,8 @@ def main(): 'watch': watch, 'sbt_combine': sbt_combine, 'info': info, 'storage': storage, - 'lca': lca_main} + 'lca': lca_main, + 'migrate': migrate} parser = argparse.ArgumentParser( description='work with compressed sequence representations') parser.add_argument('command', nargs='?') diff --git a/sourmash/commands.py b/sourmash/commands.py index b52d3c93f2..ad13a4d30a 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -1160,3 +1160,15 @@ def storage(args): set_quiet(args.quiet) if args.command == 'convert': convert_cmd(args.sbt, args.backend) + + +def migrate(args): + parser = argparse.ArgumentParser() + parser.add_argument('sbt_name', help='name to save SBT into') + + args = parser.parse_args(args) + + tree = load_sbt_index(args.sbt_name, print_version_warning=False) + + notify('saving SBT under "{}".', args.sbt_name) + tree.save(args.sbt_name, structure_only=True) diff --git a/sourmash/logging.py b/sourmash/logging.py index f6a55f8814..98bb5dbe23 100644 --- a/sourmash/logging.py +++ b/sourmash/logging.py @@ -28,6 +28,18 @@ def notify(s, *args, **kwargs): sys.stderr.flush() +def debug(s, *args, **kwargs): + "A debug logging function => stderr." + if _quiet: + return + + print(u'\r\033[K', end=u'', file=sys.stderr) + print(s.format(*args, **kwargs), file=sys.stderr, + end=kwargs.get('end', u'\n')) + if kwargs.get('flush'): + sys.stderr.flush() + + def error(s, *args, **kwargs): "A simple error logging function => stderr." print(u'\r\033[K', end=u'', file=sys.stderr) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 453133f32d..e0c67298d6 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -55,7 +55,7 @@ def search_transcript(node, seq, threshold): import khmer from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage -from .logging import error, notify +from .logging import error, notify, debug STORAGES = { @@ -222,7 +222,9 @@ def _rebuild_node(self, pos=0): if c.pos in self.missing_nodes or isinstance(c.node, Leaf): if c.node is None: self._rebuild_node(c.pos) - self.nodes[c.pos].update(node) + c_node = self.nodes[c.pos] + if c_node is not None: + c_node.update(node) self.missing_nodes.remove(pos) @@ -286,7 +288,7 @@ def child(self, parent, pos): node = self.nodes.get(cd, None) return NodePos(cd, node) - def save(self, path, storage=None, sparseness=0.0): + def save(self, path, storage=None, sparseness=0.0, structure_only=False): """Saves an SBT description locally and node data to a storage. Parameters @@ -300,6 +302,9 @@ def save(self, path, storage=None, sparseness=0.0): How much of the internal nodes should be saved. Defaults to 0.0 (save all internal nodes data), can go up to 1.0 (don't save any internal nodes data) + structure_only: boolean + Write only the index schema and metadata, but not the data. + Defaults to False (save data too) Returns ------- @@ -349,14 +354,22 @@ def save(self, path, storage=None, sparseness=0.0): 'filename': os.path.basename(node.name), 'name': node.name } + + try: + node.metadata.pop('max_n_below') + except (AttributeError, KeyError): + pass + data['metadata'] = node.metadata - # trigger data loading before saving to the new place - node.data + if structure_only is False: + # trigger data loading before saving to the new place + node.data - node.storage = storage + node.storage = storage + + data['filename'] = node.save(data['filename']) - data['filename'] = node.save(data['filename']) structure[i] = data notify("{} of {} nodes saved".format(n+1, total_nodes), end='\r') @@ -369,7 +382,7 @@ def save(self, path, storage=None, sparseness=0.0): return fn @classmethod - def load(cls, location, leaf_loader=None, storage=None): + def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): """Load an SBT description from a file. Parameters @@ -423,10 +436,11 @@ def load(cls, location, leaf_loader=None, storage=None): if version < 3 and storage is None: storage = FSStorage(dirname, '.sbt.{}'.format(sbt_name)) - return loaders[version](jnodes, leaf_loader, dirname, storage) + return loaders[version](jnodes, leaf_loader, dirname, storage, + print_version_warning) @staticmethod - def _load_v1(jnodes, leaf_loader, dirname, storage): + def _load_v1(jnodes, leaf_loader, dirname, storage, print_version_warning=True): if jnodes[0] is None: raise ValueError("Empty tree!") @@ -457,7 +471,7 @@ def _load_v1(jnodes, leaf_loader, dirname, storage): return tree @classmethod - def _load_v2(cls, info, leaf_loader, dirname, storage): + def _load_v2(cls, info, leaf_loader, dirname, storage, print_version_warning=True): nodes = {int(k): v for (k, v) in info['nodes'].items()} if nodes[0] is None: @@ -489,7 +503,7 @@ def _load_v2(cls, info, leaf_loader, dirname, storage): return tree @classmethod - def _load_v3(cls, info, leaf_loader, dirname, storage): + def _load_v3(cls, info, leaf_loader, dirname, storage, print_version_warning=True): nodes = {int(k): v for (k, v) in info['nodes'].items()} if not nodes: @@ -526,12 +540,15 @@ def _load_v3(cls, info, leaf_loader, dirname, storage): # TODO: this might not be true with combine... tree.next_node = max_node + if print_version_warning: + error("WARNING: this is an old index version, please run `sourmash migrate` to update it.") + error("WARNING: proceeding with execution, but it will take longer to finish!") tree._fill_min_n_below() return tree @classmethod - def _load_v4(cls, info, leaf_loader, dirname, storage): + def _load_v4(cls, info, leaf_loader, dirname, storage, print_version_warning=True): nodes = {int(k): v for (k, v) in info['nodes'].items()} if not nodes: @@ -575,25 +592,66 @@ def _fill_min_n_below(self): Propagate the smallest hash size below each node up the tree from the leaves. """ - for i, n in self.nodes.items(): - if isinstance(n, Leaf): - parent = self.parent(i) - if parent.pos not in self.missing_nodes: - min_n_below = parent.node.metadata.get('min_n_below', sys.maxsize) - min_n_below = min(len(n.data.minhash.get_mins()), - min_n_below) - parent.node.metadata['min_n_below'] = min_n_below - - current = parent - parent = self.parent(parent.pos) - while parent and parent.pos not in self.missing_nodes: - min_n_below = parent.node.metadata.get('min_n_below', sys.maxsize) - min_n_below = min(current.node.metadata['min_n_below'], - min_n_below) - parent.node.metadata['min_n_below'] = min_n_below - current = parent - parent = self.parent(parent.pos) + def fill_min_n_below(node, *args, **kwargs): + original_min_n_below = node.metadata.get('min_n_below', sys.maxsize) + min_n_below = original_min_n_below + + children = kwargs['children'] + for child in children: + if child.node is not None: + if isinstance(child.node, Leaf): + min_n_below = min(len(child.node.data.minhash), min_n_below) + else: + child_n = child.node.metadata.get('min_n_below', sys.maxsize) + min_n_below = min(child_n, min_n_below) + + if min_n_below == 0: + min_n_below = 1 + + node.metadata['min_n_below'] = min_n_below + return original_min_n_below != min_n_below + + self._fill_up(fill_min_n_below) + + def _fill_up(self, search_fn, *args, **kwargs): + visited, queue = set(), [i[0] for i in reversed(sorted(self._leaves()))] + debug("started filling up") + processed = 0 + while queue: + node_p = queue.pop(0) + + parent = self.parent(node_p) + if parent is None: + # we are in the root, no more nodes available to search + assert len(queue) == 0 + return + + was_missing = False + if parent.node is None: + if parent.pos in self.missing_nodes: + self._rebuild_node(parent.pos) + parent = self.parent(node_p) + was_missing = True + else: + continue + + siblings = self.children(parent.pos) + + if node_p not in visited: + visited.add(node_p) + for sibling in siblings: + visited.add(sibling.pos) + try: + queue.remove(sibling.pos) + except ValueError: + pass + + if search_fn(parent.node, children=siblings, *args) or was_missing: + queue.append(parent.pos) + processed += 1 + if processed % 100 == 0: + debug("processed {}, in queue {}", processed, len(queue), sep='\r') def print_dot(self): print(""" @@ -747,6 +805,8 @@ def update(self, parent): parent.data.update(self.data) min_n_below = min(parent.metadata.get('min_n_below', sys.maxsize), self.metadata.get('min_n_below')) + if min_n_below == 0: + min_n_below = 1 parent.metadata['min_n_below'] = min_n_below diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 946d0f08fe..69a70d473c 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -8,9 +8,10 @@ from . import signature -def load_sbt_index(filename): +def load_sbt_index(filename, print_version_warning=True): "Load and return an SBT index." - return SBT.load(filename, leaf_loader=SigLeaf.load) + return SBT.load(filename, leaf_loader=SigLeaf.load, + print_version_warning=print_version_warning) def create_sbt_index(bloom_filter_size=1e5, n_children=2): @@ -59,6 +60,9 @@ def update(self, parent): min_n_below = min(len(self.data.minhash.get_mins()), min_n_below) + if min_n_below == 0: + min_n_below = 1 + parent.metadata['min_n_below'] = min_n_below @property diff --git a/tests/test-data/sbt-search-bug/empty.sig b/tests/test-data/sbt-search-bug/empty.sig new file mode 100644 index 0000000000..fbcb09a575 --- /dev/null +++ b/tests/test-data/sbt-search-bug/empty.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","filename":"empty.fa","hash_function":"0.murmur64","license":"CC0","name":"empty sig","signatures":[{"ksize":31,"max_hash":18446744073709552,"md5sum":"c16a5320fa475530d9583c34fd356ef5","mins":[],"molecule":"DNA","num":0,"seed":42}],"version":0.4}] diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 8617535393..de5da3cac4 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -14,7 +14,7 @@ from . import sourmash_tst_utils as utils import sourmash_lib from sourmash_lib import MinHash -from sourmash_lib.sbt import SBT +from sourmash_lib.sbt import SBT, Node from sourmash_lib.sbtmh import SigLeaf, load_sbt_index try: import matplotlib @@ -1274,6 +1274,30 @@ def test_do_sourmash_sbt_search_check_bug(): assert tree.nodes[0].metadata['min_n_below'] == 431 +def test_do_sourmash_sbt_search_empty_sig(): + with utils.TempDirectory() as location: + # mins: 431 + testdata1 = utils.get_test_data('sbt-search-bug/nano.sig') + + # mins: 0 + testdata2 = utils.get_test_data('sbt-search-bug/empty.sig') + + status, out, err = utils.runscript('sourmash', + ['index', 'zzz', '-k', '31', + testdata1, testdata2], + in_directory=location) + + assert os.path.exists(os.path.join(location, 'zzz.sbt.json')) + + status, out, err = utils.runscript('sourmash', + ['search', testdata1, 'zzz'], + in_directory=location) + assert '1 matches:' in out + + tree = load_sbt_index(os.path.join(location, 'zzz.sbt.json')) + assert tree.nodes[0].metadata['min_n_below'] == 1 + + def test_do_sourmash_sbt_move_and_search_output(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -3215,6 +3239,32 @@ def test_storage_convert_fsstorage_newpath(): sorted(identity.nodes.items()))) +def test_migrate(): + with utils.TempDirectory() as location: + testdata = utils.get_test_data('v3.sbt.json') + shutil.copyfile(testdata, os.path.join(location, 'v3.sbt.json')) + shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v3'), + os.path.join(location, '.sbt.v3')) + testsbt = os.path.join(location, 'v3.sbt.json') + + original = SBT.load(testsbt, leaf_loader=SigLeaf.load) + + status, out, err = utils.runscript('sourmash', ['migrate', testsbt], + in_directory=location) + + identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) + + assert len(original.nodes) == len(identity.nodes) + assert all(n1[1].name == n2[1].name + for (n1, n2) in zip(sorted(original.nodes.items()), + sorted(identity.nodes.items()))) + + assert "this is an old index version" not in err + assert all('min_n_below' in node.metadata + for node in identity.nodes.values() + if isinstance(node, Node)) + + def test_license_cc0(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa')