Skip to content

Commit

Permalink
migrate command (update old indexes) (#494)
Browse files Browse the repository at this point in the history
* migrate cmd
* Bump to 2.0.0a8
* Add a test for empty sigs, and fix min_n_below == 0 in update methods
  • Loading branch information
luizirber authored Jun 22, 2018
1 parent a49456c commit 016e17e
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 37 deletions.
2 changes: 1 addition & 1 deletion sourmash/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.0a7
2.0.0a8
5 changes: 3 additions & 2 deletions sourmash/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .commands import (categorize, compare, compute, dump, import_csv,
gather, index, sbt_combine, search,
plot, watch, info, storage)
plot, watch, info, storage, migrate)
from .lca import main as lca_main

usage='''
Expand Down Expand Up @@ -57,7 +57,8 @@ def main():
'watch': watch,
'sbt_combine': sbt_combine, 'info': info,
'storage': storage,
'lca': lca_main}
'lca': lca_main,
'migrate': migrate}
parser = argparse.ArgumentParser(
description='work with compressed sequence representations')
parser.add_argument('command', nargs='?')
Expand Down
12 changes: 12 additions & 0 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1160,3 +1160,15 @@ def storage(args):
set_quiet(args.quiet)
if args.command == 'convert':
convert_cmd(args.sbt, args.backend)


def migrate(args):
parser = argparse.ArgumentParser()
parser.add_argument('sbt_name', help='name to save SBT into')

args = parser.parse_args(args)

tree = load_sbt_index(args.sbt_name, print_version_warning=False)

notify('saving SBT under "{}".', args.sbt_name)
tree.save(args.sbt_name, structure_only=True)
12 changes: 12 additions & 0 deletions sourmash/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ def notify(s, *args, **kwargs):
sys.stderr.flush()


def debug(s, *args, **kwargs):
"A debug logging function => stderr."
if _quiet:
return

print(u'\r\033[K', end=u'', file=sys.stderr)
print(s.format(*args, **kwargs), file=sys.stderr,
end=kwargs.get('end', u'\n'))
if kwargs.get('flush'):
sys.stderr.flush()


def error(s, *args, **kwargs):
"A simple error logging function => stderr."
print(u'\r\033[K', end=u'', file=sys.stderr)
Expand Down
122 changes: 91 additions & 31 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def search_transcript(node, seq, threshold):
import khmer

from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage
from .logging import error, notify
from .logging import error, notify, debug


STORAGES = {
Expand Down Expand Up @@ -222,7 +222,9 @@ def _rebuild_node(self, pos=0):
if c.pos in self.missing_nodes or isinstance(c.node, Leaf):
if c.node is None:
self._rebuild_node(c.pos)
self.nodes[c.pos].update(node)
c_node = self.nodes[c.pos]
if c_node is not None:
c_node.update(node)
self.missing_nodes.remove(pos)


Expand Down Expand Up @@ -286,7 +288,7 @@ def child(self, parent, pos):
node = self.nodes.get(cd, None)
return NodePos(cd, node)

def save(self, path, storage=None, sparseness=0.0):
def save(self, path, storage=None, sparseness=0.0, structure_only=False):
"""Saves an SBT description locally and node data to a storage.
Parameters
Expand All @@ -300,6 +302,9 @@ def save(self, path, storage=None, sparseness=0.0):
How much of the internal nodes should be saved.
Defaults to 0.0 (save all internal nodes data),
can go up to 1.0 (don't save any internal nodes data)
structure_only: boolean
Write only the index schema and metadata, but not the data.
Defaults to False (save data too)
Returns
-------
Expand Down Expand Up @@ -349,14 +354,22 @@ def save(self, path, storage=None, sparseness=0.0):
'filename': os.path.basename(node.name),
'name': node.name
}

try:
node.metadata.pop('max_n_below')
except (AttributeError, KeyError):
pass

data['metadata'] = node.metadata

# trigger data loading before saving to the new place
node.data
if structure_only is False:
# trigger data loading before saving to the new place
node.data

node.storage = storage
node.storage = storage

data['filename'] = node.save(data['filename'])

data['filename'] = node.save(data['filename'])
structure[i] = data

notify("{} of {} nodes saved".format(n+1, total_nodes), end='\r')
Expand All @@ -369,7 +382,7 @@ def save(self, path, storage=None, sparseness=0.0):
return fn

@classmethod
def load(cls, location, leaf_loader=None, storage=None):
def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
"""Load an SBT description from a file.
Parameters
Expand Down Expand Up @@ -423,10 +436,11 @@ def load(cls, location, leaf_loader=None, storage=None):
if version < 3 and storage is None:
storage = FSStorage(dirname, '.sbt.{}'.format(sbt_name))

return loaders[version](jnodes, leaf_loader, dirname, storage)
return loaders[version](jnodes, leaf_loader, dirname, storage,
print_version_warning)

@staticmethod
def _load_v1(jnodes, leaf_loader, dirname, storage):
def _load_v1(jnodes, leaf_loader, dirname, storage, print_version_warning=True):

if jnodes[0] is None:
raise ValueError("Empty tree!")
Expand Down Expand Up @@ -457,7 +471,7 @@ def _load_v1(jnodes, leaf_loader, dirname, storage):
return tree

@classmethod
def _load_v2(cls, info, leaf_loader, dirname, storage):
def _load_v2(cls, info, leaf_loader, dirname, storage, print_version_warning=True):
nodes = {int(k): v for (k, v) in info['nodes'].items()}

if nodes[0] is None:
Expand Down Expand Up @@ -489,7 +503,7 @@ def _load_v2(cls, info, leaf_loader, dirname, storage):
return tree

@classmethod
def _load_v3(cls, info, leaf_loader, dirname, storage):
def _load_v3(cls, info, leaf_loader, dirname, storage, print_version_warning=True):
nodes = {int(k): v for (k, v) in info['nodes'].items()}

if not nodes:
Expand Down Expand Up @@ -526,12 +540,15 @@ def _load_v3(cls, info, leaf_loader, dirname, storage):
# TODO: this might not be true with combine...
tree.next_node = max_node

if print_version_warning:
error("WARNING: this is an old index version, please run `sourmash migrate` to update it.")
error("WARNING: proceeding with execution, but it will take longer to finish!")
tree._fill_min_n_below()

return tree

@classmethod
def _load_v4(cls, info, leaf_loader, dirname, storage):
def _load_v4(cls, info, leaf_loader, dirname, storage, print_version_warning=True):
nodes = {int(k): v for (k, v) in info['nodes'].items()}

if not nodes:
Expand Down Expand Up @@ -575,25 +592,66 @@ def _fill_min_n_below(self):
Propagate the smallest hash size below each node up the tree from
the leaves.
"""
for i, n in self.nodes.items():
if isinstance(n, Leaf):
parent = self.parent(i)
if parent.pos not in self.missing_nodes:
min_n_below = parent.node.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(len(n.data.minhash.get_mins()),
min_n_below)
parent.node.metadata['min_n_below'] = min_n_below

current = parent
parent = self.parent(parent.pos)
while parent and parent.pos not in self.missing_nodes:
min_n_below = parent.node.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(current.node.metadata['min_n_below'],
min_n_below)
parent.node.metadata['min_n_below'] = min_n_below
current = parent
parent = self.parent(parent.pos)
def fill_min_n_below(node, *args, **kwargs):
original_min_n_below = node.metadata.get('min_n_below', sys.maxsize)
min_n_below = original_min_n_below

children = kwargs['children']
for child in children:
if child.node is not None:
if isinstance(child.node, Leaf):
min_n_below = min(len(child.node.data.minhash), min_n_below)
else:
child_n = child.node.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(child_n, min_n_below)

if min_n_below == 0:
min_n_below = 1

node.metadata['min_n_below'] = min_n_below
return original_min_n_below != min_n_below

self._fill_up(fill_min_n_below)

def _fill_up(self, search_fn, *args, **kwargs):
visited, queue = set(), [i[0] for i in reversed(sorted(self._leaves()))]
debug("started filling up")
processed = 0
while queue:
node_p = queue.pop(0)

parent = self.parent(node_p)
if parent is None:
# we are in the root, no more nodes available to search
assert len(queue) == 0
return

was_missing = False
if parent.node is None:
if parent.pos in self.missing_nodes:
self._rebuild_node(parent.pos)
parent = self.parent(node_p)
was_missing = True
else:
continue

siblings = self.children(parent.pos)

if node_p not in visited:
visited.add(node_p)
for sibling in siblings:
visited.add(sibling.pos)
try:
queue.remove(sibling.pos)
except ValueError:
pass

if search_fn(parent.node, children=siblings, *args) or was_missing:
queue.append(parent.pos)

processed += 1
if processed % 100 == 0:
debug("processed {}, in queue {}", processed, len(queue), sep='\r')

def print_dot(self):
print("""
Expand Down Expand Up @@ -747,6 +805,8 @@ def update(self, parent):
parent.data.update(self.data)
min_n_below = min(parent.metadata.get('min_n_below', sys.maxsize),
self.metadata.get('min_n_below'))
if min_n_below == 0:
min_n_below = 1
parent.metadata['min_n_below'] = min_n_below


Expand Down
8 changes: 6 additions & 2 deletions sourmash/sbtmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from . import signature


def load_sbt_index(filename):
def load_sbt_index(filename, print_version_warning=True):
"Load and return an SBT index."
return SBT.load(filename, leaf_loader=SigLeaf.load)
return SBT.load(filename, leaf_loader=SigLeaf.load,
print_version_warning=print_version_warning)


def create_sbt_index(bloom_filter_size=1e5, n_children=2):
Expand Down Expand Up @@ -59,6 +60,9 @@ def update(self, parent):
min_n_below = min(len(self.data.minhash.get_mins()),
min_n_below)

if min_n_below == 0:
min_n_below = 1

parent.metadata['min_n_below'] = min_n_below

@property
Expand Down
1 change: 1 addition & 0 deletions tests/test-data/sbt-search-bug/empty.sig
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"class":"sourmash_signature","email":"","filename":"empty.fa","hash_function":"0.murmur64","license":"CC0","name":"empty sig","signatures":[{"ksize":31,"max_hash":18446744073709552,"md5sum":"c16a5320fa475530d9583c34fd356ef5","mins":[],"molecule":"DNA","num":0,"seed":42}],"version":0.4}]
52 changes: 51 additions & 1 deletion tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from . import sourmash_tst_utils as utils
import sourmash_lib
from sourmash_lib import MinHash
from sourmash_lib.sbt import SBT
from sourmash_lib.sbt import SBT, Node
from sourmash_lib.sbtmh import SigLeaf, load_sbt_index
try:
import matplotlib
Expand Down Expand Up @@ -1274,6 +1274,30 @@ def test_do_sourmash_sbt_search_check_bug():
assert tree.nodes[0].metadata['min_n_below'] == 431


def test_do_sourmash_sbt_search_empty_sig():
with utils.TempDirectory() as location:
# mins: 431
testdata1 = utils.get_test_data('sbt-search-bug/nano.sig')

# mins: 0
testdata2 = utils.get_test_data('sbt-search-bug/empty.sig')

status, out, err = utils.runscript('sourmash',
['index', 'zzz', '-k', '31',
testdata1, testdata2],
in_directory=location)

assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))

status, out, err = utils.runscript('sourmash',
['search', testdata1, 'zzz'],
in_directory=location)
assert '1 matches:' in out

tree = load_sbt_index(os.path.join(location, 'zzz.sbt.json'))
assert tree.nodes[0].metadata['min_n_below'] == 1


def test_do_sourmash_sbt_move_and_search_output():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -3215,6 +3239,32 @@ def test_storage_convert_fsstorage_newpath():
sorted(identity.nodes.items())))


def test_migrate():
with utils.TempDirectory() as location:
testdata = utils.get_test_data('v3.sbt.json')
shutil.copyfile(testdata, os.path.join(location, 'v3.sbt.json'))
shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v3'),
os.path.join(location, '.sbt.v3'))
testsbt = os.path.join(location, 'v3.sbt.json')

original = SBT.load(testsbt, leaf_loader=SigLeaf.load)

status, out, err = utils.runscript('sourmash', ['migrate', testsbt],
in_directory=location)

identity = SBT.load(testsbt, leaf_loader=SigLeaf.load)

assert len(original.nodes) == len(identity.nodes)
assert all(n1[1].name == n2[1].name
for (n1, n2) in zip(sorted(original.nodes.items()),
sorted(identity.nodes.items())))

assert "this is an old index version" not in err
assert all('min_n_below' in node.metadata
for node in identity.nodes.values()
if isinstance(node, Node))


def test_license_cc0():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down

0 comments on commit 016e17e

Please sign in to comment.