Skip to content

Commit

Permalink
allow external block memory mapping
Browse files Browse the repository at this point in the history
when requested (and supported) memory map
external blocks by returning a numpy.memmap
instance of the bytes for the block in the external
file

fixes #1525
  • Loading branch information
braingram committed Jun 6, 2023
1 parent 572feb2 commit a488690
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 5 deletions.
21 changes: 17 additions & 4 deletions asdf/_block/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
"""
import os

import numpy as np

from asdf import generic_io, util


Expand All @@ -21,9 +23,9 @@ class UseInternalType:

class ExternalBlockCache:
def __init__(self):
self._cache = {}
self.clear()

def load(self, base_uri, uri):
def load(self, base_uri, uri, memmap=False, validate_checksums=False):
key = util.get_base_uri(uri)
if key not in self._cache:
resolved_uri = generic_io.resolve_uri(base_uri, uri)
Expand All @@ -32,10 +34,21 @@ def load(self, base_uri, uri):

from asdf import open as asdf_open

with asdf_open(resolved_uri, lazy_load=False, copy_arrays=True) as af:
self._cache[key] = af._blocks.blocks[0].cached_data
with asdf_open(
resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums
) as af:
blk = af._blocks.blocks[0]
if memmap and blk.header["compression"] == b"\0\0\0\0":
file_path = util.patched_urllib_parse.urlparse(resolved_uri).path
arr = np.memmap(file_path, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes)
else:
arr = blk.cached_data
self._cache[key] = arr
return self._cache[key]

def clear(self):
self._cache = {}


def relative_uri_for_index(uri, index):
# get the os-native separated path for this uri
Expand Down
5 changes: 4 additions & 1 deletion asdf/_block/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va
self._memmap = memmap
self._validate_checksums = validate_checksums

def clear_external_cache(self):
self._external_block_cache.clear()

@property
def blocks(self):
"""
Expand Down Expand Up @@ -345,7 +348,7 @@ def read(self, fd, after_magic=False):
)

def _load_external(self, uri):
value = self._external_block_cache.load(self._uri, uri)
value = self._external_block_cache.load(self._uri, uri, self._memmap, self._validate_checksums)
if value is external.UseInternal:
return self.blocks[0].data
return value
Expand Down
29 changes: 29 additions & 0 deletions asdf/_tests/_issues/test_1525.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np

import asdf


def test_1525(tmp_path):
"""
External blocks are always lazy loaded and memmapped
https://github.com/asdf-format/asdf/issues/1525
"""

fn = tmp_path / "test.asdf"
arr = np.arange(10)
af = asdf.AsdfFile({"arr": arr})
af.set_array_storage(arr, "external")
af.write_to(fn)

for copy_arrays in (True, False):
with asdf.open(fn, copy_arrays=copy_arrays) as af:
# check that block is external
source = af["arr"]._source
assert isinstance(source, str)

# check if block is memmapped
if copy_arrays:
assert not isinstance(af["arr"].base, np.memmap)
else:
assert isinstance(af["arr"].base, np.memmap)
1 change: 1 addition & 0 deletions asdf/asdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ def close(self):
for external in self._external_asdf_by_uri.values():
external.close()
self._external_asdf_by_uri.clear()
self._blocks.clear_external_cache()

def copy(self):
return self.__class__(
Expand Down

0 comments on commit a488690

Please sign in to comment.