From a4886908d947bdbf0922b33d2c49e3d64084d1a8 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 6 Jun 2023 13:57:08 -0400 Subject: [PATCH] allow external block memory mapping when requested (and supported) memory map external blocks by returning a numpy.memmap instance of the bytes for the block in the external file fixes #1525 --- asdf/_block/external.py | 21 +++++++++++++++++---- asdf/_block/manager.py | 5 ++++- asdf/_tests/_issues/test_1525.py | 29 +++++++++++++++++++++++++++++ asdf/asdf.py | 1 + 4 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 asdf/_tests/_issues/test_1525.py diff --git a/asdf/_block/external.py b/asdf/_block/external.py index 229a361f7..d93ef6902 100644 --- a/asdf/_block/external.py +++ b/asdf/_block/external.py @@ -9,6 +9,8 @@ """ import os +import numpy as np + from asdf import generic_io, util @@ -21,9 +23,9 @@ class UseInternalType: class ExternalBlockCache: def __init__(self): - self._cache = {} + self.clear() - def load(self, base_uri, uri): + def load(self, base_uri, uri, memmap=False, validate_checksums=False): key = util.get_base_uri(uri) if key not in self._cache: resolved_uri = generic_io.resolve_uri(base_uri, uri) @@ -32,10 +34,21 @@ def load(self, base_uri, uri): from asdf import open as asdf_open - with asdf_open(resolved_uri, lazy_load=False, copy_arrays=True) as af: - self._cache[key] = af._blocks.blocks[0].cached_data + with asdf_open( + resolved_uri, "r", lazy_load=False, copy_arrays=True, validate_checksums=validate_checksums + ) as af: + blk = af._blocks.blocks[0] + if memmap and blk.header["compression"] == b"\0\0\0\0": + file_path = util.patched_urllib_parse.urlparse(resolved_uri).path + arr = np.memmap(file_path, np.uint8, "r", blk.data_offset, blk.cached_data.nbytes) + else: + arr = blk.cached_data + self._cache[key] = arr return self._cache[key] + def clear(self): + self._cache = {} + def relative_uri_for_index(uri, index): # get the os-native separated path for this uri diff --git a/asdf/_block/manager.py b/asdf/_block/manager.py index 25706644b..f7ee0b113 100644 --- a/asdf/_block/manager.py +++ b/asdf/_block/manager.py @@ -305,6 +305,9 @@ def __init__(self, read_blocks=None, uri=None, lazy_load=False, memmap=False, va self._memmap = memmap self._validate_checksums = validate_checksums + def clear_external_cache(self): + self._external_block_cache.clear() + @property def blocks(self): """ @@ -345,7 +348,7 @@ def read(self, fd, after_magic=False): ) def _load_external(self, uri): - value = self._external_block_cache.load(self._uri, uri) + value = self._external_block_cache.load(self._uri, uri, self._memmap, self._validate_checksums) if value is external.UseInternal: return self.blocks[0].data return value diff --git a/asdf/_tests/_issues/test_1525.py b/asdf/_tests/_issues/test_1525.py new file mode 100644 index 000000000..5a5880bde --- /dev/null +++ b/asdf/_tests/_issues/test_1525.py @@ -0,0 +1,29 @@ +import numpy as np + +import asdf + + +def test_1525(tmp_path): + """ + External blocks are always lazy loaded and memmapped + + https://github.com/asdf-format/asdf/issues/1525 + """ + + fn = tmp_path / "test.asdf" + arr = np.arange(10) + af = asdf.AsdfFile({"arr": arr}) + af.set_array_storage(arr, "external") + af.write_to(fn) + + for copy_arrays in (True, False): + with asdf.open(fn, copy_arrays=copy_arrays) as af: + # check that block is external + source = af["arr"]._source + assert isinstance(source, str) + + # check if block is memmapped + if copy_arrays: + assert not isinstance(af["arr"].base, np.memmap) + else: + assert isinstance(af["arr"].base, np.memmap) diff --git a/asdf/asdf.py b/asdf/asdf.py index 408f0bee1..fef4d9a8c 100644 --- a/asdf/asdf.py +++ b/asdf/asdf.py @@ -453,6 +453,7 @@ def close(self): for external in self._external_asdf_by_uri.values(): external.close() self._external_asdf_by_uri.clear() + self._blocks.clear_external_cache() def copy(self): return self.__class__(