From 1f948492d3fb5d384eb95be2d91c614730210858 Mon Sep 17 00:00:00 2001 From: Drew Leonard Date: Wed, 4 Sep 2024 12:16:22 +0100 Subject: [PATCH] `Dataset.__repr__` upgrade (#431) * Replace "pixel dims" for "array dims" in dataset repr, because that's what it actually shows * Add a little more info to dataset repr * Use pretty correlation matrix instead of plain one * Tweak some output a little * Flip the ordering of world array indices to be correct * Improvements to pretty correlation matrix output * Include dataset ID * Add changelog * Slight generalisation to make the tests pass and hopefully catch any weird data * Slight tweaks * Minor tweaks * Tweaks to make doc tests pass (mostly) * Nope that still needed to be a remote test * Calculate correct number of files for TiledDatasets * Slicing TiledDatasets fives back a different kind of WCS * This needs to be REMOTE_DATA'd as well * Correct/obfuscate sample data path in dataset repr test * Don't need to run doctests on old releases * Fine I'll just skip all of them if you're going to be like that, doctest * Update dkist/dataset/loader.py * Some reworking for tiled / not tiled englishing * Tweak repr again and update loader docstring --------- Co-authored-by: Stuart Mumford --- changelog/431.trivial.rst | 1 + dkist/dataset/loader.py | 51 ++++++++-------- dkist/dataset/utils.py | 119 ++++++++++++++++++++++++-------------- docs/whatsnew/1.0.rst | 14 ++--- 4 files changed, 111 insertions(+), 74 deletions(-) create mode 100644 changelog/431.trivial.rst diff --git a/changelog/431.trivial.rst b/changelog/431.trivial.rst new file mode 100644 index 000000000..b9cd4d0a1 --- /dev/null +++ b/changelog/431.trivial.rst @@ -0,0 +1 @@ +Update Dataset representation for better readability. diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py index 46af93a58..1b1f4ea08 100644 --- a/dkist/dataset/loader.py +++ b/dkist/dataset/loader.py @@ -45,45 +45,48 @@ def load_dataset(target): Examples -------- + >>> import dkist + >>> dkist.load_dataset("/path/to/VISP_L1_ABCDE.asdf") # doctest: +SKIP >>> dkist.load_dataset("/path/to/ABCDE/") # doctest: +SKIP >>> dkist.load_dataset(Path("/path/to/ABCDE")) # doctest: +SKIP - >>> from sunpy.net import Fido, attrs as a - >>> import dkist.net - >>> search_results = Fido.search(a.dkist.Dataset("AGLKO")) # doctest: +REMOTE_DATA - >>> files = Fido.fetch(search_results) # doctest: +REMOTE_DATA - >>> dkist.load_dataset(files) # doctest: +REMOTE_DATA - - This Dataset has 4 pixel and 5 world dimensions + >>> from dkist.data.sample import VISP_BKPLX # doctest: +REMOTE_DATA + >>> print(dkist.load_dataset(VISP_BKPLX)) # doctest: +REMOTE_DATA + This VISP Dataset BKPLX consists of 1700 frames. + Files are stored in ...VISP_BKPLX + + This Dataset has 4 pixel and 5 world dimensions. - dask.array + The data are represented by a object: + dask.array - Pixel Dim Axis Name Data size Bounds + Array Dim Axis Name Data size Bounds 0 polarization state 4 None - 1 raster scan step number 1000 None - 2 dispersion axis 976 None - 3 spatial along slit 2555 None + 1 raster scan step number 425 None + 2 dispersion axis 980 None + 3 spatial along slit 2554 None World Dim Axis Name Physical Type Units - 0 stokes phys.polarization.stokes unknown - 1 time time s + 4 stokes phys.polarization.stokes unknown + 3 time time s 2 helioprojective latitude custom:pos.helioprojective.lat arcsec - 3 wavelength em.wl nm - 4 helioprojective longitude custom:pos.helioprojective.lon arcsec + 1 wavelength em.wl nm + 0 helioprojective longitude custom:pos.helioprojective.lon arcsec Correlation between pixel and world axes: - Pixel Dim - World Dim 0 1 2 3 - 0 yes no no no - 1 no yes no no - 2 no yes no yes - 3 no no yes no - 4 no yes no yes - + | PIXEL DIMENSIONS + | spatial | dispersion | raster scan | polarization + WORLD DIMENSIONS | along slit | axis | step number | state + ------------------------- | ------------ | ------------ | ------------ | ------------ + helioprojective longitude | x | | x | + wavelength | | x | | + helioprojective latitude | x | | x | + time | | | x | + stokes | | | | x """ known_types = _known_types_docs().keys() raise TypeError(f"Input type {type(target).__name__} not recognised. It must be one of {', '.join(known_types)}.") diff --git a/dkist/dataset/utils.py b/dkist/dataset/utils.py index c2c6be6af..6ab710904 100644 --- a/dkist/dataset/utils.py +++ b/dkist/dataset/utils.py @@ -2,6 +2,8 @@ Helper functions for the Dataset class. """ +import textwrap + import numpy as np import gwcs @@ -9,28 +11,44 @@ __all__ = ["dataset_info_str"] -def dataset_info_str(ds): +def dataset_info_str(ds_in): # Check for an attribute that only appears on TiledDataset # Not using isinstance to avoid circular import - is_tiled = hasattr(ds, "combined_headers") - dstype = type(ds).__name__ + is_tiled = hasattr(ds_in, "combined_headers") + dstype = type(ds_in).__name__ if is_tiled: - tile_shape = ds.shape - ds = ds[0, 0] + tile_shape = ds_in.shape + ds = ds_in[0, 0] + else: + ds = ds_in wcs = ds.wcs.low_level_wcs - # Pixel dimensions table + # Array dimensions table - instr = ds.inventory.get("instrument", "") + instr = ds.inventory.get("instrumentName", "") if instr: instr += " " + dsID = ds.inventory.get("datasetId", "(no DatasetID)") + s = f"This {instr}Dataset {dsID} " if is_tiled: - s = f"This {dstype} consists of an array of {tile_shape} Dataset objects\n\n" - s += f"Each {instr}Dataset has {wcs.pixel_n_dim} pixel and {wcs.world_n_dim} world dimensions\n\n" + s += f"is an array of {tile_shape} Dataset objects " + if ds.files: + s += "and \n" + + + if ds.files: + nframes = len(ds.files) if not is_tiled else sum([len(tile.files) for tile in ds_in.flat]) + s += f"consists of {nframes} frames.\n" + s += f"Files are stored in {ds.files.basepath}\n" + + if is_tiled: + s += "\nEach " else: - s = f"This {instr}Dataset has {wcs.pixel_n_dim} pixel and {wcs.world_n_dim} world dimensions\n\n" - s += f"{ds.data}\n\n" + s += "\nThis " + s += f"Dataset has {wcs.pixel_n_dim} pixel and {wcs.world_n_dim} world dimensions.\n\n" + + s += f"The data are represented by a {type(ds.data)} object:\n{ds.data}\n\n" array_shape = wcs.array_shape or (0,) pixel_shape = wcs.pixel_shape or (None,) * wcs.pixel_n_dim @@ -47,7 +65,7 @@ def dataset_info_str(ds): pixel_nam_width = max(9, max(len(x) for x in pixel_axis_names)) pixel_siz_width = max(9, len(str(max(array_shape)))) - s += (("{0:" + str(pixel_dim_width) + "s}").format("Pixel Dim") + " " + + s += (("{0:" + str(pixel_dim_width) + "s}").format("Array Dim") + " " + ("{0:" + str(pixel_nam_width) + "s}").format("Axis Name") + " " + ("{0:" + str(pixel_siz_width) + "s}").format("Data size") + " " + "Bounds\n") @@ -72,11 +90,11 @@ def dataset_info_str(ds): ("{0:" + str(world_typ_width) + "s}").format("Physical Type") + " " + "Units\n") - for iwrl in range(wcs.world_n_dim): + for iwrl in range(wcs.world_n_dim)[::-1]: - name = wcs.world_axis_names[::-1][iwrl] or "None" - typ = wcs.world_axis_physical_types[::-1][iwrl] or "None" - unit = wcs.world_axis_units[::-1][iwrl] or "unknown" + name = wcs.world_axis_names[iwrl] or "None" + typ = wcs.world_axis_physical_types[iwrl] or "None" + unit = wcs.world_axis_units[iwrl] or "unknown" s += (("{0:" + str(world_dim_width) + "d}").format(iwrl) + " " + ("{0:" + str(world_nam_width) + "s}").format(name) + " " + @@ -91,28 +109,51 @@ def dataset_info_str(ds): s += "Correlation between pixel and world axes:\n\n" - s += (" " * world_dim_width + " " + - ("{0:^" + str(wcs.pixel_n_dim * 5 - 2) + "s}").format("Pixel Dim") + - "\n") + s += _get_pp_matrix(ds.wcs) + + # Make sure we get rid of the extra whitespace at the end of some lines + return "\n".join([line.rstrip() for line in s.splitlines()]) - s += (("{0:" + str(world_dim_width) + "s}").format("World Dim") + - "".join([" " + ("{0:" + str(pixel_dim_width) + "d}").format(ipix) - for ipix in range(wcs.pixel_n_dim)]) + - "\n") - matrix = wcs.axis_correlation_matrix[::-1, ::-1] - matrix_str = np.empty(matrix.shape, dtype="U3") - matrix_str[matrix] = "yes" - matrix_str[~matrix] = "no" +def _get_pp_matrix(wcs): + wcs = wcs.low_level_wcs # Just in case the dataset has been sliced and returned the wrong kind of wcs + slen = np.max([len(line) for line in list(wcs.world_axis_names) + list(wcs.pixel_axis_names)]) + mstr = wcs.axis_correlation_matrix.astype("" + str(pixel_dim_width) + "s}").format(matrix_str[iwrl, ipix]) - for ipix in range(wcs.pixel_n_dim)]) + - "\n") + mstr = np.array_str(mstr, max_line_width=1000) + # Make the matrix string prettier for this context by stripping out the array presentation + # Probably a nicer way to do this with regexes but this works fine + mstr = mstr.replace("[[", "").replace(" [", "").replace("]", "").replace("' '", " | ").replace("'", "") + wid = sum(widths[1:]) + header = (" "*widths[0]) + " | " + "PIXEL DIMENSIONS".center(wid+(3*(len(wcs.pixel_axis_names)-1))) + "\n" - # Make sure we get rid of the extra whitespace at the end of some lines - return "\n".join([line.rstrip() for line in s.splitlines()]) + return header + mstr def pp_matrix(wcs): @@ -123,15 +164,7 @@ def pp_matrix(wcs): ---------- wcs : `BaseHighLevelWCS` or `BaseLowLevelWCS` """ - slen = np.max([len(line) for line in list(wcs.world_axis_names) + list(wcs.pixel_axis_names)]) - mstr = wcs.axis_correlation_matrix.astype(f">> from sunpy.net import Fido, attrs as a >>> import dkist.net - >>> res = Fido.search(a.Instrument.visp, a.dkist.Embargoed.false) # doctest: +REMOTE_DATA + >>> res = Fido.search(a.Instrument.visp, a.dkist.Embargoed.false) # doctest: +SKIP >>> res # doctest: +SKIP Results from 1 Provider: @@ -44,8 +44,8 @@ Here is a really quick demo of searching for all unembargoed VISP data and downl - >>> asdf_files = Fido.fetch(res[:, 0]) # doctest: +REMOTE_DATA - >>> asdf_files # doctest: +REMOTE_DATA + >>> asdf_files = Fido.fetch(res[:, 0]) # doctest: +SKIP + >>> asdf_files # doctest: +SKIP ['...VISP_L1_20220602T175042_BDWQK.asdf'] @@ -60,8 +60,8 @@ Any DKIST level one ASDF file can be loaded with the `dkist.load_dataset` functi >>> import dkist - >>> ds = dkist.load_dataset(asdf_files) # doctest: +REMOTE_DATA - >>> ds # doctest: +REMOTE_DATA + >>> ds = dkist.load_dataset(asdf_files) # doctest: +SKIP + >>> ds # doctest: +SKIP This Dataset has 4 pixel and 5 world dimensions @@ -133,8 +133,8 @@ This means you can first slice out a smaller dataset, and then only download the .. code-block:: python - >>> stokes_I_ds = ds[0] # doctest: +REMOTE_DATA - >>> stokes_I_ds # doctest: +REMOTE_DATA + >>> stokes_I_ds = ds[0] # doctest: +SKIP + >>> stokes_I_ds # doctest: +SKIP This Dataset has 3 pixel and 4 world dimensions