From 1f948492d3fb5d384eb95be2d91c614730210858 Mon Sep 17 00:00:00 2001
From: Drew Leonard <andy.j.leonard@gmail.com>
Date: Wed, 4 Sep 2024 12:16:22 +0100
Subject: [PATCH] `Dataset.__repr__` upgrade (#431)

* Replace "pixel dims" for "array dims" in dataset repr, because that's
what it actually shows

* Add a little more info to dataset repr

* Use pretty correlation matrix instead of plain one

* Tweak some output a little

* Flip the ordering of world array indices to be correct

* Improvements to pretty correlation matrix output

* Include dataset ID

* Add changelog

* Slight generalisation to make the tests pass and hopefully catch any
weird data

* Slight tweaks

* Minor tweaks

* Tweaks to make doc tests pass (mostly)

* Nope that still needed to be a remote test

* Calculate correct number of files for TiledDatasets

* Slicing TiledDatasets fives back a different kind of WCS

* This needs to be REMOTE_DATA'd as well

* Correct/obfuscate sample data path in dataset repr test

* Don't need to run doctests on old releases

* Fine I'll just skip all of them if you're going to be like that, doctest

* Update dkist/dataset/loader.py

* Some reworking for tiled / not tiled englishing

* Tweak repr again and update loader docstring

---------

Co-authored-by: Stuart Mumford <stuart@cadair.com>
---
 changelog/431.trivial.rst |   1 +
 dkist/dataset/loader.py   |  51 ++++++++--------
 dkist/dataset/utils.py    | 119 ++++++++++++++++++++++++--------------
 docs/whatsnew/1.0.rst     |  14 ++---
 4 files changed, 111 insertions(+), 74 deletions(-)
 create mode 100644 changelog/431.trivial.rst
diff --git a/changelog/431.trivial.rst b/changelog/431.trivial.rst
new file mode 100644
index 000000000..b9cd4d0a1
--- /dev/null
+++ b/changelog/431.trivial.rst
@@ -0,0 +1 @@
+Update Dataset representation for better readability.
diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py
index 46af93a58..1b1f4ea08 100644
--- a/dkist/dataset/loader.py
+++ b/dkist/dataset/loader.py
@@ -45,45 +45,48 @@ def load_dataset(target):
     Examples
     --------
 
+    >>> import dkist
+
     >>> dkist.load_dataset("/path/to/VISP_L1_ABCDE.asdf")  # doctest: +SKIP
 
     >>> dkist.load_dataset("/path/to/ABCDE/")  # doctest: +SKIP
 
     >>> dkist.load_dataset(Path("/path/to/ABCDE"))  # doctest: +SKIP
 
-    >>> from sunpy.net import Fido, attrs as a
-    >>> import dkist.net
-    >>> search_results = Fido.search(a.dkist.Dataset("AGLKO"))   # doctest: +REMOTE_DATA
-    >>> files = Fido.fetch(search_results)   # doctest: +REMOTE_DATA
-    >>> dkist.load_dataset(files)   # doctest: +REMOTE_DATA
-    <dkist.dataset.dataset.Dataset object at ...>
-    This Dataset has 4 pixel and 5 world dimensions
+    >>> from dkist.data.sample import VISP_BKPLX  # doctest: +REMOTE_DATA
+    >>> print(dkist.load_dataset(VISP_BKPLX))  # doctest: +REMOTE_DATA
+    This VISP Dataset BKPLX consists of 1700 frames.
+    Files are stored in ...VISP_BKPLX
+    <BLANKLINE>
+    This Dataset has 4 pixel and 5 world dimensions.
     <BLANKLINE>
-    dask.array<reshape, shape=(4, 1000, 976, 2555), dtype=float64, chunksize=(1, 1, 976, 2555), chunktype=numpy.ndarray>
+    The data are represented by a <class 'dask.array.core.Array'> object:
+    dask.array<reshape, shape=(4, 425, 980, 2554), dtype=float64, chunksize=(1, 1, 980, 2554), chunktype=numpy.ndarray>
     <BLANKLINE>
-    Pixel Dim  Axis Name                Data size  Bounds
+    Array Dim  Axis Name                Data size  Bounds
             0  polarization state               4  None
-            1  raster scan step number       1000  None
-            2  dispersion axis                976  None
-            3  spatial along slit            2555  None
+            1  raster scan step number        425  None
+            2  dispersion axis                980  None
+            3  spatial along slit            2554  None
     <BLANKLINE>
     World Dim  Axis Name                  Physical Type                   Units
-            0  stokes                     phys.polarization.stokes        unknown
-            1  time                       time                            s
+            4  stokes                     phys.polarization.stokes        unknown
+            3  time                       time                            s
             2  helioprojective latitude   custom:pos.helioprojective.lat  arcsec
-            3  wavelength                 em.wl                           nm
-            4  helioprojective longitude  custom:pos.helioprojective.lon  arcsec
+            1  wavelength                 em.wl                           nm
+            0  helioprojective longitude  custom:pos.helioprojective.lon  arcsec
     <BLANKLINE>
     Correlation between pixel and world axes:
     <BLANKLINE>
-                   Pixel Dim
-    World Dim    0    1    2    3
-            0  yes   no   no   no
-            1   no  yes   no   no
-            2   no  yes   no  yes
-            3   no   no  yes   no
-            4   no  yes   no  yes
-
+                              |                      PIXEL DIMENSIONS
+                              |   spatial    |  dispersion  | raster scan  | polarization
+             WORLD DIMENSIONS |  along slit  |     axis     | step number  |    state
+    ------------------------- | ------------ | ------------ | ------------ | ------------
+    helioprojective longitude |      x       |              |      x       |
+                   wavelength |              |      x       |              |
+     helioprojective latitude |      x       |              |      x       |
+                         time |              |              |      x       |
+                       stokes |              |              |              |      x
     """
     known_types = _known_types_docs().keys()
     raise TypeError(f"Input type {type(target).__name__} not recognised. It must be one of {', '.join(known_types)}.")
diff --git a/dkist/dataset/utils.py b/dkist/dataset/utils.py
index c2c6be6af..6ab710904 100644
--- a/dkist/dataset/utils.py
+++ b/dkist/dataset/utils.py
@@ -2,6 +2,8 @@
 Helper functions for the Dataset class.
 """
 
+import textwrap
+
 import numpy as np
 
 import gwcs
@@ -9,28 +11,44 @@
 __all__ = ["dataset_info_str"]
 
 
-def dataset_info_str(ds):
+def dataset_info_str(ds_in):
     # Check for an attribute that only appears on TiledDataset
     # Not using isinstance to avoid circular import
-    is_tiled = hasattr(ds, "combined_headers")
-    dstype = type(ds).__name__
+    is_tiled = hasattr(ds_in, "combined_headers")
+    dstype = type(ds_in).__name__
     if is_tiled:
-        tile_shape = ds.shape
-        ds = ds[0, 0]
+        tile_shape = ds_in.shape
+        ds = ds_in[0, 0]
+    else:
+        ds = ds_in
     wcs = ds.wcs.low_level_wcs
 
-    # Pixel dimensions table
+    # Array dimensions table
 
-    instr = ds.inventory.get("instrument", "")
+    instr = ds.inventory.get("instrumentName", "")
     if instr:
         instr += " "
+    dsID = ds.inventory.get("datasetId", "(no DatasetID)")
 
+    s = f"This {instr}Dataset {dsID} "
     if is_tiled:
-        s = f"This {dstype} consists of an array of {tile_shape} Dataset objects\n\n"
-        s += f"Each {instr}Dataset has {wcs.pixel_n_dim} pixel and {wcs.world_n_dim} world dimensions\n\n"
+        s += f"is an array of {tile_shape} Dataset objects "
+        if ds.files:
+            s += "and \n"
+
+
+    if ds.files:
+        nframes = len(ds.files) if not is_tiled else sum([len(tile.files) for tile in ds_in.flat])
+        s += f"consists of {nframes} frames.\n"
+        s += f"Files are stored in {ds.files.basepath}\n"
+
+    if is_tiled:
+        s += "\nEach "
     else:
-        s = f"This {instr}Dataset has {wcs.pixel_n_dim} pixel and {wcs.world_n_dim} world dimensions\n\n"
-    s += f"{ds.data}\n\n"
+        s += "\nThis "
+    s += f"Dataset has {wcs.pixel_n_dim} pixel and {wcs.world_n_dim} world dimensions.\n\n"
+
+    s += f"The data are represented by a {type(ds.data)} object:\n{ds.data}\n\n"
 
     array_shape = wcs.array_shape or (0,)
     pixel_shape = wcs.pixel_shape or (None,) * wcs.pixel_n_dim
@@ -47,7 +65,7 @@ def dataset_info_str(ds):
     pixel_nam_width = max(9, max(len(x) for x in pixel_axis_names))
     pixel_siz_width = max(9, len(str(max(array_shape))))
 
-    s += (("{0:" + str(pixel_dim_width) + "s}").format("Pixel Dim") + "  " +
+    s += (("{0:" + str(pixel_dim_width) + "s}").format("Array Dim") + "  " +
             ("{0:" + str(pixel_nam_width) + "s}").format("Axis Name") + "  " +
             ("{0:" + str(pixel_siz_width) + "s}").format("Data size") + "  " +
             "Bounds\n")
@@ -72,11 +90,11 @@ def dataset_info_str(ds):
             ("{0:" + str(world_typ_width) + "s}").format("Physical Type") + "  " +
             "Units\n")
 
-    for iwrl in range(wcs.world_n_dim):
+    for iwrl in range(wcs.world_n_dim)[::-1]:
 
-        name = wcs.world_axis_names[::-1][iwrl] or "None"
-        typ = wcs.world_axis_physical_types[::-1][iwrl] or "None"
-        unit = wcs.world_axis_units[::-1][iwrl] or "unknown"
+        name = wcs.world_axis_names[iwrl] or "None"
+        typ = wcs.world_axis_physical_types[iwrl] or "None"
+        unit = wcs.world_axis_units[iwrl] or "unknown"
 
         s += (("{0:" + str(world_dim_width) + "d}").format(iwrl) + "  " +
                 ("{0:" + str(world_nam_width) + "s}").format(name) + "  " +
@@ -91,28 +109,51 @@ def dataset_info_str(ds):
 
     s += "Correlation between pixel and world axes:\n\n"
 
-    s += (" " * world_dim_width + "  " +
-            ("{0:^" + str(wcs.pixel_n_dim * 5 - 2) + "s}").format("Pixel Dim") +
-            "\n")
+    s += _get_pp_matrix(ds.wcs)
+
+    # Make sure we get rid of the extra whitespace at the end of some lines
+    return "\n".join([line.rstrip() for line in s.splitlines()])
 
-    s += (("{0:" + str(world_dim_width) + "s}").format("World Dim") +
-            "".join(["  " + ("{0:" + str(pixel_dim_width) + "d}").format(ipix)
-                    for ipix in range(wcs.pixel_n_dim)]) +
-            "\n")
 
-    matrix = wcs.axis_correlation_matrix[::-1, ::-1]
-    matrix_str = np.empty(matrix.shape, dtype="U3")
-    matrix_str[matrix] = "yes"
-    matrix_str[~matrix] = "no"
+def _get_pp_matrix(wcs):
+    wcs = wcs.low_level_wcs # Just in case the dataset has been sliced and returned the wrong kind of wcs
+    slen = np.max([len(line) for line in list(wcs.world_axis_names) + list(wcs.pixel_axis_names)])
+    mstr = wcs.axis_correlation_matrix.astype("<U")
+    mstr[np.where(mstr == "True")] = "x"
+    mstr[np.where(mstr == "False")] = ""
+    mstr = mstr.astype(f"<U{slen}")
+
+    labels = wcs.pixel_axis_names
+    width = max(max([len(w) for w in label.split(" ")]) for label in labels)
+    wrapped = [textwrap.wrap(l, width=width, break_long_words=False) for l in labels]
+    maxlines = max([len(l) for l in wrapped])
+    for l in wrapped:
+        while len(l) < maxlines:
+            l.append("")
+    header = np.vstack([[s.center(width) for s in wrapped[l]] for l, _ in enumerate(labels)]).T
+
+    mstr = np.insert(mstr, 0, header, axis=0)
+    world = ["WORLD DIMENSIONS", *list(wcs.world_axis_names)]
+    nrows = maxlines + len(wcs.world_axis_names)
+    while len(world) < nrows:
+        world.insert(0, "")
+    mstr = np.insert(mstr, 0, world, axis=1)
+    widths = [np.max([len(a) for a in col]) for col in mstr.T]
+    mstr = np.insert(mstr, 2, ["-"*wid for wid in widths], axis=0)
+    for i, col in enumerate(mstr.T):
+        if i == 0:
+            mstr[:, i] = np.char.rjust(col, widths[i])
+        else:
+            mstr[:, i] = np.char.center(col, widths[i])
 
-    for iwrl in range(wcs.world_n_dim):
-        s += (("{0:" + str(world_dim_width) + "d}").format(iwrl) +
-                "".join(["  " + ("{0:>" + str(pixel_dim_width) + "s}").format(matrix_str[iwrl, ipix])
-                        for ipix in range(wcs.pixel_n_dim)]) +
-                "\n")
+    mstr = np.array_str(mstr, max_line_width=1000)
+    # Make the matrix string prettier for this context by stripping out the array presentation
+    # Probably a nicer way to do this with regexes but this works fine
+    mstr = mstr.replace("[[", "").replace(" [", "").replace("]", "").replace("' '", " | ").replace("'", "")
+    wid = sum(widths[1:])
+    header = (" "*widths[0]) + " | " + "PIXEL DIMENSIONS".center(wid+(3*(len(wcs.pixel_axis_names)-1))) + "\n"
 
-    # Make sure we get rid of the extra whitespace at the end of some lines
-    return "\n".join([line.rstrip() for line in s.splitlines()])
+    return header + mstr
 
 
 def pp_matrix(wcs):
@@ -123,15 +164,7 @@ def pp_matrix(wcs):
     ----------
     wcs : `BaseHighLevelWCS` or `BaseLowLevelWCS`
     """
-    slen = np.max([len(line) for line in list(wcs.world_axis_names) + list(wcs.pixel_axis_names)])
-    mstr = wcs.axis_correlation_matrix.astype(f"<U{slen}")
-    mstr = np.insert(mstr, 0, wcs.pixel_axis_names, axis=0)
-    world = ["", *list(wcs.world_axis_names)]
-    mstr = np.insert(mstr, 0, world, axis=1)
-    for i, col in enumerate(mstr.T):
-        wid = np.max([len(a) for a in col])
-        mstr[:, i] = np.char.rjust(col, wid)
-    print(np.array_str(mstr, max_line_width=1000))
+    print(_get_pp_matrix(wcs))
 
 
 def extract_pc_matrix(headers, naxes=None):
diff --git a/docs/whatsnew/1.0.rst b/docs/whatsnew/1.0.rst
index 52356301a..bcaf8a048 100644
--- a/docs/whatsnew/1.0.rst
+++ b/docs/whatsnew/1.0.rst
@@ -29,7 +29,7 @@ Here is a really quick demo of searching for all unembargoed VISP data and downl
     >>> from sunpy.net import Fido, attrs as a
     >>> import dkist.net
 
-    >>> res = Fido.search(a.Instrument.visp, a.dkist.Embargoed.false)  # doctest: +REMOTE_DATA
+    >>> res = Fido.search(a.Instrument.visp, a.dkist.Embargoed.false)  # doctest: +SKIP
     >>> res  # doctest: +SKIP
     <sunpy.net.fido_factory.UnifiedResponse object at ...>
     Results from 1 Provider:
@@ -44,8 +44,8 @@ Here is a really quick demo of searching for all unembargoed VISP data and downl
     <BLANKLINE>
     <BLANKLINE>
 
-    >>> asdf_files = Fido.fetch(res[:, 0])  # doctest: +REMOTE_DATA
-    >>> asdf_files  # doctest: +REMOTE_DATA
+    >>> asdf_files = Fido.fetch(res[:, 0])  # doctest: +SKIP
+    >>> asdf_files  # doctest: +SKIP
     <parfive.results.Results object at ...>
     ['...VISP_L1_20220602T175042_BDWQK.asdf']
 
@@ -60,8 +60,8 @@ Any DKIST level one ASDF file can be loaded with the `dkist.load_dataset` functi
 
     >>> import dkist
 
-    >>> ds = dkist.load_dataset(asdf_files)  # doctest: +REMOTE_DATA
-    >>> ds  # doctest: +REMOTE_DATA
+    >>> ds = dkist.load_dataset(asdf_files)  # doctest: +SKIP
+    >>> ds  # doctest: +SKIP
     <dkist.dataset.dataset.Dataset object at ...>
     This Dataset has 4 pixel and 5 world dimensions
     <BLANKLINE>
@@ -133,8 +133,8 @@ This means you can first slice out a smaller dataset, and then only download the
 
 .. code-block:: python
 
-    >>> stokes_I_ds = ds[0]  # doctest: +REMOTE_DATA
-    >>> stokes_I_ds  # doctest: +REMOTE_DATA
+    >>> stokes_I_ds = ds[0]  # doctest: +SKIP
+    >>> stokes_I_ds  # doctest: +SKIP
     <dkist.dataset.dataset.Dataset object at ...>
     This Dataset has 3 pixel and 4 world dimensions
     <BLANKLINE>