PennyLaneAI · eddddddy · Jul 25, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 19, 2023
diff --git a/.coveragerc b/.coveragerc
@@ -25,6 +25,9 @@ exclude_lines =
     # Ignore things that would have trivial tests
     def version
 
+    # Ignore overload stubs
+    @overload
+
 
 ignore_errors = True
 

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -26,5 +26,5 @@ jobs:
     - uses: PennyLaneAI/sphinx-action@master
       with:
         docs-folder: "doc/"
-        pre-build-command: "apt install -y graphviz && pip3 install -r doc/requirements.txt && pip3 install . pip3 install openfermionpyscf && pip3 install dill zstd"
+        pre-build-command: "apt install -y graphviz && pip3 install -r doc/requirements.txt && pip3 install . pip3 install openfermionpyscf && pip3 install aiohttp fsspec h5py"
         build-command: "sphinx-build -b html . _build -W --keep-going"
diff --git a/.github/workflows/interface-unit-tests.yml b/.github/workflows/interface-unit-tests.yml
@@ -278,7 +278,7 @@ jobs:
       install_pennylane_lightning_master: false
       pytest_coverage_flags: ${{ inputs.pytest_coverage_flags }}
       pytest_markers: data
-      additional_pip_packages: zstd dill
+      additional_pip_packages: h5py
 
 
   device-tests:

diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,4 @@ benchmark/revisions/
 venv
 config.toml
 .envrc
+datasets/*
diff --git a/codecov.yml b/codecov.yml
@@ -1,5 +1,6 @@
 ignore:
   - "pennylane/devices/tests/*"
+  - "pennylane/data/base/_lazy_modules.py"
 
 codecov:
   notify:

diff --git a/doc/introduction/data.rst b/doc/introduction/data.rst
@@ -11,16 +11,12 @@ where the quantum dataset is a collection of `quantum data` obtained from variou
 
 .. note::
 
-    The packages ``zstd`` and ``dill`` are required to use the :mod:`~pennylane.data` module. 
-    These can be installed with ``pip install zstd dill``.
-
-.. warning::
+    The packages ``aiohttp``, ``fsspec``, and ``h5py`` are required to use the :mod:`~pennylane.data` module. 
+    These can be installed with:
+    
+    .. code-block:: console
 
-        PennyLane datasets use the ``dill`` module to compress, store, and read data. Since ``dill``
-        is built on the ``pickle`` module, we reproduce an important warning from the ``pickle``
-        module: it is possible to construct malicious pickle data which will execute arbitrary code
-        during unpickling. Never unpickle data that could have come from an untrusted source, or
-        that could have been tampered with.
+        pip install aiohttp fsspec h5py
 
 Loading Datasets in PennyLane
 -----------------------------
@@ -39,7 +35,7 @@ The :func:`~pennylane.data.load` function returns a ``list`` with the desired da
 >>> H2data = H2datasets[0]
 
 We can load datasets for multiple parameter values by providing a list of values instead of a single value.
-To load all possible values, use the special keyword "full".
+To load all possible values, use the special value :const:`~pennylane.data.FULL` or the string 'full':
 
 >>> H2datasets = qml.data.load("qchem", molname="H2", basis="full", bondlength=[0.5, 1.1])
 >>> print(H2datasets)
@@ -139,9 +135,9 @@ array([-1.5, -0.5,  0.5,  1.5])
 We can then write this :class:`~pennylane.data.Dataset` to storage and read it as follows:
 
 
->>> dataset.write("./path/to/dataset.dat")
+>>> dataset.write("./path/to/dataset.h5")
 >>> read_dataset = qml.data.Dataset()
->>> read_dataset.read("./path/to/dataset.dat")
+>>> read_dataset.read("./path/to/dataset.h5")
 >>> read_dataset.data_name
 "Example"
 >>> read_dataset.hamiltonian

diff --git a/docker/pennylane.dockerfile b/docker/pennylane.dockerfile
@@ -46,7 +46,7 @@ RUN pip install pytest pytest-cov pytest-mock flaky
 RUN pip install -i https://test.pypi.org/simple/ pennylane-lightning --pre --upgrade
 # hotfix, remove when pyscf 2.1 is released (currently no wheel for python3.10)
 RUN pip install openfermionpyscf || true
-RUN pip install dill zstd || true
+RUN pip install hdf5 fsspec aiohttp || true
 RUN make test && make coverage
 
 # create Second small build.

diff --git a/pennylane/__init__.py b/pennylane/__init__.py
@@ -116,13 +116,14 @@
 from pennylane.vqe import ExpvalCost
 from pennylane.debugging import snapshots
 from pennylane.shadows import ClassicalShadow
-import pennylane.data
 import pennylane.pulse
 
 import pennylane.gradients  # pylint:disable=wrong-import-order
 import pennylane.qinfo  # pylint:disable=wrong-import-order
 from pennylane.interfaces import execute  # pylint:disable=wrong-import-order
 
+import pennylane.data
+
 # Look for an existing configuration file
 default_config = Configuration("config.toml")
 

diff --git a/pennylane/data/__init__.py b/pennylane/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018-2022 Xanadu Quantum Technologies Inc.
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-The data subpackage provides functionality to access, store and manipulate quantum datasets.
+"""The data subpackage provides functionality to access, store and manipulate quantum datasets.
 
 Datasets are generally stored and accessed using the :class:`~pennylane.data.Dataset` class.
 Pre-computed datasets are available for download and can be accessed using the :func:`~pennylane.data.load` or
@@ -24,7 +23,180 @@
 .. autosummary::
    :toctree: api
 
+Description
+-----------
+
+Datasets
+~~~~~~~~
+The :class:`Dataset` class provides a portable storage format for information describing a physical
+system and its evolution. For example, a dataset for an arbitrary quantum system could have
+a Hamiltonian, its ground state, and an efficient state-preparation circuit for that state. Datasets
+can contain a range of object types, including:
+
+- ``numpy.ndarray``
+- any numeric type
+- :class:`~.qchem.Molecule`
+- most :class:`~.Operator` types
+- ``list`` of any supported type
+- ``dict`` of any supported type, as long as the keys are strings
+
+
+Creating a Dataset
+~~~~~~~~~~~~~~~~~~
+
+To create a new dataset in-memory, initialize a new ``Dataset`` with the desired attributes:
+
+>>> hamiltonian = qml.Hamiltonian([1., 1.], [qml.PauliZ(wires=0), qml.PauliZ(wires=1)])
+>>> eigvals, eigvecs = np.linalg.eigh(qml.matrix(hamiltonian))
+>>> dataset = qml.data.Dataset(
+...   hamiltonian = hamiltonian,
+...   eigen = {"eigvals": eigvals, "eigvecs": eigvecs}
+... )
+>>> dataset.hamiltonian
+<Hamiltonian: terms=2, wires=[0, 1]>
+>>> dataset.eigen
+{'eigvals': array([-2.,  0.,  0.,  2.]),
+'eigvecs': array([[0.+0.j, 0.+0.j, 0.+0.j, 1.+0.j],
+   [0.+0.j, 1.+0.j, 0.+0.j, 0.+0.j],
+   [0.+0.j, 0.+0.j, 1.+0.j, 0.+0.j],
+   [1.+0.j, 0.+0.j, 0.+0.j, 0.+0.j]])}
+
+Attributes can also be assigned to the instance after creation:
+
+    >>> dataset.ground_state = np.transpose(eigvecs)[np.argmin(eigvals)]
+    >>> dataset.ground_state
+    array([0.+0.j, 0.+0.j, 0.+0.j, 1.+0.j])
+
+
+Reading and Writing Datasets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Datasets can be saved to disk for later use. Datasets use the HDF5 format for serialization,
+which uses the '.h5' file extension.
+
+To save a dataset, use the :meth:`Dataset.write()` method:
+
+    >>> my_dataset = Dataset(...)
+    >>> my_dataset.write("~/datasets/my_dataset.h5")
+
+To open a dataset from a file, use :meth:`Dataset.open()` class method:
+
+    >>> my_dataset = Dataset.open("~/datasets/my_dataset.h5", mode="r")
+
+The `mode` argument follow the standard library convention - 'r' for reading, 'w-' and `w` for create and overwrite,
+and 'a' for editing. ``open()`` can be used to create a new dataset directly on disk:
+
+    >>> new_dataset = Dataset.open("~/datasets/new_datasets.h5", mode="w")
+
+By default, any changes made to an opened dataset will be committed directly to the file, which will fail
+if the file is opened read-only. The `"copy"` mode can be used to load the dataset into memory and detach
+it from the file:
+
+    >>> my_dataset = Dataset.open("~/dataset/my_dataset/h5", mode="copy")
+    >>> my_dataset.new_attribute = "abc"
+
+
+Attribute Metadata
+~~~~~~~~~~~~~~~~~~
+
+Dataset attributes can also contain additional metadata, such as docstrings. The :func:`qml.data.attribute`
+function can be used to attach metadata on assignment or initialization.
+
+    >>> hamiltonian = qml.Hamiltonian([1., 1.], [qml.PauliZ(wires=0), qml.PauliZ(wires=1)])
+    >>> eigvals, eigvecs = np.linalg.eigh(qml.matrix(hamiltonian))
+    >>> dataset = qml.data.Dataset(hamiltonian = qml.data.attribute(
+            hamiltonian,
+            doc="The hamiltonian of the system"))
+    >>> dataset.eigen = qml.data.attribute(
+            {"eigvals": eigvals, "eigvecs": eigvecs},
+            doc="Eigenvalues and eigenvectors of the hamiltonain")
+
+This metadata can then be accessed using the :meth:`Dataset.attr_info` mapping:
+
+    >>> dataset.attr_info["eigen"]["doc"]
+    'The hamiltonian of the system'
+
+
+Declarative API
+~~~~~~~~~~~~~~~
+
+When creating datasets to model a physical system, it is common to collect the same data for
+a system under different conditions or assumptions. For example, a collection of datasets describing
+a quantum oscillator, which contains the first 1000 energy levels for different masses and force constants.
+
+The datasets declarative API allows us to create subclasses of ``Dataset`` that define the required attributes,
+or 'fields', and their associated type and documentation:
+
+.. code-block:: python
+
+    class QuantumOscillator(qml.data.Dataset, data_name="quantum_oscillator", identifiers=["mass", "force_constant"]):
+        \"""Dataset describing a quantum oscillator.\"""
+
+        mass: float = qml.data.field(doc = "The mass of the particle")
+        force_constant: float = qml.data.field(doc = "The force constant of the oscillator")
+        hamiltonian: qml.Hamiltonian = qml.data.field(doc = "The hamiltonian of the particle")
+        energy_levels: np.ndarray = qml.data.field(doc = "The first 1000 energy levels of the system")
+
+The ``data_name`` keyword specifies a category or descriptive name for the dataset type, and the ``identifiers``
+keyword to the class is used to specify fields that function as parameters, i.e they determine the behaviour
+of the system.
+
+When a ``QuantumOscillator`` dataset is created, its attributes will have the documentation from the field
+definition:
+
+    >>> dataset = QuantumOscillator(mass=1, force_constant=0.5, hamiltonian=..., energy_levels=...)
+    >>> dataset.attr_info["mass"]["doc"]
+    'The mass of the particle'
+
 """
 
-from .dataset import Dataset
-from .data_manager import load, load_interactive, list_datasets, list_attributes
+from .attributes import (
+    DatasetArray,
+    DatasetDict,
+    DatasetJSON,
+    DatasetList,
+    DatasetMolecule,
+    DatasetNone,
+    DatasetOperator,
+    DatasetScalar,
+    DatasetSparseArray,
+    DatasetString,
+    DatasetTuple,
+)
+from .base import DatasetNotWriteableError
+from .base.attribute import AttributeInfo, DatasetAttribute, attribute
+from .base.dataset import Dataset, field
+from .data_manager import (
+    DEFAULT,
+    FULL,
+    list_attributes,
+    list_datasets,
+    load,
+    load_interactive,
+)
+
+__all__ = (
+    "AttributeInfo",
+    "attribute",
+    "field",
+    "Dataset",
+    "DatasetAttribute",
+    "DatasetNotWriteableError",
+    "DatasetArray",
+    "DatasetScalar",
+    "DatasetString",
+    "DatasetList",
+    "DatasetDict",
+    "DatasetOperator",
+    "DatasetNone",
+    "DatasetMolecule",
+    "DatasetSparseArray",
+    "DatasetJSON",
+    "DatasetTuple",
+    "load",
+    "load_interactive",
+    "list_attributes",
+    "list_datasets",
+    "DEFAULT",
+    "FULL",
+)
diff --git a/pennylane/data/attributes/__init__.py b/pennylane/data/attributes/__init__.py
@@ -0,0 +1,40 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains DatasetAttribute definitions."""
+
+from .array import DatasetArray
+from .dictionary import DatasetDict
+from .json import DatasetJSON
+from .list import DatasetList
+from .molecule import DatasetMolecule
+from .none import DatasetNone
+from .operator import DatasetOperator
+from .scalar import DatasetScalar
+from .sparse_array import DatasetSparseArray
+from .string import DatasetString
+from .tuple import DatasetTuple
+
+__all__ = (
+    "DatasetArray",
+    "DatasetScalar",
+    "DatasetString",
+    "DatasetDict",
+    "DatasetList",
+    "DatasetOperator",
+    "DatasetSparseArray",
+    "DatasetMolecule",
+    "DatasetNone",
+    "DatasetJSON",
+    "DatasetTuple",
+)