diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index 7d764cddaf..20ee2193c2 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -38,6 +38,7 @@
     SimilarCuts,
     ThCovMatSpec,
 )
+from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
 from validphys.loader import (
     Loader,
     LoaderError,
@@ -237,6 +238,52 @@ def parse_fit(self, fit: str):
         except LoadFailedError as e:
             raise ConfigError(str(e), fit, self.loader.available_fits)
 
+    def produce_fitreplicas(self, fit):
+        """Production rule mapping the ``replica`` key to each Monte Carlo
+        fit replica.
+        """
+        num_replicas = num_fitted_replicas(fit)
+        return NSList(range(1, num_replicas + 1), nskey='replica')
+
+    def produce_pdfreplicas(self, fitpdf):
+        """Production rule mapping the ``replica`` key to each postfit
+        replica.
+        """
+        pdf = fitpdf['pdf']
+        replicas = fitted_replica_indexes(pdf)
+        return NSList(replicas, nskey='replica')
+
+    def produce_fitenvironment(self, fit, fitinputcontext):
+        """Like fitcontext, but additionally forcing various other
+        parameters, such as the cuts policy and Monte Carlo seeding to be
+        the same as the fit.
+
+        Notes
+        -----
+            - This production rule is designed to be used as a namespace
+              to collect over, for use with
+              :py:func:`validphys.pseudodata.recreate_fit_pseudodata` and
+              can be added to freely, e.g by setting trvlseed to be from
+              the fit runcard.
+        """
+        log.warning(f"Using mcseed and trvlseed from fit: {fit}")
+        theoryid = fitinputcontext['theoryid']
+        data_input = fitinputcontext['data_input']
+
+        runcard = fit.as_input()
+        trvlseed = runcard['trvlseed']
+        mcseed = runcard['mcseed']
+        genrep = runcard['genrep']
+
+        return {
+            "dataset_inputs": data_input,
+            "theoryid": theoryid,
+            "use_cuts": CutsPolicy.FROMFIT,
+            "mcseed": mcseed,
+            "trvlseed": trvlseed,
+            "genrep": genrep,
+        }
+
     def produce_fitcontext(self, fitinputcontext, fitpdf):
         """Set PDF, theory ID and data input from the fit config"""
 
diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
index ca7fed9959..fc631c68b5 100644
--- a/validphys2/src/validphys/n3fit_data.py
+++ b/validphys2/src/validphys/n3fit_data.py
@@ -405,29 +405,77 @@ def validation_pseudodata(pseudodata_table, training_mask):
 replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",))
 
 
-def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
+@table
+def replica_training_mask_table(exps_tr_masks, replica, experiments_index):
     """Save the boolean mask used to split data into training and validation
-    for each replica as a pandas DataFrame, indexed by
+    for a given replica as a pandas DataFrame, indexed by
     :py:func:`validphys.results.experiments_index`. Can be used to reconstruct
     the training and validation data used in a fit.
 
     Parameters
     ----------
-    replicas_exps_tr_masks: list[list[list[np.array]]]
-        Result of :py:func:`tr_masks` collected over experiments then replicas,
-        which creates the nested structure. The outer list is len(replicas),
-        the next list is len(group_dataset_inputs_by_experiment) and the
-        inner-most list has an array for each dataset in that particular
-        experiment - as defined by the metadata. The arrays should be 1-D
-        boolean arrays which can be used as masks.
-    replicas: NSlist
-        Namespace list of replica numbers to tabulate masks for, each element
-        of the list should be a `replica`. See example below for more
-        information.
+    exps_tr_masks: list[list[np.array]]
+        Result of :py:func:`tr_masks` collected over experiments, which creates
+        the nested structure. The outer list is
+        len(group_dataset_inputs_by_experiment) and the inner-most list has an
+        array for each dataset in that particular experiment - as defined by the
+        metadata. The arrays should be 1-D boolean arrays which can be used as
+        masks.
+    replica: int
+        The index of the replica.
     experiments_index: pd.MultiIndex
         Index returned by :py:func:`validphys.results.experiments_index`.
 
 
+    Example
+    -------
+    >>> from validphys.api import API
+    >>> ds_inp = [
+    ...     {'dataset': 'NMC', 'frac': 0.75},
+    ...     {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD'], 'frac': 0.75},
+    ...     {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10, 'frac': 0.75}
+    ... ]
+    >>> API.replica_training_mask_table(dataset_inputs=ds_inp, replica=1, trvlseed=123, theoryid=162, use_cuts="nocuts", mcseed=None, genrep=False)
+                         replica 1
+    group dataset    id
+    NMC   NMC        0        True
+                    1        True
+                    2       False
+                    3        True
+                    4        True
+    ...                        ...
+    CMS   CMSZDIFF12 45       True
+                    46       True
+                    47       True
+                    48      False
+                    49       True
+
+    [345 rows x 1 columns]
+    """
+    all_masks = np.concatenate([
+        ds_mask
+        for exp_masks in exps_tr_masks
+        for ds_mask in exp_masks
+    ])
+    return pd.DataFrame(
+        all_masks,
+        columns=[f"replica {replica}"],
+        index=experiments_index
+    )
+
+replicas_training_mask_table = collect("replica_training_mask_table", ("replicas",))
+@table
+def training_mask_table(replicas_training_mask_table):
+    """Save the boolean mask used to split data into training and validation
+    for each replica as a pandas DataFrame, indexed by
+    :py:func:`validphys.results.experiments_index`. Can be used to reconstruct
+    the training and validation data used in a fit.
+
+    Parameters
+    ----------
+    replicas_exps_tr_masks: list[list[list[np.array]]]
+        Result of :py:func:`replica_tr_masks` collected over replicas
+
     Example
     -------
     >>> from validphys.api import API
@@ -457,26 +505,8 @@ def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
     [345 rows x 3 columns]
 
     """
-    rep_dfs = []
-    for rep_exps_masks, rep in zip(replicas_exps_tr_masks, replicas):
-        # create flat list with all dataset masks in, then concatenate to single
-        # array.
-        all_masks = np.concatenate([
-            ds_mask
-            for exp_masks in rep_exps_masks
-            for ds_mask in exp_masks
-        ])
-        rep_dfs.append(pd.DataFrame(
-            all_masks,
-            columns=[f"replica {rep}"],
-            index=experiments_index
-        ))
-    return pd.concat(rep_dfs, axis=1)
-
+    return pd.concat(replicas_training_mask_table, axis=1)
 
-@table
-def training_mask_table(training_mask):
-    return training_mask
 
 def fitting_pos_dict(posdataset):
     """Loads a positivity dataset. For more information see
diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py
index 5937f8a35f..f449958898 100644
--- a/validphys2/src/validphys/pseudodata.py
+++ b/validphys2/src/validphys/pseudodata.py
@@ -5,31 +5,26 @@
 """
 from collections import namedtuple
 import logging
-import multiprocessing as mp
-import os
 import pathlib
 
 import numpy as np
 import pandas as pd
 
-from validphys.checks import check_cuts_fromfit, check_darwin_single_process
 from validphys.covmats import INTRA_DATASET_SYS_NAME
 
 from reportengine import collect
 
-from validphys.n3fit_data import replica_mcseed, replica_trvlseed
-import validphys.n3fit_data_utils as reader
+FILE_PREFIX = "datacuts_theory_fitting_"
 
 log = logging.getLogger(__name__)
 
 DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx'])
 
-fitted_pseudodata = collect('fitted_pseudodata_internal', ('fitcontext',))
-
 context_index = collect("groups_index", ("fitcontext",))
+read_fit_pseudodata = collect('read_replica_pseudodata', ('fitreplicas', 'fitenvironment'))
+read_pdf_pseudodata = collect('read_replica_pseudodata', ('pdfreplicas', 'fitenvironment'))
 
-@check_cuts_fromfit
-def read_fit_pseudodata(fitcontext, context_index):
+def read_replica_pseudodata(fit, context_index, replica):
     """Function to handle the reading of training and validation splits for a fit that has been
     produced with the ``savepseudodata`` flag set to ``True``.
 
@@ -55,64 +50,49 @@ def read_fit_pseudodata(fitcontext, context_index):
     Example
     -------
     >>> from validphys.api import API
-    >>> data_indices_list = API.read_fit_pseudodata(fit="NNPDF31_nnlo_as_0118_DISonly_pseudodata", use_cuts="fromfit")
+    >>> data_indices_list = API.read_fit_pseudodata(fit="pseudodata_test_fit_n3fit")
     >>> len(data_indices_list) # Same as nrep
-    100
+    10
     >>> rep_info = data_indices_list[0]
     >>> rep_info.pseudodata.loc[rep_info.tr_idx].head()
-                          data
-    group dataset id
-    BCDMS BCDMSD  0   0.371510
-                1   0.365659
-                2   0.350234
-                4   0.355560
-                6   0.346234
-                        data
+                                replica 1
+    group dataset           id
+    ATLAS ATLASZPT8TEVMDIST 1   30.665835
+                            3   15.795880
+                            4    8.769734
+                            5    3.117819
+                            6    0.771079
     """
     # List of length 1 due to the collect
     context_index = context_index[0]
     # The [0] is because of how pandas handles sorting a MultiIndex
     sorted_index = context_index.sortlevel(level=range(1,3))[0]
 
-    pdf = fitcontext["pdf"]
-    log.debug(f"Using same pseudodata & training/validation splits as {pdf.name}.")
-    nrep = len(pdf)
-    path = pathlib.Path(pdf.infopath)
-
-    data_indices_list = []
-    for rep_number in range(1, nrep):
-        # This is a symlink (usually).
-        replica = path.with_name(pdf.name + "_" + str(rep_number).zfill(4) + ".dat")
-        # we resolve the symlink
-        if replica.parent.is_symlink():
-            replica = pathlib.Path(os.path.realpath(replica))
-
-        training_path = replica.with_name("training.dat")
-        validation_path = replica.with_name("validation.dat")
-
-        try:
-            tr = pd.read_csv(training_path, index_col=[0, 1, 2], sep="\t", names=["data"])
-            val = pd.read_csv(validation_path, index_col=[0, 1, 2], sep="\t", names=["data"])
-        except FileNotFoundError as e:
-            raise FileNotFoundError(
-                "Could not find saved training and validation data files. "
-                f"Please ensure {pdf} was generated with the savepseudodata flag set to true"
-            ) from e
-        tr["type"], val["type"] = "training", "validation"
-
-        pseudodata = pd.concat((tr, val))
-        pseudodata.sort_index(level=range(1,3), inplace=True)
-
-        pseudodata.index = sorted_index
-
-        tr = pseudodata[pseudodata["type"]=="training"]
-        val = pseudodata[pseudodata["type"]=="validation"]
-
-        data_indices_list.append(
-            DataTrValSpec(pseudodata.drop("type", axis=1), tr.index, val.index)
-        )
+    log.debug(f"Reading pseudodata & training/validation splits from {fit.name}.")
+    replica_path = fit.path / "nnfit" / f"replica_{replica}"
+
+    training_path = replica_path / (FILE_PREFIX + "training_pseudodata.csv")
+    validation_path = replica_path / (FILE_PREFIX + "validation_pseudodata.csv")
+
+    try:
+        tr = pd.read_csv(training_path, index_col=[0, 1, 2], sep="\t", header=0)
+        val = pd.read_csv(validation_path, index_col=[0, 1, 2], sep="\t", header=0)
+    except FileNotFoundError as e:
+        raise FileNotFoundError(
+            "Could not find saved training and validation data files. "
+            f"Please ensure {fit} was generated with the savepseudodata flag set to true"
+        ) from e
+    tr["type"], val["type"] = "training", "validation"
+
+    pseudodata = pd.concat((tr, val))
+    pseudodata.sort_index(level=range(1,3), inplace=True)
+
+    pseudodata.index = sorted_index
 
-    return data_indices_list
+    tr = pseudodata[pseudodata["type"]=="training"]
+    val = pseudodata[pseudodata["type"]=="validation"]
+
+    return DataTrValSpec(pseudodata.drop("type", axis=1), tr.index, val.index)
 
 
 def make_replica(dataset_inputs_loaded_cd_with_cuts, replica_mcseed):
@@ -238,233 +218,70 @@ def indexed_make_replica(groups_index, make_replica):
     return pd.DataFrame(make_replica, index=groups_index, columns=["data"])
 
 
-@check_darwin_single_process
-def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None):
-    """A function to obtain information about the pseudodata that went
-        into an N3FIT fit.
-
-        Parameters
-        ----------
-        fit: :py:class:`validphys.core.FitSpec`
-        experiments:
-            List of :py:class:`validphys.core.ExeperimentSpec`
-        num_nnfit_replicas: ``int``
-            Provided for by :py:mod:`validphys.fitdata`. Equal to the number of
-            pre-postfit replicas.
-        t0pdfset: :py:class:`validphys.core.PDF`
-        NPROC: ``int``
-            Integer specifying how many cores to run on. Default is
-            ``mp.cpu_count()``
-
-        Example
-        -------
-        Create a ``YAML`` file say ``runcard_for_pseudodata.yaml``
-
-        .. code-block:: YAML
-            :caption: runcard_for_pseudodata.yaml
-
-            pdf: PN3_DIS_130519
-            fit: PN3_DIS_130519
-
-            experiments:
-              from_: fit
-
-            theory:
-              from_: fit
-
-            t0pdfset:
-              from_: datacuts
-
-            datacuts:
-              from_: fit
-
-            theoryid:
-              from_: theory
-
-            use_cuts: fromfit
-
-        Then run
-
-            >>> with open("./runcard_for_pseudodata.yaml", 'r') as stream:
-            ...     from reportengine.compat import yaml
-            ...     runcard = yaml.safe_load(stream)
-            >>> from validphys.api import API
-            >>> API.get_pseudodata_internal(**runcard)
-
-        Notes
-        -----
-            - This is a wrapper for the ``fitted_pseudodata`` action
-              which knows that ``experiments``, *must* come from fit
-              and similarly ``PDF`` and ``theoryid`` *must* be the same as
-              that of ``fit`` and so on.
-            - This function returns the pseudodata for the replicas
-              pre-postfit. Postfit discards some replicas and rearranges
-              the order. The correpsondence is done by the
-              :py:func:`get_pseudodata`
-              function.
-            - This code runs in parallel to increase efficiency.
-    """
-    if t0pdfset is not None:
-        t0pdfset = t0pdfset.load_t0()
-
-    # The + 1 coming from the fact that we wish to
-    # include the last replica
-    replica = range(1, num_fitted_replicas + 1)
-
-    trvlseed, mcseed, genrep = [
-        fit.as_input().get(i)
-        for i in ["trvlseed", "mcseed", "genrep"]
-    ]
-
-    # common_data_reader expects None if genrep is False
-    if genrep:
-        replicas_mcseed = [
-            replica_mcseed(rep, mcseed, genrep) for rep in replica
-        ]
-    else:
-        replicas_mcseed = None
-
-    replicas_trvlseeds = [replica_trvlseed(rep, trvlseed) for rep in replica]
-
-    def task(d, mcseeds, trvlseeds, replicas):
-        all_exp_infos = [[] for _ in range(len(mcseeds))]
-        for exp in experiments:
-            all_exp_dicts = reader.common_data_reader(
-                exp, t0pdfset, replica_seeds=mcseeds, trval_seeds=trvlseeds
-            )
-            for i, exp_dict in enumerate(all_exp_dicts):
-                all_exp_infos[i].append(exp_dict)
-        for i, j in zip(all_exp_infos, replicas):
-            d[j] = i
-
-    if NPROC == 1:
-        pseudodata_dicts = dict()
-        task(pseudodata_dicts, replicas_mcseed, replicas_trvlseeds, replica)
-    else:
-        with mp.Manager() as manager:
-            d = manager.dict()
-
-            if NPROC is None:
-                NPROC = mp.cpu_count()
-                log.warning(
-                    f"Using all {NPROC} cores available, this may be dangerous "
-                    "especially for use on a cluster. Consider setting the NPROC "
-                    "variable to something sensible."
-                )
-            processes = []
-
-            # convert sub arrays back to lists, use tolist to get builtin python
-            # types.
-            list_split = lambda lst, n: [
-                arr.tolist() for arr in np.array_split(lst, n)
-            ]
-            batched_mcseeds = list_split(replicas_mcseed, NPROC)
-            batched_trvlseeds = list_split(replicas_trvlseeds, NPROC)
-            batched_replica_num = list_split(replica, NPROC)
-            for mc_batch, trvl_batch, replica_batch in zip(
-                batched_mcseeds, batched_trvlseeds, batched_replica_num
-            ):
-                p = mp.Process(
-                    target=task,
-                    args=(d, mc_batch, trvl_batch, replica_batch,),
-                )
-                p.start()
-                processes.append(p)
-            for p in processes:
-                p.join()
-            pseudodata_dicts = dict(d)
-    return pseudodata_dicts
-
-
-def get_pseudodata(fitted_pseudodata, fitted_replica_indexes):
-    """Pseudodata used during fitting but correctly accounting for
-    the postfit reordering.
-    """
-    # By collecting over `fitcontext` we create a list of length
-    # one.
-    fitted_pseudodata = fitted_pseudodata[0]
-    return [fitted_pseudodata[i] for i in fitted_replica_indexes]
+_recreate_fit_pseudodata = collect('indexed_make_replica', ('fitreplicas', 'fitenvironment'))
+_recreate_pdf_pseudodata = collect('indexed_make_replica', ('pdfreplicas', 'fitenvironment'))
 
+fit_tr_masks = collect('replica_training_mask_table', ('fitreplicas', 'fitenvironment'))
+pdf_tr_masks = collect('replica_training_mask_table', ('pdfreplicas', 'fitenvironment'))
 
-def _datasets_mask(experiment_list):
-    """Function to obtain a per datasets training/validation
-    mask given the mask for the corresponding experiment.
+def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_tr_masks):
+    """Function used to reconstruct the pseudodata seen by each of the
+    Monte Carlo fit replicas.
 
     Returns
     -------
-    dict:
-        - tr_mask: training mask for the datasets in the experiment
-        - vl_mask: validation mask for the datasets in the experiment
+    res : list[namedtuple]
+          List of namedtuples, each of which contains a dataframe
+          containing all the data points, the training indices, and
+          the validation indices.
+
+    Example
+    -------
+    >>> from validphys.api import API
+    >>> API.recreate_fit_pseudodata(fit="pseudodata_test_fit_n3fit")
+
+    Notes
+    -----
+    - This function does not account for the postfit reshuffling.
+
+    See Also
+    --------
+    :py:func:`validphys.pseudodata.recreate_pdf_pseudodata`
     """
-    tr_mask = experiment_list["trmask"]
-    vl_mask = experiment_list["vlmask"]
-    slices = []
-    start = 0
-    for i in experiment_list["datasets"]:
-        ndata = i["ndata"]
-        slices.append(start + ndata)
-        start += ndata
-
-    return {
-        "trmask": np.split(tr_mask, slices[:-1]),
-        "vlmask": np.split(vl_mask, slices[:-1]),
-    }
-
-
-def training_validation_pseudodata(get_pseudodata):
-    """Generator to yield a dictionary of training and validation DataFrame
-    per replica indexed appropriately using a MultiIndex
+    res = []
+    for pseudodata, mask, rep in zip(_recreate_fit_pseudodata, fit_tr_masks, fitreplicas):
+        df = pseudodata
+        df.columns = [f"replica {rep}"]
+        tr_idx = df.loc[mask.values].index
+        val_idx = df.loc[~mask.values].index
+        res.append(DataTrValSpec(pseudodata, tr_idx, val_idx))
+    return res
+
+def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdf_tr_masks, pdfreplicas):
+    """Like :py:func:`validphys.pseudodata.recreate_fit_pseudodata`
+    but accounts for the postfit reshuffling of replicas.
+
+    Returns
+    -------
+    res : list[namedtuple]
+          List of namedtuples, each of which contains a dataframe
+          containing all the data points, the training indices, and
+          the validation indices.
+
+    Example
+    -------
+    >>> from validphys.api import API
+    >>> API.recreate_pdf_pseudodata(fit="pseudodata_test_fit_n3fit")
+
+    See Also
+    --------
+    :py:func:`validphys.pseudodata.recreate_fit_pseudodata`
     """
-    exp_infos = get_pseudodata
-    columns = ["experiment", "dataset", "id"]
-    # Loop over all initial replicas
-    for replica in exp_infos:
-        tr_records, tr_central_values = [], []
-        vl_records, vl_central_values = [], []
-        # Loop over experiments in given replica
-        for experiment in replica:
-            split_masks = _datasets_mask(experiment)
-            tr_mask, vl_mask = split_masks["trmask"], split_masks["vlmask"]
-            # While we're here extend the central_values of the experiment
-            tr_central_values.extend(np.squeeze(experiment["expdata"]))
-            vl_central_values.extend(np.squeeze(experiment["expdata_vl"]))
-            # Loop over datasets in experiment
-            for i, dataset in enumerate(experiment["datasets"]):
-                tr_dataset_mask = tr_mask[i]
-                vl_dataset_mask = vl_mask[i]
-                tr_indices = np.array((range(dataset["ndata"])))[tr_dataset_mask]
-                vl_indices = np.array((range(dataset["ndata"])))[vl_dataset_mask]
-                for tr_idat in tr_indices:
-                    tr_records.append(
-                        dict(
-                            [
-                                ("experiment", experiment["name"]),
-                                ("dataset", dataset["name"]),
-                                ("id", tr_idat),
-                            ]
-                        )
-                    )
-                for vl_idat in vl_indices:
-                    vl_records.append(
-                        dict(
-                            [
-                                ("experiment", experiment["name"]),
-                                ("dataset", dataset["name"]),
-                                ("id", vl_idat),
-                            ]
-                        )
-                    )
-
-        tr_df = pd.DataFrame(tr_records, columns=columns)
-        vl_df = pd.DataFrame(vl_records, columns=columns)
-
-        tr_df.set_index(columns, inplace=True)
-        vl_df.set_index(columns, inplace=True)
-
-        tr_index = tr_df.index
-        vl_index = vl_df.index
-        tr_vl_dict = {
-            "trdata": pd.DataFrame(tr_central_values, index=tr_index, columns=["data"]),
-            "vldata": pd.DataFrame(vl_central_values, index=vl_index, columns=["data"]),
-        }
-        yield tr_vl_dict
+    res = []
+    for pseudodata, mask, rep in zip(_recreate_pdf_pseudodata, pdf_tr_masks, pdfreplicas):
+        df = pseudodata
+        df.columns = [f"replica {rep}"]
+        tr_idx = df.loc[mask.values].index
+        val_idx = df.loc[~mask.values].index
+        res.append(DataTrValSpec(pseudodata, tr_idx, val_idx))
+    return res
\ No newline at end of file
diff --git a/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle b/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle
deleted file mode 100644
index e854305bfb..0000000000
Binary files a/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle and /dev/null differ
diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py
index 6643cf97dc..604c3211a2 100644
--- a/validphys2/src/validphys/tests/test_pseudodata.py
+++ b/validphys2/src/validphys/tests/test_pseudodata.py
@@ -2,111 +2,83 @@
 Test to ensure the validphys.pseudodata.get_pseudodata action
 correctly obtains the appropriate pseudodata for an n3fit fit.
 
-To this end, a 10 replica fit has been uploaded named
-`pseudodata_test_fit` obtained using 100 epochs, theoryID 162 and a
-subset of DIS datasets. When this fit was performed, the `all_exp_infos`
-was pickled and stored in `exp_infos.pickle` which is the benchmark
-we use to ensure the action is working appropriately.
+A fit has been generated called pseudodata_test_fit_n3fit
+which has the pseudodata saved as training and validation splits.
+This is used to benchmark the correctness of the pseudodata
+recreation.
 """
-import pickle
-from importlib.resources import read_binary
-
-import numpy as np
+import pandas as pd
 import pytest
 
 from validphys.api import API
-from validphys.pseudodata import training_validation_pseudodata
-import validphys.tests.regressions
+from validphys.tests.conftest import FIT
 
-from reportengine.checks import CheckError
-from reportengine.compat import yaml
-from reportengine.resourcebuilder import ResourceError
+PSEUDODATA_FIT = "pseudodata_test_fit_n3fit"
 
-EXAMPLE_RUNCARD = """fit: pseudodata_test_fit
-pdf: pseudodata_test_fit
 
-experiments:
-  from_: fit
+def test_read_fit_pseudodata():
+    fit_pseudodata = API.read_fit_pseudodata(fit=PSEUDODATA_FIT)
 
-t0pdfset:
-  from_: datacuts
+    nrep = API.num_fitted_replicas(fit=PSEUDODATA_FIT)
+    assert nrep == len(fit_pseudodata)
 
-datacuts:
-  from_: fit
+    for data, tr_idx, val_idx in fit_pseudodata:
+        assert set(tr_idx).isdisjoint(set(val_idx))
+        assert set(tr_idx).union(val_idx) == set(data.index)
 
-use_cuts: fromfit
-"""
 
+def test_read_pdf_pseudodata():
+    pdf_pseudodata = API.read_pdf_pseudodata(fit=PSEUDODATA_FIT)
 
-@pytest.fixture(
-    scope="session",
-    params=[1, pytest.param(None, marks=pytest.mark.linux)],
-)
-def setup_dicts(request):
-    n_process_config = dict(NPROC=request.param)
-    exp_infos_bytes = read_binary(validphys.tests.regressions, "test_exp_infos.pickle")
-    ns = yaml.safe_load(EXAMPLE_RUNCARD)
-    # This is what all the fitted replicas saw
-    exp_infos = pickle.loads(exp_infos_bytes)
+    pdf = API.pdf(pdf=PSEUDODATA_FIT)
+    # -1 because we ignore replica 0
+    assert len(pdf) - 1 == len(pdf_pseudodata)
 
-    # We now need to convert these to postfit replicas
-    fitted_indices = API.fitted_replica_indexes(**ns)
-    fit_postfit_mapping = dict(enumerate(exp_infos, 1))
-    exp_infos = [fit_postfit_mapping[i] for i in fitted_indices]
+    for data, tr_idx, val_idx in pdf_pseudodata:
+        assert set(tr_idx).isdisjoint(set(val_idx))
+        assert set(tr_idx).union(val_idx) == set(data.index)
 
-    pseudodata_info = API.get_pseudodata(**ns, **n_process_config)
 
-    return exp_infos, pseudodata_info
+def test_recreate_fit_pseudodata():
+    fit_pseudodata = API.recreate_fit_pseudodata(fit=PSEUDODATA_FIT)
 
+    nrep = API.num_fitted_replicas(fit=PSEUDODATA_FIT)
+    assert nrep == len(fit_pseudodata)
 
-def test_read_fit_pseudodata():
-    data_indices_list = API.read_fit_pseudodata(
-      fit="dummy_pseudodata_read_test_fit",
-      use_cuts="fromfit"
-    )
-
-    # Only bother checking the first ten replicas
-    for data_indices in data_indices_list[:10]:
-      data, tr_idx, val_idx = data_indices
-      # Check the training and validation index are disjoint
-      assert set(tr_idx).isdisjoint(set(val_idx))
-
-
-    with pytest.raises(FileNotFoundError):
-        # Check a FileNotFoundError is raised
-        # if the input fit wasn't generated
-        # with the savepseudodata flag set to true
-        bad_gen = API.read_fit_pseudodata(
-            fit="dummy_pseudodata_read_failure_test_fit", use_cuts="fromfit"
-        )
-        next(bad_gen)
-
-    with pytest.raises(ResourceError) as e_info:
-        # Check the enforcement of use_cuts being set
-        # to fromfit is in place
-        API.read_fit_pseudodata(
-          fit="dummy_pseudodata_read_test_fit",
-          use_cuts="nocuts"
+    for data, tr_idx, val_idx in fit_pseudodata:
+        assert set(tr_idx).isdisjoint(set(val_idx))
+        assert set(tr_idx).union(val_idx) == set(data.index)
+
+
+def test_recreate_pdf_pseudodata():
+    pdf_pseudodata = API.recreate_pdf_pseudodata(fit=PSEUDODATA_FIT)
+
+    pdf = API.pdf(pdf=PSEUDODATA_FIT)
+    # -1 because we ignore replica 0
+    assert len(pdf) - 1 == len(pdf_pseudodata)
+
+    for data, tr_idx, val_idx in pdf_pseudodata:
+        assert set(tr_idx).isdisjoint(set(val_idx))
+        assert set(tr_idx).union(val_idx) == set(data.index)
+
+
+def test_no_savepseudodata():
+    for func in (API.read_fit_pseudodata, API.read_pdf_pseudodata):
+        with pytest.raises(FileNotFoundError):
+            # Check a FileNotFoundError is raised
+            # if the input fit wasn't generated
+            # with the savepseudodata flag set to true
+            func(fit=FIT)
+
+
+def test_read_matches_recreate():
+    reads = API.read_fit_pseudodata(fit=PSEUDODATA_FIT)
+    recreates = API.recreate_fit_pseudodata(fit=PSEUDODATA_FIT)
+    for read, recreate in zip(reads, recreates):
+        # We ignore the absolute ordering of the dataframes and just check
+        # that they contain identical elements.
+        pd.testing.assert_frame_equal(
+            read.pseudodata, recreate.pseudodata, check_like=True
         )
-        assert isinstance(e_info.__cause__, CheckError)
-
-
-def test_pseudodata(setup_dicts):
-    exp_infos, pseudodata_info = setup_dicts
-    # Loop over replicas
-    for i, j in zip(exp_infos, pseudodata_info):
-        # For each replica, loop over experiments
-        for exp1, exp2 in zip(i, j):
-            assert np.allclose(exp1["expdata"], exp2["expdata"])
-            assert np.allclose(exp1["expdata_vl"], exp2["expdata_vl"])
-
-
-def test_pseudodata_generator(setup_dicts):
-    exp_infos, pseudodata_info = setup_dicts
-    gen = training_validation_pseudodata(pseudodata_info)
-    for i, j in enumerate(gen):
-        continue
-    # There is only one postfit replica in this fit
-    assert i == 0
-    # The training and validation split should be disjoint
-    assert set(j["trdata"].index).isdisjoint(j["vldata"].index)
+        pd.testing.assert_index_equal(read.tr_idx, recreate.tr_idx, check_order=False)
+        pd.testing.assert_index_equal(read.val_idx, recreate.val_idx, check_order=False)