NNPDF · siranipour · Sep 15, 2021 · Jul 14, 2021 · Jul 14, 2021 · Jul 19, 2021
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -38,6 +38,7 @@
     SimilarCuts,
     ThCovMatSpec,
 )
+from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
 from validphys.loader import (
     Loader,
     LoaderError,
@@ -237,6 +238,52 @@ def parse_fit(self, fit: str):
         except LoadFailedError as e:
             raise ConfigError(str(e), fit, self.loader.available_fits)
 
+    def produce_fitreplicas(self, fit):
+        """Production rule mapping the ``replica`` key to each Monte Carlo
+        fit replica.
+        """
+        num_replicas = num_fitted_replicas(fit)
+        return NSList(range(1, num_replicas + 1), nskey='replica')
+
+    def produce_pdfreplicas(self, fitpdf):
+        """Production rule mapping the ``replica`` key to each postfit
+        replica.
+        """
+        pdf = fitpdf['pdf']
+        replicas = fitted_replica_indexes(pdf)
+        return NSList(replicas, nskey='replica')
+
+    def produce_fitenvironment(self, fit, fitinputcontext):
+        """Like fitcontext, but additionally forcing various other
+        parameters, such as the cuts policy and Monte Carlo seeding to be
+        the same as the fit.
+
+        Notes
+        -----
+            - This production rule is designed to be used as a namespace
+              to collect over, for use with
+              :py:func:`validphys.pseudodata.recreate_fit_pseudodata` and
+              can be added to freely, e.g by setting trvlseed to be from
+              the fit runcard.
+        """
+        log.warning(f"Using mcseed and trvlseed from fit: {fit}")
+        theoryid = fitinputcontext['theoryid']
+        data_input = fitinputcontext['data_input']
+
+        runcard = fit.as_input()
+        trvlseed = runcard['trvlseed']
+        mcseed = runcard['mcseed']
+        genrep = runcard['genrep']
+
+        return {
+            "dataset_inputs": data_input,
+            "theoryid": theoryid,
+            "use_cuts": CutsPolicy.FROMFIT,
+            "mcseed": mcseed,
+            "trvlseed": trvlseed,
+            "genrep": genrep,
+        }
+
     def produce_fitcontext(self, fitinputcontext, fitpdf):
         """Set PDF, theory ID and data input from the fit config"""
 

diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
@@ -405,29 +405,77 @@ def validation_pseudodata(pseudodata_table, training_mask):
 replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",))
 
 
-def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
+@table
+def replica_training_mask_table(exps_tr_masks, replica, experiments_index):
     """Save the boolean mask used to split data into training and validation
-    for each replica as a pandas DataFrame, indexed by
+    for a given replica as a pandas DataFrame, indexed by
     :py:func:`validphys.results.experiments_index`. Can be used to reconstruct
     the training and validation data used in a fit.
 
     Parameters
     ----------
-    replicas_exps_tr_masks: list[list[list[np.array]]]
-        Result of :py:func:`tr_masks` collected over experiments then replicas,
-        which creates the nested structure. The outer list is len(replicas),
-        the next list is len(group_dataset_inputs_by_experiment) and the
-        inner-most list has an array for each dataset in that particular
-        experiment - as defined by the metadata. The arrays should be 1-D
-        boolean arrays which can be used as masks.
-    replicas: NSlist
-        Namespace list of replica numbers to tabulate masks for, each element
-        of the list should be a `replica`. See example below for more
-        information.
+    exps_tr_masks: list[list[np.array]]
+        Result of :py:func:`tr_masks` collected over experiments, which creates
+        the nested structure. The outer list is
+        len(group_dataset_inputs_by_experiment) and the inner-most list has an
+        array for each dataset in that particular experiment - as defined by the
+        metadata. The arrays should be 1-D boolean arrays which can be used as
+        masks.
+    replica: int
+        The index of the replica.
     experiments_index: pd.MultiIndex
         Index returned by :py:func:`validphys.results.experiments_index`.
 
 
+    Example
+    -------
+    >>> from validphys.api import API
+    >>> ds_inp = [
+    ...     {'dataset': 'NMC', 'frac': 0.75},
+    ...     {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD'], 'frac': 0.75},
+    ...     {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10, 'frac': 0.75}
+    ... ]
+    >>> API.replica_training_mask_table(dataset_inputs=ds_inp, replica=1, trvlseed=123, theoryid=162, use_cuts="nocuts", mcseed=None, genrep=False)
+                         replica 1
+    group dataset    id
+    NMC   NMC        0        True
+                    1        True
+                    2       False
+                    3        True
+                    4        True
+    ...                        ...
+    CMS   CMSZDIFF12 45       True
+                    46       True
+                    47       True
+                    48      False
+                    49       True
+
+    [345 rows x 1 columns]
+    """
+    all_masks = np.concatenate([
+        ds_mask
+        for exp_masks in exps_tr_masks
+        for ds_mask in exp_masks
+    ])
+    return pd.DataFrame(
+        all_masks,
+        columns=[f"replica {replica}"],
+        index=experiments_index
+    )
+
+replicas_training_mask_table = collect("replica_training_mask_table", ("replicas",))
+@table
+def training_mask_table(replicas_training_mask_table):
+    """Save the boolean mask used to split data into training and validation
+    for each replica as a pandas DataFrame, indexed by
+    :py:func:`validphys.results.experiments_index`. Can be used to reconstruct
+    the training and validation data used in a fit.
+
+    Parameters
+    ----------
+    replicas_exps_tr_masks: list[list[list[np.array]]]
+        Result of :py:func:`replica_tr_masks` collected over replicas
+
     Example
     -------
     >>> from validphys.api import API
@@ -457,26 +505,8 @@ def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
     [345 rows x 3 columns]
 
     """
-    rep_dfs = []
-    for rep_exps_masks, rep in zip(replicas_exps_tr_masks, replicas):
-        # create flat list with all dataset masks in, then concatenate to single
-        # array.
-        all_masks = np.concatenate([
-            ds_mask
-            for exp_masks in rep_exps_masks
-            for ds_mask in exp_masks
-        ])
-        rep_dfs.append(pd.DataFrame(
-            all_masks,
-            columns=[f"replica {rep}"],
-            index=experiments_index
-        ))
-    return pd.concat(rep_dfs, axis=1)
-
+    return pd.concat(replicas_training_mask_table, axis=1)
 
-@table
-def training_mask_table(training_mask):
-    return training_mask
 
 def fitting_pos_dict(posdataset):
     """Loads a positivity dataset. For more information see