Merge pull request #1517 from NNPDF/utilize_error_members_instead_of_…

…rawdata use error_members instead of rawdata
NNPDF · Mar 1, 2022 · d3b2639 · d3b2639
2 parents 9613456 + 420ccc1
commit d3b2639
Show file tree

Hide file tree

Showing 19 changed files with 98 additions and 76 deletions.
diff --git a/n3fit/src/n3fit/tests/test_vpinterface.py b/n3fit/src/n3fit/tests/test_vpinterface.py
@@ -68,7 +68,7 @@ def test_vpinterface():
     res_2 = xplotting_grid(fit_2, 1.6, xgrid)
     distances = distance_grids([fit_1, fit_2], [res_1, res_2], 0)
     assert len(distances) == 2
-    assert distances[0].grid_values.data.shape == (8, 40)
-    assert distances[1].grid_values.data.shape == (8, 40)
+    assert distances[0].grid_values.data.shape == (1, 8, 40)
+    assert distances[1].grid_values.data.shape == (1, 8, 40)
     np.testing.assert_allclose(distances[0].grid_values.data, 0.0)
     assert not np.allclose(distances[1].grid_values.data, 0.0)
diff --git a/validphys2/src/validphys/arclength.py b/validphys2/src/validphys/arclength.py
@@ -49,7 +49,7 @@ def arc_lengths(
         # Finite diff. step-size, x-grid
         eps = (b - a) / npoints
         ixgrid = xgrid(a, b, "linear", npoints)
-        # PDFs evaluated on grid
+        # PDFs evaluated on grid, use the entire thing, the Stats class will chose later
         xfgrid = xplotting_grid(pdf, Q, ixgrid, basis, flavours).grid_values.data * ixgrid[1]
         fdiff = np.diff(xfgrid) / eps  # Compute forward differences
         res += integrate.simps(1 + np.square(fdiff), ixgrid[1][1:])

diff --git a/validphys2/src/validphys/calcutils.py b/validphys2/src/validphys/calcutils.py
@@ -67,8 +67,8 @@ def calc_chi2(sqrtcov, diffs):
     return np.einsum('i...,i...->...', vec,vec)
 
 def all_chi2(results):
-    """Return the chi² for all elements in the result. Note that the
-    interpretation of the result will depend on the PDF error type"""
+    """Return the chi² for all elements in the result, regardless of the Stats class
+    Note that the interpretation of the result will depend on the PDF error type"""
     data_result, th_result = results
     diffs = th_result.rawdata - data_result.central_value[:,np.newaxis]
     return calc_chi2(sqrtcov=data_result.sqrtcovmat, diffs=diffs)

diff --git a/validphys2/src/validphys/closuretest/closure_results.py b/validphys2/src/validphys/closuretest/closure_results.py
@@ -122,7 +122,7 @@ def bootstrap_bias_experiment(
     """
     dt_ct, th_ct = dataset_inputs_results
     ((_, th_ul),) = underlying_dataset_inputs_results
-    th_ct_boot_cv = bootstrap_values(th_ct.rawdata, bootstrap_samples)
+    th_ct_boot_cv = bootstrap_values(th_ct.error_members, bootstrap_samples)
     boot_diffs = th_ct_boot_cv - th_ul.central_value[:, np.newaxis]
     boot_bias = calc_chi2(dt_ct.sqrtcovmat, boot_diffs) / len(dt_ct)
     return boot_bias
@@ -191,7 +191,7 @@ def variance_dataset(results, fit, use_fitcommondata):
 
     """
     dt, th = results
-    diff = th.central_value[:, np.newaxis] - th.rawdata
+    diff = th.central_value[:, np.newaxis] - th.error_members
     var_unnorm = calc_chi2(dt.sqrtcovmat, diff).mean()
     return VarianceData(var_unnorm, len(th))
 
@@ -210,7 +210,7 @@ def bootstrap_variance_experiment(dataset_inputs_results, bootstrap_samples=500)
     normalised to the number of data in the experiment.
     """
     dt_ct, th_ct = dataset_inputs_results
-    diff = th_ct.central_value[:, np.newaxis] - th_ct.rawdata
+    diff = th_ct.central_value[:, np.newaxis] - th_ct.error_members
     var_unnorm_boot = bootstrap_values(
         diff,
         bootstrap_samples,

diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -130,7 +130,7 @@ def fits_dataset_bias_variance(
     """
     closures_th, law_th, _, sqrtcov = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
-    reps = np.asarray([th.rawdata[:, :_internal_max_reps] for th in closures_th])
+    reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # take mean across replicas - since we might have changed no. of reps
     centrals = reps.mean(axis=2)
     # place bins on first axis
@@ -226,7 +226,7 @@ def dataset_xi(internal_multiclosure_dataset_loader):
 
     """
     closures_th, law_th, covmat, _ = internal_multiclosure_dataset_loader
-    replicas = np.asarray([th.rawdata for th in closures_th])
+    replicas = np.asarray([th.error_members for th in closures_th])
     centrals = np.mean(replicas, axis=-1)
     underlying = law_th.central_value
 
@@ -341,7 +341,7 @@ def _bootstrap_multiclosure_fits(
     # construct proxy fits theory predictions
     for fit_th in fit_boot_th:
         rep_boot_index = rng.choice(n_rep_max, size=n_rep, replace=use_repeats)
-        boot_ths.append(BootstrappedTheoryResult(fit_th.rawdata[:, rep_boot_index]))
+        boot_ths.append(BootstrappedTheoryResult(fit_th.error_members[:, rep_boot_index]))
     return (boot_ths, *input_tuple)
 
 
@@ -741,7 +741,7 @@ def dataset_fits_bias_replicas_variance_samples(
     """
     closures_th, law_th, _, sqrtcov = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
-    reps = np.asarray([th.rawdata[:, :_internal_max_reps] for th in closures_th])
+    reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # take mean across replicas - since we might have changed no. of reps
     centrals = reps.mean(axis=2)
     # place bins on first axis

diff --git a/validphys2/src/validphys/closuretest/multiclosure_pdf.py b/validphys2/src/validphys/closuretest/multiclosure_pdf.py
@@ -90,7 +90,7 @@ def xi_grid_values(xi_pdfgrids):
     glu_sin_grid, nonsin_grid = xi_pdfgrids
     # grid values have shape: replica, flavour, x
     # concatenate along flavour
-    return np.concatenate((glu_sin_grid.grid_values.data, nonsin_grid.grid_values.data), axis=1)
+    return np.concatenate((glu_sin_grid.grid_values.error_members(), nonsin_grid.grid_values.error_members()), axis=1)
 
 
 def underlying_xi_grid_values(

diff --git a/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py b/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py
@@ -126,8 +126,8 @@ def experiments_closure_pseudodata_estimators_table(
         fits_erep_delta_chi2 = []
         for i_fit, fit_th in enumerate(closures_th):
             # some of these could be done outside of loop, but easier to do here.
-            th_replicas = fit_th.rawdata
-            th_central = np.mean(th_replicas, axis=-1)
+            th_replicas = fit_th.error_members
+            th_central = fit_th.central_value
             dt_replicas = fits_reps_pseudo[i_fit].xs(exp_name, axis=0, level=0).to_numpy()
             dt_central = fits_cv.xs(exp_name, axis=0, level=0).iloc[:, i_fit].to_numpy()
 

diff --git a/validphys2/src/validphys/correlations.py b/validphys2/src/validphys/correlations.py
@@ -27,8 +27,8 @@ def _basic_obs_pdf_correlation(pdf_stats, obs_stats):
     (nbins x nf x nx), compatible with grid_values, upon
     changing replicas->bins.
     """
-    x = pdf_stats.data - pdf_stats.central_value()
-    y = obs_stats.data - obs_stats.central_value()
+    x = pdf_stats.error_members() - pdf_stats.central_value()
+    y = obs_stats.error_members() - obs_stats.central_value()
 
     #We want to compute:
     #sum(x*y)/(norm(x)*norm(y))
@@ -53,26 +53,24 @@ def _basic_obs_obs_correlation(obs1, obs2):
     The result is (nbins1 , nbins2), a mareix containing the correlation
     coefficients between the two sets.
     """
-    x = (obs1.data - obs1.central_value()).T
-    y = (obs2.data - obs2.central_value())
+    x = (obs1.error_members() - obs1.central_value()).T
+    y = obs2.error_members() - obs2.central_value()
 
     return x@y/np.outer(la.norm(x,axis=1),la.norm(y,axis=0))
 
-#TODO: Implement for other error types. Do not use the rawdata.
-@check_pdf_is_montecarlo
 def obs_pdf_correlations(pdf, results, xplotting_grid):
     """Return the correlations between each point in a dataset and the PDF
     values on a grid of (x,f) points in a format similar to `xplotting_grid`."""
-    _ , th = results
+    _, th = results
     corrs = pdf.stats_class(_basic_obs_pdf_correlation(xplotting_grid.grid_values, th.stats))
     return xplotting_grid.copy_grid(grid_values=corrs)
 
 
-corrpair_results = collect('results', ['corrpair'])
-corrpair_datasets = collect('dataset', ['corrpair'])
+corrpair_results = collect("results", ["corrpair"])
+corrpair_datasets = collect("dataset", ["corrpair"])
+
 
-@check_pdf_is_montecarlo
 def obs_obs_correlations(pdf, corrpair_results):
     """Return the theoretical correlation matrix between a pair of observables."""
-    (_,th1), (_,th2) = corrpair_results
+    (_, th1), (_, th2) = corrpair_results
     return _basic_obs_obs_correlation(th1.stats, th2.stats)
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
@@ -531,7 +531,7 @@ def pdferr_plus_covmat(dataset, pdf, covmat_t0_considered):
     True
     """
     th = ThPredictionsResult.from_convolution(pdf, dataset)
-    pdf_cov = np.cov(th.rawdata, rowvar=True)
+    pdf_cov = np.cov(th.error_members, rowvar=True)
     return pdf_cov + covmat_t0_considered
 
 

diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
@@ -68,7 +68,7 @@ def _chi2_distribution_plots(chi2_data, stats, pdf, plot_type):
     ax.set_xlabel(r"Replica $\chi^2$")
 
     if plot_type == "hist":
-        ax.hist(alldata.data, label=label, zorder=100)
+        ax.hist(alldata.error_members(), label=label, zorder=100)
     elif plot_type == "kde":
         # We need the squeeze here to change shape from (x, 1) to (x,)
         ax = plotutils.kde_plot(alldata.data.squeeze(), label=label)
@@ -871,7 +871,7 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
 
     basis = obs_pdf_correlations.basis
 
-    fullgrid = obs_pdf_correlations.grid_values.data
+    fullgrid = obs_pdf_correlations.grid_values.error_members()
 
     fls = obs_pdf_correlations.flavours
     x = obs_pdf_correlations.xgrid

diff --git a/validphys2/src/validphys/deltachi2.py b/validphys2/src/validphys/deltachi2.py
@@ -140,15 +140,12 @@ def pos_neg_xplotting_grids(delta_chi2_hessian, xplotting_grid):
     """
     positive_eigenvalue_mask = delta_chi2_hessian >= 0
 
-    # include replica 0 in both new grids
+    # The masks do not include replica 0, add it in both grids
     pos_mask = np.append(True, positive_eigenvalue_mask)
     neg_mask = np.append(True, ~positive_eigenvalue_mask)
 
-    pos_grid = xplotting_grid.grid_values.data[pos_mask]
-    neg_grid = xplotting_grid.grid_values.data[neg_mask]
-
-    pos_xplotting_grid = xplotting_grid.copy_grid(grid_values=pos_grid)
-    neg_xplotting_grid = xplotting_grid.copy_grid(grid_values=neg_grid)
+    pos_xplotting_grid = xplotting_grid.mask_replicas(pos_mask)
+    neg_xplotting_grid = xplotting_grid.mask_replicas(neg_mask)
 
     return [xplotting_grid, pos_xplotting_grid, neg_xplotting_grid]
 
@@ -210,12 +207,10 @@ def draw(self, pdf, grid, flstate):
         labels = flstate.labels
         handles = flstate.handles
 
-        # pick all replicas grid_values for the flavour flindex. stats_class is a method
-        # of the PDF class, which returns the stats calculator (object) for the pdf error type.
-        # Basically stats is an object which says what is the type class of the replicas:
-        # MCStats, HessianStats, SymmHessianStats. In this way validphys use the right methods
-        # to compute statistical values.
-        stats = pdf.stats_class(grid.grid_values.data[:, flindex, :])
+        # Create a copy of the `Stats` instance of the grid
+        # with only the flavours we are interested in
+        gv = grid.grid_values.data
+        stats = grid(grid_values=gv[:, flindex, :]).grid_values
 
         # Ignore spurious normalization warnings
         with warnings.catch_warnings():

diff --git a/validphys2/src/validphys/paramfits/plots.py b/validphys2/src/validphys/paramfits/plots.py
@@ -778,7 +778,7 @@ def plot_total_as_distribution_dataspecs(
             dataspecs_parabolic_as_determination_for_total,
             dataspecs_speclabel):
         #Remember that *_for_total is a len 1 list, so take the first element.
-        kde_plot(dist[0].data, ax=ax, label=label)
+        kde_plot(dist[0].error_members(), ax=ax, label=label)
     ax.set_xlabel(r"$\alpha_S$")
     ax.legend()
     return fig

diff --git a/validphys2/src/validphys/pdfgrids.py b/validphys2/src/validphys/pdfgrids.py
@@ -55,6 +55,23 @@ class XPlottingGrid:
     grid_values: Stats
     scale: str
 
+    def select_flavour(self, flindex):
+        """Return a new grid for one single flavour"""
+        if isinstance(flindex, str):
+            flstr = flindex
+            flindex = self.flavours.index(flindex)
+        else:
+            flstr = self.flavours[flindex]
+        new_grid = self.grid_values.data[:, flindex]
+        gv = self.grid_values.__class__(new_grid)
+        return dataclasses.replace(self, grid_values=gv, flavours=[flstr])
+
+    def mask_replicas(self, mask):
+        """Return a copy of XPlottingGrid with the mask applied to the replicas"""
+        new_grid = self.grid_values.data[mask]
+        gv = self.grid_values.__class__(new_grid)
+        return dataclasses.replace(self, grid_values=gv)
+
     def copy_grid(self, grid_values=None):
         """Create a copy of the grid with potentially a different set of values"""
         if not isinstance(grid_values, Stats):
@@ -92,9 +109,9 @@ def xplotting_grid(pdf:PDF, Q:(float,int), xgrid=None, basis:(str, Basis)='flavo
         raise TypeError(f"Invalid xgrid {xgrid!r}")
     gv = basis.grid_values(pdf, flavours, xgrid, Q)
     #Eliminante Q axis
-    gv = pdf.stats_class(gv.reshape(gv.shape[:-1]))
+    stats_gv = pdf.stats_class(gv.reshape(gv.shape[:-1]))
 
-    res = XPlottingGrid(Q, basis, flavours, xgrid, gv, scale)
+    res = XPlottingGrid(Q, basis, flavours, xgrid, stats_gv, scale)
     return res
 
 xplotting_grids = collect(xplotting_grid, ('pdfs',))
@@ -263,7 +280,7 @@ def distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None
 
         if pdf == pdfs[normalize_to]:
             # Zero the PDF we are normalizing against
-            pdf_zero = pdf.stats_class(np.zeros_like(gr2_stats.data[0]))
+            pdf_zero = pdf.stats_class(np.zeros_like(gr2_stats.data[0:1]))
             newgrid = grid.copy_grid(grid_values=pdf_zero)
             newgrids.append(newgrid)
             continue
@@ -273,8 +290,8 @@ def distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None
         sg1 = g_stats.std_error()
         N1 = pdf.get_members()
 
-        # the distance definition
-        distance = pdf.stats_class(np.sqrt((cv1-cv2)**2/(sg1**2/N1+sg2**2/N2)))
+        # Wrap the distance into a Stats (1, flavours, points)
+        distance = Stats([np.sqrt((cv1-cv2)**2/(sg1**2/N1+sg2**2/N2))])
 
         newgrid = grid.copy_grid(grid_values=distance)
         newgrids.append(newgrid)
@@ -316,8 +333,8 @@ def variance_distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(No
         N1 = pdf.get_members()
         s1 = (mo1-(N1-3)/(N1-1)*sg1**4)/N1
 
-        # the distance definition
-        variance_distance = pdf.stats_class(np.sqrt((sg1**2-sg2**2)**2/(s1+s2)))
+        # Wrap the distance into a Stats (1, flavours, points)
+        variance_distance = Stats([np.sqrt((sg1**2-sg2**2)**2/(s1+s2))])
 
         newgrid = grid.copy_grid(grid_values=variance_distance)
         newgrids.append(newgrid)

diff --git a/validphys2/src/validphys/pdfplots.py b/validphys2/src/validphys/pdfplots.py
@@ -186,10 +186,11 @@ def draw(self, pdf, grid, flstate):
         ax = flstate.ax
         next_prop = next(ax._get_lines.prop_cycler)
         color = next_prop['color']
-        gv = grid.grid_values.data[:,flstate.flindex,:]
+        flavour_grid = grid.select_flavour(flstate.flindex)
+        stats = flavour_grid.grid_values
+        gv = stats.data
         ax.plot(grid.xgrid, gv.T, alpha=0.2, linewidth=0.5,
                 color=color, zorder=1)
-        stats = pdf.stats_class(gv)
         ax.plot(grid.xgrid, stats.central_value(), color=color,
                 linewidth=2,
                 label=pdf.label)
@@ -223,8 +224,7 @@ def get_ylabel(self, parton_name):
     def draw(self, pdf, grid, flstate):
         ax = flstate.ax
         flindex = flstate.flindex
-        gv = grid.grid_values.data[:,flindex,:]
-        stats = pdf.stats_class(gv)
+        stats = grid.select_flavour(flindex).grid_values
 
         res = stats.std_error()
 
@@ -330,7 +330,10 @@ def draw(self, pdf, grid, flstate):
         next_prop = next(pcycler)
         color = next_prop['color']
 
-        gv = grid.grid_values.data[flindex,:]
+        # The grid for the distance is (1, flavours, points)
+        # take only the flavour we are interested in
+        gv = grid.select_flavour(flindex).grid_values.data.squeeze()
+
         ax.plot(grid.xgrid, gv, color=color, label='$%s$' % flstate.parton_name)
 
         return gv
@@ -406,11 +409,11 @@ def setup_flavour(self, flstate):
 
     def draw(self, pdf, grid, flstate):
         ax = flstate.ax
-        flindex = flstate.flindex
         hatchit = flstate.hatchit
         labels = flstate.labels
         handles = flstate.handles
-        stats = pdf.stats_class(grid.grid_values.data[:,flindex,:])
+        # Take only the flavours we are interested in
+        stats = grid.select_flavour(flstate.flindex).grid_values
         pcycler = ax._get_lines.prop_cycler
         #This is ugly but can't think of anything better
 

diff --git a/validphys2/src/validphys/replica_selector.py b/validphys2/src/validphys/replica_selector.py
@@ -169,7 +169,7 @@ def not_outlier_mask(
     """Return a boolean mask with the replicas that are never outside of the given
     percentile in the constrained region"""
     lim = unconstrained_region_index
-    gv = xplotting_grid.grid_values.data[:, :, lim:]
+    gv = xplotting_grid.grid_values.error_members()[:, :, lim:]
     delta = nsigma * np.std(gv, axis=0)
     mean = np.mean(gv, axis=0)
     return ((gv >= mean - delta) & (gv <= mean+delta)).all(axis=2).all(axis=1)
@@ -309,7 +309,7 @@ def draw(self, pdf, grid, flstate):
             #PDFs, and instead would do something different
             next_prop = next(ax._get_lines.prop_cycler)
             color = next_prop['color']
-            gv = grid.grid_values.data[filt, flstate.flindex, :]
+            gv = grid.grid_values.error_members()[filt, flstate.flindex, :]
             ax.plot(
                 grid.xgrid,
                 gv.T,