diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index e053d299ab..7f7a499db7 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -7,3 +7,8 @@ fbfd77118d60306d4e0ea84b832088eafb8cb867 # CAll black and isort on the last commit 5d9b9e8da6805abd18910cf09314b29a564728a8 +# running black in the root vp folder, closuretest, plotoptions and theorycovariance +b9084ce695c49b5645b82436b19ec6cc3bd30280 +a5d7385a79ef242e7eb1b1bf7b3746b6ebd6a0d0 +3e8cf70c2d3b36d19fcb764599f6dfa6eb924b77 +27f25d037854caa7e15496e4c6fa9d520f365ac9 diff --git a/doc/sphinx/source/contributing/python-tools.md b/doc/sphinx/source/contributing/python-tools.md index b230f29f3a..c6f52bf585 100644 --- a/doc/sphinx/source/contributing/python-tools.md +++ b/doc/sphinx/source/contributing/python-tools.md @@ -66,6 +66,8 @@ integrated with your editor of choice. - The [`black` code formatter](https://github.com/psf/black) runs almost without configuration and produces typically good results. It is good to run it by default, to avoid spending time on formatting (or arguing about it). + - The [`isort`](https://pycqa.github.io/isort/) library sorts imports + alphabetically, and automatically separated into sections and by type. ## Debugging diff --git a/doc/sphinx/source/contributing/rules.md b/doc/sphinx/source/contributing/rules.md index 356dd6460e..c0be0f6b04 100644 --- a/doc/sphinx/source/contributing/rules.md +++ b/doc/sphinx/source/contributing/rules.md @@ -1,6 +1,7 @@ ```eval_rst .. _rules: ``` + # Code development Code development is carried out using Github. @@ -103,11 +104,25 @@ requests: configuration](https://github.com/NNPDF/nnpdf/blob/master/.pylintrc) to catch common problems with Python code. * New Python code should come formatted with - [`black` tool](https://github.com/psf/black). + [`black` tool](https://github.com/psf/black) with [our default + configuration](https://github.com/NNPDF/nnpdf/blob/master/pyproject.toml) + * The imports in Python code should be sorted using the + [`isort`](https://pycqa.github.io/isort/) tool with [our default + configuration](https://github.com/NNPDF/nnpdf/blob/master/pyproject.toml) * Changes in compiled code should be tested in debug mode, with the address sanitizer enabled. This is done with the `-DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON` options in `cmake`. + Some commits corresponding to major cosmetic changes have been collected in + [`.git-blame-ignore-revs`]( + https://docs.github.com/en/repositories/working-with-files/using-files/viewing-a-file#ignore-commits-in-the-blame-view + ). It is possible to configure the local git to ignore these commits when + running `git blame`: + ``` + git config blame.ignoreRevsFile .git-blame-ignore-revs + ``` + + - Regardless of automated tests, always run code with the new changes manually. This gives great insight into possible pitfalls and areas of improvement. diff --git a/validphys2/src/validphys/__init__.py b/validphys2/src/validphys/__init__.py index d6c33482d3..407f181cbc 100644 --- a/validphys2/src/validphys/__init__.py +++ b/validphys2/src/validphys/__init__.py @@ -1,3 +1,4 @@ -#We don't want to import stuff here that could slow down the import times +# We don't want to import stuff here that could slow down the import times from validphys.version import build_version + __version__ = build_version diff --git a/validphys2/src/validphys/api.py b/validphys2/src/validphys/api.py index b71e47c7eb..00063fbd63 100644 --- a/validphys2/src/validphys/api.py +++ b/validphys2/src/validphys/api.py @@ -17,7 +17,6 @@ import logging from reportengine import api - from validphys.app import providers from validphys.config import Config, Environment diff --git a/validphys2/src/validphys/app.py b/validphys2/src/validphys/app.py index 88a45d51a0..19f9f1d41f 100644 --- a/validphys2/src/validphys/app.py +++ b/validphys2/src/validphys/app.py @@ -9,19 +9,16 @@ The entry point of the validphys application is the ``main`` funcion of this module. """ -import sys -import os -import logging import contextlib - +import logging +import os +import sys import lhapdf -from reportengine import app +from reportengine import app +from validphys import mplstyles, uploadutils from validphys.config import Config, Environment -from validphys import uploadutils -from validphys import mplstyles - providers = [ "validphys.results", @@ -37,7 +34,7 @@ "validphys.correlations", "validphys.chi2grids", "validphys.eff_exponents", - "validphys.asy_exponents", + "validphys.asy_exponents", "validphys.paramfits.dataops", "validphys.paramfits.plots", "validphys.theorycovariance.construction", @@ -55,14 +52,13 @@ "validphys.n3fit_data", "validphys.mc2hessian", "reportengine.report", - "validphys.overfit_metric" + "validphys.overfit_metric", ] log = logging.getLogger(__name__) class App(app.App): - environment_class = Environment config_class = Config diff --git a/validphys2/src/validphys/arclength.py b/validphys2/src/validphys/arclength.py index 896fcdffc2..a729394be6 100644 --- a/validphys2/src/validphys/arclength.py +++ b/validphys2/src/validphys/arclength.py @@ -12,16 +12,14 @@ import scipy.integrate as integrate from reportengine import collect +from reportengine.checks import check_positive, make_argcheck from reportengine.figure import figure from reportengine.table import table -from reportengine.checks import check_positive, make_argcheck - +from validphys import plotutils +from validphys.checks import check_pdf_normalize_to +from validphys.core import PDF from validphys.pdfbases import Basis, check_basis from validphys.pdfgrids import xgrid, xplotting_grid -from validphys.core import PDF -from validphys.checks import check_pdf_normalize_to -from validphys import plotutils - ArcLengthGrid = namedtuple("ArcLengthGrid", ("pdf", "basis", "flavours", "stats")) @@ -36,22 +34,22 @@ def arc_lengths( ): """ Compute arc lengths at scale Q - + set up a grid with three segments and compute the arclength for each segment. Note: the variation of the PDF over the grid is computed by computing the forward differences between adjacent grid points. - + Parameters ---------- - + pdf : validphys.core.PDF object Q : float scale at which to evaluate PDF - + basis : default = "flavour" flavours : default = None @@ -60,9 +58,9 @@ def arc_lengths( ------- validphys.arclength.ArcLengthGrid object - object that contains the PDF, basis, flavours, and computed + object that contains the PDF, basis, flavours, and computed arc length statistics. - + """ checked = check_basis(basis, flavours) @@ -94,12 +92,8 @@ def arc_length_table(arc_lengths): """Return a table with the descriptive statistics of the arc lengths over members of the PDF.""" arc_length_data = arc_lengths.stats.error_members() - arc_length_columns = [ - f"${arc_lengths.basis.elementlabel(fl)}$" for fl in arc_lengths.flavours - ] - return ( - pd.DataFrame(arc_length_data, columns=arc_length_columns).describe().iloc[1:, :] - ) + arc_length_columns = [f"${arc_lengths.basis.elementlabel(fl)}$" for fl in arc_lengths.flavours] + return pd.DataFrame(arc_length_data, columns=arc_length_columns).describe().iloc[1:, :] @figure @@ -116,27 +110,24 @@ def plot_arc_lengths( for ipdf, arclengths in enumerate(pdfs_arc_lengths): xvalues = np.array(range(len(arclengths.flavours))) - xlabels = [ - "$" + arclengths.basis.elementlabel(fl) + "$" for fl in arclengths.flavours - ] - + xlabels = ["$" + arclengths.basis.elementlabel(fl) + "$" for fl in arclengths.flavours] + ylower, yupper = arclengths.stats.errorbar68() yvalues = (ylower + yupper) * 0.5 yerr = np.abs(yupper - ylower) * 0.5 - if normalize_to is not None: norm_cv = pdfs_arc_lengths[normalize_to].stats.central_value() yvalues = np.divide(yvalues, norm_cv) yerr = np.divide(yerr, norm_cv) - + shift = (ipdf - (len(pdfs_arc_lengths) - 1) / 2.0) / 5.0 ax.errorbar( xvalues + shift, yvalues, yerr=yerr, fmt='', - linestyle = '', + linestyle='', label=arclengths.pdf.label, ) ax.set_xticks(xvalues) @@ -152,8 +143,8 @@ def integrability_number( basis: (str, Basis) = "evolution", flavours: (list, tuple, type(None)) = None, ): - """Return \sum_i |x_i*f(x_i)|, x_i = {1e-9, 1e-8, 1e-7} - for selected flavours + """Return \sum_i |x_i*f(x_i)|, x_i = {1e-9, 1e-8, 1e-7} + for selected flavours """ checked = check_basis(basis, flavours) basis, flavours = checked["basis"], checked["flavours"] diff --git a/validphys2/src/validphys/asy_exponents.py b/validphys2/src/validphys/asy_exponents.py index 832f0996eb..1ad42df180 100644 --- a/validphys2/src/validphys/asy_exponents.py +++ b/validphys2/src/validphys/asy_exponents.py @@ -11,30 +11,32 @@ from reportengine import collect from reportengine.figure import figuregen -from reportengine.floatformatting import format_number +from reportengine.floatformatting import format_number from reportengine.table import table - -from validphys.checks import check_positive, check_pdf_normalize_to, make_argcheck, check_xlimits +from validphys.checks import check_pdf_normalize_to, check_positive, check_xlimits, make_argcheck from validphys.core import PDF -from validphys.pdfbases import check_basis, Basis -from validphys.pdfplots import BandPDFPlotter, PDFPlotter - +from validphys.pdfbases import Basis, check_basis import validphys.pdfgrids as pdfgrids +from validphys.pdfplots import BandPDFPlotter, PDFPlotter log = logging.getLogger(__name__) + @check_positive('Q') @make_argcheck(check_basis) @check_xlimits -def alpha_asy(pdf: PDF, *, - xmin: numbers.Real = 1e-6, - xmax: numbers.Real = 1e-3, - npoints: int = 100, - Q: numbers.Real = 1.65, - basis: (str, Basis), - flavours: (list, tuple, type(None)) = None): +def alpha_asy( + pdf: PDF, + *, + xmin: numbers.Real = 1e-6, + xmax: numbers.Real = 1e-3, + npoints: int = 100, + Q: numbers.Real = 1.65, + basis: (str, Basis), + flavours: (list, tuple, type(None)) = None, +): """Returns a list of xplotting_grids containing the value of the asymptotic - exponent alpha, as defined by the first relationship in Eq. (4) of + exponent alpha, as defined by the first relationship in Eq. (4) of [arXiv:1604.00024], at the specified value of Q (in GeV), in the interval [xmin, xmax]. basis: Is one of the bases defined in pdfbases.py. This includes 'flavour' @@ -43,10 +45,10 @@ def alpha_asy(pdf: PDF, *, flavours: A set of elements from the basis. If None, the defaults for that basis will be selected. - npoints: the number of sub-intervals in the range [xmin, xmax] on which the + npoints: the number of sub-intervals in the range [xmin, xmax] on which the derivative is computed. """ - #Loading the filter map of the fit/PDF + # Loading the filter map of the fit/PDF checked = check_basis(basis, flavours) basis = checked['basis'] flavours = checked['flavours'] @@ -56,8 +58,7 @@ def alpha_asy(pdf: PDF, *, else: xGrid = pdfgrids.xgrid(xmin, xmax, 'log', npoints) - pdfGrid = pdfgrids.xplotting_grid( - pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) + pdfGrid = pdfgrids.xplotting_grid(pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) pdfGrid_values = pdfGrid.grid_values.data # NOTE: without this I get "setting an array element with a sequence" xGrid = pdfGrid.xgrid @@ -68,21 +69,25 @@ def alpha_asy(pdf: PDF, *, alphaGrid_values = np.gradient(alphaGrid_values, dx, axis=2, edge_order=2) alphaGrid_values[alphaGrid_values == -np.inf] = np.nan # when PDF_i =0 alphaGrid = pdfGrid.copy_grid(grid_values=pdf.stats_class(alphaGrid_values)) - + return alphaGrid + @check_positive('Q') @make_argcheck(check_basis) @check_xlimits -def beta_asy(pdf, *, - xmin: numbers.Real = 0.6, - xmax: numbers.Real = 0.9, - npoints: int = 100, - Q: numbers.Real = 1.65, - basis: (str, Basis), - flavours: (list, tuple, type(None)) = None): +def beta_asy( + pdf, + *, + xmin: numbers.Real = 0.6, + xmax: numbers.Real = 0.9, + npoints: int = 100, + Q: numbers.Real = 1.65, + basis: (str, Basis), + flavours: (list, tuple, type(None)) = None, +): """Returns a list of xplotting_grids containing the value of the asymptotic - exponent beta, as defined by the second relationship in Eq. (4) of + exponent beta, as defined by the second relationship in Eq. (4) of [arXiv:1604.00024], at the specified value of Q (in GeV), in the interval [xmin, xmax]. basis: Is one of the bases defined in pdfbases.py. This includes 'flavour' @@ -91,7 +96,7 @@ def beta_asy(pdf, *, flavours: A set of elements from the basis. If None, the defaults for that basis will be selected. - npoints: the number of sub-intervals in the range [xmin, xmax] on which the + npoints: the number of sub-intervals in the range [xmin, xmax] on which the derivative is computed. """ checked = check_basis(basis, flavours) @@ -103,9 +108,7 @@ def beta_asy(pdf, *, else: xGrid = pdfgrids.xgrid(xmin, xmax, 'linear', npoints) - - pdfGrid = pdfgrids.xplotting_grid( - pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) + pdfGrid = pdfgrids.xplotting_grid(pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) pdfGrid_values = pdfGrid.grid_values.data # NOTE: without this I get "setting an array element with a sequence" xGrid = pdfGrid.xgrid @@ -113,18 +116,19 @@ def beta_asy(pdf, *, warnings.simplefilter('ignore', RuntimeWarning) dx = xGrid[1] - xGrid[0] betaGrid_values = np.log(abs(pdfGrid_values)) - betaGrid_values = (xGrid - 1.) * np.gradient(betaGrid_values, dx, axis=2,edge_order=2) + betaGrid_values = (xGrid - 1.0) * np.gradient(betaGrid_values, dx, axis=2, edge_order=2) betaGrid_values[betaGrid_values == -np.inf] = np.nan # when PDF_i =0 betaGrid = pdfGrid.copy_grid(grid_values=pdf.stats_class(betaGrid_values)) return betaGrid + class AsyExponentBandPlotter(BandPDFPlotter): - """ Class inheriting from BandPDFPlotter, changing title and ylabel to reflect the asymptotic + """Class inheriting from BandPDFPlotter, changing title and ylabel to reflect the asymptotic exponent being plotted. """ - def __init__(self, exponent, *args, **kwargs): + def __init__(self, exponent, *args, **kwargs): self.exponent = exponent super().__init__(*args, **kwargs) @@ -137,15 +141,21 @@ def get_ylabel(self, parton_name): else: return fr"$\{self.exponent}_a$ for ${parton_name}$" + alpha_asy_pdfs = collect('alpha_asy', ('pdfs',)) + @figuregen @check_pdf_normalize_to def plot_alpha_asy( - pdfs, alpha_asy_pdfs, pdfs_alpha_lines, - normalize_to: (int, str, type(None)) = None, - ybottom=None, ytop=None): - """ Plots the alpha asymptotic exponent """ + pdfs, + alpha_asy_pdfs, + pdfs_alpha_lines, + normalize_to: (int, str, type(None)) = None, + ybottom=None, + ytop=None, +): + """Plots the alpha asymptotic exponent""" yield from AsyExponentBandPlotter( 'alpha', pdfs, @@ -153,17 +163,24 @@ def plot_alpha_asy( 'log', normalize_to, ybottom, - ytop,) - + ytop, + ) + + beta_asy_pdfs = collect('beta_asy', ('pdfs',)) + @figuregen @check_pdf_normalize_to def plot_beta_asy( - pdfs, beta_asy_pdfs, pdfs_beta_lines, - normalize_to: (int, str, type(None)) = None, - ybottom=None, ytop=None): - """ Plots the beta asymptotic exponent """ + pdfs, + beta_asy_pdfs, + pdfs_beta_lines, + normalize_to: (int, str, type(None)) = None, + ybottom=None, + ytop=None, +): + """Plots the beta asymptotic exponent""" yield from AsyExponentBandPlotter( 'beta', pdfs, @@ -171,7 +188,9 @@ def plot_beta_asy( 'linear', normalize_to, ybottom, - ytop,) + ytop, + ) + @table @make_argcheck(check_basis) @@ -180,11 +199,12 @@ def asymptotic_exponents_table( *, x_alpha: numbers.Real = 1e-6, x_beta: numbers.Real = 0.90, - Q: numbers.Real = 1.65, - basis:(str, Basis), + Q: numbers.Real = 1.65, + basis: (str, Basis), flavours: (list, tuple, type(None)) = None, - npoints=100,): - """ Returns a table with the values of the asymptotic exponents alpha and beta, as defined + npoints=100, +): + """Returns a table with the values of the asymptotic exponents alpha and beta, as defined in Eq. (4) of [arXiv:1604.00024], at the specified value of x and Q. basis: Is one of the bases defined in pdfbases.py. This includes 'flavour' @@ -193,27 +213,17 @@ def asymptotic_exponents_table( flavours: A set of elements from the basis. If None, the defaults for that basis will be selected. - npoints: the number of sub-intervals in the range [xmin, xmax] on which the + npoints: the number of sub-intervals in the range [xmin, xmax] on which the derivative is computed. """ alpha_a = alpha_asy( - pdf, - xmin=x_alpha, - xmax=1e-3, - npoints=npoints, - Q=Q, - basis=basis, - flavours=flavours) - + pdf, xmin=x_alpha, xmax=1e-3, npoints=npoints, Q=Q, basis=basis, flavours=flavours + ) + beta_a = beta_asy( - pdf, - xmin=0.60, - xmax=x_beta, - npoints=npoints, - Q=Q, - basis=basis, - flavours=flavours) + pdf, xmin=0.60, xmax=x_beta, npoints=npoints, Q=Q, basis=basis, flavours=flavours + ) alphastats = alpha_a.grid_values betastats = beta_a.grid_values @@ -223,28 +233,35 @@ def asymptotic_exponents_table( alpha_cv = alphastats.central_value() beta_cv = betastats.central_value() - + alpha_er = alphastats.std_error() beta_er = betastats.std_error() alpha_68 = alphastats.errorbar68() beta_68 = betastats.errorbar68() - + flavours_label = [] - asy_exp_mean = [] - asy_exp_err = [] - asy_exp_min = [] - asy_exp_max = [] - - for (j, fl) in enumerate(flavours): - asy_exp_mean.extend((alpha_cv[j,0],beta_cv[j,-1])) - asy_exp_err.extend((alpha_er[j,0],beta_er[j,-1])) - asy_exp_min.extend((alpha_68[0][j,0],beta_68[0][j,-1])) - asy_exp_max.extend((alpha_68[1][j,0],beta_68[1][j,-1])) + asy_exp_mean = [] + asy_exp_err = [] + asy_exp_min = [] + asy_exp_max = [] + + for j, fl in enumerate(flavours): + asy_exp_mean.extend((alpha_cv[j, 0], beta_cv[j, -1])) + asy_exp_err.extend((alpha_er[j, 0], beta_er[j, -1])) + asy_exp_min.extend((alpha_68[0][j, 0], beta_68[0][j, -1])) + asy_exp_max.extend((alpha_68[1][j, 0], beta_68[1][j, -1])) flavours_label.append(f"${basis.elementlabel(fl)}$") - asy_exp_data = {"mean": asy_exp_mean, "std": asy_exp_err, "min(68% CL)": asy_exp_min, "max(68% CL)": asy_exp_max} + asy_exp_data = { + "mean": asy_exp_mean, + "std": asy_exp_err, + "min(68% CL)": asy_exp_min, + "max(68% CL)": asy_exp_max, + } ind = pd.MultiIndex.from_product([flavours_label, [r"$\alpha$", r"$\beta$"]]) - df = pd.DataFrame(asy_exp_data, index=ind, columns=["mean","std","min(68% CL)","max(68% CL)"]) + df = pd.DataFrame( + asy_exp_data, index=ind, columns=["mean", "std", "min(68% CL)", "max(68% CL)"] + ) return df diff --git a/validphys2/src/validphys/calcutils.py b/validphys2/src/validphys/calcutils.py index 22c6123f9a..5c38adf021 100644 --- a/validphys2/src/validphys/calcutils.py +++ b/validphys2/src/validphys/calcutils.py @@ -8,8 +8,8 @@ from typing import Callable import numpy as np -import scipy.linalg as la import pandas as pd +import scipy.linalg as la log = logging.getLogger(__name__) @@ -57,21 +57,23 @@ def calc_chi2(sqrtcov, diffs): 44.64401691354948 """ - #Note la.cho_solve doesn't really improve things here - #NOTE: Do not enable check_finite. The upper triangular part is not - #guaranteed to make any sense. + # Note la.cho_solve doesn't really improve things here + # NOTE: Do not enable check_finite. The upper triangular part is not + # guaranteed to make any sense. vec = la.solve_triangular(sqrtcov, diffs, lower=True, check_finite=False) - #This sums up the result for the chi² for any input shape. - #Sum the squares over the first dimension and leave the others alone - return np.einsum('i...,i...->...', vec,vec) + # This sums up the result for the chi² for any input shape. + # Sum the squares over the first dimension and leave the others alone + return np.einsum('i...,i...->...', vec, vec) + def all_chi2(results): """Return the chi² for all elements in the result, regardless of the Stats class Note that the interpretation of the result will depend on the PDF error type""" data_result, th_result = results - diffs = th_result.rawdata - data_result.central_value[:,np.newaxis] + diffs = th_result.rawdata - data_result.central_value[:, np.newaxis] return calc_chi2(sqrtcov=data_result.sqrtcovmat, diffs=diffs) + def central_chi2(results): """Calculate the chi² from the central value of the theory prediction to the data""" @@ -84,10 +86,11 @@ def all_chi2_theory(results, totcov): """Like all_chi2 but here the chi² are calculated using a covariance matrix that is the sum of the experimental covmat and the theory covmat.""" data_result, th_result = results - diffs = th_result.rawdata - data_result.central_value[:,np.newaxis] + diffs = th_result.rawdata - data_result.central_value[:, np.newaxis] total_covmat = np.array(totcov) return calc_chi2(sqrtcov=la.cholesky(total_covmat, lower=True), diffs=diffs) + def central_chi2_theory(results, totcov): """Like central_chi2 but here the chi² is calculated using a covariance matrix that is the sum of the experimental covmat and the theory covmat.""" @@ -96,6 +99,7 @@ def central_chi2_theory(results, totcov): total_covmat = np.array(totcov) return calc_chi2(la.cholesky(total_covmat, lower=True), central_diff) + def calc_phi(sqrtcov, diffs): """Low level function which calculates phi given a Cholesky decomposed lower triangular part and a vector of differences. Primarily used when phi @@ -105,11 +109,15 @@ def calc_phi(sqrtcov, diffs): axis """ diffs = np.array(diffs) - return np.sqrt((np.mean(calc_chi2(sqrtcov, diffs), axis=0) - - calc_chi2(sqrtcov, diffs.mean(axis=1)))/diffs.shape[0]) + return np.sqrt( + (np.mean(calc_chi2(sqrtcov, diffs), axis=0) - calc_chi2(sqrtcov, diffs.mean(axis=1))) + / diffs.shape[0] + ) + -def bootstrap_values(data, nresamples, *, boot_seed:int=None, - apply_func:Callable=None, args=None): +def bootstrap_values( + data, nresamples, *, boot_seed: int = None, apply_func: Callable = None, args=None +): """General bootstrap sample `data` is the data which is to be sampled, replicas is assumed to @@ -132,13 +140,15 @@ def bootstrap_values(data, nresamples, *, boot_seed:int=None, """ data = np.atleast_2d(data) N_reps = data.shape[-1] - bootstrap_data = data[..., np.random.RandomState(boot_seed).randint(N_reps, - size=(N_reps, nresamples))] + bootstrap_data = data[ + ..., np.random.RandomState(boot_seed).randint(N_reps, size=(N_reps, nresamples)) + ] if apply_func is None: return np.mean(bootstrap_data, axis=-2) else: return apply_func(bootstrap_data, *args) + def get_df_block(matrix: pd.DataFrame, key: str, level): """Given a pandas dataframe whose index and column keys match, and data represents a symmetric matrix returns a diagonal block of this matrix corresponding to `matrix`[key`, key`] as a numpy @@ -148,11 +158,10 @@ def get_df_block(matrix: pd.DataFrame, key: str, level): taken, by default it is set to 1 which corresponds to the dataset level of a theory covariance matrix """ - block = matrix.xs( - key, level=level, axis=0).xs( - key, level=level, axis=1).values + block = matrix.xs(key, level=level, axis=0).xs(key, level=level, axis=1).values return block + def regularize_covmat(covmat: np.array, norm_threshold=4): """Given a covariance matrix, performs a regularization which is equivalent to performing `regularize_l2` on the sqrt of `covmat`: the l2 norm of @@ -187,13 +196,14 @@ def regularize_covmat(covmat: np.array, norm_threshold=4): "Negative eigenvalue encountered in correlation matrix: %s. " "Assuming eigenvalue should be zero and is negative due to numerical " "precision.", - e_val[0] + e_val[0], ) - if e_val[0] > 1/sqr_threshold: + if e_val[0] > 1 / sqr_threshold: return covmat - new_e_val = np.clip(e_val, a_min=1/sqr_threshold, a_max=None) + new_e_val = np.clip(e_val, a_min=1 / sqr_threshold, a_max=None) return ((e_vec * new_e_val) @ e_vec.T) * d * d.T + def regularize_l2(sqrtcov, norm_threshold=4): r"""Return a regularized version of `sqrtcov`. @@ -230,10 +240,10 @@ def regularize_l2(sqrtcov, norm_threshold=4): A regularized version of `sqrtcov`. """ - d = np.sqrt(np.sum(sqrtcov ** 2, axis=1))[:, np.newaxis] + d = np.sqrt(np.sum(sqrtcov**2, axis=1))[:, np.newaxis] sqrtcorr = sqrtcov / d u, s, vt = la.svd(sqrtcorr, full_matrices=False) if 1 / s[-1] <= norm_threshold: return sqrtcov - snew = np.clip(s, a_min=1/norm_threshold, a_max=None) + snew = np.clip(s, a_min=1 / norm_threshold, a_max=None) return u * (snew * d) @ vt diff --git a/validphys2/src/validphys/checks.py b/validphys2/src/validphys/checks.py index eb9d25065b..bdd4496003 100644 --- a/validphys2/src/validphys/checks.py +++ b/validphys2/src/validphys/checks.py @@ -5,36 +5,43 @@ @author: Zahari Kassabov """ from collections import Counter +import json +import logging import platform import tempfile -import json - -from matplotlib import scale as mscale import lhapdf +from matplotlib import scale as mscale -from reportengine.checks import (make_check, CheckError, require_one, - check_not_empty, make_argcheck, check_positive, check) - +from reportengine.checks import ( + CheckError, + check, + check_not_empty, + check_positive, + make_argcheck, + make_check, + require_one, +) from validphys import lhaindex from validphys.core import CutsPolicy -import logging log = logging.getLogger(__name__) + @make_check def check_use_t0(ns, **kwargs): """Checks use_t0 is set to true""" if not ns.get("use_t0"): raise CheckError("The flag 'use_t0' needs to be set to 'true' for this action.") + @make_check def check_pdf_is_montecarlo(ns, **kwargs): pdf = ns['pdf'] etype = pdf.error_type if etype != 'replicas': - raise CheckError("Error type of PDF %s must be 'replicas' and not %s" - % (pdf, etype)) + raise CheckError(f"Error type of PDF {pdf} must be 'replicas' and not {etype}") + @make_check def check_know_errors(ns, **kwargs): @@ -54,30 +61,35 @@ def check_can_save_grid(ns, **kwags): try: tempfile.TemporaryFile(dir=write_path) except OSError as e: - raise CheckError("Cannot write to the LHAPDF path %s.\n" - "This is required because the 'installgrid' " - "parameter is set to True:\n%s" % - (write_path, e)) + raise CheckError( + f"""Cannot write to the LHAPDF path {write_path}." +This is required because the 'installgrid' parameter is set to True: +{e}""" + ) + @make_argcheck def check_xlimits(xmax, xmin): if not (0 <= xmin < xmax <= 1): - raise CheckError(f'xmin ({xmin}) and xmax ({xmax}) must satisfy \n' - '0 <= xmin < xmax <= 1') + raise CheckError(f'xmin ({xmin}) and xmax ({xmax}) must satisfy \n' '0 <= xmin < xmax <= 1') + @make_check def check_has_fitted_replicas(ns, **kwargs): name, path = ns['fit'] - postfit_path = path/'postfit'/'postfit.log' - old_postfit_path = path/'nnfit'/'postfit.log' + postfit_path = path / 'postfit' / 'postfit.log' + old_postfit_path = path / 'nnfit' / 'postfit.log' if not postfit_path.exists(): if not old_postfit_path.exists(): raise CheckError( f"Fit {name} does not appear to be completed. " - f"Expected to find file {postfit_path}") + f"Expected to find file {postfit_path}" + ) else: - log.info(f"Cannot find postfit log at: {postfit_path}. " - f"Falling back to old location: {old_postfit_path}") + log.info( + f"Cannot find postfit log at: {postfit_path}. " + f"Falling back to old location: {old_postfit_path}" + ) if not lhaindex.isinstalled(name): raise CheckError( @@ -90,6 +102,7 @@ def check_has_fitted_replicas(ns, **kwargs): def check_scale(scalename, allow_none=False): """Check that we have a valid matplotlib scale. With allow_none=True, also None is valid.""" + @make_check def check(ns, *args, **kwargs): val = ns[scalename] @@ -97,12 +110,15 @@ def check(ns, *args, **kwargs): return valid_scales = mscale.get_scale_names() if not val in valid_scales: - e = CheckError("Invalid plotting scale: %s" % scalename, - bad_item=val, - alternatives=valid_scales, - display_alternatives='all') + e = CheckError( + "Invalid plotting scale: %s" % scalename, + bad_item=val, + alternatives=valid_scales, + display_alternatives='all', + ) e.alternatives_header = "No such scale '%s'. The allowed values are:" raise e + return check @@ -114,7 +130,9 @@ def check_cuts_fromfit(use_cuts): @make_argcheck def check_cuts_considered(use_cuts): if use_cuts == CutsPolicy.NOCUTS: - raise CheckError(f"Cuts must be computed for this action, but they are set to {use_cuts.value}") + raise CheckError( + f"Cuts must be computed for this action, but they are set to {use_cuts.value}" + ) @make_argcheck @@ -132,8 +150,7 @@ def check_dataset_cuts_match_theorycovmat(dataset, fitthcovmat): @make_argcheck -def check_data_cuts_match_theorycovmat( - data, fitthcovmat): +def check_data_cuts_match_theorycovmat(data, fitthcovmat): for dataset in data.datasets: if fitthcovmat: ds_index = fitthcovmat.load().index.get_level_values(1) @@ -147,28 +164,26 @@ def check_data_cuts_match_theorycovmat( check(ndata == ncovmat) - @make_argcheck def check_have_two_pdfs(pdfs): - check(len(pdfs) == 2,'Expecting exactly two pdfs.') + check(len(pdfs) == 2, 'Expecting exactly two pdfs.') @make_argcheck def check_at_least_two_replicas(pdf): # The get_members function also includes the central value replica, # therefore we need it to be larger than 3 - check(pdf.get_members() >= 3,'Expecting at least two replicas.') + check(pdf.get_members() >= 3, 'Expecting at least two replicas.') -#The indexing to one instead of zero is so that we can be consistent with -#how plot_fancy works, so normalize_to: 1 would normalize to the first pdf -#for both. +# The indexing to one instead of zero is so that we can be consistent with +# how plot_fancy works, so normalize_to: 1 would normalize to the first pdf +# for both. @make_argcheck def check_pdf_normalize_to(pdfs, normalize_to): """Transforn normalize_to into an index.""" - msg = ("normalize_to should be, a pdf id or an index of the " - "pdf (starting from one)") + msg = "normalize_to should be, a pdf id or an index of the pdf (starting from one)" if normalize_to is None: return @@ -176,7 +191,7 @@ def check_pdf_normalize_to(pdfs, normalize_to): names = [pdf.name for pdf in pdfs] if isinstance(normalize_to, int): normalize_to -= 1 - if not normalize_to < len(names) or normalize_to<0: + if not normalize_to < len(names) or normalize_to < 0: raise CheckError(msg) return {'normalize_to': normalize_to} @@ -187,18 +202,22 @@ def check_pdf_normalize_to(pdfs, normalize_to): raise CheckError(msg, normalize_to, alternatives=names) return {'normalize_to': normalize_to} - raise RuntimeError("Should not be here") + @make_argcheck def check_pdfs_noband(pdfs, pdfs_noband): """Allows pdfs_noband to be specified as a list of PDF IDs or a list of PDF indexes (starting from one).""" - msg = ("pdfs_noband should be a list of PDF IDs (strings) or a list of " - "PDF indexes (integers, starting from one)") - msg_range = ("At least one of your pdf_noband indexes is out of range. " - "Note that pdf_noband indexing starts at 1, not 0.") + msg = ( + "pdfs_noband should be a list of PDF IDs (strings) or a list of " + "PDF indexes (integers, starting from one)" + ) + msg_range = ( + "At least one of your pdf_noband indexes is out of range. " + "Note that pdf_noband indexing starts at 1, not 0." + ) if pdfs_noband is None: return @@ -226,7 +245,6 @@ def check_pdfs_noband(pdfs, pdfs_noband): else: raise CheckError(msg) - return {'pdfs_noband': pdfs_noband_combined} @@ -236,10 +254,14 @@ def check_mixband_as_replicas(pdfs, mixband_as_replicas): Allows mixband_as_replicas to be specified as a list of PDF IDs or a list of PDF indexes (starting from one).""" - msg = ("mixband_as_replicas should be a list of PDF IDs (strings) or a list of " - "PDF indexes (integers, starting from one)") - msg_range = ("At least one of the choices in mixband_as_replicas indexes is out of range. " - "Note that pdf_noband indexing starts at 1, not 0.") + msg = ( + "mixband_as_replicas should be a list of PDF IDs (strings) or a list of " + "PDF indexes (integers, starting from one)" + ) + msg_range = ( + "At least one of the choices in mixband_as_replicas indexes is out of range. " + "Note that pdf_noband indexing starts at 1, not 0." + ) if mixband_as_replicas is None: return {'mixband_as_replicas': []} @@ -269,40 +291,46 @@ def check_mixband_as_replicas(pdfs, mixband_as_replicas): return {'mixband_as_replicas': mixband_as_replicas_combined} + def _check_list_different(l, name): strs = [str(item) for item in l] - if not len(set(strs))==len(l): + if not len(set(strs)) == len(l): counter = Counter(strs) duplicates = [k for k, v in counter.items() if v > 1] - raise CheckError(f"{name} must be all different " - f"but there are duplicates: {duplicates}") + raise CheckError(f"{name} must be all different but there are duplicates: {duplicates}") + @make_argcheck def check_fits_different(fits): """Need this check because oterwise the pandas object gets confused""" return _check_list_different(fits, 'fits') + @make_argcheck def check_dataspecs_fits_different(dataspecs_fit): """Need this check because oterwise the pandas object gets confused""" return _check_list_different(dataspecs_fit, 'fits') + @make_argcheck def check_speclabels_different(dataspecs_speclabel): """This is needed for grouping dataframes (and because generally indecated a bug)""" return _check_list_different(dataspecs_speclabel, 'dataspecs_speclabel') + @make_argcheck def check_two_dataspecs(dataspecs): l = len(dataspecs) check(l == 2, f"Expecting exactly 2 dataspecs, not {l}") + @make_argcheck def check_norm_threshold(norm_threshold): """Check norm_threshold is not None""" check(norm_threshold is not None) + @make_argcheck def check_darwin_single_process(NPROC): """Check that if we are on macOS (platform is Darwin), NPROC is equal to 1. @@ -322,6 +350,4 @@ def check_darwin_single_process(NPROC): """ if platform.system() == "Darwin" and NPROC != 1: - raise CheckError( - "NPROC must be set to 1 on OSX, because multithreading is not supported." - ) + raise CheckError("NPROC must be set to 1 on OSX, because multithreading is not supported.") diff --git a/validphys2/src/validphys/chi2grids.py b/validphys2/src/validphys/chi2grids.py index c6126677b3..d121617b57 100644 --- a/validphys2/src/validphys/chi2grids.py +++ b/validphys2/src/validphys/chi2grids.py @@ -5,8 +5,8 @@ between pseudorreplica fluctuations between different fits. This is applied here to parameter determinations such as those of αs. """ -import logging from collections import namedtuple +import logging import numpy as np import pandas as pd diff --git a/validphys2/src/validphys/closuretest/closure_checks.py b/validphys2/src/validphys/closuretest/closure_checks.py index 1a2f1c4ef0..1659f64827 100644 --- a/validphys2/src/validphys/closuretest/closure_checks.py +++ b/validphys2/src/validphys/closuretest/closure_checks.py @@ -4,10 +4,10 @@ Module containing checks specific to the closure tests. """ -import logging from collections import defaultdict +import logging -from reportengine.checks import make_argcheck, CheckError +from reportengine.checks import CheckError, make_argcheck log = logging.getLogger(__name__) @@ -18,9 +18,7 @@ def check_use_fitcommondata(use_fitcommondata): with all actions which require comparison to fitcommondata """ if not use_fitcommondata: - raise CheckError( - "use_fitcommondata must be set to True for closure test estimators" - ) + raise CheckError("use_fitcommondata must be set to True for closure test estimators") @make_argcheck @@ -41,7 +39,7 @@ def check_fit_isclosure(fit): raise CheckError( f"The `fakedata` key is not set to `true` in the `closuretest` namespace of {fit}'s runcard. " f"{fit} is therefore not suitable for closure-test studies." - ) + ) @make_argcheck @@ -76,9 +74,8 @@ def check_fits_areclosures(fits): def check_t0pdfset_matches_law(t0pdfset, fit): t0_from_fit = fit.as_input()["closuretest"]["fakepdf"] if not str(t0pdfset) == t0_from_fit: - raise CheckError( - f"Underlying pdf: {t0_from_fit}, does not match t0pdfset: {t0pdfset}" - ) + raise CheckError(f"Underlying pdf: {t0_from_fit}, does not match t0pdfset: {t0pdfset}") + @make_argcheck def check_t0pdfset_matches_multiclosure_law(multiclosure_underlyinglaw, t0set): @@ -86,7 +83,9 @@ def check_t0pdfset_matches_multiclosure_law(multiclosure_underlyinglaw, t0set): Checks t0set instead of t0pdfset since different mechanisms can fill t0set """ if str(t0set) != str(multiclosure_underlyinglaw): - log.warning(f"The underlying pdf {multiclosure_underlyinglaw} does not match t0pdfset: {t0set}") + log.warning( + f"The underlying pdf {multiclosure_underlyinglaw} does not match t0pdfset: {t0set}" + ) @make_argcheck @@ -110,11 +109,9 @@ def check_multifit_replicas(fits_pdf, _internal_max_reps, _internal_min_reps): fit. """ - n_reps = {pdf.get_members()-1 for pdf in fits_pdf} + n_reps = {pdf.get_members() - 1 for pdf in fits_pdf} if len(n_reps) != 1: - raise CheckError( - "all fits for multiclosure actions should have same number of replicas" - ) + raise CheckError("all fits for multiclosure actions should have same number of replicas") n_reps = n_reps.pop() if _internal_max_reps is None: _internal_max_reps = n_reps @@ -155,9 +152,9 @@ def check_fits_different_filterseed(fits): if bad_fits: raise CheckError( - "Multiclosure actions require that fits have different level 1 " - "noise and therefore different filter seeds. The following groups " - f"of fits have the same seed: {bad_fits}." + "Multiclosure actions require that fits have different level 1 " + "noise and therefore different filter seeds. The following groups " + f"of fits have the same seed: {bad_fits}." ) diff --git a/validphys2/src/validphys/closuretest/closure_results.py b/validphys2/src/validphys/closuretest/closure_results.py index cfda6ae555..4f86cdbb48 100644 --- a/validphys2/src/validphys/closuretest/closure_results.py +++ b/validphys2/src/validphys/closuretest/closure_results.py @@ -10,18 +10,16 @@ from reportengine import collect from reportengine.table import table - -from validphys.calcutils import calc_chi2, bootstrap_values +from validphys.calcutils import bootstrap_values, calc_chi2 from validphys.checks import check_pdf_is_montecarlo from validphys.closuretest.closure_checks import ( check_fit_isclosure, - check_use_fitcommondata, check_fits_areclosures, check_fits_same_filterseed, check_fits_underlying_law_match, + check_use_fitcommondata, ) - BiasData = namedtuple("BiasData", ("bias", "ndata")) underlying_results = collect("results", ("fitunderlyinglaw",)) @@ -47,9 +45,7 @@ def bias_dataset(results, underlying_results, fit, use_fitcommondata): return BiasData(bias_out, len(th_ct)) -underlying_dataset_inputs_results = collect( - "dataset_inputs_results", ("fitunderlyinglaw",) -) +underlying_dataset_inputs_results = collect("dataset_inputs_results", ("fitunderlyinglaw",)) @check_fit_isclosure @@ -57,8 +53,7 @@ def bias_dataset(results, underlying_results, fit, use_fitcommondata): def bias_experiment( dataset_inputs_results, underlying_dataset_inputs_results, fit, use_fitcommondata ): - """Like `bias_dataset` but for a whole experiment. - """ + """Like `bias_dataset` but for a whole experiment.""" return bias_dataset( dataset_inputs_results, underlying_dataset_inputs_results, @@ -75,9 +70,7 @@ def bias_experiment( @table @check_fits_same_filterseed @check_fits_underlying_law_match -def biases_table( - fits_experiments, fits_experiments_bias, fits, show_total: bool = False -): +def biases_table(fits_experiments, fits_experiments_bias, fits, show_total: bool = False): """Creates a table with fits as the columns and the experiments from both fits as the row index. """ @@ -131,9 +124,7 @@ def bootstrap_bias_experiment( experiments_bootstrap_bias = collect( "bootstrap_bias_experiment", ("group_dataset_inputs_by_experiment",) ) -fits_experiments_bootstrap_bias = collect( - "experiments_bootstrap_bias", ("fits", "fitcontext") -) +fits_experiments_bootstrap_bias = collect("experiments_bootstrap_bias", ("fits", "fitcontext")) @table @@ -226,9 +217,7 @@ def bootstrap_variance_experiment(dataset_inputs_results, bootstrap_samples=500) "bootstrap_variance_experiment", ("group_dataset_inputs_by_experiment",) ) -fits_exps_bootstrap_var = collect( - "experiments_boostrap_variance", ("fits", "fitcontext") -) +fits_exps_bootstrap_var = collect("experiments_boostrap_variance", ("fits", "fitcontext")) @table @@ -286,9 +275,7 @@ def fits_bootstrap_variance_table( fits_exps_bootstrap_chi2_central = collect( "experiments_bootstrap_chi2_central", ("fits", "fitcontext") ) -fits_level_1_noise = collect( - "total_chi2_data", ("fits", "fitinputcontext", "fitunderlyinglaw") -) +fits_level_1_noise = collect("total_chi2_data", ("fits", "fitinputcontext", "fitunderlyinglaw")) @check_use_fitcommondata @@ -352,16 +339,10 @@ def delta_chi2_table( fits_exps_level_1_noise, ): records = [] - for experiment, exp_chi2, level_1_noise in zip( - experiments, exps_chi2, exps_level_1_noise - ): - delta_chi2 = ( - exp_chi2.central_result - level_1_noise.central_result - ) / exp_chi2.ndata + for experiment, exp_chi2, level_1_noise in zip(experiments, exps_chi2, exps_level_1_noise): + delta_chi2 = (exp_chi2.central_result - level_1_noise.central_result) / exp_chi2.ndata npoints = exp_chi2.ndata - records.append( - dict(experiment=str(experiment), npoints=npoints, delta_chi2=delta_chi2) - ) + records.append(dict(experiment=str(experiment), npoints=npoints, delta_chi2=delta_chi2)) df = pd.DataFrame.from_records( records, columns=("experiment", "npoints", "delta_chi2"), diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index b7c5b27ec6..5b545569e2 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -12,24 +12,24 @@ import scipy.special as special from reportengine import collect - -from validphys.results import ThPredictionsResult from validphys.calcutils import calc_chi2 +from validphys.checks import check_use_t0 from validphys.closuretest.closure_checks import ( check_at_least_10_fits, - check_multifit_replicas, - check_fits_underlying_law_match, check_fits_areclosures, check_fits_different_filterseed, - check_t0pdfset_matches_multiclosure_law + check_fits_underlying_law_match, + check_multifit_replicas, + check_t0pdfset_matches_multiclosure_law, ) -from validphys.checks import check_use_t0 +from validphys.results import ThPredictionsResult # bootstrap seed default DEFAULT_SEED = 9689372 # stepsize in fits/replicas to use for finite size bootstraps SAMPLING_INTERVAL = 5 + # TODO: deprecate this at some point @check_fits_underlying_law_match @check_fits_areclosures @@ -81,8 +81,7 @@ def internal_multiclosure_dataset_loader( """ fits_dataset_predictions = [ - ThPredictionsResult.from_convolution(pdf, dataset) - for pdf in fits_pdf + ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf ] fits_underlying_predictions = ThPredictionsResult.from_convolution( multiclosure_underlyinglaw, dataset @@ -91,7 +90,12 @@ def internal_multiclosure_dataset_loader( sqrt_covmat = la.cholesky(t0_covmat_from_systematics, lower=True) # TODO: support covmat reg and theory covariance matrix # possibly make this a named tuple - return (fits_dataset_predictions, fits_underlying_predictions, t0_covmat_from_systematics, sqrt_covmat) + return ( + fits_dataset_predictions, + fits_underlying_predictions, + t0_covmat_from_systematics, + sqrt_covmat, + ) @check_fits_underlying_law_match @@ -185,9 +189,7 @@ def fits_total_bias_variance(fits_experiments_bias_variance): return bias_total, variance_total, n_total -datasets_expected_bias_variance = collect( - "expected_dataset_bias_variance", ("data",) -) +datasets_expected_bias_variance = collect("expected_dataset_bias_variance", ("data",)) experiments_expected_bias_variance = collect( @@ -200,8 +202,7 @@ def expected_total_bias_variance(fits_total_bias_variance): return expected_dataset_bias_variance(fits_total_bias_variance) -def dataset_replica_and_central_diff( - internal_multiclosure_dataset_loader, diagonal_basis=True): +def dataset_replica_and_central_diff(internal_multiclosure_dataset_loader, diagonal_basis=True): """For a given dataset calculate sigma, the RMS difference between replica predictions and central predictions, and delta, the difference between the central prediction and the underlying prediction. @@ -229,10 +230,11 @@ def dataset_replica_and_central_diff( var_diff_sqrt = var_diff_sqrt.transpose(2, 1, 0) central_diff = central_diff.T - var_diff = var_diff_sqrt ** 2 + var_diff = var_diff_sqrt**2 sigma = np.sqrt(var_diff.mean(axis=0)) # sigma is always positive return sigma, central_diff + def dataset_xi(dataset_replica_and_central_diff): """Take sigma and delta for a dataset, where sigma is the RMS difference between replica predictions and central predictions, and delta is the @@ -262,11 +264,9 @@ def dataset_xi(dataset_replica_and_central_diff): return in_1_sigma.mean(axis=1) -def data_replica_and_central_diff( - internal_multiclosure_data_loader, diagonal_basis=True): +def data_replica_and_central_diff(internal_multiclosure_data_loader, diagonal_basis=True): """Like ``dataset_replica_and_central_diff`` but for all data""" - return dataset_replica_and_central_diff( - internal_multiclosure_data_loader, diagonal_basis) + return dataset_replica_and_central_diff(internal_multiclosure_data_loader, diagonal_basis) def data_xi(data_replica_and_central_diff): @@ -276,7 +276,9 @@ def data_xi(data_replica_and_central_diff): experiments_xi_measured = collect("data_xi", ("group_dataset_inputs_by_experiment",)) experiments_replica_central_diff = collect( - "data_replica_and_central_diff", ("group_dataset_inputs_by_experiment",)) + "data_replica_and_central_diff", ("group_dataset_inputs_by_experiment",) +) + @check_at_least_10_fits def n_fit_samples(fits): @@ -571,8 +573,6 @@ def xi_resampling_data( ) - - exps_xi_resample = collect("xi_resampling_data", ("group_dataset_inputs_by_experiment",)) @@ -633,9 +633,7 @@ def fits_bootstrap_data_bias_variance( # explicitly pass n_rep to fits_dataset_bias_variance so it uses # full subsample bias, variance, _ = expected_dataset_bias_variance( - fits_dataset_bias_variance( - boot_internal_loader, _internal_max_reps, _internal_min_reps - ) + fits_dataset_bias_variance(boot_internal_loader, _internal_max_reps, _internal_min_reps) ) bias_boot.append(bias) variance_boot.append(variance) @@ -702,6 +700,7 @@ def experiments_bootstrap_expected_xi(experiments_bootstrap_sqrt_ratio): estimated_integral = special.erf(n_sigma_in_variance / np.sqrt(2)) return estimated_integral + groups_bootstrap_bias_variance = collect( "fits_bootstrap_data_bias_variance", ("group_dataset_inputs_by_metadata",) ) @@ -753,14 +752,14 @@ def fits_bootstrap_data_xi( _internal_max_reps, True, ) - xi_1sigma_boot.append( - dataset_xi(dataset_replica_and_central_diff(boot_internal_loader)) - ) + xi_1sigma_boot.append(dataset_xi(dataset_replica_and_central_diff(boot_internal_loader))) return xi_1sigma_boot experiments_bootstrap_xi = collect( - "fits_bootstrap_data_xi", ("group_dataset_inputs_by_experiment",)) + "fits_bootstrap_data_xi", ("group_dataset_inputs_by_experiment",) +) + def total_bootstrap_xi(experiments_bootstrap_xi): """Given the bootstrap samples of xi_1sigma for all experiments, @@ -770,8 +769,8 @@ def total_bootstrap_xi(experiments_bootstrap_xi): """ return np.concatenate(experiments_bootstrap_xi, axis=1) -groups_bootstrap_xi = collect( - "fits_bootstrap_data_xi", ("group_dataset_inputs_by_metadata",)) + +groups_bootstrap_xi = collect("fits_bootstrap_data_xi", ("group_dataset_inputs_by_metadata",)) def dataset_fits_bias_replicas_variance_samples( @@ -816,6 +815,7 @@ def dataset_fits_bias_replicas_variance_samples( variances.append(calc_chi2(sqrtcov, diffs)) return biases, np.concatenate(variances), len(law_th) + def dataset_inputs_fits_bias_replicas_variance_samples( internal_multiclosure_data_loader, _internal_max_reps=None, @@ -827,7 +827,7 @@ def dataset_inputs_fits_bias_replicas_variance_samples( _internal_min_reps=20, ) + experiments_fits_bias_replicas_variance_samples = collect( - "dataset_inputs_fits_bias_replicas_variance_samples", - ("group_dataset_inputs_by_experiment",) + "dataset_inputs_fits_bias_replicas_variance_samples", ("group_dataset_inputs_by_experiment",) ) diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py index 02f3bb292b..d7be585cd7 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_output.py +++ b/validphys2/src/validphys/closuretest/multiclosure_output.py @@ -13,9 +13,9 @@ from reportengine.figure import figure, figuregen from reportengine.table import table - from validphys import plotutils + @figure def plot_dataset_fits_bias_variance(fits_dataset_bias_variance, dataset): """For a set of closure fits, calculate the bias and variance across fits @@ -27,11 +27,9 @@ def plot_dataset_fits_bias_variance(fits_dataset_bias_variance, dataset): """ biases, variances, _ = fits_dataset_bias_variance fig, ax = plotutils.subplots() - + ax.plot(biases, "*", label=f"bias, std. dev. = {np.std(biases):.2f}") - ax.axhline( - np.mean(biases), label=f"bias, mean = {np.mean(biases):.2f}", linestyle="-" - ) + ax.axhline(np.mean(biases), label=f"bias, mean = {np.mean(biases):.2f}", linestyle="-") ax.plot(variances, ".", label=f"variance, std. dev. = {np.std(variances):.2f}") ax.axhline( np.mean(variances), @@ -89,9 +87,7 @@ def datasets_bias_variance_ratio(datasets_expected_bias_variance, each_dataset): records = [] for ds, (bias, var, ndata) in zip(each_dataset, datasets_expected_bias_variance): records.append(dict(dataset=str(ds), ndata=ndata, ratio=bias / var)) - df = pd.DataFrame.from_records( - records, index="dataset", columns=("dataset", "ndata", "ratio") - ) + df = pd.DataFrame.from_records(records, index="dataset", columns=("dataset", "ndata", "ratio")) df.columns = ["ndata", "bias/variance"] return df @@ -109,15 +105,11 @@ def experiments_bias_variance_ratio( """ # don't reinvent wheel - df_in = datasets_bias_variance_ratio( - experiments_expected_bias_variance, experiments_data - ) + df_in = datasets_bias_variance_ratio(experiments_expected_bias_variance, experiments_data) bias_tot, var_tot, ntotal = expected_total_bias_variance - tot_df = pd.DataFrame( - [[ntotal, bias_tot / var_tot]], index=["Total"], columns=df_in.columns - ) + tot_df = pd.DataFrame([[ntotal, bias_tot / var_tot]], index=["Total"], columns=df_in.columns) df = pd.concat((df_in, tot_df), axis=0) df.index.rename("experiment", inplace=True) # give index appropriate name @@ -138,33 +130,31 @@ def experiments_bias_variance_table( """ records = [] for exp, (bias, var, ndata) in zip( - group_dataset_inputs_by_experiment, - experiments_expected_bias_variance + group_dataset_inputs_by_experiment, experiments_expected_bias_variance ): - records.append(dict( - experiment=exp["group_name"], - ndata=ndata, - bias=bias/ndata, - variance=var/ndata, - sqrt_ratio=np.sqrt(bias/var) - )) + records.append( + dict( + experiment=exp["group_name"], + ndata=ndata, + bias=bias / ndata, + variance=var / ndata, + sqrt_ratio=np.sqrt(bias / var), + ) + ) bias_tot, var_tot, ntotal = expected_total_bias_variance - records.append(dict( - experiment="Total", - ndata=ntotal, - bias=bias_tot/ntotal, - variance=var_tot/ntotal, - sqrt_ratio=np.sqrt(bias_tot/var_tot) - )) + records.append( + dict( + experiment="Total", + ndata=ntotal, + bias=bias_tot / ntotal, + variance=var_tot / ntotal, + sqrt_ratio=np.sqrt(bias_tot / var_tot), + ) + ) df = pd.DataFrame.from_records(records, index="experiment") - df.columns = [ - "ndata", - "bias", - "variance", - "sqrt(bias/variance)" - ] + df.columns = ["ndata", "bias", "variance", "sqrt(bias/variance)"] return df @@ -181,16 +171,12 @@ def sqrt_datasets_bias_variance_ratio(datasets_bias_variance_ratio): df_in = datasets_bias_variance_ratio vals = np.array(df_in.values) # copy just in case vals[:, 1] = np.sqrt(vals[:, 1]) - return pd.DataFrame( - vals, index=df_in.index, columns=["ndata", "sqrt(bias/variance)"] - ) + return pd.DataFrame(vals, index=df_in.index, columns=["ndata", "sqrt(bias/variance)"]) @table def sqrt_experiments_bias_variance_ratio(experiments_bias_variance_ratio): - """Like sqrt_datasets_bias_variance_ratio except for each experiment. - - """ + """Like sqrt_datasets_bias_variance_ratio except for each experiment.""" return sqrt_datasets_bias_variance_ratio(experiments_bias_variance_ratio) @@ -210,11 +196,7 @@ def total_bias_variance_ratio( dset_index = pd.MultiIndex.from_arrays( [ - [ - str(experiment) - for experiment in experiments_data - for ds in experiment.datasets - ], + [str(experiment) for experiment in experiments_data for ds in experiment.datasets], datasets_bias_variance_ratio.index.values, ] ) @@ -303,9 +285,7 @@ def compare_measured_expected_xi(fits_measured_xi, expected_xi_from_bias_varianc """ # don't want ndata twice - df = pd.concat( - (fits_measured_xi, expected_xi_from_bias_variance.iloc[:, 1]), axis=1 - ) + df = pd.concat((fits_measured_xi, expected_xi_from_bias_variance.iloc[:, 1]), axis=1) return df @@ -324,12 +304,8 @@ def plot_dataset_xi(dataset_xi, dataset): label=r"$\xi_{1\sigma}$ = " + f"{dataset_xi.mean():.2f}, from multifits", clip_on=False, ) - ax.axhline( - 0.68, linestyle=":", color="k", label=r"$\xi_{1\sigma}$ " + "expected value" - ) - ax.axhline( - 0.95, linestyle=":", color="r", label=r"$\xi_{2\sigma}$ " + "expected value" - ) + ax.axhline(0.68, linestyle=":", color="k", label=r"$\xi_{1\sigma}$ " + "expected value") + ax.axhline(0.95, linestyle=":", color="r", label=r"$\xi_{2\sigma}$ " + "expected value") ax.set_ylim((0, 1)) ax.set_xlabel("eigenvector index (ascending order)") ax.set_title(r"$\xi_{1\sigma}$ for " + str(dataset)) @@ -355,9 +331,7 @@ def plot_dataset_xi_histogram(dataset_xi, dataset): + f"{dataset_xi.std():.2f}" ), ) - ax.axvline( - 0.68, linestyle=":", color="k", label=r"$\xi_{1\sigma}$ " + "expected value" - ) + ax.axvline(0.68, linestyle=":", color="k", label=r"$\xi_{1\sigma}$ " + "expected value") ax.set_xlim((0, 1)) ax.set_xlabel(r"$\xi^{i}_{1\sigma}$") ax.set_title("Histogram of " + r"$\xi^{i}_{1\sigma}$ for " + str(dataset)) @@ -392,15 +366,14 @@ def plot_data_central_diff_histogram(experiments_replica_central_diff): which fall within the 1-sigma confidence interval of the scaled gaussian. """ - scaled_diffs = np.concatenate([ - (central_diff / sigma).flatten() - for sigma, central_diff - in experiments_replica_central_diff - ]) - fig, ax = plotutils.subplots() - ax.hist( - scaled_diffs, bins=50, density=True, label="Central prediction distribution" + scaled_diffs = np.concatenate( + [ + (central_diff / sigma).flatten() + for sigma, central_diff in experiments_replica_central_diff + ] ) + fig, ax = plotutils.subplots() + ax.hist(scaled_diffs, bins=50, density=True, label="Central prediction distribution") xlim = (-5, 5) ax.set_xlim(xlim) @@ -416,7 +389,6 @@ def plot_data_central_diff_histogram(experiments_replica_central_diff): return fig - @table def dataset_ratio_error_finite_effects( bias_variance_resampling_dataset, n_fit_samples, n_replica_samples @@ -471,9 +443,7 @@ def total_ratio_means_finite_effects( @table -def dataset_xi_error_finite_effects( - xi_resampling_dataset, n_fit_samples, n_replica_samples -): +def dataset_xi_error_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the mean of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then @@ -487,9 +457,7 @@ def dataset_xi_error_finite_effects( @table -def dataset_xi_means_finite_effects( - xi_resampling_dataset, n_fit_samples, n_replica_samples -): +def dataset_xi_means_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the mean of xi across datapoints (note that points here refers to points in the basis which diagonalises the covmat) and then @@ -505,9 +473,7 @@ def dataset_xi_means_finite_effects( # NOTE: This action was written when trying to understand the finite size effects # and is largely redundant. @table -def dataset_std_xi_error_finite_effects( - xi_resampling_dataset, n_fit_samples, n_replica_samples -): +def dataset_std_xi_error_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the standard deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the @@ -522,9 +488,7 @@ def dataset_std_xi_error_finite_effects( @table -def dataset_std_xi_means_finite_effects( - xi_resampling_dataset, n_fit_samples, n_replica_samples -): +def dataset_std_xi_means_finite_effects(xi_resampling_dataset, n_fit_samples, n_replica_samples): """For a single dataset vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the standard deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the @@ -546,9 +510,7 @@ def total_xi_error_finite_effects(total_xi_resample, n_fit_samples, n_replica_sa tabulate the standard deviation of xi_1sigma across bootstrap samples. """ - return dataset_xi_error_finite_effects( - total_xi_resample, n_fit_samples, n_replica_samples - ) + return dataset_xi_error_finite_effects(total_xi_resample, n_fit_samples, n_replica_samples) @table @@ -559,9 +521,7 @@ def total_xi_means_finite_effects(total_xi_resample, n_fit_samples, n_replica_sa tabulate the standard deviation of xi_1sigma across bootstrap samples. """ - return dataset_xi_means_finite_effects( - total_xi_resample, n_fit_samples, n_replica_samples - ) + return dataset_xi_means_finite_effects(total_xi_resample, n_fit_samples, n_replica_samples) @table @@ -597,9 +557,7 @@ def total_expected_xi_error_finite_effects( @table -def total_std_xi_error_finite_effects( - exps_xi_resample, n_fit_samples, n_replica_samples -): +def total_std_xi_error_finite_effects(exps_xi_resample, n_fit_samples, n_replica_samples): """For all data vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the std deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises @@ -608,15 +566,11 @@ def total_std_xi_error_finite_effects( """ xi_total = np.concatenate(exps_xi_resample, axis=-1) - return dataset_std_xi_error_finite_effects( - xi_total, n_fit_samples, n_replica_samples - ) + return dataset_std_xi_error_finite_effects(xi_total, n_fit_samples, n_replica_samples) @table -def total_std_xi_means_finite_effects( - exps_xi_resample, n_fit_samples, n_replica_samples -): +def total_std_xi_means_finite_effects(exps_xi_resample, n_fit_samples, n_replica_samples): """For all data vary number of fits and number of replicas used to perform bootstrap sample of xi. Take the std deviation of xi across datapoints (note that points here refers to points in the basis which diagonalises the @@ -625,15 +579,11 @@ def total_std_xi_means_finite_effects( """ xi_total = np.concatenate(exps_xi_resample, axis=-1) - return dataset_std_xi_means_finite_effects( - xi_total, n_fit_samples, n_replica_samples - ) + return dataset_std_xi_means_finite_effects(xi_total, n_fit_samples, n_replica_samples) @table -def experiments_bootstrap_sqrt_ratio_table( - experiments_bootstrap_sqrt_ratio, experiments_data -): +def experiments_bootstrap_sqrt_ratio_table(experiments_bootstrap_sqrt_ratio, experiments_data): """Given experiments_bootstrap_sqrt_ratio, which a bootstrap resampling of the sqrt(bias/variance) for each experiment and the total across all data, tabulate the mean and standard deviation across bootstrap @@ -662,32 +612,24 @@ def experiments_bootstrap_sqrt_ratio_table( @table -def groups_bootstrap_sqrt_ratio_table( - groups_bootstrap_sqrt_ratio, groups_data -): +def groups_bootstrap_sqrt_ratio_table(groups_bootstrap_sqrt_ratio, groups_data): """Like :py:func:`experiments_bootstrap_sqrt_ratio_table` but for metadata groups. """ - df = experiments_bootstrap_sqrt_ratio_table( - groups_bootstrap_sqrt_ratio, groups_data - ) + df = experiments_bootstrap_sqrt_ratio_table(groups_bootstrap_sqrt_ratio, groups_data) idx = df.index.rename("group") return df.set_index(idx) @table -def experiments_bootstrap_expected_xi_table( - experiments_bootstrap_expected_xi, experiments_data -): +def experiments_bootstrap_expected_xi_table(experiments_bootstrap_expected_xi, experiments_data): """Tabulate the mean and standard deviation across bootstrap samples of the expected xi calculated from the ratio of bias/variance. Returns a table with two columns, for the bootstrap mean and standard deviation and a row for each experiment plus the total across all experiments. """ - df = experiments_bootstrap_sqrt_ratio_table( - experiments_bootstrap_expected_xi, experiments_data - ) + df = experiments_bootstrap_sqrt_ratio_table(experiments_bootstrap_expected_xi, experiments_data) # change the column headers df.columns = [ r"Bootstrap mean expected $\xi_{1\sigma}$ from ratio", @@ -701,15 +643,13 @@ def groups_bootstrap_expected_xi_table(groups_bootstrap_expected_xi, groups_data """Like :py:func:`experiments_bootstrap_expected_xi_table` but for metadata groups. """ - df = experiments_bootstrap_expected_xi_table( - groups_bootstrap_expected_xi, groups_data) + df = experiments_bootstrap_expected_xi_table(groups_bootstrap_expected_xi, groups_data) idx = df.index.rename("group") return df.set_index(idx) + @table -def experiments_bootstrap_xi_table( - experiments_bootstrap_xi, experiments_data, total_bootstrap_xi -): +def experiments_bootstrap_xi_table(experiments_bootstrap_xi, experiments_data, total_bootstrap_xi): """Tabulate the mean and standard deviation of xi_1sigma across bootstrap samples. Note that the mean has already be taken across data points (or eigenvectors in the basis which diagonalises the covariance @@ -731,12 +671,9 @@ def experiments_bootstrap_xi_table( @table -def groups_bootstrap_xi_table( - groups_bootstrap_xi, groups_data, total_bootstrap_xi -): +def groups_bootstrap_xi_table(groups_bootstrap_xi, groups_data, total_bootstrap_xi): """Like :py:func:`experiments_bootstrap_xi_table` but for metadata groups.""" - df = experiments_bootstrap_xi_table( - groups_bootstrap_xi, groups_data, total_bootstrap_xi) + df = experiments_bootstrap_xi_table(groups_bootstrap_xi, groups_data, total_bootstrap_xi) idx = df.index.rename("group") return df.set_index(idx) @@ -757,9 +694,7 @@ def experiments_bootstrap_xi_comparison( @table -def groups_bootstrap_xi_comparison( - groups_bootstrap_xi_table, groups_bootstrap_expected_xi_table -): +def groups_bootstrap_xi_comparison(groups_bootstrap_xi_table, groups_bootstrap_expected_xi_table): """Like :py:func:`experiments_bootstrap_xi_comparison` but for metadata groups. """ @@ -818,9 +753,7 @@ def plot_experiments_xi_bootstrap_distribution( # take mean across all data xi_1sigma.append(np.mean(total_bootstrap_xi, axis=1)) # use plotting function from above - xi_plots = plot_experiments_sqrt_ratio_bootstrap_distribution( - xi_1sigma, experiments_data - ) + xi_plots = plot_experiments_sqrt_ratio_bootstrap_distribution(xi_1sigma, experiments_data) # Update the title and x label on each plot to reflect that we're plotting # \xi_1sigma, don't forget Total plot. for fig, exp in zip(xi_plots, experiments_data + ["Total"]): @@ -829,10 +762,10 @@ def plot_experiments_xi_bootstrap_distribution( ax.set_xlabel(r"$\xi_{1\sigma}$") yield fig + @figuregen def plot_bias_variance_distributions( - experiments_fits_bias_replicas_variance_samples, - group_dataset_inputs_by_experiment + experiments_fits_bias_replicas_variance_samples, group_dataset_inputs_by_experiment ): """For each experiment, plot the distribution across fits of bias and the distribution across fits and replicas of @@ -844,34 +777,20 @@ def plot_bias_variance_distributions( """ for (exp_biases, exp_vars, _), group_spec in zip( - experiments_fits_bias_replicas_variance_samples, - group_dataset_inputs_by_experiment - ): + experiments_fits_bias_replicas_variance_samples, group_dataset_inputs_by_experiment + ): fig, ax = plotutils.subplots() labels = [ "fits bias distribution", "replicas variance distribution", ] - ax.hist( - [exp_biases, exp_vars], - density=True, - label=labels - ) + ax.hist([exp_biases, exp_vars], density=True, label=labels) ax.legend() - ax.set_title( - f"Bias and variance distributions for {group_spec['group_name']}." - ) + ax.set_title(f"Bias and variance distributions for {group_spec['group_name']}.") yield fig - total_bias, total_var, _ = np.sum( - experiments_fits_bias_replicas_variance_samples, - axis=0 - ) + total_bias, total_var, _ = np.sum(experiments_fits_bias_replicas_variance_samples, axis=0) fig, ax = plotutils.subplots() - ax.hist( - [total_bias, total_var], - density=True, - label=labels - ) + ax.hist([total_bias, total_var], density=True, label=labels) ax.legend() ax.set_title("Total bias and variance distributions.") yield fig diff --git a/validphys2/src/validphys/closuretest/multiclosure_pdf.py b/validphys2/src/validphys/closuretest/multiclosure_pdf.py index 59c661ae47..0720101049 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_pdf.py +++ b/validphys2/src/validphys/closuretest/multiclosure_pdf.py @@ -12,11 +12,10 @@ import scipy.special from reportengine import collect - +from validphys.calcutils import calc_chi2 from validphys.closuretest.multiclosure import DEFAULT_SEED -from validphys.pdfgrids import xplotting_grid from validphys.core import PDF -from validphys.calcutils import calc_chi2 +from validphys.pdfgrids import xplotting_grid # Define the NN31IC basis with the charm PDF excluded. It is excluded because # the exercises carried out with this module are intended to be done in the @@ -48,9 +47,7 @@ def internal_nonsinglet_xgrid(multiclosure_nx=4): return np.linspace(0.1, 0.5, multiclosure_nx) -def xi_pdfgrids( - pdf: PDF, Q: (float, int), internal_singlet_gluon_xgrid, internal_nonsinglet_xgrid -): +def xi_pdfgrids(pdf: PDF, Q: (float, int), internal_singlet_gluon_xgrid, internal_nonsinglet_xgrid): """Generate PDF grids which are required for calculating xi in PDF space in the NN31IC basis, excluding the charm. We want to specify different xgrids for different flavours to avoid sampling PDFs in deep extrapolation regions. @@ -90,7 +87,9 @@ def xi_grid_values(xi_pdfgrids): glu_sin_grid, nonsin_grid = xi_pdfgrids # grid values have shape: replica, flavour, x # concatenate along flavour - return np.concatenate((glu_sin_grid.grid_values.error_members(), nonsin_grid.grid_values.error_members()), axis=1) + return np.concatenate( + (glu_sin_grid.grid_values.error_members(), nonsin_grid.grid_values.error_members()), axis=1 + ) def underlying_xi_grid_values( @@ -111,9 +110,7 @@ def underlying_xi_grid_values( return xi_grid_values(underlying_grid) -def pdf_central_difference( - xi_grid_values, underlying_xi_grid_values, multiclosure_underlyinglaw -): +def pdf_central_difference(xi_grid_values, underlying_xi_grid_values, multiclosure_underlyinglaw): """Calculate the difference between underlying law and central PDF for, specifically: @@ -161,9 +158,7 @@ def fits_covariance_matrix_by_flavour(fits_replica_difference): """ # diffs should be calculated on the per fit level super_diffs = np.concatenate(fits_replica_difference, axis=0) - covmats = [ - np.cov(super_diffs[:, i, :], rowvar=False) for i in range(len(XI_FLAVOURS)) - ] + covmats = [np.cov(super_diffs[:, i, :], rowvar=False) for i in range(len(XI_FLAVOURS))] return covmats @@ -221,9 +216,7 @@ def fits_covariance_matrix_totalpdf(fits_replica_difference, multiclosure_nx=4): return np.cov(super_diffs, rowvar=False) -fits_indicator_function_totalpdf = collect( - "pdf_indicator_function_totalpdf", ("fits", "fitpdf") -) +fits_indicator_function_totalpdf = collect("pdf_indicator_function_totalpdf", ("fits", "fitpdf")) def replica_and_central_diff_totalpdf( @@ -360,9 +353,7 @@ def bootstrap_pdf_differences( rep_boot_index = rng.choice(fit_xi_grid.shape[0], size=fit_xi_grid.shape[0]) xi_gv = fit_xi_grid[rep_boot_index, ...] boot_central_diff.append( - pdf_central_difference( - xi_gv, underlying_xi_grid_values, multiclosure_underlyinglaw - ) + pdf_central_difference(xi_gv, underlying_xi_grid_values, multiclosure_underlyinglaw) ) boot_rep_diff.append(pdf_replica_difference(xi_gv)) return boot_central_diff, boot_rep_diff @@ -394,9 +385,7 @@ def fits_bootstrap_pdf_ratio( flav_cov = fits_covariance_matrix_by_flavour(boot_rep_diff) flav_sqrt_cov = fits_sqrt_covmat_by_flavour(flav_cov) total_cov = fits_covariance_matrix_totalpdf(boot_rep_diff, multiclosure_nx) - ratio_flav = fits_pdf_flavour_ratio( - flav_sqrt_cov, boot_central_diff, boot_rep_diff - ) + ratio_flav = fits_pdf_flavour_ratio(flav_sqrt_cov, boot_central_diff, boot_rep_diff) ratio_tot = fits_pdf_total_ratio( boot_central_diff, boot_rep_diff, total_cov, multiclosure_nx ) diff --git a/validphys2/src/validphys/closuretest/multiclosure_pdf_output.py b/validphys2/src/validphys/closuretest/multiclosure_pdf_output.py index 09b07cabf8..f54b82fee2 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_pdf_output.py +++ b/validphys2/src/validphys/closuretest/multiclosure_pdf_output.py @@ -5,26 +5,25 @@ PDF space. """ +from matplotlib.ticker import MaxNLocator import numpy as np import pandas as pd -from matplotlib.ticker import MaxNLocator import scipy.linalg as la import scipy.special +from reportengine.figure import figure, figuregen from reportengine.table import table -from reportengine.figure import figuregen, figure - +from validphys import plotutils from validphys.closuretest.multiclosure import DEFAULT_SEED from validphys.closuretest.multiclosure_pdf import ( XI_FLAVOURS, bootstrap_pdf_differences, - xi_flavour_x, - replica_and_central_diff_totalpdf, - xi_totalpdf, fits_covariance_matrix_by_flavour, fits_covariance_matrix_totalpdf, + replica_and_central_diff_totalpdf, + xi_flavour_x, + xi_totalpdf, ) -from validphys import plotutils @table @@ -40,9 +39,7 @@ def xi_flavour_table(xi_flavour_x, xi_totalpdf): table of xi by flavour """ - data = np.concatenate((xi_flavour_x.mean(axis=-1), [xi_totalpdf]), axis=0)[ - :, np.newaxis - ] + data = np.concatenate((xi_flavour_x.mean(axis=-1), [xi_totalpdf]), axis=0)[:, np.newaxis] index = pd.Index([f"${XI_FLAVOURS[0]}$", *XI_FLAVOURS[1:], "Total"], name="flavour") return pd.DataFrame(data, columns=[r"measured $\xi_{1\sigma}$"], index=index) @@ -64,9 +61,7 @@ def plot_xi_flavour_x( """ # treat singlet and gluon separately if use_x_basis: - x_for_plot = 2 * [internal_singlet_gluon_xgrid] + 5 * [ - internal_nonsinglet_xgrid - ] + x_for_plot = 2 * [internal_singlet_gluon_xgrid] + 5 * [internal_nonsinglet_xgrid] x_label = "x" else: x_for_plot = 7 * [np.arange(multiclosure_nx)] @@ -104,9 +99,7 @@ def plot_pdf_central_diff_histogram(replica_and_central_diff_totalpdf): sigma, delta = replica_and_central_diff_totalpdf scaled_diffs = (delta / sigma).flatten() fig, ax = plotutils.subplots() - ax.hist( - scaled_diffs, bins=50, density=True, label="Central PDF distribution" - ) + ax.hist(scaled_diffs, bins=50, density=True, label="Central PDF distribution") xlim = (-5, 5) ax.set_xlim(xlim) @@ -135,9 +128,7 @@ def fits_pdf_bias_variance_ratio(fits_pdf_flavour_ratio, fits_pdf_total_ratio): fl = f"${fl}$" records.append(dict(flavour=fl, ratio=fits_pdf_flavour_ratio[i])) records.append(dict(flavour="Total", ratio=fits_pdf_total_ratio)) - df = pd.DataFrame.from_records( - records, index="flavour", columns=["flavour", "ratio"] - ) + df = pd.DataFrame.from_records(records, index="flavour", columns=["flavour", "ratio"]) df.columns = ["bias/variance"] return df @@ -349,9 +340,7 @@ def plot_multiclosure_correlation_matrix(fits_correlation_matrix_totalpdf, multi matrix. """ - fig, ax = plot_pdf_matrix( - fits_correlation_matrix_totalpdf, multiclosure_nx, vmin=0, vmax=1 - ) + fig, ax = plot_pdf_matrix(fits_correlation_matrix_totalpdf, multiclosure_nx, vmin=0, vmax=1) ax.set_title("Correlation matrix estimated from multiclosure replicas") return fig diff --git a/validphys2/src/validphys/closuretest/multiclosure_preprocessing.py b/validphys2/src/validphys/closuretest/multiclosure_preprocessing.py index 3141cf5efe..9f8696eff6 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_preprocessing.py +++ b/validphys2/src/validphys/closuretest/multiclosure_preprocessing.py @@ -12,14 +12,13 @@ from reportengine import collect from reportengine.figure import figuregen from reportengine.table import table - +from validphys import plotutils from validphys.closuretest.closure_checks import ( check_fits_areclosures, check_fits_have_same_basis, check_fits_underlying_law_match, ) from validphys.plotutils import plot_horizontal_errorbars -from validphys import plotutils def _next_multiclosure_preprocessing_table(fits_pdf, fits_preprocessing_table): @@ -89,7 +88,9 @@ def next_multiclosure_beta_preprocessing_table( @figuregen def plot_next_multiclosure_alpha_preprocessing( - fits_fitbasis_alpha_lines, fits_pdf, next_multiclosure_alpha_preprocessing_table, + fits_fitbasis_alpha_lines, + fits_pdf, + next_multiclosure_alpha_preprocessing_table, ): """Using the table produced by :py:func:`next_multiclosure_alpha_preprocessing_table`, plot the next @@ -98,12 +99,8 @@ def plot_next_multiclosure_alpha_preprocessing( limits of the first fit. """ - first_prev_ranges = fits_fitbasis_alpha_lines[0].loc[ - :, f"prev ({fits_pdf[0].label})" - ] - flavours = next_multiclosure_alpha_preprocessing_table.columns.get_level_values( - 0 - ).unique() + first_prev_ranges = fits_fitbasis_alpha_lines[0].loc[:, f"prev ({fits_pdf[0].label})"] + flavours = next_multiclosure_alpha_preprocessing_table.columns.get_level_values(0).unique() for flavour in flavours: next_flavour_range = next_multiclosure_alpha_preprocessing_table.loc[:, flavour] next_flavour_range_vals = next_flavour_range.to_numpy() @@ -139,7 +136,9 @@ def plot_next_multiclosure_alpha_preprocessing( @figuregen def plot_next_multiclosure_beta_preprocessing( - fits_fitbasis_beta_lines, fits_pdf, next_multiclosure_beta_preprocessing_table, + fits_fitbasis_beta_lines, + fits_pdf, + next_multiclosure_beta_preprocessing_table, ): """Using the table produced by :py:func:`next_multiclosure_beta_preprocessing_table`, plot the next @@ -149,7 +148,9 @@ def plot_next_multiclosure_beta_preprocessing( """ for fig in plot_next_multiclosure_alpha_preprocessing( - fits_fitbasis_beta_lines, fits_pdf, next_multiclosure_beta_preprocessing_table, + fits_fitbasis_beta_lines, + fits_pdf, + next_multiclosure_beta_preprocessing_table, ): # fixup title. ax = fig.gca() @@ -160,7 +161,9 @@ def plot_next_multiclosure_beta_preprocessing( @figuregen def plot_next_multiclosure_alpha_preprocessing_range_width( - fits_fitbasis_alpha_lines, fits_pdf, next_multiclosure_alpha_preprocessing_table, + fits_fitbasis_alpha_lines, + fits_pdf, + next_multiclosure_alpha_preprocessing_table, ): """Using the table produced by :py:func:`next_multiclosure_alpha_preprocessing_table`, plot the next @@ -170,12 +173,8 @@ def plot_next_multiclosure_alpha_preprocessing_range_width( the first fit for reference """ - first_prev_ranges = fits_fitbasis_alpha_lines[0].loc[ - :, f"prev ({fits_pdf[0].label})" - ] - flavours = next_multiclosure_alpha_preprocessing_table.columns.get_level_values( - 0 - ).unique() + first_prev_ranges = fits_fitbasis_alpha_lines[0].loc[:, f"prev ({fits_pdf[0].label})"] + flavours = next_multiclosure_alpha_preprocessing_table.columns.get_level_values(0).unique() for flavour in flavours: next_flavour_range = next_multiclosure_alpha_preprocessing_table.loc[:, flavour] next_flavour_range_vals = next_flavour_range.to_numpy() @@ -189,16 +188,16 @@ def plot_next_multiclosure_alpha_preprocessing_range_width( color="k", label="Previous range width.", ) - ax.set_title( - f"Multiclosure fits {flavour} alpha preprocessing exponents range width." - ) + ax.set_title(f"Multiclosure fits {flavour} alpha preprocessing exponents range width.") ax.legend() yield fig @figuregen def plot_next_multiclosure_beta_preprocessing_range_width( - fits_fitbasis_beta_lines, fits_pdf, next_multiclosure_beta_preprocessing_table, + fits_fitbasis_beta_lines, + fits_pdf, + next_multiclosure_beta_preprocessing_table, ): """Using the table produced by :py:func:`next_multiclosure_beta_preprocessing_table`, plot the next @@ -209,7 +208,9 @@ def plot_next_multiclosure_beta_preprocessing_range_width( """ for fig in plot_next_multiclosure_alpha_preprocessing_range_width( - fits_fitbasis_beta_lines, fits_pdf, next_multiclosure_beta_preprocessing_table, + fits_fitbasis_beta_lines, + fits_pdf, + next_multiclosure_beta_preprocessing_table, ): # fixup title. ax = fig.gca() diff --git a/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py b/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py index 7ab2f53c84..3400c6f82b 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py +++ b/validphys2/src/validphys/closuretest/multiclosure_pseudodata.py @@ -7,18 +7,18 @@ """ import numpy as np import pandas as pd + from reportengine import collect from reportengine.table import table - from validphys.calcutils import calc_chi2 from validphys.closuretest.closure_checks import check_use_fitcommondata from validphys.core import cut_mask - # NOTE: for some reason the fit doesn't get properly resolved if you try to # collect data over fits fits_dataset = collect("dataset", ("fits",)) + @check_use_fitcommondata def fits_dataset_cvs(fits_dataset): """Internal function for loading the level one data for all fits @@ -37,12 +37,11 @@ def fits_dataset_cvs(fits_dataset): fits_cv.append(cd_df.iloc[cut_mask(ds.cuts), 5].to_numpy()) return fits_cv + data_fits_cv = collect(fits_dataset_cvs, ("data",)) -def expected_data_delta_chi2( - data_fits_cv, - internal_multiclosure_data_loader -): + +def expected_data_delta_chi2(data_fits_cv, internal_multiclosure_data_loader): """For ``data``, calculate the mean of delta chi2 across all fits, returns a tuple of number of data points and unnormalised delta chi2. """ @@ -62,7 +61,8 @@ def expected_data_delta_chi2( exps_expected_delta_chi2 = collect( - "expected_data_delta_chi2", ("group_dataset_inputs_by_experiment",)) + "expected_data_delta_chi2", ("group_dataset_inputs_by_experiment",) +) def total_expected_data_delta_chi2(exps_expected_delta_chi2): @@ -75,7 +75,9 @@ def total_expected_data_delta_chi2(exps_expected_delta_chi2): groups_expected_delta_chi2 = collect( - "expected_data_delta_chi2", ("group_dataset_inputs_by_metadata",)) + "expected_data_delta_chi2", ("group_dataset_inputs_by_metadata",) +) + @table def expected_delta_chi2_table( diff --git a/validphys2/src/validphys/commondata.py b/validphys2/src/validphys/commondata.py index 6f6445a466..b67bc3ab4f 100644 --- a/validphys2/src/validphys/commondata.py +++ b/validphys2/src/validphys/commondata.py @@ -7,9 +7,9 @@ """ from reportengine import collect - from validphys.commondataparser import load_commondata + def loaded_commondata_with_cuts(commondata, cuts): """Load the commondata and apply cuts. @@ -28,11 +28,9 @@ def loaded_commondata_with_cuts(commondata, cuts): lcd = load_commondata(commondata) return lcd.with_cuts(cuts) -dataset_inputs_loaded_cd_with_cuts = collect( - "loaded_commondata_with_cuts", ("data_input",) -) + +dataset_inputs_loaded_cd_with_cuts = collect("loaded_commondata_with_cuts", ("data_input",)) groups_dataset_inputs_loaded_cd_with_cuts = collect( "loaded_commondata_with_cuts", ("group_dataset_inputs_by_metadata", "data_input") ) - diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 10e84674a3..03191a4e99 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -5,8 +5,8 @@ The validphys commondata structure is an instance of :py:class:`validphys.coredata.CommonData` """ import dataclasses -from operator import attrgetter import logging +from operator import attrgetter import pandas as pd @@ -130,8 +130,7 @@ class CommonDataMetadata: def peek_commondata_metadata(commondatafilename): - """Read some of the properties of the commondata object as a CommonData Metadata - """ + """Read some of the properties of the commondata object as a CommonData Metadata""" with open(commondatafilename) as f: try: l = f.readline() diff --git a/validphys2/src/validphys/commondatawriter.py b/validphys2/src/validphys/commondatawriter.py index 07b06b4585..650df84cbc 100644 --- a/validphys2/src/validphys/commondatawriter.py +++ b/validphys2/src/validphys/commondatawriter.py @@ -3,6 +3,7 @@ tables to files """ + def write_commondata_data(commondata, buffer): """ write commondata table to buffer, this can be a memory map, @@ -80,4 +81,4 @@ def write_systype_to_file(commondata, path): write systype table to file """ with open(path, "w") as file: - write_systype_data(commondata, file) \ No newline at end of file + write_systype_data(commondata, file) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index a9ee17a651..b7d2edc3d5 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -518,7 +518,6 @@ def _produce_similarity_cuts(self, commondata): None, "cut_similarity_threshold", write=False ) try: - _, exclusion_list = self.parse_from_(None, "do_not_require_similarity_for", write=False) except configparser.InputNotFoundError: exclusion_list = [] @@ -859,7 +858,6 @@ def produce_matched_datasets_from_dataspecs(self, dataspecs): self._check_dataspecs_type(dataspecs) all_names = [] for spec in dataspecs: - with self.set_context(ns=self._curr_ns.new_child(spec)): _, data_input = self.parse_from_(None, "data_input", write=False) @@ -1316,7 +1314,6 @@ def produce_rules( filter_rules=None, default_filter_rules_recorded_spec_=None, ): - """Produce filter rules based on the user defined input and defaults.""" from validphys.filters import Rule, RuleProcessingError, default_filter_rules_input diff --git a/validphys2/src/validphys/convolution.py b/validphys2/src/validphys/convolution.py index 94e4c7fcd6..083eca67fb 100644 --- a/validphys2/src/validphys/convolution.py +++ b/validphys2/src/validphys/convolution.py @@ -36,15 +36,14 @@ level interface which operates with :py:class:`validphys.coredata.FKTableData` objects is also available. """ -import operator import functools +import operator -import pandas as pd import numpy as np +import pandas as pd -from validphys.pdfbases import evolution from validphys.fkparser import load_fktable - +from validphys.pdfbases import evolution FK_FLAVOURS = evolution.to_known_elements( [ @@ -75,11 +74,14 @@ def _asy(a, b): def _smn(a, b, c, d): return (a + b) / (c + d) + def _com(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t): - return (a + b + c + d + e + f + g + h + i + j) / ( k + l + m + n + o + p + q + r + s + t) + return (a + b + c + d + e + f + g + h + i + j) / (k + l + m + n + o + p + q + r + s + t) + def _smt(a, b, c, d, e, f, g, h, i, j): - return (a + b + c + d + e + f + g + h + i + j) + return a + b + c + d + e + f + g + h + i + j + def _id(a): return a @@ -95,7 +97,10 @@ def _id(a): "NULL": _id, } -class PredictionsRequireCutsError(Exception): pass + +class PredictionsRequireCutsError(Exception): + pass + def _predictions(dataset, pdf, fkfunc): """Combine data on all the FKTables in the database according to the @@ -125,7 +130,7 @@ def _predictions(dataset, pdf, fkfunc): def predictions(dataset, pdf): - """"Compute theory predictions for a given PDF and dataset. Information + """ "Compute theory predictions for a given PDF and dataset. Information regading the dataset, on cuts, CFactors and combinations of FKTables is taken into account to construct the predictions. @@ -319,7 +324,7 @@ def appl(df): xx2 = df.index.get_level_values(2) # take the active combinations from the luminosity tensor partial_lumi = luminosity[..., xx1, xx2] - return pd.Series(np.einsum("ijk,kj->i",partial_lumi, df.values)) + return pd.Series(np.einsum("ijk,kj->i", partial_lumi, df.values)) return sigma.groupby(level=0).apply(appl) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index fc903bf969..8d59e44c97 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -7,13 +7,13 @@ """ from __future__ import generator_stop -import re import enum import functools import inspect import json import logging from pathlib import Path +import re import numpy as np @@ -21,23 +21,25 @@ from reportengine.baseexceptions import AsInputError from reportengine.compat import yaml -#TODO: There is a bit of a circular dependency between filters.py and this. -#Maybe move the cuts logic to its own module? -from validphys import lhaindex, filters +# TODO: There is a bit of a circular dependency between filters.py and this. +# Maybe move the cuts logic to its own module? +from validphys import filters, lhaindex +from validphys.commondataparser import ( + get_plot_kinlabels, + parse_commondata, + peek_commondata_metadata, +) +from validphys.fkparser import load_fktable, parse_cfactor +from validphys.hyperoptplot import HyperoptTrial +from validphys.lhapdfset import LHAPDFSet from validphys.tableloader import parse_exp_mat from validphys.theorydbutils import fetch_theory -from validphys.hyperoptplot import HyperoptTrial from validphys.utils import experiments_to_dataset_inputs -from validphys.lhapdfset import LHAPDFSet -from validphys.fkparser import load_fktable, parse_cfactor -from validphys.commondataparser import (peek_commondata_metadata, - get_plot_kinlabels, - parse_commondata,) log = logging.getLogger(__name__) -class TupleComp: +class TupleComp: @classmethod def argnames(cls): return list(inspect.signature(cls.__init__).parameters.keys())[1:] @@ -52,15 +54,18 @@ def __hash__(self): return hash(self.comp_tuple) def __repr__(self): - argvals = ', '.join('%s=%r'%vals for vals in zip(self.argnames(), - self.comp_tuple)) - return '%s(%s)'%(self.__class__.__qualname__, argvals) + argvals = ', '.join('%s=%r' % vals for vals in zip(self.argnames(), self.comp_tuple)) + return '%s(%s)' % (self.__class__.__qualname__, argvals) + -class PDFDoesNotExist(Exception): pass +class PDFDoesNotExist(Exception): + pass -class _PDFSETS(): + +class _PDFSETS: """Convenient way to access installed PDFS, by e.g. tab completing - in ipython.""" + in ipython.""" + def __getattr__(self, attr): if lhaindex.isinstalled(attr): return PDF(attr) @@ -72,6 +77,7 @@ def __dir__(self): PDFSETS = _PDFSETS() + class PDF(TupleComp): """Base validphys PDF providing high level access to metadata. @@ -204,8 +210,7 @@ def __len__(self): return self.info["NumMembers"] def get_members(self): - """Return the number of members selected in ``pdf.load().grid_values`` - """ + """Return the number of members selected in ``pdf.load().grid_values``""" return len(self) @@ -214,7 +219,7 @@ def __init__(self, datafile, sysfile, plotfiles, name=None, metadata=None): self.datafile = datafile self.sysfile = sysfile self.plotfiles = tuple(plotfiles) - self._name=name + self._name = name self._metadata = metadata super().__init__(datafile, sysfile, self.plotfiles) @@ -266,9 +271,10 @@ def plot_kinlabels(self): class DataSetInput(TupleComp): """Represents whatever the user enters in the YAML to specify a dataset.""" + def __init__(self, *, name, sys, cfac, frac, weight, custom_group): - self.name=name - self.sys=sys + self.name = name + self.sys = sys self.cfac = cfac self.frac = frac self.weight = weight @@ -278,6 +284,7 @@ def __init__(self, *, name, sys, cfac, frac, weight, custom_group): def __str__(self): return self.name + class ExperimentInput(TupleComp): def __init__(self, *, name, datasets): self.name = name @@ -285,11 +292,12 @@ def __init__(self, *, name, datasets): super().__init__(name, datasets) def as_dict(self): - return {'experiment':self.name, 'datasets':self.datasets} + return {'experiment': self.name, 'datasets': self.datasets} def __str__(self): return self.name + class CutsPolicy(enum.Enum): INTERNAL = "internal" NOCUTS = "nocuts" @@ -317,6 +325,7 @@ def load(self): log.debug("Loading cuts for %s", self.name) return np.atleast_1d(np.loadtxt(self.path, dtype=int)) + class InternalCutsWrapper(TupleComp): def __init__(self, commondata, rules): self.rules = rules @@ -325,9 +334,9 @@ def __init__(self, commondata, rules): def load(self): return np.atleast_1d( - np.asarray( - filters.get_cuts_for_dataset(self.commondata, self.rules), - dtype=int)) + np.asarray(filters.get_cuts_for_dataset(self.commondata, self.rules), dtype=int) + ) + class MatchedCuts(TupleComp): def __init__(self, othercuts, ndata): @@ -336,12 +345,13 @@ def __init__(self, othercuts, ndata): super().__init__(self.othercuts, self.ndata) def load(self): - loaded = [c.load() for c in self.othercuts if c] + loaded = [c.load() for c in self.othercuts if c] if loaded: return functools.reduce(np.intersect1d, loaded) self._full = True return np.arange(self.ndata) + class SimilarCuts(TupleComp): def __init__(self, inputs, threshold): if len(inputs) != 2: @@ -356,8 +366,8 @@ def __init__(self, inputs, threshold): @functools.lru_cache() def load(self): # TODO: Update this when a suitable interace becomes available - from validphys.convolution import central_predictions from validphys.commondataparser import load_commondata + from validphys.convolution import central_predictions from validphys.covmats import covmat_from_systematics first, second = self.inputs @@ -366,15 +376,13 @@ def load(self): np.diag( covmat_from_systematics( load_commondata(first_ds.commondata).with_cuts(first_ds.cuts), - first_ds, # DataSetSpec has weight attr - use_weights_in_covmat=False, # Don't weight covmat + first_ds, # DataSetSpec has weight attr + use_weights_in_covmat=False, # Don't weight covmat ) ) ) # Compute matched predictions - delta = np.abs( - (central_predictions(*first) - central_predictions(*second)).squeeze(axis=1) - ) + delta = np.abs((central_predictions(*first) - central_predictions(*second)).squeeze(axis=1)) ratio = delta / exp_err passed = ratio < self.threshold return passed[passed].index @@ -386,10 +394,9 @@ def cut_mask(cuts): return slice(None) return cuts.load() -class DataSetSpec(TupleComp): - def __init__(self, *, name, commondata, fkspecs, thspec, cuts, - frac=1, op=None, weight=1): +class DataSetSpec(TupleComp): + def __init__(self, *, name, commondata, fkspecs, thspec, cuts, frac=1, op=None, weight=1): self.name = name self.commondata = commondata @@ -402,15 +409,14 @@ def __init__(self, *, name, commondata, fkspecs, thspec, cuts, self.cuts = cuts self.frac = frac - #Do this way (instead of setting op='NULL' in the signature) - #so we don't have to know the default everywhere + # Do this way (instead of setting op='NULL' in the signature) + # so we don't have to know the default everywhere if op is None: op = 'NULL' self.op = op self.weight = weight - super().__init__(name, commondata, fkspecs, thspec, cuts, - frac, op, weight) + super().__init__(name, commondata, fkspecs, thspec, cuts, frac, op, weight) @functools.lru_cache() def load_commondata(self): @@ -439,6 +445,7 @@ def to_unweighted(self): def __str__(self): return self.name + class FKTableSpec(TupleComp): """ Each FKTable is formed by a number of sub-fktables to be concatenated @@ -459,7 +466,7 @@ def __init__(self, fkpath, cfactors, metadata=None): # NOTE: The legacy interface is currently used by fkparser to decide # whether to read an FKTable using the old parser or the pineappl parser - # this attribute (and the difference between both) might be removed in future + # this attribute (and the difference between both) might be removed in future # releases of NNPDF so please don't write code that relies on it if not isinstance(fkpath, (tuple, list)): self.legacy = True @@ -527,14 +534,14 @@ class PositivitySetSpec(LagrangeSetSpec): class IntegrabilitySetSpec(LagrangeSetSpec): pass -#We allow to expand the experiment as a list of datasets -class DataGroupSpec(TupleComp, namespaces.NSList): +# We allow to expand the experiment as a list of datasets +class DataGroupSpec(TupleComp, namespaces.NSList): def __init__(self, name, datasets, dsinputs=None): - #This needs to be hashable + # This needs to be hashable datasets = tuple(datasets) - #TODO: Find a better way for interactive usage. + # TODO: Find a better way for interactive usage. if dsinputs is not None: dsinputs = tuple(dsinputs) @@ -542,10 +549,10 @@ def __init__(self, name, datasets, dsinputs=None): self.datasets = datasets self.dsinputs = dsinputs - #TODO: Add dsinputs to comp tuple? + # TODO: Add dsinputs to comp tuple? super().__init__(name, datasets) - #TODO: Can we do better cooperative inherece trick than this? + # TODO: Can we do better cooperative inherece trick than this? namespaces.NSList.__init__(self, dsinputs, nskey='dataset_input') @functools.lru_cache(maxsize=32) @@ -568,13 +575,13 @@ def load_commondata_instance(self): @property def thspec(self): - #TODO: Is this good enough? Should we explicitly pass the theory + # TODO: Is this good enough? Should we explicitly pass the theory return self.datasets[0].thspec def __str__(self): return self.name - #Need this so that it doesn't try to iterte over itself. + # Need this so that it doesn't try to iterte over itself. @property def as_markdown(self): return str(self) @@ -602,8 +609,8 @@ def __iter__(self): @functools.lru_cache() def as_input(self): - p = self.path/'filter.yml' - log.debug('Reading input from fit configuration %s' , p) + p = self.path / 'filter.yml' + log.debug('Reading input from fit configuration %s', p) try: with p.open() as f: d = yaml.safe_load(f) @@ -616,7 +623,7 @@ def as_input(self): dataset_inputs = experiments_to_dataset_inputs(d['experiments']) d['dataset_inputs'] = dataset_inputs - #BCH + # BCH # backwards compatibility hack for runcards with the 'fitting' namespace # if a variable already exists outside 'fitting' it takes precedence fitting = d.get("fitting") @@ -630,7 +637,7 @@ def as_input(self): def __str__(self): return self.label - __slots__ = ('label','name', 'path') + __slots__ = ('label', 'name', 'path') class HyperscanSpec(FitSpec): @@ -692,7 +699,7 @@ def sample_trials(self, n=None, base_params=None, sigma=4.0): log.warning("Asked for %d trials, only %d valid trials found", n, len(all_trials)) # Compute weights proportionally to the reward (goes from 0 (worst) to 1 (best, loss=1)) rewards = np.array([i.weighted_reward for i in all_trials]) - weight_raw = np.exp(sigma * rewards ** 2) + weight_raw = np.exp(sigma * rewards**2) total = np.sum(weight_raw) weights = weight_raw / total return np.random.choice(all_trials, replace=False, size=n, p=weights) @@ -708,7 +715,7 @@ def __iter__(self): yield self.path def get_description(self): - dbpath = self.path.parent/'theory.db' + dbpath = self.path.parent / 'theory.db' return fetch_theory(dbpath, self.id) __slots__ = ('id', 'path') @@ -727,6 +734,7 @@ def is_pineappl(self): """Check whether this theory is a pineappl-based theory""" return self.yamldb_path.exists() + class ThCovMatSpec: def __init__(self, path): self.path = path @@ -739,9 +747,9 @@ def load(self): def __str__(self): return str(self.path) -#TODO: Decide if we want methods or properties -class Stats: +# TODO: Decide if we want methods or properties +class Stats: def __init__(self, data): """`data `should be N_pdf*N_bins""" self.data = np.atleast_2d(data) @@ -768,27 +776,27 @@ def errorbar68(self): raise NotImplementedError() def errorbarstd(self): - return (self.central_value() - self.std_error(), - self.central_value() + self.std_error()) + return (self.central_value() - self.std_error(), self.central_value() + self.std_error()) - #TODO... + # TODO... ... class MCStats(Stats): """Result obtained from a Monte Carlo sample""" + def std_error(self): # ddof == 1 to match legacy libNNPDF behaviour return np.std(self.error_members(), ddof=1, axis=0) def moment(self, order): - return np.mean(np.power(self.error_members()-self.central_value(),order), axis=0) + return np.mean(np.power(self.error_members() - self.central_value(), order), axis=0) def errorbar68(self): - #Use nanpercentile here because we can have e.g. 0/0==nan normalization - #somewhere. + # Use nanpercentile here because we can have e.g. 0/0==nan normalization + # somewhere. down = np.nanpercentile(self.error_members(), 15.87, axis=0) - up = np.nanpercentile(self.error_members(), 84.13, axis=0) + up = np.nanpercentile(self.error_members(), 84.13, axis=0) return down, up def sample_values(self, size): @@ -801,6 +809,7 @@ class SymmHessianStats(Stats): central value. The rest of the indexes are results for each eigenvector. A 'rescale_factor is allowed in case the eigenvector confidence interval is not 68%'.""" + def __init__(self, data, rescale_factor=1): super().__init__(data) self.rescale_factor = rescale_factor @@ -810,13 +819,13 @@ def errorbar68(self): def std_error(self): data = self.data - diffsq = (data[0] - data[1:])**2 - return np.sqrt(diffsq.sum(axis=0))/self.rescale_factor + diffsq = (data[0] - data[1:]) ** 2 + return np.sqrt(diffsq.sum(axis=0)) / self.rescale_factor def moment(self, order): data = self.data - return np.sum( - np.power((data[0] - data[1:])/self.rescale_factor, order), axis=0) + return np.sum(np.power((data[0] - data[1:]) / self.rescale_factor, order), axis=0) + class HessianStats(SymmHessianStats): """Compute stats in the 'assymetric' hessian format: The first index (0) @@ -826,26 +835,27 @@ class HessianStats(SymmHessianStats): even are the upper eigenvectors.A 'rescale_factor is allowed in case the eigenvector confidence interval is not 68%'.""" + def std_error(self): data = self.data - diffsq = (data[1::2] - data[2::2])**2 - return np.sqrt(diffsq.sum(axis=0))/self.rescale_factor/2 + diffsq = (data[1::2] - data[2::2]) ** 2 + return np.sqrt(diffsq.sum(axis=0)) / self.rescale_factor / 2 def moment(self, order): data = self.data - return np.sum( - np.power((data[1::2] - data[2::2])/self.rescale_factor/2, order), axis=0) + return np.sum(np.power((data[1::2] - data[2::2]) / self.rescale_factor / 2, order), axis=0) STAT_TYPES = dict( - symmhessian = SymmHessianStats, - hessian = HessianStats, - replicas = MCStats, - ) + symmhessian=SymmHessianStats, + hessian=HessianStats, + replicas=MCStats, +) + class Filter: def __init__(self, indexes, label, **kwargs): - self.indexes = indexes + self.indexes = indexes self.label = label self.kwargs = kwargs diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index bcdd939ffb..8f442a9a35 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -3,9 +3,12 @@ dataframes). """ import dataclasses + import numpy as np import pandas as pd -from validphys.commondatawriter import write_systype_to_file, write_commondata_to_file + +from validphys.commondatawriter import write_commondata_to_file, write_systype_to_file + KIN_NAMES = ["kin1", "kin2", "kin3"] @@ -294,9 +297,9 @@ def central_values(self): return self.commondata_table["data"] def with_central_value(self, cv): - tb = self.commondata_table.copy() - tb["data"] = cv - return dataclasses.replace(self, commondata_table=tb) + tb = self.commondata_table.copy() + tb["data"] = cv + return dataclasses.replace(self, commondata_table=tb) def get_cv(self): return self.central_values.values @@ -367,7 +370,6 @@ def systematic_errors(self, central_values=None): converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100 return pd.concat((self.additive_errors, converted_mult_errors), axis=1) - def export(self, path): """Export the data, and error types Use the same format as libNNPDF: diff --git a/validphys2/src/validphys/correlations.py b/validphys2/src/validphys/correlations.py index 2841d2bb4b..0e72362b33 100644 --- a/validphys2/src/validphys/correlations.py +++ b/validphys2/src/validphys/correlations.py @@ -8,12 +8,12 @@ import numpy.linalg as la from reportengine import collect - -from validphys.core import Stats from validphys.checks import check_pdf_is_montecarlo +from validphys.core import Stats + -#This would be a good candidate to be optimized to calculate everything in one -#pass over x, +# This would be a good candidate to be optimized to calculate everything in one +# pass over x, def _basic_obs_pdf_correlation(pdf_val, obs_val): """Calculate the correlation between pdfs and observables. The expected format is two arrays @@ -30,23 +30,24 @@ def _basic_obs_pdf_correlation(pdf_val, obs_val): x = pdf_val - np.mean(pdf_val, axis=0) y = (obs_val - np.mean(obs_val, axis=-1, keepdims=True)).T - #We want to compute: - #sum(x*y)/(norm(x)*norm(y)) - #broadcast to the appropriate dimensions + # We want to compute: + # sum(x*y)/(norm(x)*norm(y)) + # broadcast to the appropriate dimensions - num = np.einsum('ij,ikm->jkm',y,x) + num = np.einsum('ij,ikm->jkm', y, x) xnorm = la.norm(x, axis=0) ynorm = la.norm(y, axis=0) - #like np.outer, but keeping the right shape - den = np.einsum('i,jk->ijk',ynorm, xnorm) + # like np.outer, but keeping the right shape + den = np.einsum('i,jk->ijk', ynorm, xnorm) + + return num / den - return num/den def _basic_obs_obs_correlation(obs1, obs2): """Calculate the correlation between two observables. The expected format is arrays instances of: - + obs1: (nbins, nreplicas) obs2: (nbins, nreplicas) @@ -56,7 +57,8 @@ def _basic_obs_obs_correlation(obs1, obs2): x = obs1 - np.mean(obs1, axis=1, keepdims=True) y = (obs2 - np.mean(obs2, axis=1, keepdims=True)).T - return x@y/np.outer(la.norm(x,axis=1),la.norm(y,axis=0)) + return x @ y / np.outer(la.norm(x, axis=1), la.norm(y, axis=0)) + @check_pdf_is_montecarlo def obs_pdf_correlations(pdf, results, xplotting_grid): @@ -74,6 +76,7 @@ def obs_pdf_correlations(pdf, results, xplotting_grid): corrpair_results = collect("results", ["corrpair"]) corrpair_datasets = collect("dataset", ["corrpair"]) + @check_pdf_is_montecarlo def obs_obs_correlations(pdf, corrpair_results): """Return the theoretical correlation matrix between a pair of observables.""" diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py index 5d272c514c..cb0adf6e68 100644 --- a/validphys2/src/validphys/covmats.py +++ b/validphys2/src/validphys/covmats.py @@ -9,21 +9,20 @@ from reportengine import collect from reportengine.table import table - -from validphys.calcutils import regularize_covmat, get_df_block +from validphys.calcutils import get_df_block, regularize_covmat from validphys.checks import ( + check_cuts_considered, + check_data_cuts_match_theorycovmat, check_dataset_cuts_match_theorycovmat, check_norm_threshold, check_pdf_is_montecarlo, check_speclabels_different, - check_data_cuts_match_theorycovmat, - check_cuts_considered, ) +from validphys.commondata import loaded_commondata_with_cuts from validphys.convolution import central_predictions from validphys.core import PDF, DataGroupSpec, DataSetSpec from validphys.covmats_utils import construct_covmat, systematics_matrix from validphys.results import ThPredictionsResult -from validphys.commondata import loaded_commondata_with_cuts log = logging.getLogger(__name__) @@ -35,7 +34,7 @@ def covmat_from_systematics( dataset_input, use_weights_in_covmat=True, norm_threshold=None, - _central_values=None + _central_values=None, ): """Take the statistical uncertainty and systematics table from a :py:class:`validphys.coredata.CommonData` object and @@ -117,15 +116,12 @@ def covmat_from_systematics( """ covmat = construct_covmat( loaded_commondata_with_cuts.stat_errors.to_numpy(), - loaded_commondata_with_cuts.systematic_errors(_central_values) + loaded_commondata_with_cuts.systematic_errors(_central_values), ) if use_weights_in_covmat: covmat = covmat / dataset_input.weight if norm_threshold is not None: - covmat = regularize_covmat( - covmat, - norm_threshold=norm_threshold - ) + covmat = regularize_covmat(covmat, norm_threshold=norm_threshold) return covmat @@ -198,11 +194,9 @@ def dataset_inputs_covmat_from_systematics( _list_of_central_values = [None] * len(dataset_inputs_loaded_cd_with_cuts) for cd, dsinp, central_values in zip( - dataset_inputs_loaded_cd_with_cuts, - data_input, - _list_of_central_values + dataset_inputs_loaded_cd_with_cuts, data_input, _list_of_central_values ): - #used if we want to separate additive and multiplicative errors in make_replica + # used if we want to separate additive and multiplicative errors in make_replica if _only_additive: sys_errors = cd.additive_errors else: @@ -212,8 +206,7 @@ def dataset_inputs_covmat_from_systematics( # separate out the special uncertainties which can be correlated across # datasets is_intra_dataset_error = sys_errors.columns.isin(INTRA_DATASET_SYS_NAME) - block_diags.append(construct_covmat( - stat_errors, sys_errors.loc[:, is_intra_dataset_error])) + block_diags.append(construct_covmat(stat_errors, sys_errors.loc[:, is_intra_dataset_error])) special_corrs.append(sys_errors.loc[:, ~is_intra_dataset_error]) # concat systematics across datasets @@ -229,10 +222,7 @@ def dataset_inputs_covmat_from_systematics( # returns C_ij / (sqrt(w_i) * sqrt(w_j)) covmat = (covmat / sqrt_weights).T / sqrt_weights if norm_threshold is not None: - covmat = regularize_covmat( - covmat, - norm_threshold=norm_threshold - ) + covmat = regularize_covmat(covmat, norm_threshold=norm_threshold) return covmat @@ -268,7 +258,7 @@ def t0_covmat_from_systematics( dataset_input, use_weights_in_covmat=True, norm_threshold=None, - dataset_t0_predictions + dataset_t0_predictions, ): """Like :py:func:`covmat_from_systematics` except uses the t0 predictions to calculate the absolute constributions to the covmat from multiplicative @@ -300,7 +290,7 @@ def t0_covmat_from_systematics( dataset_input, use_weights_in_covmat, norm_threshold=norm_threshold, - _central_values=dataset_t0_predictions + _central_values=dataset_t0_predictions, ) @@ -313,7 +303,7 @@ def dataset_inputs_t0_covmat_from_systematics( data_input, use_weights_in_covmat=True, norm_threshold=None, - dataset_inputs_t0_predictions + dataset_inputs_t0_predictions, ): """Like :py:func:`t0_covmat_from_systematics` except for all data @@ -342,23 +332,23 @@ def dataset_inputs_t0_covmat_from_systematics( data_input, use_weights_in_covmat, norm_threshold=norm_threshold, - _list_of_central_values=dataset_inputs_t0_predictions + _list_of_central_values=dataset_inputs_t0_predictions, ) def dataset_inputs_t0_total_covmat_separate( - dataset_inputs_t0_exp_covmat_separate, - loaded_theory_covmat + dataset_inputs_t0_exp_covmat_separate, loaded_theory_covmat ): """ Function to compute the covmat to be used for the sampling by make_replica. - In this case the t0 prescription is used for the experimental covmat and the multiplicative - errors are separated. Moreover, the theory covmat is added to experimental covmat. + In this case the t0 prescription is used for the experimental covmat and the multiplicative + errors are separated. Moreover, the theory covmat is added to experimental covmat. """ covmat = dataset_inputs_t0_exp_covmat_separate covmat += loaded_theory_covmat return covmat + def dataset_inputs_t0_exp_covmat_separate( dataset_inputs_loaded_cd_with_cuts, *, @@ -369,25 +359,34 @@ def dataset_inputs_t0_exp_covmat_separate( ): """ Function to compute the covmat to be used for the sampling by make_replica. - In this case the t0 prescription is used for the experimental covmat and the multiplicative + In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are separated. """ - covmat = generate_exp_covmat(dataset_inputs_loaded_cd_with_cuts, data_input, use_weights_in_covmat, norm_threshold, dataset_inputs_t0_predictions , True) + covmat = generate_exp_covmat( + dataset_inputs_loaded_cd_with_cuts, + data_input, + use_weights_in_covmat, + norm_threshold, + dataset_inputs_t0_predictions, + True, + ) return covmat + def dataset_inputs_total_covmat_separate( dataset_inputs_exp_covmat_separate, loaded_theory_covmat, ): """ Function to compute the covmat to be used for the sampling by make_replica. - In this case the t0 prescription is not used for the experimental covmat and the multiplicative + In this case the t0 prescription is not used for the experimental covmat and the multiplicative errors are separated. Moreover, the theory covmat is added to experimental covmat. """ covmat = dataset_inputs_exp_covmat_separate covmat += loaded_theory_covmat return covmat + def dataset_inputs_exp_covmat_separate( dataset_inputs_loaded_cd_with_cuts, *, @@ -397,25 +396,34 @@ def dataset_inputs_exp_covmat_separate( ): """ Function to compute the covmat to be used for the sampling by make_replica. - In this case the t0 prescription is not used for the experimental covmat and the multiplicative - errors are separated. + In this case the t0 prescription is not used for the experimental covmat and the multiplicative + errors are separated. """ - covmat = generate_exp_covmat(dataset_inputs_loaded_cd_with_cuts, data_input, use_weights_in_covmat, norm_threshold, None , True) + covmat = generate_exp_covmat( + dataset_inputs_loaded_cd_with_cuts, + data_input, + use_weights_in_covmat, + norm_threshold, + None, + True, + ) return covmat + def dataset_inputs_t0_total_covmat( dataset_inputs_t0_exp_covmat, loaded_theory_covmat, ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 - by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat + by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ covmat = dataset_inputs_t0_exp_covmat covmat += loaded_theory_covmat return covmat + def dataset_inputs_t0_exp_covmat( dataset_inputs_loaded_cd_with_cuts, *, @@ -426,25 +434,34 @@ def dataset_inputs_t0_exp_covmat( ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 - by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat - and the multiplicative errors are included in it. + by fitting_data_dict. In this case the t0 prescription is used for the experimental covmat + and the multiplicative errors are included in it. """ - covmat = generate_exp_covmat(dataset_inputs_loaded_cd_with_cuts, data_input, use_weights_in_covmat, norm_threshold, dataset_inputs_t0_predictions , False) + covmat = generate_exp_covmat( + dataset_inputs_loaded_cd_with_cuts, + data_input, + use_weights_in_covmat, + norm_threshold, + dataset_inputs_t0_predictions, + False, + ) return covmat + def dataset_inputs_total_covmat( dataset_inputs_exp_covmat, loaded_theory_covmat, ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 - by fitting_data_dict. In this case the t0 prescription is not used for the experimental covmat + by fitting_data_dict. In this case the t0 prescription is not used for the experimental covmat and the multiplicative errors are included in it. Moreover, the theory covmat is added to experimental covmat. """ covmat = dataset_inputs_exp_covmat covmat += loaded_theory_covmat return covmat + def dataset_inputs_exp_covmat( dataset_inputs_loaded_cd_with_cuts, *, @@ -454,18 +471,22 @@ def dataset_inputs_exp_covmat( ): """ Function to compute the covmat to be used for the sampling by make_replica and for the chi2 - by fitting_data_dict. In this case the t0 prescription is not used for the experimental covmat + by fitting_data_dict. In this case the t0 prescription is not used for the experimental covmat and the multiplicative errors are included in it. """ - covmat = generate_exp_covmat(dataset_inputs_loaded_cd_with_cuts, data_input, use_weights_in_covmat, norm_threshold, None , False) + covmat = generate_exp_covmat( + dataset_inputs_loaded_cd_with_cuts, + data_input, + use_weights_in_covmat, + norm_threshold, + None, + False, + ) return covmat -def generate_exp_covmat(datasets_input, - data, - use_weights, - norm_threshold, - _list_of_c_values, - only_add + +def generate_exp_covmat( + datasets_input, data, use_weights, norm_threshold, _list_of_c_values, only_add ): """ Function to generate the experimental covmat eventually using the t0 prescription. It is also @@ -492,7 +513,7 @@ def generate_exp_covmat(datasets_input, values are used. only_add: bool specifies whether to use only the additive errors to compute the covmat - + Returns ------- : np.array @@ -504,7 +525,7 @@ def generate_exp_covmat(datasets_input, use_weights, norm_threshold=norm_threshold, _list_of_central_values=_list_of_c_values, - _only_additive = only_add + _only_additive=only_add, ) @@ -570,9 +591,11 @@ def sqrt_covmat(covariance_matrix): if covariance_matrix.size == 0: raise ValueError("Attempting the decomposition of an empty matrix.") elif dimensions[0] != dimensions[1]: - raise ValueError("The input covariance matrix should be square but " - f"instead it has dimensions {dimensions[0]} x " - f"{dimensions[1]}") + raise ValueError( + "The input covariance matrix should be square but " + f"instead it has dimensions {dimensions[0]} x " + f"{dimensions[1]}" + ) sqrt_diags = np.sqrt(np.diag(covariance_matrix)) correlation_matrix = covariance_matrix / sqrt_diags[:, np.newaxis] / sqrt_diags @@ -581,8 +604,7 @@ def sqrt_covmat(covariance_matrix): return sqrt_matrix -def groups_covmat_no_table( - groups_data, groups_index, groups_covmat_collection): +def groups_covmat_no_table(groups_data, groups_index, groups_covmat_collection): """Export the covariance matrix for the groups. It exports the full (symmetric) matrix, with the 3 first rows and columns being: @@ -592,12 +614,11 @@ def groups_covmat_no_table( - index of the point within the dataset. """ - data = np.zeros((len(groups_index),len(groups_index))) + data = np.zeros((len(groups_index), len(groups_index))) df = pd.DataFrame(data, index=groups_index, columns=groups_index) - for group, group_covmat in zip( - groups_data, groups_covmat_collection): + for group, group_covmat in zip(groups_data, groups_covmat_collection): name = group.name - df.loc[[name],[name]] = group_covmat + df.loc[[name], [name]] = group_covmat return df @@ -608,36 +629,32 @@ def groups_covmat(groups_covmat_no_table): @table -def groups_sqrtcovmat( - groups_data, groups_index, groups_sqrt_covmat): +def groups_sqrtcovmat(groups_data, groups_index, groups_sqrt_covmat): """Like groups_covmat, but dump the lower triangular part of the Cholesky decomposition as used in the fit. The upper part indices are set to zero. """ - data = np.zeros((len(groups_index),len(groups_index))) + data = np.zeros((len(groups_index), len(groups_index))) df = pd.DataFrame(data, index=groups_index, columns=groups_index) - for group, group_sqrt_covmat in zip( - groups_data, groups_sqrt_covmat): + for group, group_sqrt_covmat in zip(groups_data, groups_sqrt_covmat): name = group.name group_sqrt_covmat[np.triu_indices_from(group_sqrt_covmat, k=1)] = 0 - df.loc[[name],[name]] = group_sqrt_covmat + df.loc[[name], [name]] = group_sqrt_covmat return df @table -def groups_invcovmat( - groups_data, groups_index, groups_covmat_collection): +def groups_invcovmat(groups_data, groups_index, groups_covmat_collection): """Compute and export the inverse covariance matrix. Note that this inverts the matrices with the LU method which is suboptimal.""" - data = np.zeros((len(groups_index),len(groups_index))) + data = np.zeros((len(groups_index), len(groups_index))) df = pd.DataFrame(data, index=groups_index, columns=groups_index) - for group, group_covmat in zip( - groups_data, groups_covmat_collection): + for group, group_covmat in zip(groups_data, groups_covmat_collection): name = group.name - #Improve this inversion if this method tuns out to be important + # Improve this inversion if this method tuns out to be important invcov = la.inv(group_covmat) - df.loc[[name],[name]] = invcov + df.loc[[name], [name]] = invcov return df @@ -646,7 +663,7 @@ def groups_normcovmat(groups_covmat, groups_data_values): """Calculates the grouped experimental covariance matrix normalised to data.""" df = groups_covmat groups_data_array = np.array(groups_data_values) - mat = df/np.outer(groups_data_array, groups_data_array) + mat = df / np.outer(groups_data_array, groups_data_array) return mat @@ -655,8 +672,8 @@ def groups_corrmat(groups_covmat): """Generates the grouped experimental correlation matrix with groups_covmat as input""" df = groups_covmat covmat = df.values - diag_minus_half = (np.diagonal(covmat))**(-0.5) - mat = diag_minus_half[:,np.newaxis]*df*diag_minus_half + diag_minus_half = (np.diagonal(covmat)) ** (-0.5) + mat = diag_minus_half[:, np.newaxis] * df * diag_minus_half return mat @@ -702,6 +719,7 @@ def pdferr_plus_covmat(dataset, pdf, covmat_t0_considered): pdf_cov = np.cov(th.error_members, rowvar=True) return pdf_cov + covmat_t0_considered + def reorder_thcovmat_as_expcovmat(fitthcovmat, data): """ Reorder the thcovmat in such a way to match the order of the experimental covmat, which @@ -712,12 +730,18 @@ def reorder_thcovmat_as_expcovmat(fitthcovmat, data): tmp = theory_covmat.droplevel(0, axis=0).droplevel(0, axis=1) return tmp.reindex(index=bb, columns=bb, level=0) + def pdferr_plus_dataset_inputs_covmat(data, pdf, dataset_inputs_covmat_t0_considered, fitthcovmat): """Like `pdferr_plus_covmat` except for an experiment""" # do checks get performed here? if fitthcovmat is not None: - #change ordering according to exp_covmat (so according to runcard order) - return pdferr_plus_covmat(data, pdf, dataset_inputs_covmat_t0_considered+ reorder_thcovmat_as_expcovmat(fitthcovmat,data).values) + # change ordering according to exp_covmat (so according to runcard order) + return pdferr_plus_covmat( + data, + pdf, + dataset_inputs_covmat_t0_considered + + reorder_thcovmat_as_expcovmat(fitthcovmat, data).values, + ) return pdferr_plus_covmat(data, pdf, dataset_inputs_covmat_t0_considered) @@ -727,10 +751,7 @@ def dataset_inputs_sqrt_covmat(dataset_inputs_covariance_matrix): def systematics_matrix_from_commondata( - loaded_commondata_with_cuts, - dataset_input, - use_weights_in_covmat=True, - _central_values=None + loaded_commondata_with_cuts, dataset_input, use_weights_in_covmat=True, _central_values=None ): """Returns a systematics matrix, :math:`A`, for the corresponding dataset. The systematics matrix is a square root of the covmat: @@ -745,12 +766,13 @@ def systematics_matrix_from_commondata( """ sqrt_covmat = systematics_matrix( loaded_commondata_with_cuts.stat_errors.to_numpy(), - loaded_commondata_with_cuts.systematic_errors(_central_values) + loaded_commondata_with_cuts.systematic_errors(_central_values), ) if use_weights_in_covmat: return sqrt_covmat / np.sqrt(dataset_input.weight) return sqrt_covmat + def covmat_stability_characteristic(systematics_matrix_from_commondata): """ Return a number characterizing the stability of an experimental covariance @@ -781,7 +803,7 @@ def covmat_stability_characteristic(systematics_matrix_from_commondata): """ sqrtcov = systematics_matrix_from_commondata # copied from calcutils.regularize_l2 but just return stability condition. - d = np.sqrt(np.sum(sqrtcov ** 2, axis=1))[:, np.newaxis] + d = np.sqrt(np.sum(sqrtcov**2, axis=1))[:, np.newaxis] sqrtcorr = sqrtcov / d _, s, _ = la.svd(sqrtcorr, full_matrices=False) return 1 / s[-1] @@ -816,7 +838,8 @@ def fit_name_with_covmat_label(fit, fitthcovmat): @table @check_norm_threshold def datasets_covmat_differences_table( - each_dataset, datasets_covmat_no_reg, datasets_covmat_reg, norm_threshold): + each_dataset, datasets_covmat_no_reg, datasets_covmat_reg, norm_threshold +): """For each dataset calculate and tabulate two max differences upon regularization given a value for `norm_threshold`: @@ -825,38 +848,36 @@ def datasets_covmat_differences_table( """ records = [] - for ds, reg, noreg in zip( - each_dataset, datasets_covmat_reg, datasets_covmat_no_reg): - cov_diag_rel_diff = np.diag(reg)/np.diag(noreg) + for ds, reg, noreg in zip(each_dataset, datasets_covmat_reg, datasets_covmat_no_reg): + cov_diag_rel_diff = np.diag(reg) / np.diag(noreg) d_reg = np.sqrt(np.diag(reg)) d_noreg = np.sqrt(np.diag(noreg)) - corr_reg = reg/d_reg[:, np.newaxis]/d_reg[np.newaxis, :] - corr_noreg = noreg/d_noreg[:, np.newaxis]/d_noreg[np.newaxis, :] + corr_reg = reg / d_reg[:, np.newaxis] / d_reg[np.newaxis, :] + corr_noreg = noreg / d_noreg[:, np.newaxis] / d_noreg[np.newaxis, :] corr_abs_diff = abs(corr_reg - corr_noreg) - records.append(dict( + records.append( + dict( dataset=str(ds), - covdiff= np.max(abs(cov_diag_rel_diff- 1))*100, #make percentage - corrdiff=np.max(corr_abs_diff) - )) - df = pd.DataFrame.from_records(records, - columns=("dataset", "covdiff", "corrdiff"), - index = ("dataset",) + covdiff=np.max(abs(cov_diag_rel_diff - 1)) * 100, # make percentage + corrdiff=np.max(corr_abs_diff), + ) ) + df = pd.DataFrame.from_records( + records, columns=("dataset", "covdiff", "corrdiff"), index=("dataset",) + ) df.columns = ["Variance rel. diff. (%)", "Correlation max abs. diff."] return df @check_speclabels_different @table -def dataspecs_datasets_covmat_differences_table( - dataspecs_speclabel, dataspecs_covmat_diff_tables -): +def dataspecs_datasets_covmat_differences_table(dataspecs_speclabel, dataspecs_covmat_diff_tables): """For each dataspec calculate and tabulate the two covmat differences described in `datasets_covmat_differences_table` (max relative difference in variance and max absolute correlation difference) """ - df = pd.concat( dataspecs_covmat_diff_tables, axis=1) + df = pd.concat(dataspecs_covmat_diff_tables, axis=1) cols = df.columns.get_level_values(0).unique() df.columns = pd.MultiIndex.from_product((dataspecs_speclabel, cols)) return df @@ -876,34 +897,34 @@ def _dataset_inputs_covmat_t0_considered(dataset_inputs_covmat_t0_considered, fi and ``use_pdferr`` """ if fitthcovmat is not None: - #change ordering according to exp_covmat (so according to runcard order) - return dataset_inputs_covmat_t0_considered + reorder_thcovmat_as_expcovmat(fitthcovmat,data).values - return dataset_inputs_covmat_t0_considered + # change ordering according to exp_covmat (so according to runcard order) + return ( + dataset_inputs_covmat_t0_considered + + reorder_thcovmat_as_expcovmat(fitthcovmat, data).values + ) + return dataset_inputs_covmat_t0_considered + groups_covmat_collection = collect( 'dataset_inputs_covariance_matrix', ('group_dataset_inputs_by_metadata',) ) -groups_sqrt_covmat = collect( - 'dataset_inputs_sqrt_covmat', - ('group_dataset_inputs_by_metadata',) -) +groups_sqrt_covmat = collect('dataset_inputs_sqrt_covmat', ('group_dataset_inputs_by_metadata',)) -dataspecs_covmat_diff_tables = collect( - "datasets_covmat_differences_table", ("dataspecs",) -) +dataspecs_covmat_diff_tables = collect("datasets_covmat_differences_table", ("dataspecs",)) fits_name_with_covmat_label = collect('fit_name_with_covmat_label', ('fits',)) -datasets_covmat_no_reg = collect( - "covariance_matrix", ("data", "no_covmat_reg")) +datasets_covmat_no_reg = collect("covariance_matrix", ("data", "no_covmat_reg")) -datasets_covmat_reg = collect( - "covariance_matrix", ("data",)) +datasets_covmat_reg = collect("covariance_matrix", ("data",)) datasets_covmat = collect('covariance_matrix', ('data',)) datasets_covariance_matrix = collect( 'covariance_matrix', - ('experiments', 'experiment',) + ( + 'experiments', + 'experiment', + ), ) diff --git a/validphys2/src/validphys/covmats_utils.py b/validphys2/src/validphys/covmats_utils.py index 1b2407f683..64efbef1ce 100644 --- a/validphys2/src/validphys/covmats_utils.py +++ b/validphys2/src/validphys/covmats_utils.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd + def systematics_matrix(stat_errors: np.array, sys_errors: pd.DataFrame): """Basic function to create a systematics matrix , :math:`A`, such that: @@ -42,7 +43,7 @@ def systematics_matrix(stat_errors: np.array, sys_errors: pd.DataFrame): of ``sys_errors`` before passing that to this function. """ - diagonal = stat_errors ** 2 + diagonal = stat_errors**2 is_uncorr = sys_errors.columns.isin(("UNCORR", "THEORYUNCORR")) diagonal += (sys_errors.loc[:, is_uncorr].to_numpy() ** 2).sum(axis=1) @@ -78,7 +79,7 @@ def construct_covmat(stat_errors: np.array, sys_errors: pd.DataFrame): of ``sys_errors`` before passing that to this function. """ - diagonal = stat_errors ** 2 + diagonal = stat_errors**2 is_uncorr = sys_errors.columns.isin(("UNCORR", "THEORYUNCORR")) diagonal += (sys_errors.loc[:, is_uncorr].to_numpy() ** 2).sum(axis=1) diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index 78aa2200b2..8ae5fe1012 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -4,32 +4,34 @@ """ from __future__ import generator_stop -import logging -import itertools from collections import defaultdict from collections.abc import Sequence +import itertools +import logging +import matplotlib as mpl +from matplotlib import cm +from matplotlib import colors as mcolors +from matplotlib import ticker as mticker import numpy as np import numpy.linalg as la -import matplotlib as mpl -from matplotlib import cm, colors as mcolors, ticker as mticker -import scipy.stats as stats import pandas as pd +import scipy.stats as stats +from reportengine import collect +from reportengine.checks import CheckError, check, make_argcheck, make_check from reportengine.figure import figure, figuregen -from reportengine.checks import make_check, CheckError, make_argcheck, check from reportengine.floatformatting import format_number -from reportengine import collect - -from validphys.core import MCStats, cut_mask, CutsPolicy -from validphys.results import chi2_stat_labels -from validphys.plotoptions import get_info, kitable, transform_result from validphys import plotutils -from validphys.utils import sane_groupby_iter, split_ranges, scale_from_grid +from validphys.core import CutsPolicy, MCStats, cut_mask from validphys.coredata import KIN_NAMES +from validphys.plotoptions import get_info, kitable, transform_result +from validphys.results import chi2_stat_labels +from validphys.utils import sane_groupby_iter, scale_from_grid, split_ranges log = logging.getLogger(__name__) + @figure def plot_chi2dist_experiments(total_chi2_data, experiments_chi2_stats, pdf): """Plot the distribution of chi²s of the members of the pdfset.""" @@ -62,10 +64,9 @@ def _chi2_distribution_plots(chi2_data, stats, pdf, plot_type): alldata, central, npoints = chi2_data if not isinstance(alldata, MCStats): ax.set_facecolor("#ffcccc") - log.warning("Chi² distribution plots have a " - "different meaning for non MC sets.") + log.warning("Chi² distribution plots have a " "different meaning for non MC sets.") label += " (%s!)" % pdf.error_type - label += '\n'+ '\n'.join(str(chi2_stat_labels[k])+(' %.2f' % v) for (k,v) in stats.items()) + label += '\n' + '\n'.join(str(chi2_stat_labels[k]) + (' %.2f' % v) for (k, v) in stats.items()) ax.set_xlabel(r"Replica $\chi^2$") if plot_type == "hist": @@ -94,6 +95,7 @@ def plot_phi(groups_data, groups_data_phi, processed_metadata_group): ax.set_title(rf"$\phi$ by {processed_metadata_group}") return fig + @figure def plot_fits_groups_data_phi(fits_groups_phi_table, processed_metadata_group): """Plots a set of bars for each fit, each bar represents the value of phi for the corresponding @@ -102,6 +104,7 @@ def plot_fits_groups_data_phi(fits_groups_phi_table, processed_metadata_group): ax.set_title(rf"$\phi$ by {processed_metadata_group}") return fig + @figure def plot_dataset_inputs_phi_dist(data, dataset_inputs_bootstrap_phi_data): """Generates a bootstrap distribution of phi and then plots a histogram @@ -110,14 +113,19 @@ def plot_dataset_inputs_phi_dist(data, dataset_inputs_bootstrap_phi_data): this number can be changed by specifying `bootstrap_samples` in the runcard """ phi = dataset_inputs_bootstrap_phi_data - label = '\n'.join([fr'$\phi$ mean = {format_number(phi.mean())}', - fr'$\phi$ std dev = {format_number(phi.std())}']) + label = '\n'.join( + [ + fr'$\phi$ mean = {format_number(phi.mean())}', + fr'$\phi$ std dev = {format_number(phi.std())}', + ] + ) fig, ax = plotutils.subplots() ax.hist(phi, label=label) ax.set_title(r"$\phi$ distribution for " + data.name) ax.legend() return fig + @make_argcheck def _check_same_group_data_name(dataspecs_groups): lst = dataspecs_groups @@ -125,21 +133,25 @@ def _check_same_group_data_name(dataspecs_groups): return for j, x in enumerate(lst[1:]): if len(x) != len(lst[0]): - raise CheckError("All dataspecs should have the same number " - "of groups of data") + raise CheckError("All dataspecs should have the same number of groups of data") for i, group in enumerate(x): if group.name != lst[0][i].name: - raise CheckError("\n".join(["All groups of data must have the " - "same name", - fr"dataspec {j+1}, " - fr"group {i+1}: {group.name}", - fr"dataspec 1, group {i+1}: " - fr"{lst[0][i].name}"])) + raise CheckError( + "\n".join( + [ + "All groups of data must have the same name", + fr"dataspec {j+1}, group {i+1}: {group.name}", + fr"dataspec 1, group {i+1}: {lst[0][i].name}", + ] + ) + ) + @_check_same_group_data_name @figure -def plot_phi_scatter_dataspecs(dataspecs_groups, - dataspecs_speclabel, dataspecs_groups_bootstrap_phi): +def plot_phi_scatter_dataspecs( + dataspecs_groups, dataspecs_speclabel, dataspecs_groups_bootstrap_phi +): """For each of the dataspecs, a bootstrap distribution of phi is generated for all specified groups of datasets. The distribution is then represented as a scatter point which is the median of the bootstrap distribution and an @@ -151,27 +163,30 @@ def plot_phi_scatter_dataspecs(dataspecs_groups, phis = dataspecs_groups_bootstrap_phi exps = dataspecs_groups xticks = [group.name for group in exps[0]] - x = range(1, len(xticks)+1) + x = range(1, len(xticks) + 1) fig, ax = plotutils.subplots() phi_stats = np.percentile(phis, [16, 50, 84], axis=2) for i, label in enumerate(labels): - phi_errs = np.vstack((phi_stats[2, i, :] - phi_stats[1, i, :], - phi_stats[1, i, :] - phi_stats[0, i, :])) - ax.errorbar(x, phi_stats[1, i, :], yerr=phi_errs, fmt='.', - label=label) + phi_errs = np.vstack( + (phi_stats[2, i, :] - phi_stats[1, i, :], phi_stats[1, i, :] - phi_stats[0, i, :]) + ) + ax.errorbar(x, phi_stats[1, i, :], yerr=phi_errs, fmt='.', label=label) ax.set_xticks(x, minor=False) ax.set_xticklabels(xticks, minor=False, rotation=45) ax.legend() return fig -#TODO: This should be simplified if at all possible. For now some more examples -#are needed for a spec to emerge. + +# TODO: This should be simplified if at all possible. For now some more examples +# are needed for a spec to emerge. @make_check def check_normalize_to(ns, **kwargs): """Transforn normalize_to into an index.""" - msg = ("normalize_to should be either 'data', a pdf id or an index of the " - "result (0 for the data, and i for the ith pdf)") + msg = ( + "normalize_to should be either 'data', a pdf id or an index of the " + "result (0 for the data, and i for the ith pdf)" + ) val = ns.get('normalize_to', None) if val is None: @@ -196,11 +211,12 @@ def check_normalize_to(ns, **kwargs): raise RuntimeError("Should not be here") -#TODO: This interface is horrible. -# We need to think how to adapt it to make this use case easier -def _plot_fancy_impl(results, commondata, cutlist, - normalize_to:(int,type(None)) = None, labellist=None): +# TODO: This interface is horrible. +# We need to think how to adapt it to make this use case easier +def _plot_fancy_impl( + results, commondata, cutlist, normalize_to: (int, type(None)) = None, labellist=None +): """Implementation of the data-theory comparison plots. Providers are supposed to call (yield from) this. Parameters @@ -230,9 +246,9 @@ def _plot_fancy_impl(results, commondata, cutlist, nkinlabels = len(table.columns) ndata = len(table) - #This is easier than cheking every time + # This is easier than cheking every time if labellist is None: - labellist = [None]*len(results) + labellist = [None] * len(results) if normalize_to is not None: norm_result = results[normalize_to] @@ -241,56 +257,48 @@ def _plot_fancy_impl(results, commondata, cutlist, cv[mask] = norm_result.central_value err = np.full(ndata, np.nan) - err[mask] = norm_result.std_error - #We modify the table, so we pass only the label columns - norm_cv, _ = transform_result(cv, - err, - table.iloc[:,:nkinlabels], info) - + err[mask] = norm_result.std_error + # We modify the table, so we pass only the label columns + norm_cv, _ = transform_result(cv, err, table.iloc[:, :nkinlabels], info) cvcols = [] - for i,(result, cuts) in enumerate(zip(results, cutlist)): - #We modify the table, so we pass only the label columns + for i, (result, cuts) in enumerate(zip(results, cutlist)): + # We modify the table, so we pass only the label columns mask = cut_mask(cuts) cv = np.full(ndata, np.nan) cv[mask] = result.central_value err = np.full(ndata, np.nan) err[mask] = result.std_error - cv, err = transform_result(cv, err, - table.iloc[:,:nkinlabels], info) + cv, err = transform_result(cv, err, table.iloc[:, :nkinlabels], info) - #By doing tuple keys we avoid all possible name collisions + # By doing tuple keys we avoid all possible name collisions cvcol = ('cv', i) if normalize_to is None: table[cvcol] = cv table[('err', i)] = err else: - table[cvcol] = cv/norm_cv - table[('err', i)] = np.abs(err/norm_cv) + table[cvcol] = cv / norm_cv + table[('err', i)] = np.abs(err / norm_cv) cvcols.append(cvcol) - figby = sane_groupby_iter(table, info.figure_by) - for samefig_vals, fig_data in figby: - #Nothing to plot if all data is cut away + # Nothing to plot if all data is cut away if np.all(np.isnan(fig_data[cvcols])): continue - #For some reason matplotlib doesn't set the axis right + # For some reason matplotlib doesn't set the axis right min_vals = [] max_vals = [] fig, ax = plotutils.subplots() - ax.set_title("%s %s"%(info.dataset_label, - info.group_label(samefig_vals, info.figure_by))) + ax.set_title("%s %s" % (info.dataset_label, info.group_label(samefig_vals, info.figure_by))) lineby = sane_groupby_iter(fig_data, info.line_by) first = True - - for (sameline_vals, line_data) in lineby: + for sameline_vals, line_data in lineby: ax.set_prop_cycle(None) labels = first first = False @@ -307,18 +315,16 @@ def _plot_fancy_impl(results, commondata, cutlist, x = np.arange(npoints) ax.set_xticks(x) ax.set_xticklabels(xticklabels) - #TODO: Remove this when mpl stops doing the wrong thing - #(in v2?) - ax.set_xlim(-npoints/20, npoints - 1+ npoints/20) + # TODO: Remove this when mpl stops doing the wrong thing + # (in v2?) + ax.set_xlim(-npoints / 20, npoints - 1 + npoints / 20) - - #Use black for the first iteration (data), - #and follow the cycle for - #the rest. + # Use black for the first iteration (data), + # and follow the cycle for + # the rest. next_color = itertools.chain(['#262626'], plotutils.color_iter()) for i, (res, lb, color) in enumerate(zip(results, labellist, next_color)): - if labels: if lb: label = lb @@ -328,37 +334,45 @@ def _plot_fancy_impl(results, commondata, cutlist, label = None cv = line_data[('cv', i)].values err = line_data[('err', i)].values - ax.errorbar(x, cv, yerr=err, - lw=0.25, - label= label, - #elinewidth = 2, - capsize=2, - marker = 's', - markeredgewidth=0.25, - c=color, - zorder=1000, - transform=next(offset_iter)) - - - #We 'plot' the empty lines to get the labels. But - #if everything is rmpty we skip the plot. + ax.errorbar( + x, + cv, + yerr=err, + lw=0.25, + label=label, + # elinewidth = 2, + capsize=2, + marker='s', + markeredgewidth=0.25, + c=color, + zorder=1000, + transform=next(offset_iter), + ) + + # We 'plot' the empty lines to get the labels. But + # if everything is rmpty we skip the plot. if np.any(np.isfinite(cv)): - max_vals.append(np.nanmax(cv+err)) - min_vals.append(np.nanmin(cv-err)) + max_vals.append(np.nanmax(cv + err)) + min_vals.append(np.nanmin(cv - err)) glabel = info.group_label(sameline_vals, info.line_by) - #Use some anchor that is not in y=1 for ratio plots + # Use some anchor that is not in y=1 for ratio plots if normalize_to is not None: next_after_normalize = (normalize_to + 1) % len(results) annotate_point = x[-1], line_data[('cv', next_after_normalize)].values[-1] else: annotate_point = x[-1], line_data[('cv', 0)].values[-1] - #This is a workaround for https://github.com/matplotlib/matplotlib/issues/12648 + # This is a workaround for https://github.com/matplotlib/matplotlib/issues/12648 if np.isfinite(annotate_point).all(): - ax.annotate(glabel, annotate_point, xytext=(15 ,-10), - size='xx-small', - textcoords='offset points', zorder=10000) + ax.annotate( + glabel, + annotate_point, + xytext=(15, -10), + size='xx-small', + textcoords='offset points', + zorder=10000, + ) if info.x_scale: ax.set_xscale(info.x_scale) @@ -373,7 +387,6 @@ def _plot_fancy_impl(results, commondata, cutlist, lb = labellist[normalize_to] ax.set_ylabel(f"Ratio to {lb if lb else norm_result.label}") - ax.legend().set_zorder(100000) ax.set_xlabel(info.xlabel) fig.tight_layout() @@ -382,8 +395,7 @@ def _plot_fancy_impl(results, commondata, cutlist, @check_normalize_to @figuregen -def plot_fancy(one_or_more_results, commondata, cuts, - normalize_to: (int, str, type(None)) = None): +def plot_fancy(one_or_more_results, commondata, cuts, normalize_to: (int, str, type(None)) = None): """ Read the PLOTTING configuration for the dataset and generate the corrspondig data theory plot. @@ -402,10 +414,13 @@ def plot_fancy(one_or_more_results, commondata, cuts, files. """ - yield from _plot_fancy_impl(results=one_or_more_results, - commondata=commondata, - cutlist=[cuts]*len(one_or_more_results), - normalize_to=normalize_to) + yield from _plot_fancy_impl( + results=one_or_more_results, + commondata=commondata, + cutlist=[cuts] * len(one_or_more_results), + normalize_to=normalize_to, + ) + @make_argcheck def _check_same_dataset_name(dataspecs_commondata): @@ -417,18 +432,21 @@ def _check_same_dataset_name(dataspecs_commondata): if x.name != ele: raise CheckError("All datasets must have the same name") + @make_argcheck def _check_dataspec_normalize_to(normalize_to, dataspecs): - if (normalize_to in (0, None) or - (isinstance(normalize_to, int) and normalize_to <= len(dataspecs))): + if normalize_to in (0, None) or ( + isinstance(normalize_to, int) and normalize_to <= len(dataspecs) + ): return if normalize_to == 'data': return {'normalize_to': 0} - raise CheckError("Unrecignized format for normalize_to. Must be either " - "'data', 0 or the 1-indexed index of the dataspec " - f"(<{len(dataspecs)}), not {normalize_to}") - + raise CheckError( + "Unrecignized format for normalize_to. Must be either " + "'data', 0 or the 1-indexed index of the dataspec " + f"(<{len(dataspecs)}), not {normalize_to}" + ) @_check_same_dataset_name @@ -469,21 +487,23 @@ def plot_fancy_dataspecs( A limitation at the moment is that the data cuts and errors will be taken from the first specifiaction. """ - #We have at least one element + # We have at least one element if not dataspecs_results: return - #For now, simply take the first data result. We'll need to improve this. + # For now, simply take the first data result. We'll need to improve this. results = [dataspecs_results[0][0], *[r[1] for r in dataspecs_results]] cutlist = [dataspecs_cuts[0], *dataspecs_cuts] commondata = dataspecs_commondata[0] labellist = [None, *dataspecs_speclabel] - yield from _plot_fancy_impl(results = results, commondata=commondata, - cutlist=cutlist, labellist=labellist, - normalize_to=normalize_to) - - + yield from _plot_fancy_impl( + results=results, + commondata=commondata, + cutlist=cutlist, + labellist=labellist, + normalize_to=normalize_to, + ) def _scatter_marked(ax, x, y, marked_dict, *args, **kwargs): @@ -491,25 +511,34 @@ def _scatter_marked(ax, x, y, marked_dict, *args, **kwargs): x = np.array(x, copy=False) y = np.array(y, copy=False) for label, indexes in marked_dict.items(): - ax.scatter(x[indexes],y[indexes], *args, **kwargs, label=label, - facecolors='none', linewidth=0.5, edgecolor='red') + ax.scatter( + x[indexes], + y[indexes], + *args, + **kwargs, + label=label, + facecolors='none', + linewidth=0.5, + edgecolor='red', + ) kwargs['s'] += 10 + @figure def plot_dataspecs_groups_chi2_spider(dataspecs_groups_chi2_table): fig, ax = _plot_chi2s_spider_df(dataspecs_groups_chi2_table) return fig + @figure -def plot_fits_chi2_spider(fits, fits_groups_chi2, - fits_groups_data, processed_metadata_group): +def plot_fits_chi2_spider(fits, fits_groups_chi2, fits_groups_data, processed_metadata_group): """Plots the chi²s of all groups of datasets on a spider/radar diagram.""" - fig, ax = plotutils.add_subplot(figsize=(12,12), projection='polar') + fig, ax = plotutils.add_subplot(figsize=(12, 12), projection='polar') for fit, fitchi2, fitgroup in zip(fits, fits_groups_chi2, fits_groups_data): - exchi2 = [group_res.central_result/group_res.ndata for group_res in fitchi2] + exchi2 = [group_res.central_result / group_res.ndata for group_res in fitchi2] xticks = [group.name for group in fitgroup] ax = plotutils.spiderplot(xticks, exchi2, fit) @@ -518,13 +547,12 @@ def plot_fits_chi2_spider(fits, fits_groups_chi2, return fig + @figure -def plot_fits_phi_spider( - fits, fits_groups_data, fits_groups_data_phi, processed_metadata_group -): +def plot_fits_phi_spider(fits, fits_groups_data, fits_groups_data_phi, processed_metadata_group): """Like plot_fits_chi2_spider but for phi.""" - fig, ax = plotutils.add_subplot(figsize=(12,12), projection='polar') + fig, ax = plotutils.add_subplot(figsize=(12, 12), projection='polar') for fit, fitphi, fitgroup in zip(fits, fits_groups_data_phi, fits_groups_data): phi = [exp_phi for (exp_phi, _npoints) in fitphi] @@ -536,11 +564,12 @@ def plot_fits_phi_spider( return fig + @figure def plot_groups_data_chi2_spider(groups_data, groups_chi2, processed_metadata_group, pdf): """Plot the chi² of all groups of datasets as a spider plot.""" - exchi2 = [group_res.central_result/group_res.ndata for group_res in groups_chi2] + exchi2 = [group_res.central_result / group_res.ndata for group_res in groups_chi2] xticks = [group.name for group in groups_data] fig, ax = plotutils.add_subplot(projection='polar') @@ -549,13 +578,14 @@ def plot_groups_data_chi2_spider(groups_data, groups_chi2, processed_metadata_gr ax.set_title(rf"$\chi^2$ by {processed_metadata_group}") return fig + @figure def plot_groups_data_phi_spider(groups_data, groups_data_phi, processed_metadata_group, pdf): """Plot the phi of all groups of datasets as a spider plot.""" phi = [exp_phi for (exp_phi, _npoints) in groups_data_phi] xticks = [group.name for group in groups_data] fig, ax = plotutils.add_subplot(projection='polar') - + ax = plotutils.spiderplot(xticks, phi, pdf) ax.set_title(rf"$\phi$ by {processed_metadata_group}") return fig @@ -564,14 +594,16 @@ def plot_groups_data_phi_spider(groups_data, groups_data_phi, processed_metadata @figure def plot_groups_data_chi2(groups_data, groups_chi2, processed_metadata_group): """Plot the chi² of all groups of datasets with bars.""" - exchi2 = [group_res.central_result/group_res.ndata for group_res in groups_chi2] + exchi2 = [group_res.central_result / group_res.ndata for group_res in groups_chi2] xticks = [group.name for group in groups_data] fig, ax = plotutils.barplot(exchi2, collabels=xticks, datalabels=[r'$\chi^2$']) ax.set_title(rf"$\chi^2$ by {processed_metadata_group}") return fig -plot_experiments_chi2 = collect("plot_groups_data_chi2", ("group_dataset_inputs_by_experiment",)) + +plot_experiments_chi2 = collect("plot_groups_data_chi2", ("group_dataset_inputs_by_experiment",)) + @figure def plot_datasets_chi2(groups_data, groups_chi2): @@ -580,14 +612,14 @@ def plot_datasets_chi2(groups_data, groups_chi2): xticks = [] for group, group_res in zip(groups_data, groups_chi2): xticks = [dataset.name for dataset in group] - dschi2 = [dsres.central_result/dsres.ndata for dsres in group_res] - fig,ax = plotutils.barplot(dschi2, collabels=xticks, - datalabels=[r'$\chi^2$']) + dschi2 = [dsres.central_result / dsres.ndata for dsres in group_res] + fig, ax = plotutils.barplot(dschi2, collabels=xticks, datalabels=[r'$\chi^2$']) ax.set_title(r"$\chi^2$ distribution for datasets") return fig + @figure def plot_datasets_chi2_spider(groups_data, groups_chi2): """Plot the chi² of all datasets with bars.""" @@ -595,7 +627,7 @@ def plot_datasets_chi2_spider(groups_data, groups_chi2): xticks = [] for group, group_res in zip(groups_data, groups_chi2): xticks = [dataset.name for dataset in group] - dschi2 = [dsres.central_result/dsres.ndata for dsres in group_res] + dschi2 = [dsres.central_result / dsres.ndata for dsres in group_res] fig, ax = plotutils.add_subplot(figsize=(4, 4), projection='polar') ax = plotutils.spiderplot(xticks, dschi2, label=[r'$\chi^2$']) @@ -609,7 +641,7 @@ def _plot_chis_df(df): """Takes a dataframe that is a reduced version of ``fits_dataset_chi2s_table`` and returns a bar plot. See ``plot_fits_datasets_chi2`` for use""" chilabel = df.columns.get_level_values(1)[1] - data = df.iloc[:, df.columns.get_level_values(1)==chilabel].T.values + data = df.iloc[:, df.columns.get_level_values(1) == chilabel].T.values fitnames = df.columns.get_level_values(0).unique() expnames = list(df.index.get_level_values(0)) fig, ax = plotutils.barplot(data, expnames, fitnames) @@ -617,16 +649,17 @@ def _plot_chis_df(df): ax.legend() return fig, ax + def _plot_chi2s_spider_df(df, size=6): """Like _plot_chis_df but for spider plot.""" chilabel = df.columns.get_level_values(1)[1] - data = df.iloc[:, df.columns.get_level_values(1)==chilabel].T.values + data = df.iloc[:, df.columns.get_level_values(1) == chilabel].T.values fitnames = df.columns.get_level_values(0).unique() expnames = list(df.index.get_level_values(0)) fig, ax = plotutils.add_subplot(figsize=(size, size), projection='polar') for dat, fitname in zip(data, fitnames): ax = plotutils.spiderplot(expnames, dat, fitname) - ax.legend(bbox_to_anchor=(0.3,-0.2), fontsize=15) + ax.legend(bbox_to_anchor=(0.3, -0.2), fontsize=15) return fig, ax @@ -645,6 +678,7 @@ def plot_fits_datasets_chi2(fits_datasets_chi2_table): ax.set_title(r"$\chi^2$ for datasets") return fig + @figure def plot_fits_datasets_chi2_spider(fits_datasets_chi2_table): """Generate a plot equivalent to ``plot_datasets_chi2_spider`` using all the @@ -660,28 +694,32 @@ def plot_fits_datasets_chi2_spider(fits_datasets_chi2_table): ax.set_title(r"$\chi^2$ for datasets") return fig + @figuregen def plot_fits_datasets_chi2_spider_bygroup(fits_datasets_chi2_table): """Same as plot_fits_datasets_chi2_spider but one plot for each group.""" tab = fits_datasets_chi2_table groups = tab.index.unique(level=0) - # dfs = [tab.T[group].T for group in groups] + # dfs = [tab.T[group].T for group in groups] for group in groups: df = tab.T[group].T fig, ax = _plot_chi2s_spider_df(df) ax.set_title(rf"$\chi^2$ for {group}") yield fig + @figure def plot_dataspecs_datasets_chi2(dataspecs_datasets_chi2_table): """Same as plot_fits_datasets_chi2 but for arbitrary dataspecs""" return plot_fits_datasets_chi2(dataspecs_datasets_chi2_table) + @figure def plot_dataspecs_datasets_chi2_spider(dataspecs_datasets_chi2_table): """Same as plot_fits_datasets_chi2_spider but for arbitrary dataspecs""" return plot_fits_datasets_chi2_spider(dataspecs_datasets_chi2_table) + @figure def plot_fits_groups_data_chi2(fits_groups_chi2_table, processed_metadata_group): """Generate a plot equivalent to ``plot_groups_data_chi2`` using all the @@ -690,11 +728,13 @@ def plot_fits_groups_data_chi2(fits_groups_chi2_table, processed_metadata_group) ax.set_title(rf"$\chi^2$ by {processed_metadata_group}") return fig + @figure def plot_dataspecs_groups_chi2(dataspecs_groups_chi2_table, processed_metadata_group): """Same as plot_fits_groups_data_chi2 but for arbitrary dataspecs""" return plot_fits_groups_data_chi2(dataspecs_groups_chi2_table, processed_metadata_group) + @figure def plot_training_length(replica_data, fit): """Generate an histogram for the distribution @@ -766,11 +806,10 @@ def plot_trainvaliddist(fit, replica_data): kde_train = stats.gaussian_kde(training, bw_method='silverman') kde_valid = stats.gaussian_kde(valid, bw_method='silverman') - mean = (np.array(training) + np.array(valid))*0.5 + mean = (np.array(training) + np.array(valid)) * 0.5 kde_mean = stats.gaussian_kde(mean, bw_method='silverman') - x = np.linspace(np.min([training,valid]), - np.max([training, valid]), 150) + x = np.linspace(np.min([training, valid]), np.max([training, valid]), 150) ax.plot(x, kde_train(x), label="Training") ax.plot(x, kde_valid(x), label="Validation") ax.plot(x, kde_mean(x), label="Mean") @@ -784,20 +823,21 @@ def plot_trainvaliddist(fit, replica_data): @figure -def plot_chi2_eigs(pdf,dataset,chi2_per_eig): +def plot_chi2_eigs(pdf, dataset, chi2_per_eig): fig, ax = plotutils.subplots() - x = np.arange(1,len(chi2_per_eig) + 1) + x = np.arange(1, len(chi2_per_eig) + 1) ax.plot(x, chi2_per_eig, 'o', markersize=10) ax.yaxis.grid(False) ax.set_title(fr"$\chi^2/N_{{dat}}$ {dataset}") ax.set_xlabel("# Eigenvalue") return fig + @figure def plot_replica_sum_rules(pdf, sum_rules, Q): """Plot the value of each sum rule as a function of the replica index""" fig, axes = plotutils.subplots(nrows=len(sum_rules), sharex=True) - #TODO: Get rid of this nonsense + # TODO: Get rid of this nonsense ncomputed = len(sum_rules[0]) if pdf.error_type == 'replicas': x = np.arange(1, ncomputed + 1) @@ -809,8 +849,9 @@ def plot_replica_sum_rules(pdf, sum_rules, Q): fig.suptitle(f'Sum rules for {pdf} at Q={Q} GeV') return fig + @figuregen -def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): +def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold: float = 0.9): """ Plot the correlations between the change in the observable and the change in the PDF in (x,fl) space. @@ -850,64 +891,62 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): plotting_var = info.get_xcol(table) - #TODO: vmin vmax should be global or by figure? - vmin,vmax = min(plotting_var), max(plotting_var) + # TODO: vmin vmax should be global or by figure? + vmin, vmax = min(plotting_var), max(plotting_var) if info.x_scale == 'log': norm = mcolors.LogNorm(vmin, vmax) else: norm = mcolors.Normalize(vmin, vmax) - #http://stackoverflow.com/a/11558629/1007990 + # http://stackoverflow.com/a/11558629/1007990 sm = cm.ScalarMappable(cmap=cm.viridis, norm=norm) for same_vals, fb in figby: - grid = fullgrid[ np.asarray(fb.index),...] - + grid = fullgrid[np.asarray(fb.index), ...] - #Use the maximum absolute correlation for plotting purposes + # Use the maximum absolute correlation for plotting purposes absgrid = np.max(np.abs(grid), axis=0) - mark_mask = absgrid > np.max(absgrid)*mark_threshold + mark_mask = absgrid > np.max(absgrid) * mark_threshold label = info.group_label(same_vals, info.figure_by) - #TODO: PY36ScalarMappable - #TODO Improve title? - title = "%s %s\n[%s]" % (info.dataset_label, '(%s)'%label if label else '' ,pdf.label) + # TODO: PY36ScalarMappable + # TODO Improve title? + title = "%s %s\n[%s]" % (info.dataset_label, '(%s)' % label if label else '', pdf.label) - #Start plotting + # Start plotting w, h = mpl.rcParams["figure.figsize"] - h*=2.5 + h *= 2.5 fig, axes = plotutils.subplots(nrows=nf, sharex=True, figsize=(w, h), sharey=True) fig.suptitle(title) colors = sm.to_rgba(info.get_xcol(fb)) for flindex, (ax, fl) in enumerate(zip(axes, fls)): - for i,color in enumerate(colors): - ax.plot(x, grid[i,flindex,:].T, color=color) + for i, color in enumerate(colors): + ax.plot(x, grid[i, flindex, :].T, color=color) - - flmask = mark_mask[flindex,:] + flmask = mark_mask[flindex, :] ranges = split_ranges(x, flmask, filter_falses=True) for r in ranges: ax.axvspan(r[0], r[-1], color='#eeeeff') - ax.set_ylabel("$%s$"%basis.elementlabel(fl)) + ax.set_ylabel("$%s$" % basis.elementlabel(fl)) ax.set_xscale(scale_from_grid(obs_pdf_correlations)) - ax.set_ylim(-1,1) + ax.set_ylim(-1, 1) ax.set_xlim(x[0], x[-1]) ax.set_xlabel('$x$') - #fig.subplots_adjust(hspace=0) + # fig.subplots_adjust(hspace=0) - fig.colorbar(sm, ax=axes.ravel().tolist(), label=info.xlabel, - aspect=100) - #TODO: Fix title for this - #fig.tight_layout() + fig.colorbar(sm, ax=axes.ravel().tolist(), label=info.xlabel, aspect=100) + # TODO: Fix title for this + # fig.tight_layout() yield fig + @figure def plot_obscorrs(corrpair_datasets, obs_obs_correlations, pdf): """NOTE: EXPERIMENTAL. Plot the correlation matrix between a pair of datasets.""" fig, ax = plotutils.subplots() ds1, ds2 = corrpair_datasets - #in1,in2 = get_info(ds1), get_info(ds2) + # in1,in2 = get_info(ds1), get_info(ds2) im = ax.imshow(obs_obs_correlations, cmap=cm.Spectral_r, vmin=-1, vmax=1) @@ -916,6 +955,7 @@ def plot_obscorrs(corrpair_datasets, obs_obs_correlations, pdf): fig.colorbar(im, ax=ax) return fig + @figure def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_kin=False): """Plot an errorbar spanning the central 68% CI of a positivity @@ -931,7 +971,7 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k ax.axhline(0, color='red') posset = posdataset.load_commondata() - ndata = posset.ndata + ndata = posset.ndata xvals = [] if pos_use_kin: @@ -955,7 +995,7 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k marker='s', label=str(pdf), lw=0.5, - transform=next(offsets) + transform=next(offsets), ) minscale = min(minscale, np.abs(np.min(cv))) ax.legend() @@ -971,9 +1011,8 @@ def plot_positivity(pdfs, positivity_predictions_for_pdfs, posdataset, pos_use_k @make_argcheck def _check_same_posdataset_name(dataspecs_posdataset): """Check that the ``posdataset`` key matches for ``dataspecs``""" - _check_same_dataset_name.__wrapped__( - [ds.commondata for ds in dataspecs_posdataset] - ) + _check_same_dataset_name.__wrapped__([ds.commondata for ds in dataspecs_posdataset]) + @figure @_check_same_posdataset_name @@ -1004,13 +1043,15 @@ def _check_display_cuts_requires_use_cuts(display_cuts, use_cuts): "The display_cuts option requires setting some cuts", ) + @make_argcheck def _check_marker_by(marker_by): markers = ('process type', 'experiment', 'dataset', 'group') if marker_by not in markers: raise CheckError("Unknown marker_by value", marker_by, markers) -#TODO: Right now this is hackish Could we turn it into a permanent interface? + +# TODO: Right now this is hackish Could we turn it into a permanent interface? @make_argcheck def _check_highlights(data_input, highlight_datasets): if highlight_datasets: @@ -1019,8 +1060,9 @@ def _check_highlights(data_input, highlight_datasets): diff = values - names_set if diff: formatted_diff = '\n'.join(diff) - raise CheckError(f"The following highlight elements are " - f"not dataset names:\n{formatted_diff}") + raise CheckError( + f"The following highlight elements are not dataset names:\n{formatted_diff}" + ) return {'highlight_datasets': values} @@ -1030,6 +1072,7 @@ def _check_aspect(aspect): if aspect not in aspects: raise CheckError(f"Unknown aspect {aspect}", aspect, aspects) + @figure @_check_display_cuts_requires_use_cuts @_check_marker_by @@ -1039,11 +1082,11 @@ def plot_xq2( dataset_inputs_by_groups_xq2map, use_cuts, data_input, - display_cuts:bool=True, - marker_by:str='process type', - highlight_label:str='highlight', - highlight_datasets:(Sequence,type(None))=None, - aspect:str='landscape', + display_cuts: bool = True, + marker_by: str = 'process type', + highlight_label: str = 'highlight', + highlight_datasets: (Sequence, type(None)) = None, + aspect: str = 'landscape', ): """Plot the (x,Q²) coverage based of the data based on some LO approximations. These are governed by the relevant kintransform. @@ -1155,15 +1198,15 @@ def plot_xq2( """ - w,h = mpl.rcParams["figure.figsize"] + w, h = mpl.rcParams["figure.figsize"] rescaling_factor = 1.6 w *= rescaling_factor h *= rescaling_factor - if aspect=='landscape': + if aspect == 'landscape': figsize = w, h - elif aspect=='portrait': + elif aspect == 'portrait': figsize = h, w - elif aspect=='square': + elif aspect == 'square': figsize = h, h else: raise ValueError(f"Unknown aspect {aspect}") @@ -1182,14 +1225,14 @@ def plot_xq2( highlight_datasets = set() def next_options(): - #Get the colors + # Get the colors prop_settings = mpl.rcParams['axes.prop_cycle'] - #Apparently calling the object gives us an infinite cycler + # Apparently calling the object gives us an infinite cycler settings_cycler = prop_settings() - #So far, I don't understand how this is done with mpl "cycler" - #objects, or wether I like it. So far this is godd enough - for markeropts, settings in zip(plotutils.marker_iter_plot(), settings_cycler): - #Override last with first + # So far, I don't understand how this is done with mpl "cycler" + # objects, or wether I like it. So far this is godd enough + for markeropts, settings in zip(plotutils.marker_iter_plot(), settings_cycler): + # Override last with first options = { 'linestyle': 'none', **markeropts, @@ -1200,7 +1243,7 @@ def next_options(): next_opts = next_options() key_options = {} - for (experiment, commondata, fitted, masked, group) in dataset_inputs_by_groups_xq2map: + for experiment, commondata, fitted, masked, group in dataset_inputs_by_groups_xq2map: info = get_info(commondata) if marker_by == 'process type': key = info.process_description @@ -1214,7 +1257,7 @@ def next_options(): else: raise ValueError('Unknown marker_by value') - #TODO: This is an ugly check. Is there a way to do it with .setdefault + # TODO: This is an ugly check. Is there a way to do it with .setdefault # or defaultdict? if key not in key_options: key_options[key] = next(next_opts) @@ -1238,33 +1281,48 @@ def next_options(): if key in x: coords = np.concatenate(x[key]), np.concatenate(q2[key]) else: - #This is to get the label key + # This is to get the label key coords = [], [] - ax.plot(*coords, + ax.plot( + *coords, label=key, markeredgewidth=1, markeredgecolor=None, **key_options[key], ) - #Iterate again so highlights are printed on top. + # Iterate again so highlights are printed on top. for key in xh: - ax.plot(np.concatenate(xh[key]), np.concatenate(q2h[key]), + ax.plot( + np.concatenate(xh[key]), + np.concatenate(q2h[key]), markeredgewidth=0.6, markeredgecolor="black", **key_options[key], ) if xh: - #Get legend key - ax.plot([], [], marker='s', markeredgewidth=0.6, color='none', + # Get legend key + ax.plot( + [], + [], + marker='s', + markeredgewidth=0.6, + color='none', markersize=5, - markeredgecolor="black", label= f'Black edge: {highlight_label}', + markeredgecolor="black", + label=f'Black edge: {highlight_label}', ) if display_cuts: - ax.scatter(np.concatenate(filteredx), np.concatenate(filteredq2), + ax.scatter( + np.concatenate(filteredx), + np.concatenate(filteredq2), marker='o', - facecolors='none', edgecolor='red', s=40, lw=0.8, label="Cut" + facecolors='none', + edgecolor='red', + s=40, + lw=0.8, + label="Cut", ) ax.set_title("Kinematic coverage") diff --git a/validphys2/src/validphys/deltachi2.py b/validphys2/src/validphys/deltachi2.py index d8d1b31c32..3769712cba 100644 --- a/validphys2/src/validphys/deltachi2.py +++ b/validphys2/src/validphys/deltachi2.py @@ -3,9 +3,9 @@ Plots and data processing that can be used in a delta chi2 analysis """ +from collections import namedtuple import logging import warnings -from collections import namedtuple from matplotlib.figure import Figure import numpy as np @@ -13,12 +13,10 @@ from reportengine.checks import CheckError, make_argcheck from reportengine.figure import figure, figuregen - from validphys import plotutils -from validphys.checks import check_scale, check_pdf_normalize_to, check_pdfs_noband +from validphys.checks import check_pdf_normalize_to, check_pdfs_noband, check_scale from validphys.core import PDF -from validphys.pdfplots import PDFPlotter, BandPDFPlotter - +from validphys.pdfplots import BandPDFPlotter, PDFPlotter log = logging.getLogger(__name__) @@ -28,9 +26,7 @@ def check_pdf_is_symmhessian(pdf, **kwargs): """Check ``pdf`` has error type of ``symmhessian``""" etype = pdf.error_type if etype != "symmhessian": - raise CheckError( - "Error: type of PDF %s must be 'symmhessian' and not %s" % (pdf, etype) - ) + raise CheckError("Error: type of PDF %s must be 'symmhessian' and not %s" % (pdf, etype)) @check_pdf_is_symmhessian @@ -40,8 +36,7 @@ def delta_chi2_hessian(pdf, total_chi2_data): each eigenvector of the Hessian set. """ delta_chi2 = ( - np.ravel(total_chi2_data.replica_result.error_members()) - - total_chi2_data.central_result + np.ravel(total_chi2_data.replica_result.error_members()) - total_chi2_data.central_result ) return delta_chi2 @@ -49,13 +44,13 @@ def delta_chi2_hessian(pdf, total_chi2_data): @figure def plot_kullback_leibler(delta_chi2_hessian): """ - Determines the Kullback–Leibler divergence by comparing the expectation value of Delta chi2 to - the cumulative distribution function of chi-square distribution with one degree of freedom + Determines the Kullback–Leibler divergence by comparing the expectation value of Delta chi2 to + the cumulative distribution function of chi-square distribution with one degree of freedom (see: https://en.wikipedia.org/wiki/Chi-square_distribution). - - The Kullback-Leibler divergence provides a measure of the difference between two distribution + + The Kullback-Leibler divergence provides a measure of the difference between two distribution functions, here we compare the chi-squared distribution and the cumulative distribution of the - expectation value of Delta chi2. + expectation value of Delta chi2. """ delta_chi2 = delta_chi2_hessian @@ -74,7 +69,7 @@ def plot_kullback_leibler(delta_chi2_hessian): label="cumulative $\Delta\chi^2$", ) # compute Kullback-Leibler (null values set to 1e-8) - vals_nnpdf[vals_nnpdf==0] = 1e-8 + vals_nnpdf[vals_nnpdf == 0] = 1e-8 kl_nnpdf = sp.stats.entropy(sp.stats.chi2.cdf(bin_central_nnpdf, 1), qk=vals_nnpdf) ax.plot(x, sp.stats.chi2.cdf(x, 1), label="$\chi^2$ CDF") @@ -121,7 +116,7 @@ def plot_delta_chi2_hessian_distribution(delta_chi2_hessian, pdf, total_chi2_dat fig, ax = plotutils.subplots() - bins = np.arange(np.floor(min(delta_chi2)), np.ceil(max(delta_chi2))+1) + bins = np.arange(np.floor(min(delta_chi2)), np.ceil(max(delta_chi2)) + 1) ax.hist( delta_chi2, @@ -201,7 +196,7 @@ class PDFEpsilonPlotter(PDFPlotter): def setup_flavour(self, flstate): flstate.labels = [] flstate.handles = [] - + def get_ylabel(self, parton_name): return '$\epsilon(x)$' @@ -232,7 +227,7 @@ def draw(self, pdf, grid, flstate): error68 = (error68up - error68down) / 2.0 epsilon = abs(1 - errorstd / error68) - handle, = ax.plot(xgrid, epsilon, linestyle="-", color=color) + (handle,) = ax.plot(xgrid, epsilon, linestyle="-", color=color) handles.append(handle) labels.append(pdf.label) @@ -240,11 +235,11 @@ def draw(self, pdf, grid, flstate): return [5 * epsilon] def legend(self, flstate): - return flstate.ax.legend(flstate.handles, flstate.labels, - handler_map={plotutils.HandlerSpec: - plotutils.ComposedHandler() - } - ) + return flstate.ax.legend( + flstate.handles, + flstate.labels, + handler_map={plotutils.HandlerSpec: plotutils.ComposedHandler()}, + ) @make_argcheck @@ -253,9 +248,7 @@ def check_pdfs_are_montecarlo(pdfs, **kwargs): for pdf in pdfs: etype = pdf.error_type if etype != "replicas": - raise CheckError( - "Error: type of PDF %s must be 'replicas' and not '%s'" % (pdf, etype) - ) + raise CheckError("Error: type of PDF %s must be 'replicas' and not '%s'" % (pdf, etype)) @figuregen diff --git a/validphys2/src/validphys/eff_exponents.py b/validphys2/src/validphys/eff_exponents.py index 2f5948db8b..0a3a6c7a1f 100644 --- a/validphys2/src/validphys/eff_exponents.py +++ b/validphys2/src/validphys/eff_exponents.py @@ -18,13 +18,11 @@ from reportengine.figure import figuregen from reportengine.floatformatting import format_number, significant_digits from reportengine.table import table - -from validphys.checks import check_positive, check_pdf_normalize_to, make_argcheck, check_xlimits +from validphys.checks import check_pdf_normalize_to, check_positive, check_xlimits, make_argcheck from validphys.core import PDF, FitSpec -from validphys.pdfbases import check_basis, Basis -from validphys.pdfplots import BandPDFPlotter, PDFPlotter - +from validphys.pdfbases import Basis, check_basis import validphys.pdfgrids as pdfgrids +from validphys.pdfplots import BandPDFPlotter, PDFPlotter log = logging.getLogger(__name__) @@ -35,13 +33,16 @@ @check_positive('Q') @make_argcheck(check_basis) @check_xlimits -def alpha_eff(pdf: PDF, *, - xmin: numbers.Real = 1e-6, - xmax: numbers.Real = 1e-3, - npoints: int = 200, - Q: numbers.Real = 1.65, - basis: (str, Basis), - flavours: (list, tuple, type(None)) = None): +def alpha_eff( + pdf: PDF, + *, + xmin: numbers.Real = 1e-6, + xmax: numbers.Real = 1e-3, + npoints: int = 200, + Q: numbers.Real = 1.65, + basis: (str, Basis), + flavours: (list, tuple, type(None)) = None, +): """Return a list of xplotting_grids containing the value of the effective exponent alpha at the specified values of x and flavour. alpha is relevant at small x, hence the linear scale. @@ -54,7 +55,7 @@ def alpha_eff(pdf: PDF, *, Q: The PDF scale in GeV. """ - #Loading the filter map of the fit/PDF + # Loading the filter map of the fit/PDF checked = check_basis(basis, flavours) basis = checked['basis'] flavours = checked['flavours'] @@ -64,28 +65,31 @@ def alpha_eff(pdf: PDF, *, else: xGrid = pdfgrids.xgrid(xmin, xmax, 'log', npoints) - pdfGrid = pdfgrids.xplotting_grid( - pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) + pdfGrid = pdfgrids.xplotting_grid(pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) pdfGrid_values = pdfGrid.grid_values.data # NOTE: without this I get "setting an array element with a sequence" xGrid = pdfGrid.xgrid with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) - alphaGrid_values = -np.log(abs(pdfGrid_values/xGrid))/np.log(xGrid) - alphaGrid_values[alphaGrid_values == - np.inf] = np.nan # when PDF_i =0 + alphaGrid_values = -np.log(abs(pdfGrid_values / xGrid)) / np.log(xGrid) + alphaGrid_values[alphaGrid_values == -np.inf] = np.nan # when PDF_i =0 alphaGrid = pdfGrid.copy_grid(grid_values=pdf.stats_class(alphaGrid_values)) return alphaGrid + @check_positive('Q') @make_argcheck(check_basis) @check_xlimits -def beta_eff(pdf, *, - xmin: numbers.Real = 0.6, - xmax: numbers.Real = 0.9, - npoints: int = 200, - Q: numbers.Real = 1.65, - basis: (str, Basis), - flavours: (list, tuple, type(None)) = None): +def beta_eff( + pdf, + *, + xmin: numbers.Real = 0.6, + xmax: numbers.Real = 0.9, + npoints: int = 200, + Q: numbers.Real = 1.65, + basis: (str, Basis), + flavours: (list, tuple, type(None)) = None, +): """Return a list of xplotting_grids containing the value of the effective exponent beta at the specified values of x and flavour. beta is relevant at large x, hence the linear scale. @@ -107,26 +111,25 @@ def beta_eff(pdf, *, else: xGrid = pdfgrids.xgrid(xmin, xmax, 'linear', npoints) - - pdfGrid = pdfgrids.xplotting_grid( - pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) + pdfGrid = pdfgrids.xplotting_grid(pdf, Q, xgrid=xGrid, basis=basis, flavours=flavours) pdfGrid_values = pdfGrid.grid_values.data # NOTE: without this I get "setting an array element with a sequence" xGrid = pdfGrid.xgrid with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) - betaGrid_values = np.log(abs(pdfGrid_values/xGrid))/np.log(1-xGrid) + betaGrid_values = np.log(abs(pdfGrid_values / xGrid)) / np.log(1 - xGrid) betaGrid_values[betaGrid_values == -np.inf] = np.nan # when PDF_i =0 betaGrid = pdfGrid.copy_grid(grid_values=pdf.stats_class(betaGrid_values)) return betaGrid # .grid_values + class PreprocessingPlotter(PDFPlotter): - """ Class inherenting from BandPDFPlotter, changing title and ylabel to reflect the effective + """Class inherenting from BandPDFPlotter, changing title and ylabel to reflect the effective exponent being plotted. """ - def __init__(self, exponent, *args, **kwargs): + def __init__(self, exponent, *args, **kwargs): self.exponent = exponent super().__init__(*args, **kwargs) @@ -139,6 +142,7 @@ def get_ylabel(self, parton_name): else: return fr"$\{self.exponent}_e$ for ${parton_name}$" + def get_alpha_lines(effective_exponents_table_internal): """Given an effective_exponents_table_internal returns the rows with bounds of the alpha effective exponent for all flavours, used to plot horizontal @@ -147,19 +151,22 @@ def get_alpha_lines(effective_exponents_table_internal): """ return effective_exponents_table_internal.iloc[0::2, :] + def get_beta_lines(effective_exponents_table_internal): """Same as `get_alpha_lines` but for beta""" return effective_exponents_table_internal.iloc[1::2, :] + pdfs_alpha_lines = collect('get_alpha_lines', ("pdfs",)) pdfs_beta_lines = collect('get_beta_lines', ("pdfs",)) fits_alpha_lines = collect('get_alpha_lines', ('fits', 'fitpdf')) fits_beta_lines = collect('get_beta_lines', ('fits', 'fitpdf')) + class ExponentBandPlotter(BandPDFPlotter, PreprocessingPlotter): - def __init__(self, hlines, exponent, *args, **kwargs): - super().__init__(exponent, *args, **kwargs) + def __init__(self, hlines, exponent, *args, **kwargs): + super().__init__(exponent, *args, **kwargs) self.hlines = hlines def draw(self, pdf, grid, flstate): @@ -192,27 +199,30 @@ def draw(self, pdf, grid, flstate): xmin=xmin, xmax=xmax, linestyle=INTERNAL_LINESTYLE[i], - color=INTERNAL_COLOR[pdf_index % len(INTERNAL_COLOR)] + color=INTERNAL_COLOR[pdf_index % len(INTERNAL_COLOR)], ) flstate.handles.append(handle) flstate.labels.append(label) # need to return xgrid shaped object but with hlines taken into account to get plots nice hline_positions = hlines.loc[table_fl_index, :].values.flatten() - new_errdown = min( - [*errdown, *hline_positions,]) - new_errup = max( - [*errup, *hline_positions,]) - return new_errdown*np.ones_like(errdown), new_errup*np.ones_like(errup) + new_errdown = min([*errdown, *hline_positions]) + new_errup = max([*errup, *hline_positions]) + return new_errdown * np.ones_like(errdown), new_errup * np.ones_like(errup) alpha_eff_pdfs = collect('alpha_eff', ('pdfs',)) + @figuregen @check_pdf_normalize_to def plot_alpha_eff_internal( - pdfs, alpha_eff_pdfs, pdfs_alpha_lines, - normalize_to: (int, str, type(None)) = None, - ybottom=None, ytop=None): + pdfs, + alpha_eff_pdfs, + pdfs_alpha_lines, + normalize_to: (int, str, type(None)) = None, + ybottom=None, + ytop=None, +): """Plot the central value and the uncertainty of a list of effective exponents as a function of x for a given value of Q. If normalize_to is given, plot the ratios to the corresponding alpha effective. @@ -225,15 +235,22 @@ def plot_alpha_eff_internal( absolute values. """ yield from ExponentBandPlotter( - pdfs_alpha_lines, 'alpha', pdfs, alpha_eff_pdfs, 'log', normalize_to, ybottom, ytop) + pdfs_alpha_lines, 'alpha', pdfs, alpha_eff_pdfs, 'log', normalize_to, ybottom, ytop + ) + + +alpha_eff_fits = collect('alpha_eff', ('fits', 'fitpdf')) -alpha_eff_fits = collect('alpha_eff', ('fits', 'fitpdf',)) @figuregen def plot_alpha_eff( - fits_pdf, alpha_eff_fits, fits_alpha_lines, - normalize_to: (int, str, type(None)) = None, - ybottom=None, ytop=None): + fits_pdf, + alpha_eff_fits, + fits_alpha_lines, + normalize_to: (int, str, type(None)) = None, + ybottom=None, + ytop=None, +): """Plot the central value and the uncertainty of a list of effective exponents as a function of x for a given value of Q. If normalize_to is given, plot the ratios to the corresponding alpha effective. @@ -249,33 +266,48 @@ def plot_alpha_eff( set based on the scale in xgrid, which should be used instead. """ return plot_alpha_eff_internal( - fits_pdf, alpha_eff_fits, fits_alpha_lines, normalize_to, ybottom, ytop) + fits_pdf, alpha_eff_fits, fits_alpha_lines, normalize_to, ybottom, ytop + ) + beta_eff_pdfs = collect('beta_eff', ('pdfs',)) + @figuregen @check_pdf_normalize_to def plot_beta_eff_internal( - pdfs, beta_eff_pdfs, pdfs_beta_lines, - normalize_to: (int, str, type(None)) = None, - ybottom=None, ytop=None): - """ Same as plot_alpha_eff_internal but for beta effective exponent """ + pdfs, + beta_eff_pdfs, + pdfs_beta_lines, + normalize_to: (int, str, type(None)) = None, + ybottom=None, + ytop=None, +): + """Same as plot_alpha_eff_internal but for beta effective exponent""" yield from ExponentBandPlotter( - pdfs_beta_lines, 'beta', pdfs, beta_eff_pdfs, 'linear', normalize_to, ybottom, ytop) + pdfs_beta_lines, 'beta', pdfs, beta_eff_pdfs, 'linear', normalize_to, ybottom, ytop + ) + + +beta_eff_fits = collect('beta_eff', ('fits', 'fitpdf')) -beta_eff_fits = collect('beta_eff', ('fits', 'fitpdf',)) @figuregen def plot_beta_eff( - fits_pdf, beta_eff_fits, fits_beta_lines, - normalize_to: (int, str, type(None)) = None, - ybottom=None, ytop=None): - """ Same as plot_alpha_eff but for beta effective exponents """ + fits_pdf, + beta_eff_fits, + fits_beta_lines, + normalize_to: (int, str, type(None)) = None, + ybottom=None, + ytop=None, +): + """Same as plot_alpha_eff but for beta effective exponents""" return plot_beta_eff_internal( - fits_pdf, beta_eff_fits, fits_beta_lines, normalize_to, ybottom, ytop) + fits_pdf, beta_eff_fits, fits_beta_lines, normalize_to, ybottom, ytop + ) -def previous_effective_exponents(basis:str, fit: (FitSpec, type(None)) = None): +def previous_effective_exponents(basis: str, fit: (FitSpec, type(None)) = None): """If provided with a fit, check that the `basis` is the basis which was fitted if so then return the previous effective exponents read from the fit runcard. """ @@ -288,13 +320,14 @@ def previous_effective_exponents(basis:str, fit: (FitSpec, type(None)) = None): else: return None + @table def previous_effective_exponents_table(fit: FitSpec): """Given a fit, reads the previous exponents from the fit runcard""" fitting = fit.as_input()["fitting"] checked = check_basis( - fitting["fitbasis"], - [runcard_fl['fl'] for runcard_fl in fitting["basis"]]) + fitting["fitbasis"], [runcard_fl['fl'] for runcard_fl in fitting["basis"]] + ) basis = checked["basis"] flavours = checked["flavours"] prev_a_bounds = [runcard_fl['smallx'] for runcard_fl in fitting["basis"]] @@ -306,6 +339,7 @@ def previous_effective_exponents_table(fit: FitSpec): columns = pd.MultiIndex.from_product([[f"prev ({fit.label})"], ["Min", "Max"]]) return pd.DataFrame(data, index=ind, columns=columns) + @table @make_argcheck(check_basis) def next_effective_exponents_table( @@ -315,7 +349,7 @@ def next_effective_exponents_table( x2_alpha: numbers.Real = 1e-3, x1_beta: numbers.Real = 0.65, x2_beta: numbers.Real = 0.95, - basis:(str, Basis), + basis: (str, Basis), flavours: (list, tuple, type(None)) = None, ): """Given a PDF, calculate the next effective exponents @@ -342,9 +376,11 @@ def next_effective_exponents_table( Qmin = pdf.q_min alpha_effs = alpha_eff( - pdf, xmin=x1_alpha, xmax=x2_alpha, npoints=2, Q=Qmin, basis=basis, flavours=flavours) + pdf, xmin=x1_alpha, xmax=x2_alpha, npoints=2, Q=Qmin, basis=basis, flavours=flavours + ) beta_effs = beta_eff( - pdf, xmin=x1_beta, xmax=x2_beta, npoints=2, Q=Qmin, basis=basis, flavours=flavours) + pdf, xmin=x1_beta, xmax=x2_beta, npoints=2, Q=Qmin, basis=basis, flavours=flavours + ) eff_exp_data = [] @@ -356,7 +392,7 @@ def next_effective_exponents_table( alpha_cv = np.nanmean(alphastats.error_members(), axis=0) beta_cv = np.nanmean(betastats.error_members(), axis=0) - #tuple of low and high values repectively + # tuple of low and high values repectively alpha68 = alphastats.errorbar68() beta68 = betastats.errorbar68() @@ -365,29 +401,32 @@ def next_effective_exponents_table( alpha_sigdown = -alpha68[0] + alpha_cv beta_sigdown = -beta68[0] + beta_cv flavours_label = [] - for (j, fl) in enumerate(flavours): + for j, fl in enumerate(flavours): # the gluon/singlet case if fl in (r"\Sigma", "g"): new_alpha_bounds = [ - alpha_cv[j, 0] - 2*alpha_sigdown[j, 0], - min(2, alpha_cv[j, 0] + 2*alpha_sigup[j, 0])] + alpha_cv[j, 0] - 2 * alpha_sigdown[j, 0], + min(2, alpha_cv[j, 0] + 2 * alpha_sigup[j, 0]), + ] else: new_alpha_bounds = [ - min(alpha_cv[j, :] - 2*alpha_sigdown[j, :]), - min(2, max(alpha_cv[j, :] + 2*alpha_sigup[j, :]))] + min(alpha_cv[j, :] - 2 * alpha_sigdown[j, :]), + min(2, max(alpha_cv[j, :] + 2 * alpha_sigup[j, :])), + ] new_beta_bounds = [ - max(0, min(beta_cv[j, :] - 2*beta_sigdown[j, :])), - max(beta_cv[j, :] + 2*beta_sigup[j, :])] + max(0, min(beta_cv[j, :] - 2 * beta_sigdown[j, :])), + max(beta_cv[j, :] + 2 * beta_sigup[j, :]), + ] eff_exp_data.extend((new_alpha_bounds, new_beta_bounds)) flavours_label.append(f"${basis.elementlabel(fl)}$") ind = pd.MultiIndex.from_product([flavours_label, [r"$\alpha$", r"$\beta$"]]) eff_exp_columns = pd.MultiIndex.from_product([[f"next ({pdf.label})"], ["Min", "Max"]]) - df = pd.DataFrame(eff_exp_data, index=ind, - columns=eff_exp_columns) + df = pd.DataFrame(eff_exp_data, index=ind, columns=eff_exp_columns) return df + @table def effective_exponents_table_internal( next_effective_exponents_table, @@ -413,15 +452,13 @@ def effective_exponents_table_internal( return df -effective_exponents_table = collect( - 'effective_exponents_table_internal', ('fitpdfandbasis',)) +effective_exponents_table = collect('effective_exponents_table_internal', ('fitpdfandbasis',)) fmt = lambda a: float(significant_digits(a, 4)) next_fit_eff_exps_table = collect("next_effective_exponents_table", ("fitpdfandbasis",)) -def iterate_preprocessing_yaml( - fit, next_fit_eff_exps_table, _flmap_np_clip_arg=None): +def iterate_preprocessing_yaml(fit, next_fit_eff_exps_table, _flmap_np_clip_arg=None): """Using py:func:`next_effective_exponents_table` update the preprocessing exponents of the input ``fit``. This is part of the usual pipeline referred to as "iterating a fit", for more information see: :ref:`run-iterated-fit`. @@ -471,8 +508,7 @@ def iterate_preprocessing_yaml( basis = checked["basis"] # use order defined in runcard. - runcard_flavours = [ - f"{basis.elementlabel(ref_fl['fl'])}" for ref_fl in previous_exponents] + runcard_flavours = [f"{basis.elementlabel(ref_fl['fl'])}" for ref_fl in previous_exponents] for i, fl in enumerate(runcard_flavours): alphas = df_effexps.loc[(f"${fl}$", r"$\alpha$")].values betas = df_effexps.loc[(f"${fl}$", r"$\beta$")].values @@ -484,17 +520,12 @@ def iterate_preprocessing_yaml( alphas = np.clip(alphas, **smallx_args) if largex_args is not None: betas = np.clip(betas, **largex_args) - previous_exponents[i]["smallx"] = [ - fmt(alpha) for alpha in alphas - ] - previous_exponents[i]["largex"] = [ - fmt(beta) for beta in betas - ] + previous_exponents[i]["smallx"] = [fmt(alpha) for alpha in alphas] + previous_exponents[i]["largex"] = [fmt(beta) for beta in betas] return yaml.dump(filtermap, Dumper=yaml.RoundTripDumper) -def update_runcard_description_yaml( - iterate_preprocessing_yaml, _updated_description=None): +def update_runcard_description_yaml(iterate_preprocessing_yaml, _updated_description=None): """Take the runcard with iterated preprocessing and update the description if ``_updated_description`` is provided. As with :py:func:`iterate_preprocessing_yaml` the result can be used in a report @@ -515,8 +546,7 @@ def update_runcard_description_yaml( return yaml.dump(filtermap, Dumper=yaml.RoundTripDumper) -def iterated_runcard_yaml( - fit, update_runcard_description_yaml): +def iterated_runcard_yaml(fit, update_runcard_description_yaml): """ Takes the runcard with preprocessing iterated and description updated then @@ -560,7 +590,7 @@ def iterated_runcard_yaml( if seed in filtermap: filtermap[seed] = random.randrange(0, maxint) elif fitting_data is not None and seed in fitting_data: - #BCH + # BCH # For older runcards the seeds are inside the `fitting` namespace fitting_data[seed] = random.randrange(0, maxint) diff --git a/validphys2/src/validphys/fitdata.py b/validphys2/src/validphys/fitdata.py index b2d07b7170..dd1e8eb900 100644 --- a/validphys2/src/validphys/fitdata.py +++ b/validphys2/src/validphys/fitdata.py @@ -2,26 +2,25 @@ """ Utilities for loading data from fit folders """ +from collections import OrderedDict, defaultdict, namedtuple +from io import StringIO import json import logging -from collections import namedtuple, OrderedDict, defaultdict -from io import StringIO import pathlib import numpy as np import pandas as pd -from reportengine.compat import yaml from reportengine import collect -from reportengine.table import table -from reportengine.checks import make_argcheck, CheckError +from reportengine.checks import CheckError, make_argcheck +from reportengine.compat import yaml from reportengine.floatformatting import ValueErrorTuple - -from validphys.core import PDF +from reportengine.table import table from validphys import checks +from validphys.core import PDF from validphys.plotoptions import get_info -#TODO: Add more stuff here as needed for postfit +# TODO: Add more stuff here as needed for postfit LITERAL_FILES = ['chi2exps.log'] REPLICA_FILES = ['.dat', '.json'] FIT_SUMRULES = [ @@ -31,9 +30,10 @@ "svalence", ] -#t = blessings.Terminal() +# t = blessings.Terminal() log = logging.getLogger(__name__) + def num_fitted_replicas(fit): """Function to obtain the number of nnfit replicas. That is the number of replicas before postfit was run. @@ -44,9 +44,9 @@ def num_fitted_replicas(fit): return len(veto["Positivity"]) -#TODO setup make_check on these +# TODO setup make_check on these def check_nnfit_results_path(path): - """ Returns True if the requested path is a valid results directory, + """Returns True if the requested path is a valid results directory, i.e if it is a directory and has a 'nnfit' subdirectory""" if not path.is_dir(): log.warning(f"Path is not a directory {path}") @@ -56,18 +56,20 @@ def check_nnfit_results_path(path): return False return True + def check_lhapdf_info(results_dir, fitname): - """ Check that an LHAPDF info metadata file is - present in the fit results """ + """Check that an LHAPDF info metadata file is + present in the fit results""" info_path = results_dir.joinpath('nnfit', f'{fitname}.info') if not info_path.is_file(): log.warning(f"Cannot find info file at {info_path}") return False return True -#TODO This should establish if the .dat files are corrupted or not + +# TODO This should establish if the .dat files are corrupted or not def check_replica_files(replica_path, prefix): - """ Verification of a replica results directory at `replica_path` + """Verification of a replica results directory at `replica_path` for a fit named `prefix`. Returns True if the results directory is complete""" @@ -77,11 +79,11 @@ def check_replica_files(replica_path, prefix): return False valid = True for f in LITERAL_FILES: - test_path = path/f + test_path = path / f if not test_path.is_file(): log.warning(f"Missing file: {test_path}") valid = False - main_path = path/prefix + main_path = path / prefix for f in REPLICA_FILES: test_path = main_path.with_suffix(f) if not test_path.is_file(): @@ -91,7 +93,11 @@ def check_replica_files(replica_path, prefix): log.warning(f"Found invalid replica {path}") return valid -FitInfo = namedtuple("FitInfo", ("nite", 'training', 'validation', 'chi2', 'is_positive', 'arclengths', 'integnumbers')) + +FitInfo = namedtuple( + "FitInfo", + ("nite", 'training', 'validation', 'chi2', 'is_positive', 'arclengths', 'integnumbers'), +) def _old_load_fitinfo(old_fitinfo): @@ -99,18 +105,26 @@ def _old_load_fitinfo(old_fitinfo): so that comparisons can still be run against very old fits """ with old_fitinfo.open("r", encoding="utf-8") as fitinfo_file: - fitinfo_line = fitinfo_file.readline().split() # General fit properties - fitinfo_arcl = fitinfo_file.readline() # Replica arc-lengths - fitinfo_integ = fitinfo_file.readline() # Replica integ-numbers + fitinfo_line = fitinfo_file.readline().split() # General fit properties + fitinfo_arcl = fitinfo_file.readline() # Replica arc-lengths + fitinfo_integ = fitinfo_file.readline() # Replica integ-numbers - n_iterations = int( fitinfo_line[0]) + n_iterations = int(fitinfo_line[0]) erf_validation = float(fitinfo_line[1]) - erf_training = float(fitinfo_line[2]) - chisquared = float(fitinfo_line[3]) - is_positive = fitinfo_line[4] == "POS_PASS" - arclengths = np.fromstring(fitinfo_arcl, sep=' ') - integnumbers = np.fromstring(fitinfo_integ, sep=' ') - return FitInfo(n_iterations, erf_training, erf_validation, chisquared, is_positive, arclengths, integnumbers) + erf_training = float(fitinfo_line[2]) + chisquared = float(fitinfo_line[3]) + is_positive = fitinfo_line[4] == "POS_PASS" + arclengths = np.fromstring(fitinfo_arcl, sep=' ') + integnumbers = np.fromstring(fitinfo_integ, sep=' ') + return FitInfo( + n_iterations, + erf_training, + erf_validation, + chisquared, + is_positive, + arclengths, + integnumbers, + ) def load_fitinfo(replica_path, prefix): @@ -130,15 +144,23 @@ def load_fitinfo(replica_path, prefix): is_positive = fitinfo_dict["pos_state"] == "POS_PASS" arclengths = np.array(fitinfo_dict["arc_lengths"]) integnumbers = np.array(fitinfo_dict["integrability"]) - return FitInfo(n_iterations, erf_training, erf_validation, chisquared, is_positive, arclengths, integnumbers) + return FitInfo( + n_iterations, + erf_training, + erf_validation, + chisquared, + is_positive, + arclengths, + integnumbers, + ) @checks.check_has_fitted_replicas def replica_paths(fit): """Return the paths of all the replicas""" - #Total number of members = number of replicas + 1 + # Total number of members = number of replicas + 1 l = len(PDF(fit.name)) - postfit_path = fit.path / 'postfit' + postfit_path = fit.path / 'postfit' old_postfit_path = fit.path / 'nnfit' if postfit_path.is_dir(): return [postfit_path / f'replica_{index}' for index in range(1, l)] @@ -157,20 +179,20 @@ def replica_data(fit, replica_paths): @table def fit_summary(fit_name_with_covmat_label, replica_data, total_chi2_data, total_phi_data): - """ Summary table of fit properties - - Central chi-squared - - Average chi-squared - - Training and Validation error functions - - Training lengths - - Phi + """Summary table of fit properties + - Central chi-squared + - Average chi-squared + - Training and Validation error functions + - Training lengths + - Phi - Note: - Chi-squared values from the replica_data are not used here (presumably - they are fixed to being t0) + Note: + Chi-squared values from the replica_data are not used here (presumably + they are fixed to being t0) - This uses a corrected form for the error on phi in comparison to the - vp1 value. The error is propagated from the uncertainty on the - average chi-squared only. + This uses a corrected form for the error on phi in comparison to the + vp1 value. The error is propagated from the uncertainty on the + average chi-squared only. """ nrep = len(replica_data) @@ -183,51 +205,63 @@ def fit_summary(fit_name_with_covmat_label, replica_data, total_chi2_data, total evalid = [x.validation for x in replica_data] phi, _ = total_phi_data - phi_err = np.std(member_chi2)/(2.0*phi*np.sqrt(nrep)) + phi_err = np.std(member_chi2) / (2.0 * phi * np.sqrt(nrep)) VET = ValueErrorTuple - data = OrderedDict( ((r"$\chi^2$", f"{central_chi2:.5f}"), - (r"$$", f"{VET(np.mean(etrain), np.std(etrain))}"), - (r"$$", f"{VET(np.mean(evalid), np.std(evalid))}"), - (r"$$", f"{VET(np.mean(nite), np.std(nite))}"), - (r"$<\chi^2>$", f"{VET(np.mean(member_chi2), np.std(member_chi2))}"), - (r"$\phi$", f"{VET(phi, phi_err)}"))) + data = OrderedDict( + ( + (r"$\chi^2$", f"{central_chi2:.5f}"), + (r"$$", f"{VET(np.mean(etrain), np.std(etrain))}"), + (r"$$", f"{VET(np.mean(evalid), np.std(evalid))}"), + (r"$$", f"{VET(np.mean(nite), np.std(nite))}"), + (r"$<\chi^2>$", f"{VET(np.mean(member_chi2), np.std(member_chi2))}"), + (r"$\phi$", f"{VET(phi, phi_err)}"), + ) + ) return pd.Series(data, index=data.keys(), name=fit_name_with_covmat_label) collected_fit_summaries = collect('fit_summary', ('fits', 'fitcontext')) + + @table def summarise_fits(collected_fit_summaries): - """ Produces a table of basic comparisons between fits, includes - all the fields used in fit_summary """ + """Produces a table of basic comparisons between fits, includes + all the fields used in fit_summary""" return pd.concat(collected_fit_summaries, axis=1) @checks.check_use_t0 @table def t0_chi2_info_table(pdf, dataset_inputs_abs_chi2_data, t0pdfset, use_t0): - """ Provides table with - - t0pdfset name - - Central t0-chi-squared - - Average t0-chi-squared + """Provides table with + - t0pdfset name + - Central t0-chi-squared + - Average t0-chi-squared """ ndata = dataset_inputs_abs_chi2_data.ndata central_chi2 = dataset_inputs_abs_chi2_data.central_result / ndata member_chi2 = dataset_inputs_abs_chi2_data.replica_result.error_members() / ndata VET = ValueErrorTuple - data = OrderedDict( (("t0pdfset", f"{t0pdfset}"), - (r"$\chi^2_{t0}$", f"{central_chi2:.5f}"), - (r"$<\chi^2_{t0}>$", f"{VET(np.mean(member_chi2), np.std(member_chi2))}"))) + data = OrderedDict( + ( + ("t0pdfset", f"{t0pdfset}"), + (r"$\chi^2_{t0}$", f"{central_chi2:.5f}"), + (r"$<\chi^2_{t0}>$", f"{VET(np.mean(member_chi2), np.std(member_chi2))}"), + ) + ) return pd.Series(data, index=data.keys(), name=pdf.label) + fits_replica_data = collect('replica_data', ('fits',)) -#Do collect in two parts so we get a list for each fit instead of a single list +# Do collect in two parts so we get a list for each fit instead of a single list all_datasets = collect('dataset', ('data',)) -fits_datasets = collect('all_datasets', ('fits', 'fitinputcontext',)) +fits_datasets = collect('all_datasets', ('fits', 'fitinputcontext')) + @make_argcheck def _assert_two_fits(fits): @@ -235,8 +269,10 @@ def _assert_two_fits(fits): if len(fits) != 2: raise CheckError("Exactly two fits are required") + DatasetComp = namedtuple('DatasetComp', ('common', 'first_only', 'second_only')) + @_assert_two_fits def match_datasets_by_name(fits, fits_datasets): """Return a tuple with common, first_only and second_only. @@ -256,9 +292,8 @@ def match_datasets_by_name(fits, fits_datasets): return DatasetComp(common, first_only, second_only) -#TODO: Do we do md output here or that's for the templates? -def print_dataset_differences(fits, match_datasets_by_name, - print_common:bool=True): +# TODO: Do we do md output here or that's for the templates? +def print_dataset_differences(fits, match_datasets_by_name, print_common: bool = True): """Given exactly two fits, print the datasets that are included in one " "but not in the other. If `print_common` is True, also print the datasets that are common.""" @@ -266,20 +301,26 @@ def print_dataset_differences(fits, match_datasets_by_name, first, second = fits res = StringIO() if m.common and print_common: - res.write("The following datasets are included in both `%s` and `%s`:\n\n" % (first, second)) - for k,v in m.common.items(): + res.write( + "The following datasets are included in both `%s` and `%s`:\n\n" % (first, second) + ) + for k, v in m.common.items(): info = get_info(v[0].commondata) res.write(' - %s\n' % info.dataset_label) res.write('\n') if m.first_only: - res.write("The following datasets are included in `%s` but not in `%s`:\n\n"% (first,second)) - for k,v in m.first_only.items(): + res.write( + "The following datasets are included in `%s` but not in `%s`:\n\n" % (first, second) + ) + for k, v in m.first_only.items(): info = get_info(v.commondata) res.write(' - %s\n' % info.dataset_label) res.write('\n') if m.second_only: - res.write("The following datasets are included in `%s` but not in `%s`:\n\n"% (second,first)) - for k,v in m.second_only.items(): + res.write( + "The following datasets are included in `%s` but not in `%s`:\n\n" % (second, first) + ) + for k, v in m.second_only.items(): info = get_info(v.commondata) res.write(' - %s\n' % info.dataset_label) res.write('\n') @@ -287,8 +328,10 @@ def print_dataset_differences(fits, match_datasets_by_name, res.write("The datasets included in the fits are identical.") return res.getvalue() + print_dataset_differences.highlight = 'markdown' + @_assert_two_fits def test_for_same_cuts(fits, match_datasets_by_name): """Given two fits, return a list of tuples `(first, second)` @@ -309,31 +352,44 @@ def test_for_same_cuts(fits, match_datasets_by_name): else: c2 = np.arange(second.commondata.ndata) if not np.array_equal(c1, c2): - msg = "Cuts for %s are not the same:\n%s:\n%s\n\n%s:\n%s" % (ds, first_fit, c1, second_fit, c2) + msg = "Cuts for %s are not the same:\n%s:\n%s\n\n%s:\n%s" % ( + ds, + first_fit, + c1, + second_fit, + c2, + ) log.info(msg) - res.append((first, second)) + res.append((first, second)) return res + def print_different_cuts(fits, test_for_same_cuts): """Print a summary of the datasets that are included in both fits but have different cuts.""" res = StringIO() first_fit, second_fit = fits if test_for_same_cuts: - res.write("The following datasets are both included but have different kinematical cuts:\n\n") - for (first, second) in test_for_same_cuts: + res.write( + "The following datasets are both included but have different kinematical cuts:\n\n" + ) + for first, second in test_for_same_cuts: info = get_info(first.commondata) total_points = first.commondata.ndata res.write(" - %s:\n" % info.dataset_label) first_len = len(first.cuts.load()) if first.cuts else total_points second_len = len(second.cuts.load()) if second.cuts else total_points - res.write(" * %s includes %d out of %d points.\n" % (first_fit, first_len, total_points)) - res.write(" * %s includes %d out of %d points.\n" % (second_fit, second_len, total_points)) + res.write( + " * %s includes %d out of %d points.\n" % (first_fit, first_len, total_points) + ) + res.write( + " * %s includes %d out of %d points.\n" % (second_fit, second_len, total_points) + ) res.write('\n') - return res.getvalue() + def fit_theory_covmat_summary(fit, fitthcovmat): """returns a table with a single column for the `fit`, with three rows indicating if the theory covariance matrix was used in the 'sampling' of the pseudodata, @@ -349,16 +405,20 @@ def fit_theory_covmat_summary(fit, fitthcovmat): df = pd.DataFrame( [sampling, fitting, report], columns=[fit.label], - index=['sampling', 'fitting', 'validphys statistical estimators']) + index=['sampling', 'fitting', 'validphys statistical estimators'], + ) return df + fits_theory_covmat_summary = collect('fit_theory_covmat_summary', ('fits',)) + @table def summarise_theory_covmat_fits(fits_theory_covmat_summary): """Collects the theory covmat summary for all fits and concatenates them into a single table""" return pd.concat(fits_theory_covmat_summary, axis=1) + def _get_fitted_index(pdf, i): """Return the nnfit index for the replica i""" p = pdf.infopath.with_name(f'{pdf.name}_{i:04d}.dat') @@ -367,21 +427,24 @@ def _get_fitted_index(pdf, i): metadata = next(it) return metadata['FromMCReplica'] + @make_argcheck def _check_has_replica_tags(pdf): """Check that the PDF has fitted index tags.""" try: - _get_fitted_index(pdf,1) + _get_fitted_index(pdf, 1) except KeyError as e: - raise CheckError("PDF replica file don't contain " - "the fitted replica tag.") from e + raise CheckError("PDF replica file don't contain the fitted replica tag.") from e + @_check_has_replica_tags def fitted_replica_indexes(pdf): """Return nnfit index of replicas 1 to N.""" - return [_get_fitted_index(pdf,i) for i in range(1, len(pdf))] + return [_get_fitted_index(pdf, i) for i in range(1, len(pdf))] + + +fits_replica_indexes = collect('fitted_replica_indexes', ('fits', 'fitpdf')) -fits_replica_indexes = collect('fitted_replica_indexes', ('fits','fitpdf')) def fits_replica_data_correlated(fits_replica_data, fits_replica_indexes, fits): """Return a table with the same columns as ``replica_data`` indexed by the @@ -397,23 +460,21 @@ def fits_replica_data_correlated(fits_replica_data, fits_replica_indexes, fits): dfs.append(pd.DataFrame(dt, columns=FitInfo._fields, index=inds)) return pd.concat(dfs, axis=1, keys=[fit.name for fit in fits]) + @table def datasets_properties_table(data_input): """Return dataset properties for each dataset in ``data_input``""" dataset_property_dict = defaultdict(list) for dataset in data_input: # only add elements if they don't evaluate to false - ds_input_dict = { - k: v for (k, v) in zip(dataset.argnames(), dataset.comp_tuple) - if v - } + ds_input_dict = {k: v for (k, v) in zip(dataset.argnames(), dataset.comp_tuple) if v} dataset_property_dict["Dataset"].append(ds_input_dict.pop("name")) dataset_property_dict["Training fraction"].append(ds_input_dict.pop("frac", "-")) dataset_property_dict["Weight"].append(ds_input_dict.pop("weight", "-")) dataset_property_dict["C-factors"].append(", ".join(ds_input_dict.pop("cfac", "-"))) dataset_property_dict["Other fields"].append( - ", ".join([f"{k}: {v}" for k, v in ds_input_dict.items()]) - if ds_input_dict else "-") + ", ".join([f"{k}: {v}" for k, v in ds_input_dict.items()]) if ds_input_dict else "-" + ) df = pd.DataFrame(dataset_property_dict) df.set_index("Dataset", inplace=True) df = df[["Training fraction", "Weight", "C-factors", "Other fields"]] @@ -425,9 +486,9 @@ def fit_datasets_properties_table(fitinputcontext): """Returns table of dataset properties for each dataset used in a fit.""" return datasets_properties_table(fitinputcontext["data_input"]) + dataset_inputs_commondata = collect("commondata", ("data_input",)) -groups_commondata = collect( - "dataset_inputs_commondata", ("group_dataset_inputs_by_metadata",)) +groups_commondata = collect("dataset_inputs_commondata", ("group_dataset_inputs_by_metadata",)) def print_systype_overlap(groups_commondata, group_dataset_inputs_by_metadata): @@ -460,10 +521,10 @@ def print_systype_overlap(groups_commondata, group_dataset_inputs_by_metadata): else: return "No overlap of systypes" + @table def fit_code_version(fit): - """ Returns table with the code version from ``replica_1/{fitname}.json`` files. - """ + """Returns table with the code version from ``replica_1/{fitname}.json`` files.""" vinfo = {} for json_path in fit.path.glob(f"nnfit/replica_*/{fit.name}.json"): tmp = json.loads(json_path.read_text(encoding="utf-8")).get("version") @@ -474,11 +535,13 @@ def fit_code_version(fit): return pd.DataFrame(vinfo.items(), columns=["module", fit.name]).set_index("module") + fits_fit_code_version = collect("fit_code_version", ("fits",)) + @table def fits_version_table(fits_fit_code_version): - """ Produces a table of version information for multiple fits.""" + """Produces a table of version information for multiple fits.""" vtable = pd.concat(fits_fit_code_version, axis=1) # Fill NaNs with "unavailable" vtable.fillna("unavailable", inplace=True) diff --git a/validphys2/src/validphys/fitveto.py b/validphys2/src/validphys/fitveto.py index 4b3ca9bb9e..a988a14203 100644 --- a/validphys2/src/validphys/fitveto.py +++ b/validphys2/src/validphys/fitveto.py @@ -12,6 +12,7 @@ import json import logging + import numpy as np log = logging.getLogger(__name__) @@ -23,7 +24,7 @@ def distribution_veto(dist, prior_mask, nsigma_threshold): - """ For a given distribution (a list of floats), returns a boolean mask + """For a given distribution (a list of floats), returns a boolean mask specifying the passing elements. The result is a new mask of the elements that satisfy: @@ -43,7 +44,7 @@ def distribution_veto(dist, prior_mask, nsigma_threshold): def integrability_veto(dist, integ_threshold): - """ For a given distribution (a list of floats), returns a boolean mask + """For a given distribution (a list of floats), returns a boolean mask specifying the passing elements. The result is a new mask of the elements that satisfy: value <= integ_threshold @@ -52,8 +53,13 @@ def integrability_veto(dist, integ_threshold): return dist <= integ_threshold -def determine_vetoes(fitinfos: list, nsigma_discard_chi2: float, nsigma_discard_arclength: float, integ_threshold: float): - """ Assesses whether replica fitinfo passes standard NNPDF vetoes +def determine_vetoes( + fitinfos: list, + nsigma_discard_chi2: float, + nsigma_discard_arclength: float, + integ_threshold: float, +): + """Assesses whether replica fitinfo passes standard NNPDF vetoes Returns a dictionary of vetoes and their passing boolean masks. Included in the dictionary is a 'Total' veto. """ @@ -78,18 +84,15 @@ def determine_vetoes(fitinfos: list, nsigma_discard_chi2: float, nsigma_discard_ log.warning(f"No integrability numbers in the fitinfo file") else: for i in range(0, len(fitinfos[0].integnumbers)): - values = [j.integnumbers[i] for j in fitinfos] + values = [j.integnumbers[i] for j in fitinfos] key = "IntegNumber_" + str(i) - vetoes[key] = integrability_veto( - values, integ_threshold=integ_threshold) - + vetoes[key] = integrability_veto(values, integ_threshold=integ_threshold) + # Distribution vetoes while True: for key in distributions: values, threshold = distributions[key] - vetoes[key] = distribution_veto( - values, total_mask, nsigma_threshold=threshold - ) + vetoes[key] = distribution_veto(values, total_mask, nsigma_threshold=threshold) new_total_mask = np.all(list(vetoes.values()), axis=0) if sum(new_total_mask) == sum(total_mask): break @@ -102,8 +105,10 @@ def determine_vetoes(fitinfos: list, nsigma_discard_chi2: float, nsigma_discard_ return vetoes -def save_vetoes_info(veto_dict: dict, chi2_threshold, arclength_threshold, integ_threshold, filepath): - """ Saves to file the chi2 and arclength thresholds used by postfit as well as veto +def save_vetoes_info( + veto_dict: dict, chi2_threshold, arclength_threshold, integ_threshold, filepath +): + """Saves to file the chi2 and arclength thresholds used by postfit as well as veto dictionaries which contain information on which replicas pass each veto.""" if filepath.exists(): log.warning(f"Veto file {filepath} already exists. Overwriting file") @@ -111,7 +116,7 @@ def save_vetoes_info(veto_dict: dict, chi2_threshold, arclength_threshold, integ thresholds_dict = { "chi2_threshold": chi2_threshold, "arclength_threshold": arclength_threshold, - "integrability_threshold": integ_threshold + "integrability_threshold": integ_threshold, } veto_dict_tolist = {key: val.tolist() for key, val in veto_dict.items()} combined_dict = {**thresholds_dict, **veto_dict_tolist} diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index 5ced2a8350..3ac464a8a0 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -17,15 +17,15 @@ fk = l.check_fktable(setname="ATLASTTBARTOT", theoryID=53, cfac=('QCD',)) res = load_fktable(fk) """ -import io +import dataclasses import functools +import io import tarfile -import dataclasses import numpy as np import pandas as pd -from validphys.coredata import FKTableData, CFactorData +from validphys.coredata import CFactorData, FKTableData from validphys.pineparser import pineappl_reader @@ -40,11 +40,13 @@ class BadFKTableError(Exception): @dataclasses.dataclass(frozen=True) class GridInfo: """Class containing the basic properties of an FKTable grid.""" + setname: str hadronic: bool ndata: int nx: int + @functools.lru_cache() def load_fktable(spec): """Load the data corresponding to a FKSpec object. The cfactors @@ -70,13 +72,13 @@ def load_fktable(spec): return tabledata.with_cfactor(cfprod) + def _get_compressed_buffer(path): archive = tarfile.open(path) members = archive.getmembers() l = len(members) if l != 1: - raise BadFKTableError( - f"Archive {path} should contain one file, but it contains {l}.") + raise BadFKTableError(f"Archive {path} should contain one file, but it contains {l}.") return archive.extractfile(members[0]) @@ -103,9 +105,11 @@ def open_fkpath(path): def _is_header_line(line): return line.startswith((b'_', b'{')) + def _bytes_to_bool(x): return bool(int(x)) + def _parse_fk_options(line_and_stream, value_parsers=None): """Parse a sequence of lines of the form *OPTION: VALUE @@ -146,21 +150,26 @@ def f_(line_and_stream): return processed, lineno, next_line buf.write(next_line) raise BadFKTableError("FKTable should end with FastKernel spec, not with a segment string") + return f_ + @_segment_parser def _parse_string(buf): return buf.getvalue().decode() + @_segment_parser def _parse_flavour_map(buf): buf.seek(0) return np.loadtxt(buf, dtype=bool) + @_segment_parser def _parse_xgrid(buf): return np.fromstring(buf.getvalue(), sep='\n') + # This used a different interface from segment parser because we want it to # be fast. # We assume it is going to be the last section. @@ -170,15 +179,16 @@ def _parse_hadronic_fast_kernel(f): # Note that we need the slower whitespace here because it turns out # that there are fktables where space and tab are used as separators # within the same table. - df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1,2)) - df.columns = list(range(14*14)) + df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0, 1, 2)) + df.columns = list(range(14 * 14)) df.index.names = ['data', 'x1', 'x2'] return df + def _parse_dis_fast_kernel(f): """Parse the FastKernel section of a DIS FKTable into a DataFrame. ``f`` should be a stream containing only the section""" - df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0,1)) + df = pd.read_csv(f, sep=r'\s+', header=None, index_col=(0, 1)) df.columns = list(range(14)) df.index.names = ['data', 'x'] return df @@ -186,40 +196,35 @@ def _parse_dis_fast_kernel(f): def _parse_gridinfo(line_and_stream): dict_result, line_number, next_line = _parse_fk_options( - line_and_stream, - value_parsers={ - "HADRONIC": _bytes_to_bool, - "NDATA": int, - "NX": int - }) + line_and_stream, value_parsers={"HADRONIC": _bytes_to_bool, "NDATA": int, "NX": int} + ) gi = GridInfo(**{k.lower(): v for k, v in dict_result.items()}) return gi, line_number, next_line - def _parse_header(lineno, header): if not _is_header_line(header): - raise BadFKTableError(f"Bad header at line {lineno}: First " - "character should be either '_' or '{'") + raise BadFKTableError( + f"Bad header at line {lineno}: First character should be either '_' or '{{'" + ) try: endname = header.index(b'_', 1) except ValueError: raise BadFKTableError(f"Bad header at line {lineno}: Expected '_' after name") from None header_name = header[1:endname] - #Note: This is not the same as header[0]. Bytes iterate as ints. + # Note: This is not the same as header[0]. Bytes iterate as ints. return header[0:1], header_name.decode() def _build_sigma(f, res): gi = res["GridInfo"] fm = res["FlavourMap"] - table = ( - _parse_hadronic_fast_kernel(f) if gi.hadronic else _parse_dis_fast_kernel(f) - ) + table = _parse_hadronic_fast_kernel(f) if gi.hadronic else _parse_dis_fast_kernel(f) # Filter out empty flavour indices table = table.loc[:, fm.ravel()] return table + _KNOWN_SEGMENTS = { "GridDesc": _parse_string, "VersionInfo": _parse_fk_options, @@ -266,14 +271,14 @@ def _build_sigma(f, res): ), } + def _check_required_sections(res, lineno): """Check that we have found all the required sections by the time we reach 'FastKernel'""" for section in _KNOWN_SEGMENTS: if section not in res: - raise BadFKTableError( - f"{section} must come before 'FastKernel' section at {lineno}" - ) + raise BadFKTableError(f"{section} must come before 'FastKernel' section at {lineno}") + def parse_fktable(f): """Parse an open byte stream into an FKTableData. Raise a BadFKTableError @@ -327,9 +332,7 @@ def parse_fktable(f): out, lineno, header = parser(line_and_stream) except Exception as e: # Note that the old lineno is the one we want - raise BadFKTableError( - f"Failed processing header {header_name} on line {lineno}" - ) from e + raise BadFKTableError(f"Failed processing header {header_name} on line {lineno}") from e res[header_name] = out diff --git a/validphys2/src/validphys/gridvalues.py b/validphys2/src/validphys/gridvalues.py index d038faa7f4..83628a9997 100644 --- a/validphys2/src/validphys/gridvalues.py +++ b/validphys2/src/validphys/gridvalues.py @@ -46,6 +46,7 @@ "csbar": [4, -3], } + def _grid_values(lpdf, flmat, xmat, qmat): """Compute lpdf.grid_values with more forgiving argument types""" flmat = np.atleast_1d(np.asanyarray(flmat)) @@ -53,7 +54,8 @@ def _grid_values(lpdf, flmat, xmat, qmat): qmat = np.atleast_1d(np.asarray(qmat)) return lpdf.grid_values(flmat, xmat, qmat) -def grid_values(pdf:PDF, flmat, xmat, qmat): + +def grid_values(pdf: PDF, flmat, xmat, qmat): """ Evaluate ``x*f(x)`` on a grid of points in flavour, x and Q. @@ -97,7 +99,8 @@ def grid_values(pdf:PDF, flmat, xmat, qmat): """ return _grid_values(pdf.load(), flmat, xmat, qmat) -def central_grid_values(pdf:PDF, flmat, xmat, qmat): + +def central_grid_values(pdf: PDF, flmat, xmat, qmat): """Same as :py:func:`grid_values` but it returns only the central values. The return value is indexed as:: @@ -109,10 +112,12 @@ def central_grid_values(pdf:PDF, flmat, xmat, qmat): return _grid_values(pdf.load_t0(), flmat, xmat, qmat) -#TODO: Investigate writting these in cython/cffi/numba/... +# TODO: Investigate writting these in cython/cffi/numba/... -def evaluate_luminosity(pdf_set: LHAPDFSet, n: int, s:float, mx: float, - x1: float, x2: float, channel): + +def evaluate_luminosity( + pdf_set: LHAPDFSet, n: int, s: float, mx: float, x1: float, x2: float, channel +): """Returns PDF luminosity at specified values of mx, x1, x2, sqrts**2 for a given channel. @@ -123,7 +128,7 @@ def evaluate_luminosity(pdf_set: LHAPDFSet, n: int, s:float, mx: float, channel: The channel tag name from LUMI_CHANNELS. """ - + # fmt: off res = 0 if channel == 'gg': res = pdf_set.xfxQ(x1, mx, n, 21) * pdf_set.xfxQ(x2, mx, n, 21) @@ -157,6 +162,7 @@ def evaluate_luminosity(pdf_set: LHAPDFSet, n: int, s:float, mx: float, else: raise ValueError("Bad channel") + # fmt: on # The following is equivalent to Eq.(2) in arXiv:1607.01831 - return res/x1/x2/s + return res / x1 / x2 / s diff --git a/validphys2/src/validphys/hyper_algorithm.py b/validphys2/src/validphys/hyper_algorithm.py index 8ab82edb58..35db03fa35 100644 --- a/validphys2/src/validphys/hyper_algorithm.py +++ b/validphys2/src/validphys/hyper_algorithm.py @@ -3,6 +3,7 @@ """ import itertools import logging + import pandas as pd log = logging.getLogger(__name__) diff --git a/validphys2/src/validphys/hyperoptplot.py b/validphys2/src/validphys/hyperoptplot.py index a35ec75515..c09b42f041 100644 --- a/validphys2/src/validphys/hyperoptplot.py +++ b/validphys2/src/validphys/hyperoptplot.py @@ -8,19 +8,21 @@ # still used for the plotting script -import os -import re import glob import json import logging +import os +import re from types import SimpleNamespace + import numpy as np import pandas as pd +import seaborn as sns + from reportengine.figure import figure from reportengine.table import table -import seaborn as sns -from validphys.hyper_algorithm import autofilter_dataframe from validphys import plotutils +from validphys.hyper_algorithm import autofilter_dataframe log = logging.getLogger(__name__) @@ -706,7 +708,7 @@ def plot_scans(df, best_df, plotting_parameter, include_best=True): key = plotting_parameter mode = plotting_styles[plotting_parameter] - if mode in (0,2): # normal scatter plot + if mode in (0, 2): # normal scatter plot ax = sns.scatterplot(x=key, y=loss_k, data=df, ax=ax) best_x = best_df.get(key) if mode == 2: @@ -743,7 +745,6 @@ def plot_scans(df, best_df, plotting_parameter, include_best=True): alpha=0.4, ) - # Finally plot the "best" one, which will be first if include_best: ax = sns.scatterplot(x=best_x, y=best_df.get(loss_k), ax=ax, color="orange", marker="s") diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 316cf8d48e..295bcd30fb 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -11,30 +11,29 @@ import pandas as pd from reportengine import collect -from reportengine.table import table from reportengine.checks import check_positive - -from validphys.core import CutsPolicy +from reportengine.table import table from validphys import plotoptions +from validphys.core import CutsPolicy log = logging.getLogger(__name__) - @check_positive('titlelevel') -def describe_kinematics(commondata, titlelevel:int=1): +def describe_kinematics(commondata, titlelevel: int = 1): """Output a markdown text describing the stored metadata for a given commondata. titlelevel can be used to control the header level of the title. """ import inspect + cd = commondata info = plotoptions.get_info(cd) proc = cd.load_commondata().commondataproc src = inspect.getsource(info.kinematics_override.xq2map) - titlespec = '#'*titlelevel - return (f""" + titlespec = '#' * titlelevel + return f""" {titlespec} {cd} {info.dataset_label} @@ -56,7 +55,8 @@ def describe_kinematics(commondata, titlelevel:int=1): {src} ``` -""") +""" + describe_kinematics.highlight = 'markdown' @@ -64,7 +64,8 @@ def describe_kinematics(commondata, titlelevel:int=1): nfittedlabel = '$N_{fitted}$' ndatalabel = '$N_{data}$' -def kinlimits(commondata, cuts, use_cuts, use_kinoverride:bool=True): + +def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True): """Return a mapping containing the number of fitted and used datapoints, as well as the label, minimum and maximum value for each of the three kinematics. If ``use_kinoverride`` is set to False, the PLOTTING files will @@ -83,7 +84,7 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride:bool=True): else: nfitted = '-' - d = {'dataset': commondata, ndatalabel:ndata, nfittedlabel:nfitted} + d = {'dataset': commondata, ndatalabel: ndata, nfittedlabel: nfitted} for i, key in enumerate(['k1', 'k2', 'k3']): kmin = kintable[key].min() kmax = kintable[key].max() @@ -93,32 +94,47 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride:bool=True): d[key + ' max'] = kmax return d + all_kinlimits = collect(kinlimits, ('dataset_inputs',)) + @table -def all_kinlimits_table(all_kinlimits, use_kinoverride:bool=True): +def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True): """Return a table with the kinematic limits for the datasets given as input in dataset_inputs. If the PLOTTING overrides are not used, the information on sqrt(k2) will be displayed.""" - table = pd.DataFrame(all_kinlimits, - columns=['dataset', '$N_{data}$', '$N_{fitted}$', - 'k1', 'k1 min', 'k1 max', 'k2', 'k2 min', 'k2 max', 'k3', 'k3 min', 'k3 max' - ]) + table = pd.DataFrame( + all_kinlimits, + columns=[ + 'dataset', + '$N_{data}$', + '$N_{fitted}$', + 'k1', + 'k1 min', + 'k1 max', + 'k2', + 'k2 min', + 'k2 max', + 'k3', + 'k3 min', + 'k3 max', + ], + ) - #We really want to see the square root of the scale + # We really want to see the square root of the scale if not use_kinoverride: table['k2'] = 'sqrt(' + table['k2'] + ')' table['k2 min'] = np.sqrt(table['k2 min']) table['k2 max'] = np.sqrt(table['k2 max']) - #renaming the columns is overly complicated + # renaming the columns is overly complicated cols = list(table.columns) - cols[6:9] = ['sqrt(k2)', 'sqrt(k2) min', 'sqrt(k2) max'] + cols[6:9] = ['sqrt(k2)', 'sqrt(k2) min', 'sqrt(k2) max'] table.columns = cols - return table + @table def all_commondata_grouping(all_commondata, metadata_group): """Return a table with the grouping specified @@ -126,21 +142,22 @@ def all_commondata_grouping(all_commondata, metadata_group): """ records = [] for cd in all_commondata: - records.append({'dataset': str(cd), metadata_group: getattr(plotoptions.get_info(cd), metadata_group)}) + records.append( + {'dataset': str(cd), metadata_group: getattr(plotoptions.get_info(cd), metadata_group)} + ) df = pd.DataFrame.from_records(records, index='dataset') # sort first by grouping alphabetically and then dataset name return df.sort_values([metadata_group, 'dataset']) -def total_fitted_points(all_kinlimits_table)->int: + +def total_fitted_points(all_kinlimits_table) -> int: """Print the total number of fitted points in a given set of data""" tb = all_kinlimits_table return int(tb[nfittedlabel].sum()) -XQ2Map = namedtuple( - 'XQ2Map', - ('experiment', 'commondata', 'fitted', 'masked', "group") -) +XQ2Map = namedtuple('XQ2Map', ('experiment', 'commondata', 'fitted', 'masked', "group")) + def xq2map_with_cuts(commondata, cuts, group_name=None): """Return two (x,Q²) tuples: one for the fitted data and one for the @@ -154,22 +171,23 @@ def xq2map_with_cuts(commondata, cuts, group_name=None): boolmask[mask] = True fitted_kintable = kintable.loc[boolmask] masked_kitable = kintable.loc[~boolmask] - xq2fitted = plotoptions.get_xq2map(fitted_kintable, info) + xq2fitted = plotoptions.get_xq2map(fitted_kintable, info) xq2masked = plotoptions.get_xq2map(masked_kitable, info) - return XQ2Map( - info.experiment, commondata, xq2fitted, xq2masked, group_name - ) + return XQ2Map(info.experiment, commondata, xq2fitted, xq2masked, group_name) fitted_kintable = plotoptions.get_xq2map(kintable, info) empty = (np.array([]), np.array([])) - return XQ2Map( - info.experiment, commondata, fitted_kintable, empty, group_name - ) + return XQ2Map(info.experiment, commondata, fitted_kintable, empty, group_name) + dataset_inputs_by_groups_xq2map = collect( xq2map_with_cuts, - ('group_dataset_inputs_by_metadata', 'data_input',) + ( + 'group_dataset_inputs_by_metadata', + 'data_input', + ), ) + def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False): """ Table containing the kinematics of a commondata object, diff --git a/validphys2/src/validphys/lhaindex.py b/validphys2/src/validphys/lhaindex.py index 446fa01548..cd8df6401e 100644 --- a/validphys2/src/validphys/lhaindex.py +++ b/validphys2/src/validphys/lhaindex.py @@ -5,27 +5,34 @@ @author: zah """ +import fnmatch +from functools import lru_cache +import glob import os import os.path as osp import re -import glob -import fnmatch -from functools import lru_cache -from reportengine.compat import yaml import lhapdf +from reportengine.compat import yaml _indexes_to_names = None _names_to_indexes = None + def expand_index_names(globstr): return fnmatch.filter(get_names_to_indexes().keys(), globstr) + def expand_local_names(globstr): paths = get_lha_paths() - return [name for path in paths for name in glob.glob1(path, globstr) - if osp.isdir(osp.join(path, name))] + return [ + name + for path in paths + for name in glob.glob1(path, globstr) + if osp.isdir(osp.join(path, name)) + ] + def expand_names(globstr): """Return names of installed PDFs. If none is found, @@ -35,15 +42,17 @@ def expand_names(globstr): names = expand_index_names(globstr) return names + def get_indexes_to_names(): global _indexes_to_names if _indexes_to_names is None: _indexes_to_names = parse_index(get_index_path()) return _indexes_to_names + def finddir(name): for path in get_lha_paths(): - d = osp.join(path,name) + d = osp.join(path, name) if osp.isdir(d): return d raise FileNotFoundError(name) @@ -51,30 +60,32 @@ def finddir(name): def isinstalled(name): """Check that name exists in LHAPDF dir""" - return name and any( - osp.isdir(osp.join(path, name)) for path in get_lha_paths()) + return name and any(osp.isdir(osp.join(path, name)) for path in get_lha_paths()) def get_names_to_indexes(): global _names_to_indexes if _names_to_indexes is None: - _names_to_indexes = {name:index for index,name in - get_indexes_to_names().items()} + _names_to_indexes = {name: index for index, name in get_indexes_to_names().items()} return _names_to_indexes + def get_pdf_indexes(name): """Get index in the amc@nlo format""" info = parse_info(name) ind = info['SetIndex'] num_members = info['NumMembers'] - return {'lhapdf_id' : ind, - 'lhapdf_min': ind + (num_members > 1), - 'lhapdf_max': ind + num_members - 1} + return { + 'lhapdf_id': ind, + 'lhapdf_min': ind + (num_members > 1), + 'lhapdf_max': ind + num_members - 1, + } def get_pdf_name(index): return get_indexes_to_names()[str(index)] + def parse_index(index_file): d = {} name_re = '(\d+)\s+(\S+)' @@ -85,13 +96,15 @@ def parse_index(index_file): d[index] = m.group(2) return d + def get_collaboration(name): try: - col = name[:name.index('_')] + col = name[: name.index('_')] except ValueError: col = name return col + def as_from_name(name): """Annoying function needed because this is not in the info files. as(M_z) there is actually as(M_ref).""" @@ -109,19 +122,23 @@ def infofilename(name): return info raise FileNotFoundError(name + ".info") + @lru_cache() def parse_info(name): with open(infofilename(name)) as infofile: result = yaml.safe_load(infofile) return result + def get_lha_paths(): return lhapdf.paths() + def get_lha_datapath(): return get_lha_paths()[-1] -def get_index_path(folder = None): + +def get_index_path(folder=None): if folder: index_file = os.path.join(folder, 'pdfsets.index') if folder is None or not osp.exists(index_file): diff --git a/validphys2/src/validphys/lhapdfset.py b/validphys2/src/validphys/lhapdfset.py index 7c777df893..860a4fcf39 100644 --- a/validphys2/src/validphys/lhapdfset.py +++ b/validphys2/src/validphys/lhapdfset.py @@ -27,8 +27,9 @@ 21: 0.007604124516892057} """ import logging -import numpy as np + import lhapdf +import numpy as np log = logging.getLogger(__name__) diff --git a/validphys2/src/validphys/lhio.py b/validphys2/src/validphys/lhio.py index 615afd63ca..6be9187318 100644 --- a/validphys2/src/validphys/lhio.py +++ b/validphys2/src/validphys/lhio.py @@ -18,40 +18,42 @@ log = logging.getLogger(__name__) + def split_sep(f): for line in f: if line.startswith(b'---'): break yield line -def read_xqf_from_file(f): +def read_xqf_from_file(f): lines = split_sep(f) try: (xtext, qtext, ftext) = [next(lines) for _ in range(3)] except StopIteration: return None - xvals = np.fromstring(xtext, sep = " ") - qvals = np.fromstring(qtext, sep = " ") - fvals = np.fromstring(ftext, sep = " ", dtype=int) - vals = np.fromstring(b''.join(lines), sep= " ") - return pd.Series(vals, index = pd.MultiIndex.from_product((xvals, qvals, fvals))) + xvals = np.fromstring(xtext, sep=" ") + qvals = np.fromstring(qtext, sep=" ") + fvals = np.fromstring(ftext, sep=" ", dtype=int) + vals = np.fromstring(b''.join(lines), sep=" ") + return pd.Series(vals, index=pd.MultiIndex.from_product((xvals, qvals, fvals))) def read_xqf_from_lhapdf(pdf, replica, kin_grids): indexes = tuple(kin_grids.index) - #Use LHAPDF directly to avoid the insanely deranged replica 0 convention - #of libnnpdf. - #TODO: Find a way around this + # Use LHAPDF directly to avoid the insanely deranged replica 0 convention + # of libnnpdf. + # TODO: Find a way around this import lhapdf xfxQ = lhapdf.mkPDF(pdf.name, int(replica)).xfxQ vals = [] for x in indexes: - #TODO: Change this for a faster grid_values call - vals += [xfxQ(x[3],x[1],x[2])] - return pd.Series(vals, index = kin_grids.index) + # TODO: Change this for a faster grid_values call + vals += [xfxQ(x[3], x[1], x[2])] + return pd.Series(vals, index=kin_grids.index) + def read_all_xqf(f): while True: @@ -60,17 +62,15 @@ def read_all_xqf(f): return yield result -def load_replica(pdf, rep, kin_grids=None): +def load_replica(pdf, rep, kin_grids=None): suffix = str(rep).zfill(4) pdf_name = str(pdf) - path = osp.join(lhaindex.finddir(pdf_name), - pdf_name + "_" + suffix + ".dat") + path = osp.join(lhaindex.finddir(pdf_name), pdf_name + "_" + suffix + ".dat") - log.debug("Loading replica {rep} at {path}".format(rep=rep, - path=path)) + log.debug("Loading replica {rep} at {path}".format(rep=rep, path=path)) with open(path, 'rb') as inn: header = b"".join(split_sep(inn)) @@ -82,30 +82,32 @@ def load_replica(pdf, rep, kin_grids=None): xfqs = pd.concat(xfqs, keys=range(len(xfqs))) return header, xfqs -#Split this to debug easily + +# Split this to debug easily def _rep_to_buffer(out, header, subgrids): sep = b'---' out.write(header) out.write(sep) - for _,g in subgrids.groupby(level=0): + for _, g in subgrids.groupby(level=0): out.write(b'\n') ind = g.index.get_level_values(1).unique() - np.savetxt(out, ind, fmt='%.7E',delimiter=' ', newline=' ') + np.savetxt(out, ind, fmt='%.7E', delimiter=' ', newline=' ') out.write(b'\n') ind = g.index.get_level_values(2).unique() - np.savetxt(out, ind, fmt='%.7E',delimiter=' ', newline=' ') + np.savetxt(out, ind, fmt='%.7E', delimiter=' ', newline=' ') out.write(b'\n') - #Integer format + # Integer format ind = g.index.get_level_values(3).unique() - np.savetxt(out, ind, delimiter=' ', fmt="%d", - newline=' ') + np.savetxt(out, ind, delimiter=' ', fmt="%d", newline=' ') out.write(b'\n ') - #Reshape so printing is easy - reshaped = g.values.reshape((len(g.groupby(level=1))*len(g.groupby(level=2)), - len(g.groupby(level=3)))) + # Reshape so printing is easy + reshaped = g.values.reshape( + (len(g.groupby(level=1)) * len(g.groupby(level=2)), len(g.groupby(level=3))) + ) np.savetxt(out, reshaped, delimiter=" ", newline="\n", fmt='%14.7E') out.write(sep) + def write_replica(rep, set_root, header, subgrids): suffix = str(rep).zfill(4) target_file = set_root / f'{set_root.name}_{suffix}.dat' @@ -114,46 +116,54 @@ def write_replica(rep, set_root, header, subgrids): with open(target_file, 'wb') as out: _rep_to_buffer(out, header, subgrids) + def load_all_replicas(pdf, db=None): if db is not None: - #removing str() will crash as it casts to unicode due to pdf name + # removing str() will crash as it casts to unicode due to pdf name key = str("(load_all_replicas, %s)" % pdf.get_key()) if key in db: return db[key] rep0headers, rep0grids = load_replica(pdf, 0) - headers, grids = zip(*[load_replica(pdf, rep, rep0grids) - for rep in range(1, len(pdf))]) + headers, grids = zip(*[load_replica(pdf, rep, rep0grids) for rep in range(1, len(pdf))]) result = [rep0headers] + list(headers), [rep0grids] + list(grids) if db is not None: db[key] = result return result + def big_matrix(gridlist): """Return a properly indexes matrix of the differences between each member and the central value""" central_value = gridlist[0] - X = pd.concat(gridlist[1:], axis=1, - keys=range(1,len(gridlist)+1), #avoid confusion with rep0 - ).subtract(central_value, axis=0) + X = pd.concat( + gridlist[1:], + axis=1, + keys=range(1, len(gridlist) + 1), # avoid confusion with rep0 + ).subtract(central_value, axis=0) if np.any(X.isnull()) or X.shape[0] != len(central_value): raise ValueError("Incompatible grid specifications") return X + def rep_matrix(gridlist): """Return a properly indexes matrix of all the members""" - X = pd.concat(gridlist, axis=1, - keys=range(1,len(gridlist)+1), #avoid confusion with rep0 - ) + X = pd.concat( + gridlist, + axis=1, + keys=range(1, len(gridlist) + 1), # avoid confusion with rep0 + ) if np.ravel(pd.isnull(X)).any(): raise ValueError("Found null values in grid") return X -def _index_to_path(set_folder, set_name, index): - return set_folder/('%s_%04d.dat' % (set_name, index)) + +def _index_to_path(set_folder, set_name, index): + return set_folder / ('%s_%04d.dat' % (set_name, index)) + def generate_replica0(pdf, kin_grids=None, extra_fields=None): - """ Generates a replica 0 as an average over an existing set of LHAPDF + """Generates a replica 0 as an average over an existing set of LHAPDF replicas and outputs it to the PDF's parent folder Parameters @@ -188,15 +198,24 @@ def generate_replica0(pdf, kin_grids=None, extra_fields=None): try: M = rep_matrix(grids) except ValueError as e: - raise ValueError("Null values found in replica grid matrix. " - "This may indicate that the headers don't match" - "If this is intentional try using use_rep0grid=True") from e + raise ValueError( + "Null values found in replica grid matrix. " + "This may indicate that the headers don't match" + "If this is intentional try using use_rep0grid=True" + ) from e header = b'PdfType: central\nFormat: lhagrid1\n' write_replica(0, set_root, header, M.mean(axis=1)) + def new_pdf_from_indexes( - pdf, indexes, set_name=None, folder=None, - extra_fields=None, installgrid=False, use_rep0grid=False): + pdf, + indexes, + set_name=None, + folder=None, + extra_fields=None, + installgrid=False, + use_rep0grid=False, +): """Create a new PDF set from by selecting replicas from another one. Parameters @@ -227,10 +246,9 @@ def new_pdf_from_indexes( if folder is None: folder = pathlib.Path() - set_root = folder/set_name + set_root = folder / set_name if set_root.exists(): - log.warning("Target directory for new PDF already exists %s. " - "Deleting contents.", set_root) + log.warning("Target directory for new PDF already exists %s. Deleting contents.", set_root) if set_root.is_dir(): shutil.rmtree(str(set_root)) else: @@ -241,15 +259,15 @@ def new_pdf_from_indexes( original_info = pdf.infopath original_folder = original_info.parent - new_info = set_root/(set_name + '.info') + new_info = set_root / (set_name + '.info') - new_len = len(indexes)+1 + new_len = len(indexes) + 1 with original_info.open() as orig_file, new_info.open('w') as new_file: for line in orig_file: if line.find('SetDesc') >= 0: new_file.write('SetDesc: "Reweighted set from %s"\n' % pdf) - elif line.find('NumMembers') >=0: + elif line.find('NumMembers') >= 0: new_file.write('NumMembers: %d\n' % new_len) else: new_file.write(line) @@ -259,7 +277,7 @@ def new_pdf_from_indexes( else: rep0grid = None - for newindex,oldindex in enumerate(indexes, 1): + for newindex, oldindex in enumerate(indexes, 1): original_path = _index_to_path(original_folder, pdf, oldindex) new_path = _index_to_path(set_root, set_name, newindex) shutil.copy(original_path, new_path) @@ -274,18 +292,18 @@ def new_pdf_from_indexes( lhapdf.setPaths(oldpaths) if installgrid: - newpath = pathlib.Path(lhaindex.get_lha_datapath()) / set_name + newpath = pathlib.Path(lhaindex.get_lha_datapath()) / set_name log.info(f"Installing new PDF set at {newpath}") shutil.copytree(set_root, newpath) -def hessian_from_lincomb(pdf, V, set_name=None, folder = None, extra_fields=None): +def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None): """Construct a new LHAPDF grid from a linear combination of members""" # preparing output folder neig = V.shape[1] - base = pathlib.Path(lhapdf.paths()[-1]) / pdf.name + base = pathlib.Path(lhapdf.paths()[-1]) / pdf.name if set_name is None: set_name = pdf.name + "_hessian_" + str(neig) if folder is None: @@ -295,16 +313,16 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder = None, extra_fields=None # can lead to the wrong result if Neig is not the same between both PDF sets. if os.path.exists(set_root): shutil.rmtree(set_root) - log.warning("Target directory for new PDF, %s, already exists. " "Removing contents.", - set_root,) + log.warning( + "Target directory for new PDF, %s, already exists. Removing contents.", + set_root, + ) os.makedirs(os.path.join(set_root)) # copy replica 0 - shutil.copy(base/f'{pdf}_0000.dat', set_root / f"{set_name }_0000.dat") - - with open(base/f'{pdf}.info', 'r') as inn, \ - open(set_root / f'{set_name }.info', 'w') as out: + shutil.copy(base / f'{pdf}_0000.dat', set_root / f"{set_name }_0000.dat") + with open(base / f'{pdf}.info', 'r') as inn, open(set_root / f'{set_name }.info', 'w') as out: for l in inn.readlines(): if l.find("SetDesc:") >= 0: out.write(f"SetDesc: \"Hessian {pdf}_hessian\"\n") @@ -318,7 +336,10 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder = None, extra_fields=None yaml.dump(extra_fields, out, default_flow_style=False) _headers, grids = load_all_replicas(pdf) - result = (big_matrix(grids).dot(V)).add(grids[0], axis=0, ) + result = (big_matrix(grids).dot(V)).add( + grids[0], + axis=0, + ) hess_header = b"PdfType: error\nFormat: lhagrid1\n" for column in result.columns: write_replica(column + 1, set_root, hess_header, result[column]) diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index 25523cb15f..5f330fc845 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -7,68 +7,111 @@ Resolve paths to useful objects, and query the existence of different resources within the specified paths. """ -import sys -import pathlib import functools +from functools import cached_property import logging -import re -import tempfile -import shutil +import mimetypes import os import os.path as osp -import urllib.parse as urls -import mimetypes -from functools import cached_property - +import pathlib +import re +import shutil +import sys +import tempfile from typing import List +import urllib.parse as urls import requests -from reportengine.compat import yaml -from reportengine import filefinder -from validphys.core import (CommonDataSpec, FitSpec, TheoryIDSpec, FKTableSpec, - PositivitySetSpec, IntegrabilitySetSpec, DataSetSpec, PDF, Cuts, - DataGroupSpec, peek_commondata_metadata, CutsPolicy, - InternalCutsWrapper, HyperscanSpec) +from reportengine import filefinder +from reportengine.compat import yaml +from validphys import lhaindex, pineparser +from validphys.core import ( + PDF, + CommonDataSpec, + Cuts, + CutsPolicy, + DataGroupSpec, + DataSetSpec, + FitSpec, + FKTableSpec, + HyperscanSpec, + IntegrabilitySetSpec, + InternalCutsWrapper, + PositivitySetSpec, + TheoryIDSpec, + peek_commondata_metadata, +) from validphys.utils import tempfile_cleaner -from validphys import lhaindex -from validphys import pineparser DEFAULT_NNPDF_PROFILE_PATH = f"{sys.prefix}/share/NNPDF/nnprofile.yaml" log = logging.getLogger(__name__) -class LoaderError(Exception): pass -class LoadFailedError(FileNotFoundError, LoaderError): pass +class LoaderError(Exception): + pass + + +class LoadFailedError(FileNotFoundError, LoaderError): + pass -class DataNotFoundError(LoadFailedError): pass -class SysNotFoundError(LoadFailedError): pass +class DataNotFoundError(LoadFailedError): + pass -class FKTableNotFound(LoadFailedError): pass -class CfactorNotFound(LoadFailedError): pass +class SysNotFoundError(LoadFailedError): + pass -class CompoundNotFound(LoadFailedError): pass -class TheoryNotFound(LoadFailedError): pass +class FKTableNotFound(LoadFailedError): + pass -class TheoryDataBaseNotFound(LoadFailedError): pass -class FitNotFound(LoadFailedError): pass +class CfactorNotFound(LoadFailedError): + pass -class HyperscanNotFound(LoadFailedError): pass -class CutsNotFound(LoadFailedError): pass +class CompoundNotFound(LoadFailedError): + pass -class PDFNotFound(LoadFailedError): pass -class ProfileNotFound(LoadFailedError): pass +class TheoryNotFound(LoadFailedError): + pass -class RemoteLoaderError(LoaderError): pass -class InconsistentMetaDataError(LoaderError): pass +class TheoryDataBaseNotFound(LoadFailedError): + pass + + +class FitNotFound(LoadFailedError): + pass + + +class HyperscanNotFound(LoadFailedError): + pass + + +class CutsNotFound(LoadFailedError): + pass + + +class PDFNotFound(LoadFailedError): + pass + + +class ProfileNotFound(LoadFailedError): + pass + + +class RemoteLoaderError(LoaderError): + pass + + +class InconsistentMetaDataError(LoaderError): + pass + def _get_nnpdf_profile(profile_path=None): """Returns the NNPDF profile as a dictionary @@ -100,6 +143,7 @@ def _get_nnpdf_profile(profile_path=None): raise LoaderError(f"Could not parse profile file {mpath}: {e}") from e return profile_dict + class LoaderBase: """ Base class for the NNPDF loader. @@ -121,7 +165,7 @@ def __init__(self, profile=None): raise LoaderError(f"The data path {datapath} does not exist.") if not resultspath.exists(): - raise LoaderError(f"The results path {resultspath} does not exist.") + raise LoaderError(f"The results path {resultspath} does not exist.") # And save them up self.datapath = datapath @@ -144,13 +188,11 @@ def _vp_cache(self): log.info(f"Creating validphys cache directory: {vpcache}") vpcache.mkdir(parents=True, exist_ok=True) except Exception as e: - raise LoaderError("Could not create the cache directory " - f"at {vpcache}") from e + raise LoaderError("Could not create the cache directory " f"at {vpcache}") from e return vpcache -def rebuild_commondata_without_cuts( - filename_with_cuts, cuts, datapath_filename, newpath): +def rebuild_commondata_without_cuts(filename_with_cuts, cuts, datapath_filename, newpath): """Take a CommonData file that is stored with the cuts applied and write another file with no cuts. The points that were not present in the original file have the same kinematics as the file in @@ -166,49 +208,49 @@ def rebuild_commondata_without_cuts( return index_pattern = re.compile(r'(?P\s*)(?P\d+)') - data_line_pattern = re.compile(r'\s*(?P\d+)' - r'\s+(?P\S+)\s+' - r'(?P(\s*\S+){3})\s+') + data_line_pattern = re.compile( + r'\s*(?P\d+)' r'\s+(?P\S+)\s+' r'(?P(\s*\S+){3})\s+' + ) mask = cuts.load() maskiter = iter(mask) ndata = metadata.ndata nsys = metadata.nsys next_index = next(maskiter) - with open(filename_with_cuts, 'r') as fitfile, \ - open(datapath_filename) as dtfile, \ - open(newpath, 'w') as newfile: + with open(filename_with_cuts, 'r') as fitfile, open(datapath_filename) as dtfile, open( + newpath, 'w' + ) as newfile: newfile.write(dtfile.readline()) - #discard this line + # discard this line fitfile.readline() - for i in range(1 ,ndata+1): - #You gotta love mismatched indexing - if i-1 == next_index: + for i in range(1, ndata + 1): + # You gotta love mismatched indexing + if i - 1 == next_index: line = fitfile.readline() - line = re.sub( - index_pattern, rf'\g{i}', line, count=1) + line = re.sub(index_pattern, rf'\g{i}', line, count=1) newfile.write(line) next_index = next(maskiter, None) - #drop the data file line + # drop the data file line dtfile.readline() else: line = dtfile.readline() - #check that we know where we are + # check that we know where we are m = re.match(index_pattern, line) assert int(m.group('index')) == i - #We have index, process type, and 3*kinematics - #that we would like to keep. + # We have index, process type, and 3*kinematics + # that we would like to keep. m = re.match(data_line_pattern, line) - newfile.write(line[:m.end()]) - #And value, stat, *sys that we want to drop - #Do not use string join to keep up with the ugly format - #This should really be nan's, but the c++ streams that could read this - #do not have the right interface. - #https://stackoverflow.com/questions/11420263/is-it-possible-to-read-infinity-or-nan-values-using-input-streams - zeros = '-0\t'*(2 + 2*nsys) + newfile.write(line[: m.end()]) + # And value, stat, *sys that we want to drop + # Do not use string join to keep up with the ugly format + # This should really be nan's, but the c++ streams that could read this + # do not have the right interface. + # https://stackoverflow.com/questions/11420263/is-it-possible-to-read-infinity-or-nan-values-using-input-streams + zeros = '-0\t' * (2 + 2 * nsys) newfile.write(f'{zeros}\n') -#TODO: Deprecate get methods? + +# TODO: Deprecate get methods? class Loader(LoaderBase): """Load various resources from the NNPDF data path.""" @@ -230,13 +272,14 @@ def available_hyperscans(self): @functools.lru_cache() def available_theories(self): """Return a string token for each of the available theories""" - theory_token = 'theory_' - return {folder.name[len(theory_token):] - for folder in self.datapath.glob(theory_token+'*')} + theory_token = 'theory_' + return { + folder.name[len(theory_token) :] for folder in self.datapath.glob(theory_token + '*') + } + @property @functools.lru_cache() def available_datasets(self): - data_str = "DATA_" # We filter out the positivity sets here return { @@ -254,20 +297,19 @@ def available_pdfs(self): def commondata_folder(self): return self.datapath / 'commondata' - def check_commondata(self, setname, sysnum=None, use_fitcommondata=False, - fit=None): + def check_commondata(self, setname, sysnum=None, use_fitcommondata=False, fit=None): if use_fitcommondata: if not fit: - raise LoadFailedError( - "Must specify a fit when setting use_fitcommondata") - datafilefolder = (fit.path/'filter')/setname - newpath = datafilefolder/f'FILTER_{setname}.dat' + raise LoadFailedError("Must specify a fit when setting use_fitcommondata") + datafilefolder = (fit.path / 'filter') / setname + newpath = datafilefolder / f'FILTER_{setname}.dat' if not newpath.exists(): - oldpath = datafilefolder/f'DATA_{setname}.dat' + oldpath = datafilefolder / f'DATA_{setname}.dat' if not oldpath.exists(): - raise DataNotFoundError(f"Either {newpath} or {oldpath} " - "are needed with `use_fitcommondata`") - #This is to not repeat all the error handling stuff + raise DataNotFoundError( + f"Either {newpath} or {oldpath} are needed with `use_fitcommondata`" + ) + # This is to not repeat all the error handling stuff basedata = self.check_commondata(setname, sysnum=sysnum) basedata_path = basedata.datafile cuts = self.check_fit_cuts(basedata, fit=fit) @@ -278,9 +320,9 @@ def check_commondata(self, setname, sysnum=None, use_fitcommondata=False, f"Found fit using old commondata export settings: " f"'{fit}'. The commondata that are used in this run " "will be updated now." - "Please consider re-uploading it.") - log.warning( - f"Points that do not pass the cuts are set to zero!") + "Please consider re-uploading it." + ) + log.warning("Points that do not pass the cuts are set to zero!") log.info(f"Upgrading filtered commondata. Writing {newpath}") rebuild_commondata_without_cuts(oldpath, cuts, basedata_path, newpath) @@ -288,33 +330,36 @@ def check_commondata(self, setname, sysnum=None, use_fitcommondata=False, else: datafile = self.commondata_folder / f'DATA_{setname}.dat' if not datafile.exists(): - raise DataNotFoundError(("Could not find Commondata set: '%s'. " - "File '%s' does not exist.") - % (setname, datafile)) + raise DataNotFoundError( + ("Could not find Commondata set: '%s'. " "File '%s' does not exist.") + % (setname, datafile) + ) if sysnum is None: sysnum = 'DEFAULT' - sysfile = (self.commondata_folder / 'systypes' / - ('SYSTYPE_%s_%s.dat' % (setname, sysnum))) + sysfile = self.commondata_folder / 'systypes' / ('SYSTYPE_%s_%s.dat' % (setname, sysnum)) if not sysfile.exists(): - raise SysNotFoundError(("Could not find systype %s for " - "dataset '%s'. File %s does not exist.") % (sysnum, setname, - sysfile)) + raise SysNotFoundError( + "Could not find systype %s for dataset '%s'. File %s does not exist." + % (sysnum, setname, sysfile) + ) plotfiles = [] - metadata = peek_commondata_metadata(datafile) - process_plotting_root = self.commondata_folder/f'PLOTTINGTYPE_{metadata.process_type}' - type_plotting = (process_plotting_root.with_suffix('.yml'), - process_plotting_root.with_suffix('.yaml'),) + process_plotting_root = self.commondata_folder / f'PLOTTINGTYPE_{metadata.process_type}' + type_plotting = ( + process_plotting_root.with_suffix('.yml'), + process_plotting_root.with_suffix('.yaml'), + ) - data_plotting_root = self.commondata_folder/f'PLOTTING_{setname}' + data_plotting_root = self.commondata_folder / f'PLOTTING_{setname}' - data_plotting = (data_plotting_root.with_suffix('.yml'), - data_plotting_root.with_suffix('.yaml'), - ) - #TODO: What do we do when both .yml and .yaml exist? + data_plotting = ( + data_plotting_root.with_suffix('.yml'), + data_plotting_root.with_suffix('.yaml'), + ) + # TODO: What do we do when both .yml and .yaml exist? for tp in (type_plotting, data_plotting): for p in tp: if p.exists(): @@ -331,15 +376,15 @@ def check_theoryID(self, theoryID): theoryID = str(theoryID) theopath = self.datapath / ('theory_%s' % theoryID) if not theopath.exists(): - raise TheoryNotFound(("Could not find theory %s. " - "Folder '%s' not found") % (theoryID, theopath) ) + raise TheoryNotFound( + "Could not find theory %s. Folder '%s' not found" % (theoryID, theopath) + ) return TheoryIDSpec(theoryID, theopath) @property def theorydb_file(self): - """Checks theory db file exists and returns path to it - """ - dbpath = self.datapath/'theory.db' + """Checks theory db file exists and returns path to it""" + dbpath = self.datapath / 'theory.db' if not dbpath.is_file(): raise TheoryDataBaseNotFound(f"could not find theory.db. File not found at {dbpath}") return dbpath @@ -352,10 +397,11 @@ def get_commondata(self, setname, sysnum): # @functools.lru_cache() def check_fktable(self, theoryID, setname, cfac): _, theopath = self.check_theoryID(theoryID) - fkpath = theopath/ 'fastkernel' / ('FK_%s.dat' % setname) + fkpath = theopath / 'fastkernel' / ('FK_%s.dat' % setname) if not fkpath.exists(): - raise FKTableNotFound(("Could not find FKTable for set '%s'. " - "File '%s' not found") % (setname, fkpath) ) + raise FKTableNotFound( + "Could not find FKTable for set '%s'. File '%s' not found" % (setname, fkpath) + ) cfactors = self.check_cfactor(theoryID, setname, cfac) return FKTableSpec(fkpath, cfactors) @@ -394,32 +440,29 @@ def check_compound(self, theoryID, setname, cfac): compound_spec_path = theopath / 'compound' / ('FK_%s-COMPOUND.dat' % setname) try: with compound_spec_path.open() as f: - #Drop first line with comment + # Drop first line with comment next(f) txt = f.read() except FileNotFoundError as e: - msg = ("Could not find COMPOUND set '%s' for theory %d: %s" % - (setname, int(thid), e)) + msg = "Could not find COMPOUND set '%s' for theory %d: %s" % (setname, int(thid), e) raise CompoundNotFound(msg) - #This is a little bit funny, but is the least amount of thinking... + # This is a little bit funny, but is the least amount of thinking... yaml_format = 'FK:\n' + re.sub('FK:', ' - ', txt) data = yaml.safe_load(yaml_format) - #we have to split out 'FK_' the extension to get a name consistent - #with everything else + # we have to split out 'FK_' the extension to get a name consistent + # with everything else try: - tables = [self.check_fktable(theoryID, name[3:-4], cfac) - for name in data['FK']] + tables = [self.check_fktable(theoryID, name[3:-4], cfac) for name in data['FK']] except FKTableNotFound as e: raise LoadFailedError( - f"Incorrect COMPOUND file '{compound_spec_path}'. " - f"Searching for non-existing FKTable:\n{e}") from e + f"Incorrect COMPOUND file '{compound_spec_path}'. " + f"Searching for non-existing FKTable:\n{e}" + ) from e op = data['OP'] return tuple(tables), op - def get_fktable(self, theoryID, setname, cfac): - - fkspec= self.check_fktable(theoryID, setname, cfac) + fkspec = self.check_fktable(theoryID, setname, cfac) return fkspec.load() def check_cfactor(self, theoryID, setname, cfactors): @@ -428,8 +471,10 @@ def check_cfactor(self, theoryID, setname, cfactors): for cfactor in cfactors: cfactorpath = theopath / "cfactor" / f"CF_{cfactor}_{setname}.dat" if not cfactorpath.exists(): - msg = (f"Could not find cfactor '{cfactor}' for FKTable {setname}." - f"File {cfactorpath} does not exist in {theoryID}") + msg = ( + f"Could not find cfactor '{cfactor}' for FKTable {setname}." + f"File {cfactorpath} does not exist in {theoryID}" + ) raise CfactorNotFound(msg) cf.append(cfactorpath) @@ -469,15 +514,9 @@ def check_fit(self, fitname): if p.is_dir(): return FitSpec(fitname, p) if not p.is_dir(): - msg = ( - f"Could not find fit '{fitname}' in '{resultspath}'. " - f"Folder '{p}' not found" - ) + msg = f"Could not find fit '{fitname}' in '{resultspath}'. Folder '{p}' not found" raise FitNotFound(msg) - msg = ( - f"Could not load fit '{fitname}' from '{resultspath}. " - f"'{p}' must be a folder" - ) + msg = f"Could not load fit '{fitname}' from '{resultspath}. '{p}' must be a folder" raise FitNotFound(msg) def check_hyperscan(self, hyperscan_name): @@ -495,15 +534,17 @@ def check_hyperscan(self, hyperscan_name): return hyperspec raise HyperscanNotFound(f"No hyperscan output find in {hyperscan_name}") - raise HyperscanNotFound(f"Could not find hyperscan '{hyperscan_name}' in '{resultspath}'." - f" Folder '{hyperscan_name}' not found") + raise HyperscanNotFound( + f"Could not find hyperscan '{hyperscan_name}' in '{resultspath}'." + f" Folder '{hyperscan_name}' not found" + ) def check_default_filter_rules(self, theoryid, defaults=None): # avoid circular import from validphys.filters import ( - default_filter_settings_input, - default_filter_rules_input, Rule, + default_filter_rules_input, + default_filter_settings_input, ) th_params = theoryid.get_description() @@ -514,19 +555,20 @@ def check_default_filter_rules(self, theoryid, defaults=None): for inp in default_filter_rules_input() ] - def check_dataset(self, - name, - *, - rules=None, - sysnum=None, - theoryid, - cfac=(), - frac=1, - cuts=CutsPolicy.INTERNAL, - use_fitcommondata=False, - fit=None, - weight=1, - ): + def check_dataset( + self, + name, + *, + rules=None, + sysnum=None, + theoryid, + cfac=(), + frac=1, + cuts=CutsPolicy.INTERNAL, + use_fitcommondata=False, + fit=None, + weight=1, + ): """Loads a given dataset If the dataset contains new-type fktables, use the pineappl loading function, otherwise fallback to legacy @@ -537,7 +579,8 @@ def check_dataset(self, theoryno, _ = theoryid commondata = self.check_commondata( - name, sysnum, use_fitcommondata=use_fitcommondata, fit=fit) + name, sysnum, use_fitcommondata=use_fitcommondata, fit=fit + ) if theoryid.is_pineappl(): # If it is a pineappl theory, use the pineappl reader @@ -549,8 +592,8 @@ def check_dataset(self, fkspec = self.check_fktable(theoryno, name, cfac) op = None - #Note this is simply for convenience when scripting. The config will - #construct the actual Cuts object by itself + # Note this is simply for convenience when scripting. The config will + # construct the actual Cuts object by itself if isinstance(cuts, str): cuts = CutsPolicy(cuts) if isinstance(cuts, CutsPolicy): @@ -565,9 +608,16 @@ def check_dataset(self, elif cuts is CutsPolicy.FROM_CUT_INTERSECTION_NAMESPACE: raise LoaderError(f"Intersection cuts not supported in loader calls.") - return DataSetSpec(name=name, commondata=commondata, - fkspecs=fkspec, thspec=theoryid, cuts=cuts, - frac=frac, op=op, weight=weight) + return DataSetSpec( + name=name, + commondata=commondata, + fkspecs=fkspec, + thspec=theoryid, + cuts=cuts, + frac=frac, + op=op, + weight=weight, + ) def check_experiment(self, name: str, datasets: List[DataSetSpec]) -> DataGroupSpec: """Loader method for instantiating DataGroupSpec objects. The NNPDF::Experiment @@ -612,7 +662,7 @@ def check_fit_cuts(self, commondata, fit): if not isinstance(fit, FitSpec): fit = self.check_fit(fit) _, fitpath = fit - p = (fitpath/'filter')/setname/('FKMASK_' + setname+ '.dat') + p = (fitpath / 'filter') / setname / ('FKMASK_' + setname + '.dat') if not p.parent.exists(): raise CutsNotFound(f"Bad filter configuration. Could not find {p.parent}") if not p.exists(): @@ -628,8 +678,7 @@ def check_vp_output_file(self, filename, extra_paths=('.',)): try: vpcache = self._vp_cache() except KeyError as e: - log.warning("Entry validphys_cache_path expected but not found " - "in the nnprofile.") + log.warning("Entry validphys_cache_path expected but not found in the nnprofile.") else: extra_paths = (*extra_paths, vpcache) @@ -640,12 +689,11 @@ def check_vp_output_file(self, filename, extra_paths=('.',)): raise LoadFailedError(f"Could not find '{filename}'") from e except filefinder.FinderError as e: raise LoaderError(e) from e - return path/name + return path / name -#http://stackoverflow.com/a/15645088/1007990 +# http://stackoverflow.com/a/15645088/1007990 def _download_and_show(response, stream): - total_length = response.headers.get('content-length') if total_length is None or not log.isEnabledFor(logging.INFO): @@ -665,18 +713,19 @@ def _download_and_show(response, stream): sys.stdout.flush() sys.stdout.write('\n') + def download_file(url, stream_or_path, make_parents=False): """Download a file and show a progress bar if the INFO log level is enabled. If ``make_parents`` is ``True`` ``stream_or_path`` is path-like, all the parent folders will be created.""" - #There is a bug in CERN's - #Apache that incorrectly sets the Content-Encodig header to gzip, even - #though it doesn't compress two times. + # There is a bug in CERN's + # Apache that incorrectly sets the Content-Encodig header to gzip, even + # though it doesn't compress two times. # See: http://mail-archives.apache.org/mod_mbox/httpd-dev/200207.mbox/%3C3D2D4E76.4010502@talex.com.pl%3E # and e.g. https://bugzilla.mozilla.org/show_bug.cgi?id=610679#c30 - #If it looks like the url is already encoded, we do not request - #it to be compressed + # If it looks like the url is already encoded, we do not request + # it to be compressed headers = {} if mimetypes.guess_type(url)[1] is not None: headers['Accept-Encoding'] = None @@ -689,20 +738,19 @@ def download_file(url, stream_or_path, make_parents=False): p = pathlib.Path(stream_or_path) if p.is_dir(): raise IsADirectoryError(p) - log.info("Downloading %s to %s." , url, stream_or_path) + log.info("Downloading %s to %s.", url, stream_or_path) if make_parents: p.parent.mkdir(exist_ok=True, parents=True) - download_target = tempfile.NamedTemporaryFile(delete=False, - dir=p.parent, - prefix=p.name, - suffix='.part') + download_target = tempfile.NamedTemporaryFile( + delete=False, dir=p.parent, prefix=p.name, suffix='.part' + ) with download_target as f: _download_and_show(response, f) shutil.move(download_target.name, p) else: - log.info("Downloading %s." , url,) + log.info("Downloading %s.", url) _download_and_show(response, stream_or_path) @@ -714,14 +762,15 @@ def download_and_extract(url, local_path): name = url.split('/')[-1] archive_dest = tempfile.NamedTemporaryFile(delete=False, suffix=name, dir=local_path) with archive_dest as t: - log.debug("Saving data to %s" , t.name) + log.debug("Saving data to %s", t.name) download_file(url, t) - log.info("Extracting archive to %s" , local_path) + log.info("Extracting archive to %s", local_path) try: shutil.unpack_archive(t.name, extract_dir=local_path) except: - log.error(f"The original archive at {t.name} was only extracted " - f"partially at \n{local_path}") + log.error( + f"The original archive at {t.name} was only extracted partially at \n{local_path}" + ) raise else: os.unlink(archive_dest.name) @@ -733,17 +782,18 @@ def f_(*args, **kwargs): try: return f(*args, **kwargs) except KeyError as e: - log.error(f"nnprofile is configured " - f"improperly: Key {e} is missing from the profile!") - raise LoaderError("Cannot attempt download because " - "nnprofile is configured improperly: " - f"Missing key '{e}'") from e + log.error(f"nnprofile is configured improperly: Key {e} is missing from the profile!") + raise LoaderError( + "Cannot attempt download because " + "nnprofile is configured improperly: " + f"Missing key '{e}'" + ) from e + return f_ -#TODO: Make this async someday +# TODO: Make this async someday class RemoteLoader(LoaderBase): - @property @_key_or_loader_error def fit_urls(self): @@ -788,27 +838,30 @@ def nnpdf_pdfs_index(self): @_key_or_loader_error def lhapdf_urls(self): urls = self.nnprofile['lhapdf_urls'] - if len(urls)>1: + if len(urls) > 1: log.warning("Only one lhapdf_url is supported at the moment.") if len(urls) == 0: raise LoaderError("The specification for lhapdf_urls is empty in nnprofile") return urls - def _remote_files_from_url(self, url, index, thing='files'): index_url = url + index try: resp = requests.get(index_url) resp.raise_for_status() except Exception as e: - raise RemoteLoaderError("Failed to fetch remote %s index %s: %s" % (thing, index_url,e)) from e + raise RemoteLoaderError( + "Failed to fetch remote %s index %s: %s" % (thing, index_url, e) + ) from e try: info = resp.json()['files'] except Exception as e: - raise RemoteLoaderError("Malformed index %s. Expecting json with a key 'files': %s" % (index_url, e)) from e + raise RemoteLoaderError( + "Malformed index %s. Expecting json with a key 'files': %s" % (index_url, e) + ) from e - return {file.split('.')[0] : url+file for file in info} + return {file.split('.')[0]: url + file for file in info} def remote_files(self, urls, index, thing='files'): d = {} @@ -834,23 +887,22 @@ def remote_hyperscans(self): def remote_theories(self): token = 'theory_' rt = self.remote_files(self.theory_urls, self.theory_index, thing="theories") - return {k[len(token):]: v for k,v in rt.items()} + return {k[len(token) :]: v for k, v in rt.items()} @property @functools.lru_cache() def remote_nnpdf_pdfs(self): - return self.remote_files(self.nnpdf_pdfs_urls, self.nnpdf_pdfs_index, - thing="PDFs") + return self.remote_files(self.nnpdf_pdfs_urls, self.nnpdf_pdfs_index, thing="PDFs") @cached_property def remote_keywords(self): root = self.nnprofile['reports_root_url'] url = urls.urljoin(root, 'index.json') try: - req = requests.get(url) - req.raise_for_status() - keyobjs= req.json()['keywords'] - l = [k[0] for k in keyobjs] + req = requests.get(url) + req.raise_for_status() + keyobjs = req.json()['keywords'] + l = [k[0] for k in keyobjs] except requests.RequestException as e: raise RemoteLoaderError(e) from e return l @@ -879,10 +931,9 @@ def nnpdf_pdfs(self): def downloadable_pdfs(self): return set((*self.lhapdf_pdfs, *self.downloadable_fits, *self.nnpdf_pdfs)) - def download_fit(self, fitname): if not fitname in self.remote_fits: - raise FitNotFound("Could not find fit '{}' in remote index {}".format(fitname, self.fit_index)) + raise FitNotFound(f"Could not find fit '{fitname}' in remote index {self.fit_index}") with tempfile_cleaner( root=self.resultspath, @@ -891,24 +942,26 @@ def download_fit(self, fitname): prefix='fit_download_deleteme_', ) as tempdir: download_and_extract(self.remote_fits[fitname], tempdir) - #Handle old-style fits compressed with 'results' as root. - old_style_res = tempdir/'results' + # Handle old-style fits compressed with 'results' as root. + old_style_res = tempdir / 'results' if old_style_res.is_dir(): move_target = old_style_res / fitname else: - move_target = tempdir/fitname + move_target = tempdir / fitname if not move_target.is_dir(): - raise RemoteLoaderError(f"Unknown format for fit in {tempdir}. Expecting a folder {move_target}") + raise RemoteLoaderError( + f"Unknown format for fit in {tempdir}. Expecting a folder {move_target}" + ) fitpath = self.resultspath / fitname shutil.move(move_target, fitpath) - if lhaindex.isinstalled(fitname): log.warning( f"The PDF corresponding to the downloaded fit '{fitname}' " "exists in the LHAPDF path." - " Will be erased and replaced with the new one.") + " Will be erased and replaced with the new one." + ) p = pathlib.Path(lhaindex.finddir(fitname)) if p.is_symlink(): p.unlink() @@ -916,11 +969,11 @@ def download_fit(self, fitname): shutil.rmtree(p) else: p = pathlib.Path(lhaindex.get_lha_datapath()) / fitname - #This is needed here as well because the path may be a - #broken symlink. + # This is needed here as well because the path may be a + # broken symlink. if p.is_symlink(): p.unlink() - gridpath = fitpath / 'postfit' / fitname + gridpath = fitpath / 'postfit' / fitname gridpath_old = fitpath / 'nnfit' / fitname if gridpath.is_dir(): p.symlink_to(gridpath, target_is_directory=True) @@ -944,7 +997,7 @@ def download_hyperscan(self, hyperscan_name): prefix='fit_download_deleteme_', ) as tempdir: download_and_extract(self.remote_hyperscans[hyperscan_name], tempdir) - move_target = tempdir/hyperscan_name + move_target = tempdir / hyperscan_name if not move_target.is_dir(): raise RemoteLoaderError( f"Unknown format for fit in {tempdir}. Expecting a folder {move_target}" @@ -952,24 +1005,26 @@ def download_hyperscan(self, hyperscan_name): hyperscan_path = self.hyperscan_resultpath / hyperscan_name shutil.move(move_target, hyperscan_path) - def download_pdf(self, name): - #Check if the pdf is an existing fit first + # Check if the pdf is an existing fit first try: - #We don't want to download the fit here + # We don't want to download the fit here fit = Loader.check_fit(self, name) except FitNotFound: pass else: p = pathlib.Path(lhaindex.get_lha_datapath()) / fit.name - fitpath = fit.path / 'postfit' + fitpath = fit.path / 'postfit' fitpath_old = fit.path / 'nnfit' if fitpath.exists() or fitpath_old.exists(): - log.info("Found existing fit with the same name as the " - "requested PDF (%s). Symlinking the grid to the LHAPDF path (%s).", - name, p) - #This is needed here as well because the path may be a - #broken symlink. + log.info( + "Found existing fit with the same name as the " + "requested PDF (%s). Symlinking the grid to the LHAPDF path (%s).", + name, + p, + ) + # This is needed here as well because the path may be a + # broken symlink. if p.is_symlink(): p.unlink() if fitpath.exists(): @@ -978,53 +1033,56 @@ def download_pdf(self, name): p.symlink_to(fitpath_old / fit.name) return - #It would be good to use the LHAPDF command line, except that it does - #questionable things like returning 0 exit status when it fails to - #download. + # It would be good to use the LHAPDF command line, except that it does + # questionable things like returning 0 exit status when it fails to + # download. _saved_exception = False if name in self.lhapdf_pdfs: try: url = self.lhapdf_urls[0] + name + '.tar.gz' - #url = 'https://data.nnpdf.science/thisisatesttodelete/NNPDF31_nlo_as_0118.tar.gz' - #url = 'https://data.nnpdf.science/patata/NNPDF31_nlo_as_0118.tar.gz' + # url = 'https://data.nnpdf.science/thisisatesttodelete/NNPDF31_nlo_as_0118.tar.gz' + # url = 'https://data.nnpdf.science/patata/NNPDF31_nlo_as_0118.tar.gz' return download_and_extract(url, lhaindex.get_lha_datapath()) except shutil.ReadError as e: _saved_exception = e - log.error(f"{e}. It seems the LHAPDF URLs aren't behaving, " - f"attempting to find resource in other repositories") + log.error( + f"{e}. It seems the LHAPDF URLs aren't behaving, " + f"attempting to find resource in other repositories" + ) pass except requests.RequestException as e: _saved_exception = e - log.error(f"There was a problem with the connection: {e}. " - f"Attempting to find resource elsewhere.") + log.error( + f"There was a problem with the connection: {e}. " + f"Attempting to find resource elsewhere." + ) pass except RemoteLoaderError as e: _saved_exception = e - log.error(f"Failed to download resource: {e}. Attempting " - f"to find it elsewhere.") + log.error(f"Failed to download resource: {e}. Attempting " f"to find it elsewhere.") pass if name in self.downloadable_fits: try: return self.download_fit(name) except requests.RequestException as e: _saved_exception = e - log.error(f"There was a problem with the connection: {e}. " - f"Attempting to find resource elsewhere.") + log.error( + f"There was a problem with the connection: {e}. " + f"Attempting to find resource elsewhere." + ) pass except RemoteLoaderError as e: _saved_exception = e - log.error(f"Failed to download resource: {e}. Attempting " - f"to find it elsewhere.") + log.error(f"Failed to download resource: {e}. Attempting " f"to find it elsewhere.") pass if name in self.remote_nnpdf_pdfs: - return download_and_extract(self.remote_nnpdf_pdfs[name], - lhaindex.get_lha_datapath()) + return download_and_extract(self.remote_nnpdf_pdfs[name], lhaindex.get_lha_datapath()) elif _saved_exception: - raise LoadFailedError(f"{_saved_exception}. The resource could not " - f"be found elsewhere.") from _saved_exception + raise LoadFailedError( + f"{_saved_exception}. The resource could not " f"be found elsewhere." + ) from _saved_exception else: - raise PDFNotFound("PDF '%s' is neither an uploaded fit nor an " - "LHAPDF set." % name) + raise PDFNotFound("PDF '%s' is neither an uploaded fit nor an " "LHAPDF set." % name) def download_theoryID(self, thid): thid = str(thid) @@ -1039,36 +1097,38 @@ def download_vp_output_file(self, filename, **kwargs): except KeyError as e: raise LoadFailedError('Key report_root_url not found in nnprofile') try: - url = root_url + filename + url = root_url + filename except Exception as e: raise LoadFailedError(e) from e try: filename = pathlib.Path(filename) - download_file(url, self._vp_cache()/filename, make_parents=True) + download_file(url, self._vp_cache() / filename, make_parents=True) except requests.HTTPError as e: if e.response.status_code == requests.codes.not_found: - raise RemoteLoaderError(f"Resource {filename} could not " - f"be found on the validphys " - f"server {url}") from e + raise RemoteLoaderError( + f"Resource {filename} could not " f"be found on the validphys " f"server {url}" + ) from e elif e.response.status_code == requests.codes.unauthorized: - log.error("Could not access the validphys reports page " - "because the authentification is not provided. " - "Please, update your ~/.netrc file to contain the " - "following:\n\n" - f"machine {urls.urlsplit(root_url).netloc}\n" - f" login nnpdf\n" - f" password \n" - ) + log.error( + "Could not access the validphys reports page " + "because the authentification is not provided. " + "Please, update your ~/.netrc file to contain the " + "following:\n\n" + f"machine {urls.urlsplit(root_url).netloc}\n" + f" login nnpdf\n" + f" password \n" + ) raise + class FallbackLoader(Loader, RemoteLoader): """A loader that first tries to find resources locally (calling Loader.check_*) and if it fails, it tries to download them (calling RemoteLoader.download_*).""" def make_checker(self, resource): - #We are intercepting the check_ + # We are intercepting the check_ orig = super().__getattribute__('check_' + resource) download = getattr(self, 'download_' + resource) @@ -1078,9 +1138,11 @@ def f(*args, **kwargs): return orig(*args, **kwargs) except LoadFailedError as e: saved_exception = e - log.info("Could not find a resource " + log.info( + "Could not find a resource " f"({resource}): {saved_exception}. " - f"Attempting to download it.") + f"Attempting to download it." + ) try: download(*args, **kwargs) except RemoteLoaderError as e: @@ -1094,19 +1156,17 @@ def f(*args, **kwargs): raise saved_exception from e except Exception as e: - #Simply raise these for now so we can find and fix them + # Simply raise these for now so we can find and fix them raise e else: return orig(*args, **kwargs) - return f - + return f def __getattribute__(self, attr): token = 'check_' if attr.startswith(token): - resname = attr[len(token):] + resname = attr[len(token) :] if hasattr(RemoteLoader, 'download_' + resname): return super().__getattribute__('make_checker')(resname) return super().__getattribute__(attr) - diff --git a/validphys2/src/validphys/mc2hessian.py b/validphys2/src/validphys/mc2hessian.py index 2f9ee758fc..49d890f1e1 100644 --- a/validphys2/src/validphys/mc2hessian.py +++ b/validphys2/src/validphys/mc2hessian.py @@ -12,19 +12,16 @@ import numpy as np from reportengine.checks import check, check_positive, make_argcheck - from validphys import lhaindex +from validphys.checks import check_pdf_is_montecarlo from validphys.lhio import hessian_from_lincomb from validphys.pdfgrids import xplotting_grid -from validphys.checks import check_pdf_is_montecarlo - log = logging.getLogger(__name__) def gridname(pdf, Neig, mc2hname: (str, type(None)) = None): - """If no custom `mc2hname' is specified, the name of the Hessian PDF is automatically generated. - """ + """If no custom `mc2hname' is specified, the name of the Hessian PDF is automatically generated.""" if mc2hname is None: grid_name = f"{pdf.name}_hessian_{Neig}" else: diff --git a/validphys2/src/validphys/mc_gen.py b/validphys2/src/validphys/mc_gen.py index 4f854dbf3b..83e36c8894 100644 --- a/validphys2/src/validphys/mc_gen.py +++ b/validphys2/src/validphys/mc_gen.py @@ -7,15 +7,15 @@ # The functions in this module have been ported to not use libNNPDF # but they should not be used as an example as they follow the libNNPDF logic import logging -import matplotlib.patches as mpatches + from matplotlib.figure import Figure +import matplotlib.patches as mpatches import numpy as np import pandas as pd from scipy.stats import moment as mom -from reportengine.table import table from reportengine.figure import figure - +from reportengine.table import table from validphys import plotutils log = logging.getLogger(__name__) @@ -51,9 +51,7 @@ def art_data_residuals(art_rep_generation, color="green"): normresiduals = residuals / real_data fig, ax = plotutils.subplots() - ax.hist( - normresiduals, bins=50, histtype="step", stacked=True, fill=False, color=color - ) + ax.hist(normresiduals, bins=50, histtype="step", stacked=True, fill=False, color=color) ax.set_ylabel(r"Data points") ax.set_xlabel(r"$(D^0-)/D^0$") @@ -63,9 +61,7 @@ def art_data_residuals(art_rep_generation, color="green"): @figure -def art_data_distribution( - art_rep_generation, title="Artificial Data Distribution", color="green" -): +def art_data_distribution(art_rep_generation, title="Artificial Data Distribution", color="green"): """ Plot of the distribution of pseudodata. """ @@ -74,9 +70,7 @@ def art_data_distribution( normart_data = art_data / real_data fig, ax = plotutils.subplots() - ax.hist( - normart_data, bins=50, histtype="step", stacked=True, fill=False, color=color - ) + ax.hist(normart_data, bins=50, histtype="step", stacked=True, fill=False, color=color) ax.set_ylabel(r"Data points") ax.set_xlabel(r"$/D^0$") @@ -93,9 +87,9 @@ def art_data_moments(art_rep_generation, color="green"): _, _, normart_replicas, _ = art_rep_generation artrep_array = np.asarray(normart_replicas) - + fig = Figure(figsize=(10, 12)) - axes = [fig.add_subplot(3, 1, i+1) for i in range(3)] + axes = [fig.add_subplot(3, 1, i + 1) for i in range(3)] # Plot histogram of moments for momno, ax in zip(range(1, 4), axes.flatten()): # Calculate moments @@ -120,9 +114,9 @@ def art_data_comparison(art_rep_generation, nreplica: int): artrep_array = np.asarray(normart_replicas) normart_data = art_data / real_data - nrows=len(artrep_array.T) + nrows = len(artrep_array.T) fig = Figure(figsize=(4, 2 * len(artrep_array.T))) - axes = [fig.add_subplot(nrows, 1, i+1) for i in range(nrows)] + axes = [fig.add_subplot(nrows, 1, i + 1) for i in range(nrows)] for i, ax, datapoint, normartdatapoint in zip( range(len(artrep_array.T)), axes.flatten(), artrep_array.T, normart_data @@ -141,12 +135,8 @@ def art_data_comparison(art_rep_generation, nreplica: int): linestyle="-", color="darkorchid", ) - ax.vlines( - 0, ax.get_ylim()[0], ax.get_ylim()[1], linestyle="-", color="dodgerblue" - ) - ax.vlines( - 2, ax.get_ylim()[0], ax.get_ylim()[1], linestyle="-", color="dodgerblue" - ) + ax.vlines(0, ax.get_ylim()[0], ax.get_ylim()[1], linestyle="-", color="dodgerblue") + ax.vlines(2, ax.get_ylim()[0], ax.get_ylim()[1], linestyle="-", color="dodgerblue") ax.legend(handles=handles) ax.set_xlabel(r"$D^{(r)}/D^0$") ax.set_ylabel("Frequency") diff --git a/validphys2/src/validphys/n3fit_data_utils.py b/validphys2/src/validphys/n3fit_data_utils.py index 5743439946..fe908d73cb 100644 --- a/validphys2/src/validphys/n3fit_data_utils.py +++ b/validphys2/src/validphys/n3fit_data_utils.py @@ -7,8 +7,9 @@ The ``validphys_group_extractor`` will loop over every dataset of a given group loading their fktables (and applying any necessary cuts). """ -from itertools import zip_longest import dataclasses +from itertools import zip_longest + import numpy as np @@ -31,6 +32,7 @@ class FittableDataSet: training_mask: bool training mask to apply to the fktable """ + name: str fktables_data: list # of validphys.coredata.FKTableData objects diff --git a/validphys2/src/validphys/overfit_metric.py b/validphys2/src/validphys/overfit_metric.py index 9d94d7b9b0..7c77d3827c 100644 --- a/validphys2/src/validphys/overfit_metric.py +++ b/validphys2/src/validphys/overfit_metric.py @@ -14,13 +14,13 @@ from reportengine import collect from reportengine.figure import figure from reportengine.table import table - -from validphys.checks import check_at_least_two_replicas from validphys import plotutils +from validphys.checks import check_at_least_two_replicas log = logging.getLogger(__name__) -preds = collect("predictions",("dataset_inputs",)) +preds = collect("predictions", ("dataset_inputs",)) + def _create_new_val_pseudodata(pdf_data_index, fit_data_indices_list): """Loads all validation pseudodata replicas used during the fiting of the @@ -34,15 +34,13 @@ def _create_new_val_pseudodata(pdf_data_index, fit_data_indices_list): """ vl_data_fitrep = [] for fitreplica_info in fit_data_indices_list: - vl_data_fitrep.append( - fitreplica_info.pseudodata.loc[pdf_data_index.val_idx] - ) + vl_data_fitrep.append(fitreplica_info.pseudodata.loc[pdf_data_index.val_idx]) return np.array(vl_data_fitrep)[:, :, 0] @check_at_least_two_replicas def calculate_chi2s_per_replica( - pdf, # for the check + pdf, # for the check fit_code_version, recreate_pdf_pseudodata_no_table, preds, @@ -74,7 +72,7 @@ def calculate_chi2s_per_replica( """ fit_name = fit_code_version.columns[0] nnpdf_version = fit_code_version[fit_name]['nnpdf'] - if nnpdf_version>='4.0.5': + if nnpdf_version >= '4.0.5': pp = [] for i, dss in enumerate(dataset_inputs): preds_witout_cv = preds[i].drop(0, axis=1) @@ -85,7 +83,6 @@ def calculate_chi2s_per_replica( chi2s_per_replica = [] for enum, pdf_data_index in enumerate(recreate_pdf_pseudodata_no_table): - prediction_filter = pdf_data_index.val_idx.droplevel(level=0) prediction_filter.rename(["dataset", "data"], inplace=True) PDF_predictions_val = PDF_predictions.loc[prediction_filter] @@ -96,9 +93,7 @@ def calculate_chi2s_per_replica( ) invcovmat_vl = np.linalg.inv( - groups_covmat_no_table[pdf_data_index.val_idx].T[ - pdf_data_index.val_idx - ] + groups_covmat_no_table[pdf_data_index.val_idx].T[pdf_data_index.val_idx] ) tmp = PDF_predictions_val - new_val_pseudodata_list @@ -107,8 +102,10 @@ def calculate_chi2s_per_replica( chi2s_per_replica.append(chi2) ret = np.array(chi2s_per_replica) else: - log.warning(f"""Since {fit_name} pseudodata generation has changed, - hence the overfit metric cannot be determined.""") + log.warning( + f"""Since {fit_name} pseudodata generation has changed, + hence the overfit metric cannot be determined.""" + ) ret = np.array(np.nan) return ret @@ -147,7 +144,7 @@ def array_expected_overfitting( (number_of_resamples*Npdfs,) sized array containing the mean delta chi2 values per resampled list. """ - # calculate_chi2s_per_replica is set to NaN if the pseudodata generation + # calculate_chi2s_per_replica is set to NaN if the pseudodata generation # has changed sinc the fit has been performed. As a result the overfitting # metric can no longer be determined. if (calculate_chi2s_per_replica != calculate_chi2s_per_replica).all(): @@ -158,9 +155,7 @@ def array_expected_overfitting( number_pdfs = calculate_chi2s_per_replica.shape[0] list_expected_overfitting = [] for _ in range(number_pdfs * number_of_resamples): - mask = np.random.randint( - 0, number_pdfs, size=int(resampling_fraction * number_pdfs) - ) + mask = np.random.randint(0, number_pdfs, size=int(resampling_fraction * number_pdfs)) res_tmp = calculate_chi2s_per_replica[mask][:, mask] fitted_val_erf_tmp = fitted_val_erf[mask] @@ -199,9 +194,7 @@ def plot_overfitting_histogram(fit, array_expected_overfitting): return fig -fits_overfitting_summary = collect( - "fit_overfitting_summary", ("fits", "fitcontext") -) +fits_overfitting_summary = collect("fit_overfitting_summary", ("fits", "fitcontext")) @table diff --git a/validphys2/src/validphys/pdfbases.py b/validphys2/src/validphys/pdfbases.py index 10ad6543ed..bfba0b5692 100644 --- a/validphys2/src/validphys/pdfbases.py +++ b/validphys2/src/validphys/pdfbases.py @@ -4,18 +4,17 @@ This holds the concrete labels data relative to the PDF bases, as declaratively as possible. """ +import abc import copy -import inspect import functools -import abc +import inspect import numpy as np from reportengine.checks import CheckError +from validphys.gridvalues import central_grid_values, grid_values -from validphys.gridvalues import grid_values, central_grid_values - - +# fmt: off #This mapping maps the keys passed to LHAPDF (PDG codes) to nice LaTeX labels. PDG_PARTONS = dict(( (-6, r'\bar{t}'), diff --git a/validphys2/src/validphys/pdfgrids.py b/validphys2/src/validphys/pdfgrids.py index 9e12533ec2..f131ad3f6a 100644 --- a/validphys2/src/validphys/pdfgrids.py +++ b/validphys2/src/validphys/pdfgrids.py @@ -4,33 +4,35 @@ """ from collections import namedtuple import dataclasses -import numbers import logging +import numbers import numpy as np import scipy.integrate as integrate from reportengine import collect -from reportengine.checks import make_argcheck, CheckError, check_positive, check - -from validphys.core import PDF, Stats -from validphys.gridvalues import (evaluate_luminosity) -from validphys.pdfbases import (Basis, check_basis) +from reportengine.checks import CheckError, check, check_positive, make_argcheck from validphys.checks import check_pdf_normalize_to, check_xlimits +from validphys.core import PDF, Stats +from validphys.gridvalues import evaluate_luminosity +from validphys.pdfbases import Basis, check_basis log = logging.getLogger(__name__) + @make_argcheck def _check_scale(scale): scales = ('linear', 'log') if scale not in scales: raise CheckError(f'Unrecognized scale {scale}.', scale, scales) + @_check_scale @check_xlimits @check_positive('npoints') -def xgrid(xmin:numbers.Real=1e-5, xmax:numbers.Real=1, - scale:str='log', npoints:int=200): +def xgrid( + xmin: numbers.Real = 1e-5, xmax: numbers.Real = 1, scale: str = 'log', npoints: int = 200 +): """Return a tuple ``(scale, array)`` where ``scale`` is the input scale ("linear" or "log") and ``array`` is generated from the input parameters and distributed according to scale.""" @@ -48,13 +50,14 @@ class XPlottingGrid: The `grid_values` attribute corresponds to a `Stats` instance in order to compute statistical estimators in a sensible manner. """ + Q: float basis: (str, Basis) flavours: (list, tuple, type(None)) xgrid: np.ndarray grid_values: Stats scale: str - derivative_degree: int = 0 # keep track of the degree of the derivative + derivative_degree: int = 0 # keep track of the degree of the derivative def __post_init__(self): """Enforce grid_values being a Stats instance""" @@ -135,21 +138,21 @@ def xplotting_grid( derivative (int): how many derivtives of the PDF should be taken (default=0) """ - #Make usable outside reportengine + # Make usable outside reportengine checked = check_basis(basis, flavours) basis = checked['basis'] flavours = checked['flavours'] if xgrid is None: - #Call the function that is shadowed + # Call the function that is shadowed xgrid = globals()['xgrid']() - if isinstance(xgrid, tuple) and len(xgrid)==2: + if isinstance(xgrid, tuple) and len(xgrid) == 2: scale, xgrid = xgrid elif isinstance(xgrid, np.ndarray): scale = 'unknown' else: raise TypeError(f"Invalid xgrid {xgrid!r}") gv = basis.grid_values(pdf, flavours, xgrid, Q) - #Eliminante Q axis + # Eliminante Q axis stats_gv = pdf.stats_class(gv.reshape(gv.shape[:-1])) res = XPlottingGrid(Q, basis, flavours, xgrid, stats_gv, scale) @@ -159,6 +162,7 @@ def xplotting_grid( return res + @make_argcheck(check_basis) def kinetic_xplotting_grid( pdf: PDF, @@ -177,9 +181,7 @@ def kinetic_xplotting_grid( k = \sqrt{1 + (d/dlogx f)^2} """ # Get the pdf derived wrt logx - xpg = xplotting_grid( - pdf=pdf, Q=Q, xgrid=xgrid, basis=basis, flavours=flavours, derivative=1 - ) + xpg = xplotting_grid(pdf=pdf, Q=Q, xgrid=xgrid, basis=basis, flavours=flavours, derivative=1) # Compute the kinetic energy kinen_rawdata = np.sqrt(1 + xpg.grid_values.data**2) kinen_gv = pdf.stats_class(kinen_rawdata) @@ -191,13 +193,17 @@ def kinetic_xplotting_grid( kinetic_xplotting_grids = collect(kinetic_xplotting_grid, ('pdfs',)) -Lumi2dGrid = namedtuple('Lumi2dGrid', ['y','m','grid_values']) - +Lumi2dGrid = namedtuple('Lumi2dGrid', ['y', 'm', 'grid_values']) -def lumigrid2d(pdf:PDF, lumi_channel, sqrts:numbers.Real, - y_lim:numbers.Real=5, nbins_m:int=100, - nbins_y:int=50): +def lumigrid2d( + pdf: PDF, + lumi_channel, + sqrts: numbers.Real, + y_lim: numbers.Real = 5, + nbins_m: int = 100, + nbins_y: int = 50, +): """ Return the differential luminosity in a grid of (nbins_m x nbins_y) points, for the allowed values of invariant mass and rpidity for given @@ -209,13 +215,12 @@ def lumigrid2d(pdf:PDF, lumi_channel, sqrts:numbers.Real, The results are computed for all relevant PDF members and wrapped in a stats class, to compute statistics regardless of the error_type. """ - s = sqrts*sqrts + s = sqrts * sqrts mxs = np.logspace(1, np.log10(sqrts), nbins_m) + ys = np.linspace(0, y_lim, nbins_y) - ys = np.linspace(0 , y_lim, nbins_y) - - y_kinlims = -np.log(mxs/sqrts) + y_kinlims = -np.log(mxs / sqrts) ys_max = np.searchsorted(ys, y_kinlims) # TODO: Write this in something fast @@ -225,16 +230,14 @@ def lumigrid2d(pdf:PDF, lumi_channel, sqrts:numbers.Real, weights = np.full(shape=(nmembers, nbins_m, nbins_y), fill_value=np.NaN) for irep in range(nmembers): - for im,mx in enumerate(mxs): - masked_ys = ys[:ys_max[im]] - for iy,y in enumerate(masked_ys): - #TODO: Fill this from lpdf.grid_values? - x1 = mx/sqrts*np.exp(y) - x2 = mx/sqrts*np.exp(-y) - res= evaluate_luminosity(lpdf, irep, - s, mx, x1, x2, lumi_channel) - weights[irep, im, iy] = res - + for im, mx in enumerate(mxs): + masked_ys = ys[: ys_max[im]] + for iy, y in enumerate(masked_ys): + # TODO: Fill this from lpdf.grid_values? + x1 = mx / sqrts * np.exp(y) + x2 = mx / sqrts * np.exp(-y) + res = evaluate_luminosity(lpdf, irep, s, mx, x1, x2, lumi_channel) + weights[irep, im, iy] = res return Lumi2dGrid(ys, mxs, pdf.stats_class(weights)) @@ -242,11 +245,13 @@ def lumigrid2d(pdf:PDF, lumi_channel, sqrts:numbers.Real, lumigrids2d = collect('lumigrid2d', ['lumi_channels']) -Lumi1dGrid = namedtuple('Lumi1dGrid', ['m','grid_values']) +Lumi1dGrid = namedtuple('Lumi1dGrid', ['m', 'grid_values']) + def _default_mxmax(sqrts): return sqrts / 3 + @make_argcheck def _check_mx(mxmin, mxmax, sqrts): if mxmax is None: @@ -254,11 +259,13 @@ def _check_mx(mxmin, mxmax, sqrts): check( 0 <= mxmin < mxmax <= sqrts, - ("mxmin and mxmax not consistent: Should be 0 <= mxmin < mxmax <= sqrts, " - f"but mxmin={mxmin} GeV, mxmax={mxmax} GeV and sqrts={sqrts} GeV." + ( + "mxmin and mxmax not consistent: Should be 0 <= mxmin < mxmax <= sqrts, " + f"but mxmin={mxmin} GeV, mxmax={mxmax} GeV and sqrts={sqrts} GeV." ), ) + @_check_mx @check_positive("sqrts") @_check_scale @@ -295,7 +302,7 @@ def lumigrid1d( mxs = np.linspace(mxmin, mxmax, nbins_m) else: raise ValueError("Unknown scale") - sqrt_taus = (mxs / sqrts) + sqrt_taus = mxs / sqrts # TODO: Write this in something fast lpdf = pdf.load() @@ -304,20 +311,18 @@ def lumigrid1d( weights = np.full(shape=(nmembers, nbins_m), fill_value=np.NaN) for im, (mx, sqrt_tau) in enumerate(zip(mxs, sqrt_taus)): - y_min = -np.log(1/sqrt_tau) - y_max = np.log(1/sqrt_tau) + y_min = -np.log(1 / sqrt_tau) + y_max = np.log(1 / sqrt_tau) if y_cut is not None: - if -y_cut > y_min and y_cut < y_max: + if -y_cut > y_min and y_cut < y_max: y_min = -y_cut - y_max = y_cut + y_max = y_cut for irep in range(nmembers): # Eq.(3) in arXiv:1607.01831 f = lambda y: evaluate_luminosity( - lpdf, irep, s, mx, - sqrt_tau * np.exp(y), sqrt_tau * np.exp(-y), - lumi_channel + lpdf, irep, s, mx, sqrt_tau * np.exp(y), sqrt_tau * np.exp(-y), lumi_channel ) res = integrate.quad(f, y_min, y_max, epsrel=5e-4, limit=50)[0] @@ -331,7 +336,7 @@ def lumigrid1d( @check_pdf_normalize_to -def distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None): +def distance_grids(pdfs, xplotting_grids, normalize_to: (int, str, type(None)) = None): """Return an object containing the value of the distance PDF at the specified values of x and flavour. @@ -349,7 +354,6 @@ def distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None newgrids = [] for grid, pdf in zip(xplotting_grids, pdfs): - if pdf == pdfs[normalize_to]: # Zero the PDF we are normalizing against pdf_zero = pdf.stats_class(np.zeros_like(gr2_stats.data[0:1])) @@ -363,7 +367,7 @@ def distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None N1 = pdf.get_members() # Wrap the distance into a Stats (1, flavours, points) - distance = Stats([np.sqrt((cv1-cv2)**2/(sg1**2/N1+sg2**2/N2))]) + distance = Stats([np.sqrt((cv1 - cv2) ** 2 / (sg1**2 / N1 + sg2**2 / N2))]) newgrid = grid.copy_grid(grid_values=distance) newgrids.append(newgrid) @@ -372,7 +376,7 @@ def distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None @check_pdf_normalize_to -def variance_distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(None))=None): +def variance_distance_grids(pdfs, xplotting_grids, normalize_to: (int, str, type(None)) = None): """Return an object containing the value of the variance distance PDF at the specified values of x and flavour. @@ -387,11 +391,10 @@ def variance_distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(No sg2 = gr2_stats.std_error() mo2 = gr2_stats.moment(4) N2 = pdfs[normalize_to].get_members() - s2 = (mo2-(N2-3)/(N2-1)*sg2**4)/N2 + s2 = (mo2 - (N2 - 3) / (N2 - 1) * sg2**4) / N2 newgrids = [] for grid, pdf in zip(xplotting_grids, pdfs): - if pdf == pdfs[normalize_to]: # Zero the PDF we are normalizing against pdf_zero = pdf.stats_class(np.zeros_like(gr2_stats.data[0])) @@ -403,10 +406,10 @@ def variance_distance_grids(pdfs, xplotting_grids, normalize_to:(int,str,type(No sg1 = g_stats.std_error() mo1 = g_stats.moment(4) N1 = pdf.get_members() - s1 = (mo1-(N1-3)/(N1-1)*sg1**4)/N1 + s1 = (mo1 - (N1 - 3) / (N1 - 1) * sg1**4) / N1 # Wrap the distance into a Stats (1, flavours, points) - variance_distance = Stats([np.sqrt((sg1**2-sg2**2)**2/(s1+s2))]) + variance_distance = Stats([np.sqrt((sg1**2 - sg2**2) ** 2 / (s1 + s2))]) newgrid = grid.copy_grid(grid_values=variance_distance) newgrids.append(newgrid) diff --git a/validphys2/src/validphys/pdfoutput.py b/validphys2/src/validphys/pdfoutput.py index 52234abefc..653710b0ee 100644 --- a/validphys2/src/validphys/pdfoutput.py +++ b/validphys2/src/validphys/pdfoutput.py @@ -15,18 +15,17 @@ import logging import re -from reportengine.checks import make_check, CheckError, check, make_argcheck - +from reportengine.checks import CheckError, check, make_argcheck, make_check from validphys import lhaindex log = logging.getLogger(__name__) PDFSETS_PATH = 'pdfsets' + @make_argcheck def _check_set_name(set_name): - check(re.fullmatch(r'[\w\-]+', set_name), - "Invalid set_name. Must be alphanumeric.") + check(re.fullmatch(r'[\w\-]+', set_name), "Invalid set_name. Must be alphanumeric.") @make_check @@ -35,42 +34,50 @@ def _setup_pdf_output(*, callspec, ns, **kwargs): rootns = ns.maps[-1] if lhaindex.isinstalled(set_name): - raise CheckError("The PDF set that would be " - "generated already exists in the LHAPDF path:\n" - f"{lhaindex.finddir(set_name)}\n" - "Either delete it or explicitly assign a set_name for " - "the new PDF.") + raise CheckError( + "The PDF set that would be " + "generated already exists in the LHAPDF path:\n" + f"{lhaindex.finddir(set_name)}\n" + "Either delete it or explicitly assign a set_name for " + "the new PDF." + ) output_path = ns['output_path'] - pdfpath = output_path/PDFSETS_PATH - pdfpath.mkdir(exist_ok=True ) + pdfpath = output_path / PDFSETS_PATH + pdfpath.mkdir(exist_ok=True) ns['output_path'] = pdfpath - #TODO: Enable this someday - #Ugly hack to allow analyzing the generated pdf some day (as in smpdf). - #For now, this is used to prevent duplicated reweighted sets. + # TODO: Enable this someday + # Ugly hack to allow analyzing the generated pdf some day (as in smpdf). + # For now, this is used to prevent duplicated reweighted sets. if '_future_pdfs' not in rootns: rootns['_future_pdfs'] = {} future_pdfs = rootns['_future_pdfs'] if set_name in future_pdfs: - raise CheckError(f"PDF set with name '{set_name}' would already be " - "generated by another action and would be overwritten." - ) + raise CheckError( + f"PDF set with name '{set_name}' would already be " + "generated by another action and would be overwritten." + ) - #lhapdf.pathsAppend(str(ns['output_path'])) + # lhapdf.pathsAppend(str(ns['output_path'])) future_pdfs[set_name] = callspec + def _prepare(namespace, *args, **kwargs): - return {'set_name': namespace['set_name'], - #Prepare executes before the checks for some reason - 'output_path': namespace['output_path']/PDFSETS_PATH} + return { + 'set_name': namespace['set_name'], + # Prepare executes before the checks for some reason + 'output_path': namespace['output_path'] / PDFSETS_PATH, + } + def _return_set_name(result, set_name, output_path): if result is not None: log.warning("Result of provider marked with @pdfset discarded.") - return (output_path/set_name).relative_to(output_path.parent) + return (output_path / set_name).relative_to(output_path.parent) + def pdfset(f): """Mark the function as returning a PDF set. diff --git a/validphys2/src/validphys/pdfplots.py b/validphys2/src/validphys/pdfplots.py index f61aab2e91..07cc497f4c 100644 --- a/validphys2/src/validphys/pdfplots.py +++ b/validphys2/src/validphys/pdfplots.py @@ -4,32 +4,38 @@ Plots of quantities that are mostly functions of the PDFs only. """ import abc -import logging +import copy import functools -import warnings +import logging import numbers -import copy from types import SimpleNamespace +import warnings - +from matplotlib import cm +from matplotlib import colors as mcolors import numpy as np -from matplotlib import cm, colors as mcolors -from reportengine.figure import figure, figuregen from reportengine.checks import make_argcheck +from reportengine.figure import figure, figuregen from reportengine.floatformatting import format_number - from validphys import plotutils +from validphys.checks import ( + check_have_two_pdfs, + check_mixband_as_replicas, + check_pdf_normalize_to, + check_pdfs_noband, + check_scale, +) from validphys.core import MCStats from validphys.gridvalues import LUMI_CHANNELS from validphys.utils import scale_from_grid -from validphys.checks import check_pdf_normalize_to, check_scale, check_have_two_pdfs -from validphys.checks import check_pdfs_noband, check_mixband_as_replicas log = logging.getLogger(__name__) + class FlavourState(SimpleNamespace): """This is the namespace for the pats specific for each flavour""" + pass @@ -61,19 +67,21 @@ def normalize(self): normalize_grid = self._xplotting_grids[normalize_to] normvals = normalize_grid.grid_values.central_value() - #Handle division by zero more quietly + # Handle division by zero more quietly def fp_error(tp, flag): - log.warning("Invalid values found computing " + log.warning( + "Invalid values found computing " f"normalization to {normalize_pdf}: " - f"Floating point error ({tp}).") - #Show warning only once + f"Floating point error ({tp})." + ) + # Show warning only once np.seterr(all='ignore') newgrids = [] with np.errstate(all='call'): np.seterrcall(fp_error) for pdf, grid in zip(self.pdfs, self._xplotting_grids): - newvalues = pdf.stats_class(grid.grid_values.data/normvals) + newvalues = pdf.stats_class(grid.grid_values.data / normvals) newgrids.append(grid.copy_grid(grid_values=newvalues)) return newgrids @@ -115,7 +123,6 @@ def firstgrid(self): return self.xplotting_grids[0] raise AttributeError("Need at least one xgrid") - @abc.abstractmethod def draw(self, pdf, grid, flstate): """Plot the desired function of the grid and return the array to be @@ -128,8 +135,7 @@ def legend(self, flstate): def __iter__(self): yield from self() - - def __call__(self,): + def __call__(self): if not self.xplotting_grids: return @@ -138,8 +144,7 @@ def __call__(self,): for flindex, fl in enumerate(self.firstgrid.flavours): fig, ax = plotutils.subplots() parton_name = basis.elementlabel(fl) - flstate = FlavourState(flindex=flindex, fl=fl, fig=fig, ax=ax, - parton_name=parton_name) + flstate = FlavourState(flindex=flindex, fl=fl, fig=fig, ax=ax, parton_name=parton_name) self.setup_flavour(flstate) ax.set_title(self.get_title(parton_name)) @@ -149,18 +154,17 @@ def __call__(self,): if limits is not None: all_vals.append(np.atleast_2d(limits)) - #Note these two lines do not conmute! + # Note these two lines do not conmute! ax.set_xscale(self.xscale) plotutils.frame_center(ax, self.firstgrid.xgrid, np.concatenate(all_vals)) - if (self.ymin is not None): + if self.ymin is not None: ax.set_ylim(ymin=self.ymin) - if (self.ymax is not None): + if self.ymax is not None: ax.set_ylim(ymax=self.ymax) ax.set_xlabel('$x$') ax.set_xlim(self.firstgrid.xgrid[0]) - ax.set_ylabel(self.get_ylabel(parton_name)) ax.set_axisbelow(True) @@ -169,15 +173,16 @@ def __call__(self,): yield fig, parton_name - @functools.lru_cache() def _warn_pdf_not_montecarlo(pdf): et = pdf.error_type if et != 'replicas': - log.warning("Plotting members of a non-Monte Carlo PDF set:" - f" {pdf.name} with error type '{et}'.") + log.warning( + f"Plotting members of a non-Monte Carlo PDF set: {pdf.name} with error type '{et}'." + ) -#Cant't add the lru_cache here because pdfs is not hashable at the moment + +# Cant't add the lru_cache here because pdfs is not hashable at the moment @make_argcheck def _warn_any_pdf_not_montecarlo(pdfs): for pdf in pdfs: @@ -192,19 +197,23 @@ def draw(self, pdf, grid, flstate): flavour_grid = grid.select_flavour(flstate.flindex) stats = flavour_grid.grid_values gv = stats.data - ax.plot(grid.xgrid, gv.T, alpha=0.2, linewidth=0.5, - color=color, zorder=1) - ax.plot(grid.xgrid, stats.central_value(), color=color, - linewidth=2, - label=pdf.label) + ax.plot(grid.xgrid, gv.T, alpha=0.2, linewidth=0.5, color=color, zorder=1) + ax.plot(grid.xgrid, stats.central_value(), color=color, linewidth=2, label=pdf.label) return gv + @figuregen @check_pdf_normalize_to @check_scale('xscale', allow_none=True) @_warn_any_pdf_not_montecarlo -def plot_pdfreplicas(pdfs, xplotting_grids, xscale:(str,type(None))=None, - normalize_to:(int,str,type(None))=None, ymin = None, ymax = None): +def plot_pdfreplicas( + pdfs, + xplotting_grids, + xscale: (str, type(None)) = None, + normalize_to: (int, str, type(None)) = None, + ymin=None, + ymax=None, +): """Plot the replicas of the specified PDFs. Otherise it works the same as plot_pdfs. @@ -213,11 +222,17 @@ def plot_pdfreplicas(pdfs, xplotting_grids, xscale:(str,type(None))=None, - normalize_to should be, a pdf id or an index of the pdf (starting from one). """ - yield from ReplicaPDFPlotter(pdfs=pdfs, xplotting_grids=xplotting_grids, - xscale=xscale, normalize_to=normalize_to, ymin=ymin, ymax=ymax) + yield from ReplicaPDFPlotter( + pdfs=pdfs, + xplotting_grids=xplotting_grids, + xscale=xscale, + normalize_to=normalize_to, + ymin=ymin, + ymax=ymax, + ) -class UncertaintyPDFPlotter(PDFPlotter): +class UncertaintyPDFPlotter(PDFPlotter): def get_ylabel(self, parton_name): if self.normalize_to is not None: return r"$\sigma($%s$)$" % super().get_ylabel(parton_name) @@ -246,8 +261,14 @@ def __call__(self): @figuregen @check_pdf_normalize_to @check_scale('xscale', allow_none=True) -def plot_pdf_uncertainties(pdfs, xplotting_grids, xscale:(str,type(None))=None, - normalize_to:(int,str,type(None))=None, ymin=None, ymax=None): +def plot_pdf_uncertainties( + pdfs, + xplotting_grids, + xscale: (str, type(None)) = None, + normalize_to: (int, str, type(None)) = None, + ymin=None, + ymax=None, +): """Plot the PDF standard deviations as a function of x. If normalize_to is set, the ratio to that PDF's central value is plotted. Otherwise it is the absolute values.""" @@ -258,9 +279,9 @@ class AllFlavoursPlotter(PDFPlotter): """Auxiliary class which groups multiple PDF flavours in one plot.""" def setup_flavour(self, flstate): - flstate.handles= self.handles - flstate.labels= self.doesnothing - flstate.hatchit= self.hatchit + flstate.handles = self.handles + flstate.labels = self.doesnothing + flstate.hatchit = self.hatchit def __call__(self): if not self.xplotting_grids: @@ -280,11 +301,9 @@ def __call__(self): all_vals = [] for flindex, fl in enumerate(self.firstgrid.flavours): - parton_name = basis.elementlabel(fl) self.labels.append(f'${parton_name}$') - flstate = FlavourState(flindex=flindex, fl=fl, fig=fig, ax=ax, - parton_name=parton_name) + flstate = FlavourState(flindex=flindex, fl=fl, fig=fig, ax=ax, parton_name=parton_name) self.setup_flavour(flstate) for pdf, grid in zip(self.pdfs, self.xplotting_grids): @@ -292,14 +311,13 @@ def __call__(self): if limits is not None: all_vals.append(np.atleast_2d(limits)) - #It can happen that we don't get anything to concatenate - #e.g. because we are comparing to the base PDF several times. + # It can happen that we don't get anything to concatenate + # e.g. because we are comparing to the base PDF several times. if all_vals: - plotutils.frame_center(ax, self.firstgrid.xgrid, - np.concatenate(all_vals)) - if (self.ymin is not None): + plotutils.frame_center(ax, self.firstgrid.xgrid, np.concatenate(all_vals)) + if self.ymin is not None: ax.set_ylim(ymin=self.ymin) - if (self.ymax is not None): + if self.ymax is not None: ax.set_ylim(ymax=self.ymax) ax.set_axisbelow(True) @@ -322,7 +340,6 @@ def get_ylabel(self, parton_name): return "Distance from {}".format(self.normalize_pdf.label) def draw(self, pdf, grid, flstate): - if pdf == self.normalize_pdf: return None @@ -354,19 +371,27 @@ def get_title(self, parton_name): return f'{self.pdfs[(1+self.normalize_to)%2]} Q={self.Q : .1f} GeV' -class FlavoursDistancePlotter(DistancePDFPlotter, AllFlavoursPlotter): pass +class FlavoursDistancePlotter(DistancePDFPlotter, AllFlavoursPlotter): + pass -class FlavoursVarDistancePlotter(VarDistancePDFPlotter, AllFlavoursPlotter): pass +class FlavoursVarDistancePlotter(VarDistancePDFPlotter, AllFlavoursPlotter): + pass @figure @check_pdf_normalize_to @check_have_two_pdfs @check_scale('xscale', allow_none=True) -def plot_pdfdistances(pdfs, distance_grids, *, - xscale:(str,type(None))=None, - normalize_to:(int,str),ymin=None,ymax=None): +def plot_pdfdistances( + pdfs, + distance_grids, + *, + xscale: (str, type(None)) = None, + normalize_to: (int, str), + ymin=None, + ymax=None, +): """Plots the distances between different PDF sets and a reference PDF set for all flavours. Distances are normalized such that a value of order 10 is unlikely to be explained by purely statistical fluctuations @@ -378,24 +403,27 @@ def plot_pdfdistances(pdfs, distance_grids, *, @check_pdf_normalize_to @check_have_two_pdfs @check_scale('xscale', allow_none=True) -def plot_pdfvardistances(pdfs, variance_distance_grids, *, - xscale:(str,type(None))=None, - normalize_to:(int,str),ymin=None,ymax=None): +def plot_pdfvardistances( + pdfs, + variance_distance_grids, + *, + xscale: (str, type(None)) = None, + normalize_to: (int, str), + ymin=None, + ymax=None, +): """Plots the distances between different PDF sets and a reference PDF set for all flavours. Distances are normalized such that a value of order 10 is unlikely to be explained by purely statistical fluctuations """ - return FlavoursVarDistancePlotter(pdfs, variance_distance_grids, xscale, normalize_to, ymin, ymax)() + return FlavoursVarDistancePlotter( + pdfs, variance_distance_grids, xscale, normalize_to, ymin, ymax + )() class BandPDFPlotter(PDFPlotter): def __init__( - self, - *args, - pdfs_noband=None, - show_mc_errors=True, - legend_stat_labels=True, - **kwargs + self, *args, pdfs_noband=None, show_mc_errors=True, legend_stat_labels=True, **kwargs ): if pdfs_noband is None: pdfs_noband = [] @@ -405,9 +433,9 @@ def __init__( super().__init__(*args, **kwargs) def setup_flavour(self, flstate): - flstate.handles=[] - flstate.labels=[] - flstate.hatchit=plotutils.hatch_iter() + flstate.handles = [] + flstate.labels = [] + flstate.hatchit = plotutils.hatch_iter() def draw(self, pdf, grid, flstate): ax = flstate.ax @@ -420,58 +448,55 @@ def draw(self, pdf, grid, flstate): next_prop = next(ax._get_lines.prop_cycler) cv = stats.central_value() xgrid = grid.xgrid - #Ignore spurious normalization warnings + # Ignore spurious normalization warnings with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) err68down, err68up = stats.errorbar68() - #http://stackoverflow.com/questions/5195466/matplotlib-does-not-display-hatching-when-rendering-to-pdf + # http://stackoverflow.com/questions/5195466/matplotlib-does-not-display-hatching-when-rendering-to-pdf hatch = next(hatchit) color = next_prop['color'] - cvline, = ax.plot(xgrid, cv, color=color) + (cvline,) = ax.plot(xgrid, cv, color=color) if pdf in self.pdfs_noband: labels.append(pdf.label) handles.append(cvline) return [cv, cv] alpha = 0.5 - ax.fill_between(xgrid, err68up, err68down, color=color, alpha=alpha, - zorder=1) + ax.fill_between(xgrid, err68up, err68down, color=color, alpha=alpha, zorder=1) - ax.fill_between(xgrid, err68up, err68down, facecolor='None', alpha=alpha, - edgecolor=color, - hatch=hatch, - zorder=1) + ax.fill_between( + xgrid, + err68up, + err68down, + facecolor='None', + alpha=alpha, + edgecolor=color, + hatch=hatch, + zorder=1, + ) if isinstance(stats, MCStats) and self.show_mc_errors: errorstdup, errorstddown = stats.errorbarstd() ax.plot(xgrid, errorstdup, linestyle='--', color=color) ax.plot(xgrid, errorstddown, linestyle='--', color=color) label = ( - rf"{pdf.label} ($68\%$ c.l.+$1\sigma$)" - if self.legend_stat_labels - else pdf.label + rf"{pdf.label} ($68\%$ c.l.+$1\sigma$)" if self.legend_stat_labels else pdf.label ) outer = True else: outer = False - label = ( - rf"{pdf.label} ($68\%$ c.l.)" - if self.legend_stat_labels - else pdf.label - ) - handle = plotutils.HandlerSpec(color=color, alpha=alpha, - hatch=hatch, - outer=outer) + label = rf"{pdf.label} ($68\%$ c.l.)" if self.legend_stat_labels else pdf.label + handle = plotutils.HandlerSpec(color=color, alpha=alpha, hatch=hatch, outer=outer) handles.append(handle) labels.append(label) return [err68down, err68up] def legend(self, flstate): - return flstate.ax.legend(flstate.handles, flstate.labels, - handler_map={plotutils.HandlerSpec: - plotutils.ComposedHandler() - } - ) + return flstate.ax.legend( + flstate.handles, + flstate.labels, + handler_map={plotutils.HandlerSpec: plotutils.ComposedHandler()}, + ) @figuregen @@ -664,10 +689,17 @@ class FlavoursPlotter(AllFlavoursPlotter, BandPDFPlotter): def get_title(self, parton_name): return f'{self.pdfs[0]} Q={self.Q : .1f} GeV' + @figure @check_scale('xscale', allow_none=True) -def plot_flavours(pdf, xplotting_grid, xscale:(str,type(None))=None, - normalize_to:(int,str,type(None))=None,ymin=None,ymax=None): +def plot_flavours( + pdf, + xplotting_grid, + xscale: (str, type(None)) = None, + normalize_to: (int, str, type(None)) = None, + ymin=None, + ymax=None, +): """Plot the absolute central value and the uncertainty of all the flavours of a pdf as a function of x for a given value of Q. @@ -675,7 +707,10 @@ def plot_flavours(pdf, xplotting_grid, xscale:(str,type(None))=None, set based on the scale in xgrid, which should be used instead. """ - return FlavoursPlotter([pdf], [xplotting_grid], xscale, normalize_to=None, ymin= ymin, ymax=ymax)() + return FlavoursPlotter( + [pdf], [xplotting_grid], xscale, normalize_to=None, ymin=ymin, ymax=ymax + )() + @figure @check_pdf_normalize_to @@ -692,7 +727,7 @@ def plot_lumi1d( ymax: (numbers.Real, type(None)) = None, pdfs_noband=None, scale="log", - legend_stat_labels: bool=True, + legend_stat_labels: bool = True, ): """Plot PDF luminosities at a given center of mass energy. sqrts is the center of mass energy (GeV). @@ -743,9 +778,7 @@ def plot_lumi1d( labels.append(pdf.label) continue - ax.fill_between( - mx, err68down / norm, err68up / norm, color=color, alpha=alpha, zorder=1 - ) + ax.fill_between(mx, err68down / norm, err68up / norm, color=color, alpha=alpha, zorder=1) ax.fill_between( mx, err68down / norm, @@ -766,9 +799,7 @@ def plot_lumi1d( label_add = r"($68\%$ c.l.)" if legend_stat_labels else "" outer = False - handle = plotutils.HandlerSpec( - color=color, alpha=alpha, hatch=hatch, outer=outer - ) + handle = plotutils.HandlerSpec(color=color, alpha=alpha, hatch=hatch, outer=outer) handles.append(handle) labels.append(f"{pdf.label} {label_add}") @@ -784,7 +815,7 @@ def plot_lumi1d( ax.set_ylim(ymin, ymax) ax.set_xscale(scale) ax.grid(False) - if y_cut==None: + if y_cut == None: ax.set_title( f"${LUMI_CHANNELS[lumi_channel]}$ luminosity\n" f"$\\sqrt{{s}}={format_number(sqrts/1000)}$ TeV" @@ -810,7 +841,7 @@ def plot_lumi1d_uncertainties( normalize_to=None, ymin: (numbers.Real, type(None)) = None, ymax: (numbers.Real, type(None)) = None, - scale = "log", + scale="log", ): """Plot PDF luminosity uncertainties at a given center of mass energy. sqrts is the center of mass energy (GeV). @@ -845,7 +876,7 @@ def plot_lumi1d_uncertainties( ax.set_xlim(mx[0], mx[-1]) ax.set_xscale(scale) ax.grid(False) - if y_cut==None: + if y_cut is None: ax.set_title( f"${LUMI_CHANNELS[lumi_channel]}$ luminosity uncertainty\n" f"$\\sqrt{{s}}={format_number(sqrts/1000)}$ TeV" @@ -860,11 +891,9 @@ def plot_lumi1d_uncertainties( current_ymin, _ = ax.get_ylim() ax.set_ylim(max(0, current_ymin), None) - return fig - @figure @check_pdf_normalize_to def plot_lumi1d_replicas( @@ -917,10 +946,8 @@ def plot_lumi1d_replicas( color = next(pcycler) - ax.plot(mx, (replicas/norm).T, alpha=0.2, linewidth=0.5, - color=color, zorder=1) - line, = ax.plot(mx, cv/norm, color=color, - linewidth=2) + ax.plot(mx, (replicas / norm).T, alpha=0.2, linewidth=0.5, color=color, zorder=1) + (line,) = ax.plot(mx, cv / norm, color=color, linewidth=2) lines.append(line) labels.append(pdf.label) @@ -929,9 +956,9 @@ def plot_lumi1d_replicas( ax.set_xlim(mx[0], mx[-1]) ax.set_ylim(ymin, ymax) ax.set_xscale(scale) - ax.legend(lines,labels) + ax.legend(lines, labels) ax.grid(False) - if y_cut==None: + if y_cut is None: ax.set_title( f"${LUMI_CHANNELS[lumi_channel]}$ luminosity\n" f"$\\sqrt{{s}}={format_number(sqrts/1000)}$ TeV" @@ -946,8 +973,7 @@ def plot_lumi1d_replicas( return fig - -#TODO: Move these to utils somewhere? Find better implementations? +# TODO: Move these to utils somewhere? Find better implementations? def _reflect_matrl(mat, odd=False): """Reflect a matrix with positive values in the first axis to have the same balues for the nwgative axis. The first value is not reflected. @@ -956,13 +982,14 @@ def _reflect_matrl(mat, odd=False): """ mat = np.asarray(mat) - res = np.empty(shape=(mat.shape[0]*2-1, *mat.shape[1:]),dtype=mat.dtype) - neglen = mat.shape[0]-1 + res = np.empty(shape=(mat.shape[0] * 2 - 1, *mat.shape[1:]), dtype=mat.dtype) + neglen = mat.shape[0] - 1 fact = -1 if odd else 1 - res[:neglen,...] = fact*mat[:0:-1,...] - res[neglen:,...] = mat + res[:neglen, ...] = fact * mat[:0:-1, ...] + res[neglen:, ...] = mat return res + def _reflect_matud(mat, odd=False): """Reflect a matrix with positive values in the second axis to have the same balues for the nwgative axis. The first value is not reflected. @@ -971,18 +998,16 @@ def _reflect_matud(mat, odd=False): """ mat = np.asarray(mat) - res = np.empty(shape=(mat.shape[0], mat.shape[1]*2-1, *mat.shape[2:]), - dtype=mat.dtype) - neglen = mat.shape[1]-1 + res = np.empty(shape=(mat.shape[0], mat.shape[1] * 2 - 1, *mat.shape[2:]), dtype=mat.dtype) + neglen = mat.shape[1] - 1 fact = -1 if odd else 1 - res[:,:neglen,...] = fact*mat[:,:0:-1,...] - res[:,neglen:,...] = mat + res[:, :neglen, ...] = fact * mat[:, :0:-1, ...] + res[:, neglen:, ...] = mat return res @figure -def plot_lumi2d(pdf, lumi_channel, lumigrid2d, sqrts, - display_negative:bool=True): +def plot_lumi2d(pdf, lumi_channel, lumigrid2d, sqrts, display_negative: bool = True): """Plot the absolute luminosity on a grid of invariant mass and rapidity for a given center of mass energy `sqrts`. The color scale is logarithmic. @@ -993,7 +1018,6 @@ def plot_lumi2d(pdf, lumi_channel, lumigrid2d, sqrts, """ - cmap = copy.copy(cm.viridis_r) cmap.set_bad("white", alpha=0) fig, ax = plotutils.subplots() @@ -1006,38 +1030,46 @@ def plot_lumi2d(pdf, lumi_channel, lumigrid2d, sqrts, y = _reflect_matrl(lumigrid2d.y, odd=True) masked_weights = np.ma.masked_invalid(mat, copy=False) - #TODO: SymLogNorm is really the right thing to do here, but I can't be - #bothered to make it work. Mostly the ticks around zero are completely - #broken and looks like it takes a lot of fidlling wirh the mpl internals - #to fix it. + # TODO: SymLogNorm is really the right thing to do here, but I can't be + # bothered to make it work. Mostly the ticks around zero are completely + # broken and looks like it takes a lot of fidlling wirh the mpl internals + # to fix it. with np.errstate(invalid='ignore'): - positive_mask = masked_weights>0 - linlim = np.nanpercentile(masked_weights[positive_mask],90)/1e5 + positive_mask = masked_weights > 0 + linlim = np.nanpercentile(masked_weights[positive_mask], 90) / 1e5 - #norm = mcolors.SymLogNorm(linlim, vmin=None) + # norm = mcolors.SymLogNorm(linlim, vmin=None) norm = mcolors.LogNorm(vmin=linlim) with np.errstate(invalid='ignore'): - masked_weights[masked_weights 50 extdown = np.nanmin(masked_weights) < 1 - #TODO: Wrap this somewhere + # TODO: Wrap this somewhere if extup: if extdown: extend = 'both' @@ -1105,12 +1144,18 @@ def plot_lumi2d_uncertainty(pdf, lumi_channel, lumigrid2d, sqrts:numbers.Real): else: extend = None - fig.colorbar(mesh, label="Relative uncertainty (%)", - ticks=[1,5,10,25,50], format='%.0f', extend=extend) + fig.colorbar( + mesh, + label="Relative uncertainty (%)", + ticks=[1, 5, 10, 25, 50], + format='%.0f', + extend=extend, + ) ax.set_yscale('log') - ax.set_title("Relative uncertainty for $%s$-luminosity\n%s - " - "$\\sqrt{s}=%.1f$ GeV" % (LUMI_CHANNELS[channel], - pdf.label, sqrts)) + ax.set_title( + ("Relative uncertainty for $%s$-luminosity\n%s - $\\sqrt{s}=%.1f$ GeV") + % (LUMI_CHANNELS[channel], pdf.label, sqrts) + ) ax.set_ylabel('$m_{X}$ (GeV)') ax.set_xlabel('y') ax.grid(False) @@ -1123,6 +1168,7 @@ class MixBandPDFPlotter(BandPDFPlotter): depending on the type of PDF. Practical use: plot together the PDF central values with the NNPDF bands """ + def __init__(self, *args, mixband_as_replicas, **kwargs): self.mixband_as_replicas = mixband_as_replicas super().__init__(*args, **kwargs) @@ -1136,10 +1182,8 @@ def draw(self, pdf, grid, flstate): color = next_prop['color'] stats = grid.select_flavour(flstate.flindex).grid_values gv = stats.data - ax.plot(grid.xgrid, gv.T, alpha=0.2, linewidth=0.5, - color=color, zorder=1) - cv_line = ax.plot(grid.xgrid[0:1], stats.central_value()[0:1], - color=color, linewidth=2) + ax.plot(grid.xgrid, gv.T, alpha=0.2, linewidth=0.5, color=color, zorder=1) + cv_line = ax.plot(grid.xgrid[0:1], stats.central_value()[0:1], color=color, linewidth=2) handle = cv_line[0] labels.append(pdf.label) handles.append(handle) diff --git a/validphys2/src/validphys/pineparser.py b/validphys2/src/validphys/pineparser.py index 8edb23f166..cfe1db6e1f 100644 --- a/validphys2/src/validphys/pineparser.py +++ b/validphys2/src/validphys/pineparser.py @@ -8,7 +8,6 @@ import pandas as pd from reportengine.compat import yaml - from validphys.coredata import FKTableData ########### This part might eventually be part of whatever commondata reader @@ -239,6 +238,7 @@ def pineappl_reader(fkspec): an FKTableData object containing all necessary information to compute predictions """ from pineappl.fk_table import FkTable + pines = [FkTable.read(i) for i in fkspec.fkpath] cfactors = fkspec.load_cfactors() @@ -271,7 +271,6 @@ def pineappl_reader(fkspec): partial_fktables = [] ndata = 0 for i, p in enumerate(pines): - # Start by reading possible cfactors if cfactor is not empty cfprod = 1.0 if cfactors: @@ -297,9 +296,9 @@ def pineappl_reader(fkspec): missing_x_points = np.setdiff1d(xgrid, p.x_grid(), assume_unique=True) for x_point in missing_x_points: miss_index = list(xgrid).index(x_point) - raw_fktable = np.insert(raw_fktable, miss_index, 0., axis=2) + raw_fktable = np.insert(raw_fktable, miss_index, 0.0, axis=2) if hadronic: - raw_fktable = np.insert(raw_fktable, miss_index, 0., axis=3) + raw_fktable = np.insert(raw_fktable, miss_index, 0.0, axis=3) # Check conversion factors and remove the x* from the fktable raw_fktable *= fkspec.metadata.get("conversion_factor", 1.0) / xdivision diff --git a/validphys2/src/validphys/plotoptions/__init__.py b/validphys2/src/validphys/plotoptions/__init__.py index 9d20947d68..ed3c59a2b7 100644 --- a/validphys2/src/validphys/plotoptions/__init__.py +++ b/validphys2/src/validphys/plotoptions/__init__.py @@ -5,5 +5,9 @@ @author: Zahari Kassabov """ -from validphys.plotoptions.core import (get_info, kitable, transform_result, - get_xq2map) #analysis:ignore \ No newline at end of file +from validphys.plotoptions.core import ( # analysis:ignore + get_info, + get_xq2map, + kitable, + transform_result, +) diff --git a/validphys2/src/validphys/plotoptions/core.py b/validphys2/src/validphys/plotoptions/core.py index 45caa9fb18..941da091c8 100644 --- a/validphys2/src/validphys/plotoptions/core.py +++ b/validphys2/src/validphys/plotoptions/core.py @@ -7,22 +7,20 @@ import dataclasses import enum import logging +import numbers import typing import numpy as np import pandas as pd -import numbers - from validobj import ValidationError -from reportengine.floatformatting import format_number from reportengine.compat import yaml -from reportengine.utils import get_functions, ChainMap - -from validphys.core import CommonDataSpec, DataSetSpec, Cuts, InternalCutsWrapper +from reportengine.floatformatting import format_number +from reportengine.utils import ChainMap, get_functions +from validphys.core import CommonDataSpec, Cuts, DataSetSpec, InternalCutsWrapper from validphys.coredata import CommonData +from validphys.plotoptions import kintransforms, labelers, resulttransforms from validphys.plotoptions.utils import apply_to_all_columns, get_subclasses -from validphys.plotoptions import labelers, kintransforms, resulttransforms from validphys.utils import parse_yaml_inp log = logging.getLogger(__name__) @@ -36,6 +34,7 @@ ResultTransformations = enum.Enum('ResultTransformations', list(result_functions.keys())) TransformFunctions = enum.Enum('TransformFunctions', list(transform_functions.keys())) + def get_info(data, *, normalize=False, cuts=None, use_plotfiles=True): """Retrieve and process the plotting information for the input data (which could be a DatasetSpec or a CommonDataSpec). @@ -66,11 +65,12 @@ def get_info(data, *, normalize=False, cuts=None, use_plotfiles=True): if isinstance(data, DataSetSpec): data = data.commondata if not isinstance(data, CommonDataSpec): - raise TypeError("Unrecognized data type: %s" % type(data) ) + raise TypeError("Unrecognized data type: %s" % type(data)) info = PlotInfo.from_commondata(data, cuts=cuts, normalize=normalize) return info + class PlotInfo: def __init__( self, @@ -137,8 +137,6 @@ def get_xcol(self, table): else: return np.asarray(table[self.x]) - - def group_label(self, same_vals, groupby): if not groupby: return '' @@ -152,11 +150,8 @@ def group_label(self, same_vals, groupby): pieces.append('%s = %s' % (label, val)) return '%s' % ' '.join(pieces) - - @classmethod def from_commondata(cls, commondata, cuts=None, normalize=False): - plot_params = ChainMap() if commondata.plotfiles: for file in commondata.plotfiles: @@ -172,12 +167,12 @@ def from_commondata(cls, commondata, cuts=None, normalize=False): plot_params['dataset_label'] = commondata.name else: - plot_params = {'dataset_label':commondata.name} + plot_params = {'dataset_label': commondata.name} kinlabels = commondata.plot_kinlabels kinlabels = plot_params['kinematics_override'].new_labels(*kinlabels) if "extra_labels" in plot_params and cuts is not None: - cut_extra_labels ={ + cut_extra_labels = { k: [v[i] for i in cuts] for k, v in plot_params["extra_labels"].items() } plot_params["extra_labels"] = cut_extra_labels @@ -260,7 +255,6 @@ def parse_x(self): f"The label {self.x} is not in the set of known labels {self.all_labels}" ) - @property def all_labels(self): if self.extra_labels is None: @@ -269,9 +263,7 @@ def all_labels(self): def __post_init__(self): if self.kinematics_override is not None: - self.kinematics_override = transform_functions[ - self.kinematics_override.name - ]() + self.kinematics_override = transform_functions[self.kinematics_override.name]() if self.result_transform is not None: self.result_transform = result_functions[self.result_transform.name] @@ -311,7 +303,7 @@ def kitable(data, info, *, cuts=None): if isinstance(data, DataSetSpec): data = data.load_commondata() elif isinstance(data, CommonDataSpec): - data = data.load() + data = data.load() table = pd.DataFrame(data.get_kintable(), columns=default_labels[1:]) if isinstance(data, CommonData) and cuts is not None: @@ -321,9 +313,9 @@ def kitable(data, info, *, cuts=None): transform = apply_to_all_columns(table, info.kinematics_override) table = pd.DataFrame(np.array(transform).T, columns=table.columns, index=table.index) - #TODO: This is a little bit ugly. We want to call the functions - #with all the - #extra labels + # TODO: This is a little bit ugly. We want to call the functions + # with all the + # extra labels if info.extra_labels: vals = tuple(info.extra_labels.items()) else: @@ -340,21 +332,23 @@ def kitable(data, info, *, cuts=None): nreal_labels = len(table.columns) for label, func in funcs: - #Pass only the "real" labels and not the derived functions - table[label] = apply_to_all_columns(table.iloc[:,:nreal_labels], func) + # Pass only the "real" labels and not the derived functions + table[label] = apply_to_all_columns(table.iloc[:, :nreal_labels], func) return table + def transform_result(cv, error, kintable, info): if not info.result_transform: return cv, error f = info.result_transform - df = pd.DataFrame({'cv':cv, 'error':error}) - newcv, newerror = apply_to_all_columns(pd.concat([df,kintable], axis=1),f) + df = pd.DataFrame({'cv': cv, 'error': error}) + newcv, newerror = apply_to_all_columns(pd.concat([df, kintable], axis=1), f) return np.array(newcv), np.array(newerror) + def get_xq2map(kintable, info): """Return a tuple of (x,Q²) from the kinematic values defined in kitable (usually obtained by calling ``kitable``) using machinery specified in diff --git a/validphys2/src/validphys/plotoptions/kintransforms.py b/validphys2/src/validphys/plotoptions/kintransforms.py index 786d166f7b..5ba7878ea3 100644 --- a/validphys2/src/validphys/plotoptions/kintransforms.py +++ b/validphys2/src/validphys/plotoptions/kintransforms.py @@ -60,45 +60,53 @@ def xq2map(self, k1:np.array,k2:np.array,k3:np.array,**extra_labels) -> (np.arra """ -#TODO: fix the issue with Zmass and top mass - make them (globa?) constants -ZMASS=91.1876 -TMASS=173.3 +# TODO: fix the issue with Zmass and top mass - make them (globa?) constants +ZMASS = 91.1876 +TMASS = 173.3 import abc import numpy as np + class Kintransform(metaclass=abc.ABCMeta): @classmethod def __subclasshook__(cls, other): - return hasattr(other, 'xq2map') and hasattr(other, '__call__') and hasattr(other, 'new_labels') + return ( + hasattr(other, 'xq2map') and hasattr(other, '__call__') and hasattr(other, 'new_labels') + ) + + +# Common utilities on top of which we build the transforms -#Common utilities on top of which we build the transforms class SqrtScaleMixin: def __call__(self, k1, k2, k3): - return k1, np.sqrt(k2), k3 + return k1, np.sqrt(k2), k3 qlabel = NotImplemented def new_labels(self, s1, s2, s3): return s1, self.qlabel, s3 + class DISXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """in DIS-like experiment k1 is x, k2 is Q""" - return k1, k2*k2 + return k1, k2 * k2 + class DYXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """in DY-like experiments k1 is (pseudo)-rapidity and k2 is Q for each point in the experiment there are two points in the xQ2 map""" - ratio = k2/k3 - x1 = ratio*np.exp(k1) - x2 = ratio*np.exp(-k1) - q2 = k2*k2 - x = np.concatenate(( x1,x2 )) - return np.clip(x,a_min=None,a_max=1, out=x), np.concatenate(( q2,q2 )) + ratio = k2 / k3 + x1 = ratio * np.exp(k1) + x2 = ratio * np.exp(-k1) + q2 = k2 * k2 + x = np.concatenate((x1, x2)) + return np.clip(x, a_min=None, a_max=1, out=x), np.concatenate((q2, q2)) + class JETXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): @@ -106,12 +114,13 @@ def xq2map(self, k1, k2, k3, **extra_labels): k1 is (pseudo)-rapidity and k2 is pT plotting both x1 and x2 """ - ratio = k2/k3 + ratio = k2 / k3 x1 = 2 * ratio * np.exp(k1) x2 = 2 * ratio * np.exp(-k1) - q2 = k2*k2 - x = np.concatenate(( x1,x2 )) - return np.clip(x,a_min=None,a_max=1, out=x), np.concatenate(( q2,q2 )) + q2 = k2 * k2 + x = np.concatenate((x1, x2)) + return np.clip(x, a_min=None, a_max=1, out=x), np.concatenate((q2, q2)) + class DIJETXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): @@ -119,12 +128,13 @@ def xq2map(self, k1, k2, k3, **extra_labels): k1 is max(|y1|,|y2|) and k2 is m12 plotting both x1 and x2 """ - ratio = k2/k3 + ratio = k2 / k3 x1 = ratio * np.exp(k1) x2 = ratio * np.exp(-k1) - q2 = k2*k2 - x = np.concatenate(( x1,x2 )) - return np.clip(x,a_min=None,a_max=1, out=x), np.concatenate(( q2,q2 )) + q2 = k2 * k2 + x = np.concatenate((x1, x2)) + return np.clip(x, a_min=None, a_max=1, out=x), np.concatenate((q2, q2)) + class DIJETATLASXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): @@ -132,15 +142,16 @@ def xq2map(self, k1, k2, k3, **extra_labels): k1 is rapidity difference and k2 is m12 plotting both x1 and x2 """ - ratio = k2/k3 - #x1 = ratio - #x2 = np.full_like(x1, 1.0) + ratio = k2 / k3 + # x1 = ratio + # x2 = np.full_like(x1, 1.0) x1 = ratio * np.exp(k1) x2 = ratio * np.exp(-k1) - q2 = k2*k2 - x = np.concatenate(( x1,x2 )) - return np.clip(x,a_min=None,a_max=1, out=x), np.concatenate(( q2,q2 )) - + q2 = k2 * k2 + x = np.concatenate((x1, x2)) + return np.clip(x, a_min=None, a_max=1, out=x), np.concatenate((q2, q2)) + + class DIJET3DXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """ @@ -149,152 +160,203 @@ def xq2map(self, k1, k2, k3, **extra_labels): plotting both x1 and x2 """ sqrts = 8000 - ratio = k2/sqrts + ratio = k2 / sqrts prefactor = ratio * (np.exp(k1) + np.exp(-k1)) x1 = prefactor * np.exp(k3) x2 = prefactor * np.exp(-k3) - q2 = k2*k2 - x = np.concatenate(( x1,x2 )) - #print(k1[55],k2[55],k3[55],x[55],np.argwhere(x>1)) - return np.clip(x,a_min=None,a_max=1, out=x), np.concatenate(( q2,q2 )) - + q2 = k2 * k2 + x = np.concatenate((x1, x2)) + # print(k1[55],k2[55],k3[55],x[55],np.argwhere(x>1)) + return np.clip(x, a_min=None, a_max=1, out=x), np.concatenate((q2, q2)) - class EWPTXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """in ZPt-like Experiments k1 is the pt, k2 is Q""" - zmass2 = ZMASS*ZMASS - Q = (np.sqrt(zmass2+k1*k1)+k1) - effQ = np.sqrt(zmass2+k1*k1) - return Q/k3, effQ*effQ + zmass2 = ZMASS * ZMASS + Q = np.sqrt(zmass2 + k1 * k1) + k1 + effQ = np.sqrt(zmass2 + k1 * k1) + return Q / k3, effQ * effQ + class DYMXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """in DYM-like experiments the k1 is the mass, k2 is the mass""" - return k2/k3, k2*k2 + return k2 / k3, k2 * k2 + class HQPTXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """in HQPt-like Experiments k1 is the pt, k2 is Q""" - QMASS2 = TMASS*TMASS - Q = (np.sqrt(QMASS2+k1*k1)+k1) - return Q/k3, Q*Q + QMASS2 = TMASS * TMASS + Q = np.sqrt(QMASS2 + k1 * k1) + k1 + return Q / k3, Q * Q + class HQQPTXQ2MapMixin: def xq2map(self, k1, k2, k3, **extra_labels): """in ZPt-like Experiments k1 is the pt, k2 is Q""" - QQMASS2 = (2*TMASS)*(2*TMASS) - Q = (np.sqrt(QQMASS2+k1*k1)+k1) - return Q/k3, Q*Q + QQMASS2 = (2 * TMASS) * (2 * TMASS) + Q = np.sqrt(QQMASS2 + k1 * k1) + k1 + return Q / k3, Q * Q class dyp_sqrt_scale(SqrtScaleMixin, DYXQ2MapMixin): qlabel = '$M (GeV)$' -class jet_sqrt_scale(SqrtScaleMixin,JETXQ2MapMixin): +class jet_sqrt_scale(SqrtScaleMixin, JETXQ2MapMixin): def new_labels(self, *old_labels): return ('$|y|$', '$p_T$ (GeV)', r'$\sqrt{s} (GeV)$') -class dijet_sqrt_scale(SqrtScaleMixin,DIJETXQ2MapMixin): + +class dijet_sqrt_scale(SqrtScaleMixin, DIJETXQ2MapMixin): def new_labels(self, *old_labels): return ('$|y|$', '$m_{12}$ (GeV)', r'$\sqrt{s} (GeV)$') -class dijet_sqrt_scale_ATLAS(SqrtScaleMixin,DIJETATLASXQ2MapMixin): + +class dijet_sqrt_scale_ATLAS(SqrtScaleMixin, DIJETATLASXQ2MapMixin): def __call__(self, k1, k2, k3): return k1, k2, k3 def new_labels(self, *old_labels): return ('$|y^*|$', '$m_{12}$ (GeV)', r'$\sqrt{s} (GeV)$') -class dijet_CMS_3D(SqrtScaleMixin,DIJET3DXQ2MapMixin): + +class dijet_CMS_3D(SqrtScaleMixin, DIJET3DXQ2MapMixin): def new_labels(self, *old_labels): return ('$|y^*|$', '$p_{T,avg}$ (GeV)', r'$|y_b|$') -class dijet_CMS_5TEV(SqrtScaleMixin,DIJET3DXQ2MapMixin): + +class dijet_CMS_5TEV(SqrtScaleMixin, DIJET3DXQ2MapMixin): def new_labels(self, *old_labels): - return ('$\eta_{dijet}$', '$p_{T,avg}$ (GeV)', r'$\sqrt{s} (GeV)$') + return ('$\eta_{dijet}$', '$p_{T,avg}$ (GeV)', r'$\sqrt{s} (GeV)$') + class dis_sqrt_scale(DISXQ2MapMixin): def __call__(self, k1, k2, k3): - ecm = np.sqrt(k2/(k1*k3)) + ecm = np.sqrt(k2 / (k1 * k3)) return k1, np.sqrt(k2), np.ceil(ecm) def new_labels(self, *old_labels): return ('$x$', '$Q$ (GeV)', r'$\sqrt{s} (GeV)$') -class ewj_jpt_sqrt_scale(SqrtScaleMixin,EWPTXQ2MapMixin): #okay but it does not exist + +class ewj_jpt_sqrt_scale(SqrtScaleMixin, EWPTXQ2MapMixin): # okay but it does not exist qlabel = '$M (GeV)$' -class ewj_jrap_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): #EWJ_JRAP->DY ----> okay but it does not exist + +class ewj_jrap_sqrt_scale( + SqrtScaleMixin, DYXQ2MapMixin +): # EWJ_JRAP->DY ----> okay but it does not exist qlabel = '$M (GeV)$' -class ewj_mll_sqrt_scale(SqrtScaleMixin,DYMXQ2MapMixin): #EWJ_MLL->DYm ----> okay but it does not exist + +class ewj_mll_sqrt_scale( + SqrtScaleMixin, DYMXQ2MapMixin +): # EWJ_MLL->DYm ----> okay but it does not exist qlabel = '$M_{ll} (GeV)$' -class ewj_pt_sqrt_scale(SqrtScaleMixin,EWPTXQ2MapMixin): #EWJ_PT->DY ----> Zpt, okay but it does not exist + +class ewj_pt_sqrt_scale( + SqrtScaleMixin, EWPTXQ2MapMixin +): # EWJ_PT->DY ----> Zpt, okay but it does not exist qlabel = '$M (GeV)$' -class ewj_ptrap_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): # EWJ_PTRAP -> DY okay, but it does not exist + +class ewj_ptrap_sqrt_scale( + SqrtScaleMixin, DYXQ2MapMixin +): # EWJ_PTRAP -> DY okay, but it does not exist qlabel = r'$p_T (GeV)$' -class ewj_rap_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): # EWJ_RAP -> DY okay (can we get rid of it also in commondata?) + +class ewj_rap_sqrt_scale( + SqrtScaleMixin, DYXQ2MapMixin +): # EWJ_RAP -> DY okay (can we get rid of it also in commondata?) qlabel = '$M (GeV)$' -class ewk_mll_sqrt_scale(SqrtScaleMixin,DYMXQ2MapMixin): # EWK_MLL -> DYM okay + +class ewk_mll_sqrt_scale(SqrtScaleMixin, DYMXQ2MapMixin): # EWK_MLL -> DYM okay qlabel = '$M_{ll} (GeV)$' -class ewk_pt_sqrt_scale(SqrtScaleMixin,EWPTXQ2MapMixin): # EWK_PT -> Zpt okay + +class ewk_pt_sqrt_scale(SqrtScaleMixin, EWPTXQ2MapMixin): # EWK_PT -> Zpt okay qlabel = '$M (GeV)$' -class ewk_ptrap_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): # EWK_PT -> DY okay + +class ewk_ptrap_sqrt_scale(SqrtScaleMixin, DYXQ2MapMixin): # EWK_PT -> DY okay qlabel = r'$p_T (GeV)$' -class ewk_rap_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): # EWK_RAP -> DY okay + +class ewk_rap_sqrt_scale(SqrtScaleMixin, DYXQ2MapMixin): # EWK_RAP -> DY okay qlabel = '$M (GeV)$' -class hig_rap_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): #okay, but it does not exist + +class hig_rap_sqrt_scale(SqrtScaleMixin, DYXQ2MapMixin): # okay, but it does not exist qlabel = '$M_H (GeV)$' -class hqp_mqq_sqrt_scale(SqrtScaleMixin,DYMXQ2MapMixin): # HQP_MQQ -> DYM okay + +class hqp_mqq_sqrt_scale(SqrtScaleMixin, DYMXQ2MapMixin): # HQP_MQQ -> DYM okay qlabel = r'$\mu (GeV)$' -class hqp_ptq_sqrt_scale(SqrtScaleMixin,HQPTXQ2MapMixin): # HQP_PTQ -> HQPT okay + +class hqp_ptq_sqrt_scale(SqrtScaleMixin, HQPTXQ2MapMixin): # HQP_PTQ -> HQPT okay qlabel = r'$\mu (GeV)$' -class hqp_ptqq_sqrt_scale(SqrtScaleMixin,HQQPTXQ2MapMixin): # HQP_PTQQ -> HQQPT okay + +class hqp_ptqq_sqrt_scale(SqrtScaleMixin, HQQPTXQ2MapMixin): # HQP_PTQQ -> HQQPT okay qlabel = r'$\mu (GeV)$' -class hqp_yq_sqrt_scale(SqrtScaleMixin,JETXQ2MapMixin): # HQP_YQ->JETXQ2 okay + +class hqp_yq_sqrt_scale(SqrtScaleMixin, JETXQ2MapMixin): # HQP_YQ->JETXQ2 okay qlabel = r'$\mu (GeV)$' -class hqp_yqq_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): #HQP_YQQ->DYXQ2 okay + +class hqp_yqq_sqrt_scale(SqrtScaleMixin, DYXQ2MapMixin): # HQP_YQQ->DYXQ2 okay qlabel = r'$\mu (GeV)$' -class inc_sqrt_scale(SqrtScaleMixin,DYMXQ2MapMixin): # INC -> DYM okay + +class inc_sqrt_scale(SqrtScaleMixin, DYMXQ2MapMixin): # INC -> DYM okay qlabel = r'$\mu (GeV)$' -class pht_sqrt_scale(SqrtScaleMixin,DYXQ2MapMixin): #okay but not in commondata + +class pht_sqrt_scale(SqrtScaleMixin, DYXQ2MapMixin): # okay but not in commondata qlabel = r'$E_{T,\gamma} (GeV)$' -class sia_sqrt_scale(SqrtScaleMixin,DISXQ2MapMixin): #okay but not in commondata + +class sia_sqrt_scale(SqrtScaleMixin, DISXQ2MapMixin): # okay but not in commondata qlabel = '$Q (GeV)$' class nmc_process(DISXQ2MapMixin): - def __call__(self, k1,k2,k3): - xBins = [0.0045, 0.008, 0.0125, 0.0175, - 0.025, 0.035, 0.05, 0.07, 0.09, 0.11, - 0.14, 0.18, 0.225, 0.275, 0.35, 0.5] + def __call__(self, k1, k2, k3): + xBins = [ + 0.0045, + 0.008, + 0.0125, + 0.0175, + 0.025, + 0.035, + 0.05, + 0.07, + 0.09, + 0.11, + 0.14, + 0.18, + 0.225, + 0.275, + 0.35, + 0.5, + ] for x in np.nditer(k1, op_flags=['readwrite']): - x[...] = min(xBins, key=lambda y:abs(x-y)) - ecm = np.sqrt(k2/(k1*k3)) + x[...] = min(xBins, key=lambda y: abs(x - y)) + ecm = np.sqrt(k2 / (k1 * k3)) return k1, np.sqrt(k2), np.ceil(ecm) def new_labels(self, *old_labels): return ('$x$', '$Q$ (GeV)', r'$\sqrt{s} (GeV)$') + class ewk_pseudorapity_sqrt_scale(ewk_rap_sqrt_scale): def new_labels(self, *old_labels): superlabels = super().new_labels(*old_labels) diff --git a/validphys2/src/validphys/plotoptions/labelers.py b/validphys2/src/validphys/plotoptions/labelers.py index bd04a3f7fa..a2f0f2d0ef 100644 --- a/validphys2/src/validphys/plotoptions/labelers.py +++ b/validphys2/src/validphys/plotoptions/labelers.py @@ -6,41 +6,47 @@ """ import numpy as np -from validphys.plotoptions.utils import label -from validphys.plotoptions.utils import bins +from validphys.plotoptions.utils import bins, label + @label(r"$I(x>10^{-2})\times I(Q > 1000 GeV)$") def high_xq(k1, k2, k3, **kwargs): return (k1 > 1e-2) & (k2 > 1000) -def pt_ratio(k1, k2, k3 , **kwargs): - return k1/k2 -def jet_eta(k1,k2,k3,**kwargs): +def pt_ratio(k1, k2, k3, **kwargs): + return k1 / k2 + + +def jet_eta(k1, k2, k3, **kwargs): return k1 -def k2bins5(k1,k2,k3,**kwargs): + +def k2bins5(k1, k2, k3, **kwargs): qbin = bins(k2) qbin[:] = [int(x / 5) for x in qbin] return qbin -def k2bins6(k1,k2,k3,**kwargs): + +def k2bins6(k1, k2, k3, **kwargs): qbin = bins(k2) qbin[:] = [int(x / 6) for x in qbin] return qbin -def k2bins10(k1,k2,k3,**kwargs): + +def k2bins10(k1, k2, k3, **kwargs): qbin = bins(k2) qbin[:] = [int(x / 10) for x in qbin] return qbin + @label("$Q^2$ (GeV²)") -def two_Q2_bins(k1,k2,k3,**kwargs): - min_, median, max_ = np.percentile(k2, (0,50,100)) - firstlabel = '[%.2f, %.2f)'%(min_, median) - #Use dtype=object to avoid longer strings in secondlabel getting trimmed +def two_Q2_bins(k1, k2, k3, **kwargs): + min_, median, max_ = np.percentile(k2, (0, 50, 100)) + firstlabel = '[%.2f, %.2f)' % (min_, median) + # Use dtype=object to avoid longer strings in secondlabel getting trimmed res = np.array([firstlabel] * len(k2), dtype=object) - secondlabel = '[%.2f, %.2f]'%(median, max_) - res[k2>=median] = secondlabel + secondlabel = '[%.2f, %.2f]' % (median, max_) + res[k2 >= median] = secondlabel return res diff --git a/validphys2/src/validphys/plotoptions/resulttransforms.py b/validphys2/src/validphys/plotoptions/resulttransforms.py index 1eee93d645..5da4e397a4 100644 --- a/validphys2/src/validphys/plotoptions/resulttransforms.py +++ b/validphys2/src/validphys/plotoptions/resulttransforms.py @@ -23,44 +23,52 @@ def xbinexp(cv, error, **labels): """ -from validphys.plotoptions.utils import bins import numpy +from validphys.plotoptions.utils import bins + + class MissingLabelError(KeyError): def __init__(self, key_error): msg = "A label is required to perform the operation: %s" % key_error.args[0] super().__init__(msg) + def half(cv, error, **labels): - return cv/2, error/2 + return cv / 2, error / 2 -#TODO: Refactor these so we don't write the same code over and over? +# TODO: Refactor these so we don't write the same code over and over? + def qbinEMC(cv, error, **labels): q = labels['k2'] qbin = numpy.sqrt(q) - k = (float(10)**qbin) - return k*cv, k*error + k = float(10) ** qbin + return k * cv, k * error + def qbinexp(cv, error, **labels): q = labels['k2'] qbin = bins(q) - k = float(10)**qbin - return k*cv, k*error + k = float(10) ** qbin + return k * cv, k * error + def qbindis(cv, error, **labels): q = labels['k1'] qbin = bins(q) - k = float(10)**(10-qbin) - return k*cv, k*error + k = float(10) ** (10 - qbin) + return k * cv, k * error + def qbinjets(cv, error, **labels): qbin = labels['k1'] - k = float(1000)**(5-qbin) - return k*cv, k*error + k = float(1000) ** (5 - qbin) + return k * cv, k * error + def qbindyp(cv, error, **labels): qbin = labels['k1'] - k = float(10000)**(qbin) - return k*cv, k*error + k = float(10000) ** (qbin) + return k * cv, k * error diff --git a/validphys2/src/validphys/plotoptions/utils.py b/validphys2/src/validphys/plotoptions/utils.py index a5ad54d06f..1393d7f5b4 100644 --- a/validphys2/src/validphys/plotoptions/utils.py +++ b/validphys2/src/validphys/plotoptions/utils.py @@ -9,6 +9,7 @@ import numpy as np + def bins(arr): """Return bins corresponding to unique values of ``arr`` sorted by value. @@ -22,21 +23,26 @@ def bins(arr): arr = np.atleast_1d(arr) return np.unique(arr, return_inverse=True)[1] + def new_labels(k1label, k2lebel, k3label): def closure(f): f.new_labels = k1label, k2lebel, k3label return f + return closure + def label(label): def closure(f): f.label = label return f + return closure + def get_subclasses(obj, base): """Return the classes in ``obj`` that are subclasses of ``base``""" - predicate = lambda x: inspect.isclass(x) and issubclass(x, base) + predicate = lambda x: inspect.isclass(x) and issubclass(x, base) return collections.OrderedDict(inspect.getmembers(obj, predicate)) @@ -44,5 +50,5 @@ def apply_to_all_columns(df, func): """Apply a function to all columns of a dataframe at the saem time. The parameter names are the names of the column and the values are arrays containing the each column's values.""" - params = dict((col,df[col].values) for col in df.columns) + params = dict((col, df[col].values) for col in df.columns) return func(**params) diff --git a/validphys2/src/validphys/plotutils.py b/validphys2/src/validphys/plotutils.py index e53fdae56e..4d8e6d1b5e 100644 --- a/validphys2/src/validphys/plotutils.py +++ b/validphys2/src/validphys/plotutils.py @@ -5,22 +5,20 @@ @author: Zahari Kassabov """ +from collections import namedtuple import functools import itertools -from collections import namedtuple import logging -import scipy.stats as stats - -import numpy as np -from matplotlib.figure import Figure import matplotlib as mpl -import matplotlib.scale as mscale -import matplotlib.patches as mpatches +from matplotlib import ticker, transforms import matplotlib.collections as mcollections -from matplotlib import transforms +from matplotlib.figure import Figure from matplotlib.markers import MarkerStyle -from matplotlib import ticker +import matplotlib.patches as mpatches +import matplotlib.scale as mscale +import numpy as np +import scipy.stats as stats from reportengine.floatformatting import format_number @@ -85,30 +83,35 @@ def add_subplot(figsize=None, projection=None, **kwargs): def ax_or_gca(f): """A decorator. When applied to a function, the keyword argument ``ax`` will automatically be filled with the current axis, if it was None.""" + @functools.wraps(f) def _f(*args, **kwargs): - if 'ax' not in kwargs or kwargs['ax'] is None: - kwargs['ax'] = Figure().add_subplot(1, 1, 1) - return f(*args, **kwargs) + if 'ax' not in kwargs or kwargs['ax'] is None: + kwargs['ax'] = Figure().add_subplot(1, 1, 1) + return f(*args, **kwargs) + return _f + def ax_or_newfig(f): """A decorator. When applied to a function, the keyword argument ``ax`` will automatically be filled with the a new axis corresponding to an empty, if it was None.""" + @functools.wraps(f) def _f(*args, **kwargs): noax = 'ax' not in kwargs or kwargs['ax'] is None if noax: - fig = Figure() - kwargs['ax'] = fig.add_subplot(1, 1, 1) + fig = Figure() + kwargs['ax'] = fig.add_subplot(1, 1, 1) result = f(*args, **kwargs) if noax: - kwargs['ax'].legend(loc = 'best') + kwargs['ax'].legend(loc='best') return result return _f + def frame_center(ax, x, values): """Set the `ylims` of the axis ``ax`` to appropriately display ``values``, which can be 1 or 2D and are assumed to be sampled uniformly @@ -117,37 +120,34 @@ def frame_center(ax, x, values): scale = mscale.scale_factory(ax.xaxis.get_scale(), ax.xaxis) t = scale.get_transform() tx = t.transform(x) - absxmax, absxmin = np.max(tx) , np.min(tx) + absxmax, absxmin = np.max(tx), np.min(tx) l = absxmax - absxmin - center = l/2 + absxmin + center = l / 2 + absxmin - dist = np.abs(tx-center) + dist = np.abs(tx - center) - #Require some margin around the center - close_region_mask = np.where(dist < l/10) + # Require some margin around the center + close_region_mask = np.where(dist < l / 10) close_vals = values[:, close_region_mask] close_anchor_max = np.percentile(np.percentile(close_vals, 95, axis=0), 95) close_anchor_min = np.percentile(np.percentile(close_vals, 5, axis=0), 5) - close_anchor_min, close_anchor_max = expand_margin(close_anchor_min, - close_anchor_max, 1.4) + close_anchor_min, close_anchor_max = expand_margin(close_anchor_min, close_anchor_max, 1.4) - #And to see at least 50% everywhere in t - medium_region_mask = np.where(dist < l/2.5) + # And to see at least 50% everywhere in t + medium_region_mask = np.where(dist < l / 2.5) medium_vals = values[:, medium_region_mask] - medium_anchor_max = np.percentile(np.percentile(medium_vals, 75, axis=0),95) - medium_anchor_min = np.percentile(np.percentile(medium_vals, 25, axis=0),5) + medium_anchor_max = np.percentile(np.percentile(medium_vals, 75, axis=0), 95) + medium_anchor_min = np.percentile(np.percentile(medium_vals, 25, axis=0), 5) - medium_anchor_min, medium_anchor_max = expand_margin(medium_anchor_min, - medium_anchor_max, - 1.1) + medium_anchor_min, medium_anchor_max = expand_margin(medium_anchor_min, medium_anchor_max, 1.1) view_max = max((close_anchor_max, medium_anchor_max)) view_min = min((close_anchor_min, medium_anchor_min)) amin, amax = ax.get_ylim() - #Fix edge cases where the limits are nan or infinite - #(e.g. when dividing by zero in the whole range) + # Fix edge cases where the limits are nan or infinite + # (e.g. when dividing by zero in the whole range) if not np.isfinite(view_min): view_min = amin if not np.isfinite(view_max): @@ -156,13 +156,14 @@ def frame_center(ax, x, values): ax.set_ylim(max(view_min, amin), min(view_max, amax)) -def expand_margin(a,b,proportion): +def expand_margin(a, b, proportion): """Return a pair of numbers that have the same mean as ``(a,b)`` and their distance is ``proportion`` times bigger.""" - halfdiff = (b-a)/2 + halfdiff = (b - a) / 2 center = a + halfdiff - expansion = halfdiff*proportion - return center - expansion, center+expansion + expansion = halfdiff * proportion + return center - expansion, center + expansion + def hatch_iter(): """An infinite iterator that yields increasingly denser patterns of @@ -172,8 +173,9 @@ def hatch_iter(): i = 1 while True: for hatch in hatches: - yield hatch*i - i+=1 + yield hatch * i + i += 1 + def marker_iter_scatter(): """Yield the possible matplotplib.markers.Markersyle instances with @@ -186,12 +188,13 @@ def marker_iter_scatter(): for shape in MarkerStyle.filled_markers: yield MarkerStyle(marker=shape, fillstyle=fill) + def marker_iter_plot(): """Because of the mpl strange interface, markers work differently in plots and scatter. This is the same as `marker_iter_scatter`, but returns kwargs to be passed to ``plt.plot()``""" for ms in marker_iter_scatter(): - yield {'marker':ms.get_marker(),'fillstyle': ms.get_fillstyle()} + yield {'marker': ms.get_marker(), 'fillstyle': ms.get_fillstyle()} def color_iter(): @@ -199,7 +202,7 @@ def color_iter(): colores are exhausted a warning will be logged and the cycle will be repeated infinitely. Therefore this avoids the overflow error at runtime when using matplotlib's ``f'C{i}'`` color specification (equivalent to - ``colors[i]``) when ``i>len(colors)`` """ + ``colors[i]``) when ``i>len(colors)``""" color_list = [prop['color'] for prop in mpl.rcParams['axes.prop_cycle']] yield from color_list log.warning("Color cycle exhausted. Will repeat colors.") @@ -225,6 +228,7 @@ def scalar_log_formatter(): >>> ax.set_yscale("log") >>> ax.yaxis.set_major_formatter(scalar_log_formatter()) """ + # See https://stackoverflow.com/a/33213196 def formatter(y, _pos): decimalplaces = int(np.maximum(-np.log10(y), 0)) # =0 for numbers >=1 @@ -235,65 +239,74 @@ def formatter(y, _pos): return ticker.FuncFormatter(formatter) + HandlerSpec = namedtuple('HandelrSpec', ["color", "alpha", "hatch", "outer"]) + class ComposedHandler: """Legend artist for PDF plots.""" + def legend_artist(self, legend, orig_handle, fontsize, handlebox): x0, y0 = handlebox.xdescent, handlebox.ydescent width, height = handlebox.width, handlebox.height patches = [] if orig_handle.outer: - wpad = width*0.1 - hpad = height*0.1 + wpad = width * 0.1 + hpad = height * 0.1 edges = 'none' - outer = mpatches.Rectangle([x0, y0], width, height, - facecolor='none', - linestyle= 'dashed', - edgecolor = orig_handle.color, - transform=handlebox.get_transform()) + outer = mpatches.Rectangle( + [x0, y0], + width, + height, + facecolor='none', + linestyle='dashed', + edgecolor=orig_handle.color, + transform=handlebox.get_transform(), + ) handlebox.add_artist(outer) patches.append(outer) else: wpad = hpad = 0 edges = 'black' - patch = mpatches.Rectangle([x0+wpad, y0+hpad], - width-2*wpad, height-2*hpad, - facecolor=orig_handle.color, - alpha = orig_handle.alpha, - hatch=orig_handle.hatch, - edgecolor=edges, - transform=handlebox.get_transform()) + patch = mpatches.Rectangle( + [x0 + wpad, y0 + hpad], + width - 2 * wpad, + height - 2 * hpad, + facecolor=orig_handle.color, + alpha=orig_handle.alpha, + hatch=orig_handle.hatch, + edgecolor=edges, + transform=handlebox.get_transform(), + ) handlebox.add_artist(patch) patches.append(patch) return patches -def offset_xcentered(n, ax,*, offset_prop=0.05): + +def offset_xcentered(n, ax, *, offset_prop=0.05): """Yield ``n`` matplotlib transforms in such a way that the corresponding ``n`` transofrmed x values are centered around the middle. The offset between to consecutive points is ``offset_prop`` in units of the figure dpi scale.""" - first_offset = +(n//2) - #http://matplotlib.org/users/transforms_tutorial.html + first_offset = +(n // 2) + # http://matplotlib.org/users/transforms_tutorial.html for i in range(n): - - dx = offset_prop*(i-first_offset) - offset = transforms.ScaledTranslation(dx, 0, - ax.figure.dpi_scale_trans) - offset_transform = ax.transData + offset - yield offset_transform + dx = offset_prop * (i - first_offset) + offset = transforms.ScaledTranslation(dx, 0, ax.figure.dpi_scale_trans) + offset_transform = ax.transData + offset + yield offset_transform def centered_range(n, value=0, distance=1): """Generte a range of ``n`` points centered around ``value``, unifirmely sampled at intervals of ``distance``.""" - first_offset = +(n/2) - 0.5 + first_offset = +(n / 2) - 0.5 for i in range(n): - yield distance*(i-first_offset) + value + yield distance * (i - first_offset) + value def barplot(values, collabels, datalabels, orientation='auto'): @@ -342,89 +355,102 @@ def barplot(values, collabels, datalabels, orientation='auto'): ntypes, l = values.shape lc = len(collabels) if lc != l: - raise ValueError(f"Mismatch between the number of data points ({l}) and " - f"the number axis labels ({lc})") + raise ValueError( + f"Mismatch between the number of data points ({l}) and " + f"the number axis labels ({lc})" + ) width = 2 - #The tick positions - x = np.linspace(0, 1.1*width*ntypes*l-1, l) - w,h = mpl.rcParams["figure.figsize"] + # The tick positions + x = np.linspace(0, 1.1 * width * ntypes * l - 1, l) + w, h = mpl.rcParams["figure.figsize"] - #Rescale if we have too much data - rescale = max(1, 1+(width*l*ntypes-15)*0.05) - #Rescale if the labels are too long - lbrescale = max(1, 1+0.04*(max(len(l) for l in collabels)-5)) + # Rescale if we have too much data + rescale = max(1, 1 + (width * l * ntypes - 15) * 0.05) + # Rescale if the labels are too long + lbrescale = max(1, 1 + 0.04 * (max(len(l) for l in collabels) - 5)) if orientation == 'auto': - if l*ntypes > 20: + if l * ntypes > 20: orientation = 'horizontal' else: orientation = 'vertical' if orientation == 'vertical': - fig = Figure(figsize=(w*rescale, h*lbrescale)) + fig = Figure(figsize=(w * rescale, h * lbrescale)) ax = fig.subplots() barfunc = ax.bar infoaxis = ax.xaxis infolim = ax.set_xlim otheraxis = ax.yaxis rotation = 80 + def get_pos(val): if val >= 0: - xytext = (0,5) + xytext = (0, 5) else: - xytext = (0,-5) - horizontalalignment='center' - return {'xytext': xytext, - 'horizontalalignment':horizontalalignment} - - def xytext(x,y): return x,y - elif orientation =='horizontal': - fig = Figure(figsize=(w*lbrescale, h*rescale)) + xytext = (0, -5) + horizontalalignment = 'center' + return {'xytext': xytext, 'horizontalalignment': horizontalalignment} + + def xytext(x, y): + return x, y + + elif orientation == 'horizontal': + fig = Figure(figsize=(w * lbrescale, h * rescale)) ax = fig.subplots() barfunc = ax.barh infoaxis = ax.yaxis infolim = ax.set_ylim otheraxis = ax.xaxis rotation = 10 + def get_pos(val): if val >= 0: - xytext = (5,0) - horizontalalignment= 'left' + xytext = (5, 0) + horizontalalignment = 'left' else: - xytext = (-5,0) - horizontalalignment='right' - return {'xytext': xytext, - 'horizontalalignment':horizontalalignment} - def xytext(x,y): return y,x + xytext = (-5, 0) + horizontalalignment = 'right' + return {'xytext': xytext, 'horizontalalignment': horizontalalignment} + + def xytext(x, y): + return y, x else: raise ValueError("orientation must be one of ('auto', 'horizontal', 'vertical')") infoaxis.set_ticks(x) - infoaxis.set_ticklabels(collabels,rotation=rotation) + infoaxis.set_ticklabels(collabels, rotation=rotation) deltas = list(centered_range(ntypes, distance=width)) for row, delta, datalabel in zip(values, deltas, datalabels): - thisx = x+delta + thisx = x + delta barfunc(thisx, row, width, label=datalabel) for xp, v in zip(thisx, row): - #NaN coords cause error (https://github.com/NNPDF/nnpdf/issues/363) + # NaN coords cause error (https://github.com/NNPDF/nnpdf/issues/363) if np.all(np.isfinite([xp, v])): - ax.annotate(f'{format_number(v,3)}', xy=xytext(xp,v), - textcoords='offset points', - size='small', wrap=True, **get_pos(v) - ) + ax.annotate( + f'{format_number(v,3)}', + xy=xytext(xp, v), + textcoords='offset points', + size='small', + wrap=True, + **get_pos(v), + ) else: - #place label at zero for nan coordinate -> ensure `get_pos` is fed altered coords + # place label at zero for nan coordinate -> ensure `get_pos` is fed altered coords new_pos = [val if np.isfinite(val) else 0 for val in [xp, v]] - ax.annotate(f'{format_number(v,3)}', xy=xytext(*new_pos), - textcoords='offset points', - size='small', wrap=True, **get_pos(new_pos[0]) - ) - - - infolim(x[0]+deltas[0] - width/2, x[-1]+deltas[-1]+width/2) + ax.annotate( + f'{format_number(v,3)}', + xy=xytext(*new_pos), + textcoords='offset points', + size='small', + wrap=True, + **get_pos(new_pos[0]), + ) + + infolim(x[0] + deltas[0] - width / 2, x[-1] + deltas[-1] + width / 2) otheraxis.set_visible(False) ax.tick_params(length=0) ax.spines['left'].set_color('none') @@ -436,17 +462,17 @@ def xytext(x,y): return y,x ax._bar_orientation = orientation return fig, ax -def plot_horizontal_errorbars(cvs, errors, categorylabels, datalabels=None, - xlim=None): + +def plot_horizontal_errorbars(cvs, errors, categorylabels, datalabels=None, xlim=None): """A plots with a list of horizontal errorbars oriented vertically. ``cvs`` and ``errors`` are the central values and errors both of shape `ndatasets x ncategories`, ``cateogorylabels`` are the labels of each element for which errorbars are drawn and ``datalabels`` are the labels of the different datasets that are compared. """ - w,h = mpl.rcParams["figure.figsize"] - rescale = max(1, 1 + 0.1*(len(categorylabels) - 7)) - fig = Figure(figsize=(w*1.5, h*rescale)) + w, h = mpl.rcParams["figure.figsize"] + rescale = max(1, 1 + 0.1 * (len(categorylabels) - 7)) + fig = Figure(figsize=(w * 1.5, h * rescale)) ax = fig.subplots() if datalabels is None: datalabels = itertools.repeat(None) @@ -456,28 +482,25 @@ def plot_horizontal_errorbars(cvs, errors, categorylabels, datalabels=None, ax.yaxis.set_ticklabels(categorylabels) mi = marker_iter_plot() - - distance = 0.5/len(cvs) + distance = 0.5 / len(cvs) pos = centered_range(len(cvs), distance=distance) for cv, err, lb, markerspec, shift in zip(cvs, errors, datalabels, mi, pos): - ax.errorbar(cv, y+shift, xerr=err, linestyle='none', label=lb, - **markerspec) + ax.errorbar(cv, y + shift, xerr=err, linestyle='none', label=lb, **markerspec) if xlim is None: - #Support both single error and up, low. + # Support both single error and up, low. errmat = np.atleast_2d(errors) - low = cvs - errmat[0,:] - high = cvs + errmat[-1,:] - xlim = expand_margin(np.nanpercentile(low, 15), - np.nanpercentile(high,85), - 1.5) + low = cvs - errmat[0, :] + high = cvs + errmat[-1, :] + xlim = expand_margin(np.nanpercentile(low, 15), np.nanpercentile(high, 85), 1.5) ax.set_xlim(xlim) - ax.set_ylim(-0.5, len(categorylabels)-0.5) + ax.set_ylim(-0.5, len(categorylabels) - 0.5) ax.invert_yaxis() ax.grid(axis='y') return fig, ax + @ax_or_gca def kde_plot(a, height=0.05, ax=None, label=None, color=None, max_marks=100000): """Plot a Kernel Density Estimate of a 1D array, togther with individual @@ -557,6 +580,7 @@ def kde_plot(a, height=0.05, ax=None, label=None, color=None, max_marks=100000): ax.set_ylim(ymin=0) return ax + @ax_or_gca def spiderplot(xticks, vals, label, ax=None): """ diff --git a/validphys2/src/validphys/promptutils.py b/validphys2/src/validphys/promptutils.py index 955002a991..cac88678c2 100644 --- a/validphys2/src/validphys/promptutils.py +++ b/validphys2/src/validphys/promptutils.py @@ -5,6 +5,7 @@ # help with command line speed from prompt_toolkit import HTML + def yes_no_str(default=None): """Return a yes or no string for the prompt, with the default highlighted""" @@ -15,6 +16,7 @@ def yes_no_str(default=None): else: return HTML('[y/N]') + def confirm(message, default=None): """ This is like prompt_toolkit.shortcuts.confirm (implemented by @@ -23,9 +25,9 @@ def confirm(message, default=None): It also support defaults. """ + from prompt_toolkit.formatted_text import merge_formatted_text from prompt_toolkit.key_binding.key_bindings import KeyBindings from prompt_toolkit.keys import Keys - from prompt_toolkit.formatted_text import merge_formatted_text from prompt_toolkit.shortcuts import PromptSession bindings = KeyBindings() @@ -44,7 +46,7 @@ def no(event): @bindings.add(Keys.Any) def nothing(event): - " Disallow inserting other text. " + "Disallow inserting other text." pass if default: @@ -58,6 +60,7 @@ def nothing(event): session = PromptSession(complete_message, key_bindings=bindings) return session.prompt() + # We need some sort of cache because prompt_toolkit calls the callable # every time it tries to complete. class KeywordsWithCache: diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index feee158a98..f5d6fb0389 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -4,15 +4,18 @@ networks during the fitting. """ from collections import namedtuple -import logging import hashlib +import logging import numpy as np import pandas as pd -from validphys.covmats import INTRA_DATASET_SYS_NAME, sqrt_covmat, dataset_inputs_covmat_from_systematics - from reportengine import collect +from validphys.covmats import ( + INTRA_DATASET_SYS_NAME, + dataset_inputs_covmat_from_systematics, + sqrt_covmat, +) FILE_PREFIX = "datacuts_theory_fitting_" @@ -24,6 +27,7 @@ read_fit_pseudodata = collect('read_replica_pseudodata', ('fitreplicas', 'fitcontextwithcuts')) read_pdf_pseudodata = collect('read_replica_pseudodata', ('pdfreplicas', 'fitcontextwithcuts')) + def read_replica_pseudodata(fit, context_index, replica): """Function to handle the reading of training and validation splits for a fit that has been produced with the ``savepseudodata`` flag set to ``True``. @@ -66,7 +70,7 @@ def read_replica_pseudodata(fit, context_index, replica): # List of length 1 due to the collect context_index = context_index[0] # The [0] is because of how pandas handles sorting a MultiIndex - sorted_index = context_index.sortlevel(level=range(1,3))[0] + sorted_index = context_index.sortlevel(level=range(1, 3))[0] log.debug(f"Reading pseudodata & training/validation splits from {fit.name}.") replica_path = fit.path / "nnfit" / f"replica_{replica}" @@ -82,7 +86,9 @@ def read_replica_pseudodata(fit, context_index, replica): training_path = replica_path / "training.dat" validation_path = replica_path / "validation.dat" tr = pd.read_csv(training_path, index_col=[0, 1, 2], sep="\t", names=[f"replica {replica}"]) - val = pd.read_csv(validation_path, index_col=[0, 1, 2], sep="\t", names=[f"replica {replica}"]) + val = pd.read_csv( + validation_path, index_col=[0, 1, 2], sep="\t", names=[f"replica {replica}"] + ) except FileNotFoundError as e: raise FileNotFoundError( "Could not find saved training and validation data files. " @@ -91,22 +97,23 @@ def read_replica_pseudodata(fit, context_index, replica): tr["type"], val["type"] = "training", "validation" pseudodata = pd.concat((tr, val)) - pseudodata.sort_index(level=range(1,3), inplace=True) + pseudodata.sort_index(level=range(1, 3), inplace=True) pseudodata.index = sorted_index - tr = pseudodata[pseudodata["type"]=="training"] - val = pseudodata[pseudodata["type"]=="validation"] + tr = pseudodata[pseudodata["type"] == "training"] + val = pseudodata[pseudodata["type"] == "validation"] return DataTrValSpec(pseudodata.drop("type", axis=1), tr.index, val.index) + def make_replica( - groups_dataset_inputs_loaded_cd_with_cuts, - replica_mcseed, - dataset_inputs_sampling_covmat, - sep_mult, - genrep=True - ): + groups_dataset_inputs_loaded_cd_with_cuts, + replica_mcseed, + dataset_inputs_sampling_covmat, + sep_mult, + genrep=True, +): """Function that takes in a list of :py:class:`validphys.coredata.CommonData` objects and returns a pseudodata replica accounting for possible correlations between systematic uncertainties. @@ -131,10 +138,10 @@ def make_replica( separate_multiplicative: bool Specifies whether computing the shifts with the full covmat or separating multiplicative errors (in the latter case remember to generate the covmat coherently) - + genrep: bool Specifies whether computing replicas or not - + Returns ------- pseudodata: np.array @@ -157,16 +164,18 @@ def make_replica( 0.34206012, 0.31866286, 0.2790856 , 0.33257621, 0.33680007, """ if not genrep: - return np.concatenate([cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts]) + return np.concatenate( + [cd.central_values for cd in groups_dataset_inputs_loaded_cd_with_cuts] + ) # Seed the numpy RNG with the seed and the name of the datasets in this run name_salt = "-".join(i.setname for i in groups_dataset_inputs_loaded_cd_with_cuts) - name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10 ** 8 - rng = np.random.default_rng(seed=replica_mcseed+name_seed) - #construct covmat + name_seed = int(hashlib.sha256(name_salt.encode()).hexdigest(), 16) % 10**8 + rng = np.random.default_rng(seed=replica_mcseed + name_seed) + # construct covmat covmat = dataset_inputs_sampling_covmat covmat_sqrt = sqrt_covmat(covmat) - #Loading the data + # Loading the data pseudodatas = [] check_positive_masks = [] nonspecial_mult = [] @@ -176,13 +185,13 @@ def make_replica( pseudodata = cd.central_values.to_numpy() pseudodatas.append(pseudodata) - #Separation of multiplicative errors. If separate_multiplicative is True also the exp_covmat is produced + # Separation of multiplicative errors. If separate_multiplicative is True also the exp_covmat is produced # without multiplicative errors if sep_mult: mult_errors = cd.multiplicative_errors mult_uncorr_errors = mult_errors.loc[:, mult_errors.columns == "UNCORR"].to_numpy() mult_corr_errors = mult_errors.loc[:, mult_errors.columns == "CORR"].to_numpy() - nonspecial_mult.append( (mult_uncorr_errors, mult_corr_errors) ) + nonspecial_mult.append((mult_uncorr_errors, mult_corr_errors)) special_mult.append( mult_errors.loc[:, ~mult_errors.columns.isin(INTRA_DATASET_SYS_NAME)] ) @@ -190,7 +199,7 @@ def make_replica( check_positive_masks.append(np.zeros_like(pseudodata, dtype=bool)) else: check_positive_masks.append(np.ones_like(pseudodata, dtype=bool)) - #concatenating special multiplicative errors, pseudodatas and positive mask + # concatenating special multiplicative errors, pseudodatas and positive mask if sep_mult: special_mult_errors = pd.concat(special_mult, axis=0, sort=True).fillna(0).to_numpy() all_pseudodata = np.concatenate(pseudodatas, axis=0) @@ -201,7 +210,7 @@ def make_replica( mult_shifts = [] # Prepare the per-dataset multiplicative shifts for mult_uncorr_errors, mult_corr_errors in nonspecial_mult: - # convert to from percent to fraction + # convert to from percent to fraction mult_shift = ( 1 + mult_uncorr_errors * rng.normal(size=mult_uncorr_errors.shape) / 100 ).prod(axis=1) @@ -211,19 +220,18 @@ def make_replica( ).prod(axis=1) mult_shifts.append(mult_shift) - - #If sep_mult is true then the multiplicative shifts were not included in the covmat + + # If sep_mult is true then the multiplicative shifts were not included in the covmat shifts = covmat_sqrt @ rng.normal(size=covmat.shape[1]) - mult_part = 1. + mult_part = 1.0 if sep_mult: special_mult = ( - 1 + special_mult_errors * rng.normal(size=(1, - special_mult_errors.shape[1])) / 100 - ).prod(axis=1) - mult_part = np.concatenate(mult_shifts, axis=0)*special_mult - #Shifting pseudodata - shifted_pseudodata = (all_pseudodata + shifts)*mult_part - #positivity control + 1 + special_mult_errors * rng.normal(size=(1, special_mult_errors.shape[1])) / 100 + ).prod(axis=1) + mult_part = np.concatenate(mult_shifts, axis=0) * special_mult + # Shifting pseudodata + shifted_pseudodata = (all_pseudodata + shifts) * mult_part + # positivity control if np.all(shifted_pseudodata[full_mask] >= 0): break @@ -231,11 +239,11 @@ def make_replica( def indexed_make_replica(groups_index, make_replica): - """Index the make_replica pseudodata appropriately - """ + """Index the make_replica pseudodata appropriately""" return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) + def level0_commondata_wc(data, fakepdf): """ Given a validphys.core.DataGroupSpec object, load commondata and @@ -264,6 +272,7 @@ def level0_commondata_wc(data, fakepdf): [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] """ from validphys.covmats import dataset_t0_predictions + level0_commondata_instances_wc = [] # ==== Load validphys.coredata.CommonData instance with cuts ====# @@ -279,22 +288,18 @@ def level0_commondata_wc(data, fakepdf): t0_prediction = dataset_t0_predictions( dataset=dataset, t0set=fakepdf ) # N.B. cuts already applied to th. pred. - level0_commondata_instances_wc.append( - commondata_wc.with_central_value(t0_prediction) - ) + level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) return level0_commondata_instances_wc -def make_level1_data( - data, level0_commondata_wc, filterseed, experiments_index, sep_mult -): +def make_level1_data(data, level0_commondata_wc, filterseed, experiments_index, sep_mult): """ Given a list of Level 0 commondata instances, return the same list with central values replaced by Level 1 data. Level 1 data is generated using validphys.make_replica. - The covariance matrix, from which the stochastic Level 1 + The covariance matrix, from which the stochastic Level 1 noise is sampled, is built from Level 0 commondata instances (level0_commondata_wc). This, in particular, means that the multiplicative systematics are generated @@ -307,12 +312,12 @@ def make_level1_data( Generate L1 data: L1 = L0 + eta, eta ~ N(0,CL0) Generate L2 data: L2_k = L1 + eps_k, eps_k ~ N(0,CL1) - + where CL0 and CL1 means that the multiplicative entries have been constructed from Level 0 and Level 1 central values respectively. - - + + Parameters ---------- @@ -379,12 +384,8 @@ def make_level1_data( _group_recreate_pseudodata = collect( 'indexed_make_replica', ('group_dataset_inputs_by_experiment',) ) -_recreate_fit_pseudodata = collect( - '_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment') -) -_recreate_pdf_pseudodata = collect( - '_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment') -) +_recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) +_recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment')) fit_tr_masks = collect('replica_training_mask_table', ('fitreplicas', 'fitenvironment')) pdf_tr_masks = collect('replica_training_mask_table', ('pdfreplicas', 'fitenvironment')) @@ -392,6 +393,7 @@ def make_level1_data( fitted_make_replicas = collect('make_replica', ('pdfreplicas',)) indexed_make_replicas = collect('indexed_make_replica', ('replicas',)) + def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_tr_masks): """Function used to reconstruct the pseudodata seen by each of the Monte Carlo fit replicas. @@ -425,6 +427,7 @@ def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_tr_masks) res.append(DataTrValSpec(df, tr_idx, val_idx)) return res + def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_tr_masks): """Like :py:func:`validphys.pseudodata.recreate_fit_pseudodata` but accounts for the postfit reshuffling of replicas. @@ -447,6 +450,9 @@ def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_tr_masks) """ return recreate_fit_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_tr_masks) + pdf_tr_masks_no_table = collect('replica_training_mask', ('pdfreplicas', 'fitenvironment')) + + def recreate_pdf_pseudodata_no_table(_recreate_pdf_pseudodata, pdfreplicas, pdf_tr_masks_no_table): return recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdfreplicas, pdf_tr_masks_no_table) diff --git a/validphys2/src/validphys/renametools.py b/validphys2/src/validphys/renametools.py index 90520b837c..c9812b0718 100644 --- a/validphys2/src/validphys/renametools.py +++ b/validphys2/src/validphys/renametools.py @@ -4,12 +4,12 @@ """ import os import sys -import time import threading +import time class Spinner: - """ Context manager to provide a spinning cursor + """Context manager to provide a spinning cursor while validphys performs some other task silently. When exececuted in a TTY, it shows a spinning cursor for the duration of @@ -24,6 +24,7 @@ class Spinner: ... time.sleep(5) """ + def __init__(self, delay=0.1): self.spinner_generator = self.spinning_cursor() self.delay = delay @@ -55,16 +56,16 @@ def __exit__(self, exception, value, tb): else: print("Done") - @staticmethod def spinning_cursor(): while True: - for cursor in '|/-\\': yield cursor + for cursor in '|/-\\': + yield cursor def rename_pdf(pdf_folder, initial_fit_name, final_name): for item in os.listdir(pdf_folder): - p = pdf_folder/item + p = pdf_folder / item if p.is_symlink(): replica = p.resolve().parent.name pointer = f'../../nnfit/{replica}/{final_name}.dat' @@ -76,19 +77,19 @@ def rename_pdf(pdf_folder, initial_fit_name, final_name): def rename_nnfit(nnfit_path, initial_fit_name, final_name): - info_file = nnfit_path/f'{initial_fit_name}.info' + info_file = nnfit_path / f'{initial_fit_name}.info' info_file.rename(info_file.with_name(f'{final_name}.info')) - #Some older fits have the PDF here + # Some older fits have the PDF here pdf_folder = nnfit_path / initial_fit_name if pdf_folder.is_dir(): rename_pdf(pdf_folder, initial_fit_name, final_name) - #Change replica names + # Change replica names for item in nnfit_path.glob('replica*'): if item.is_dir(): files = item.glob(initial_fit_name + '*') for i in files: newname = i.name.replace(initial_fit_name, final_name) - i.rename(item/newname) + i.rename(item / newname) def rename_postfit(postfit_path, initial_fit_name, final_name): @@ -96,14 +97,15 @@ def rename_postfit(postfit_path, initial_fit_name, final_name): rename_pdf(pdf_folder, initial_fit_name, final_name) os.system(f'sed -i -e "s/{initial_fit_name}/{final_name}/g" {postfit_path/"postfit.log"}') + def change_name(initial_path, final_name): """Function that takes initial fit name and final fit name and performs the renaming""" initial_fit_name = initial_path.name - nnfit = initial_path/'nnfit' + nnfit = initial_path / 'nnfit' if nnfit.exists(): rename_nnfit(nnfit, initial_fit_name, final_name) - postfit = initial_path/'postfit' + postfit = initial_path / 'postfit' if postfit.exists(): rename_postfit(postfit, initial_fit_name, final_name) newpath = initial_path.with_name(final_name) diff --git a/validphys2/src/validphys/replica_selector.py b/validphys2/src/validphys/replica_selector.py index 2bf6b533ce..e636d81531 100644 --- a/validphys2/src/validphys/replica_selector.py +++ b/validphys2/src/validphys/replica_selector.py @@ -4,19 +4,18 @@ Tools for filtering replica sets based on criteria on the replicas. """ import logging -import shutil import re +import shutil - -from reportengine.checks import make_argcheck, check +from reportengine.checks import check, make_argcheck from reportengine.compat import yaml - from validphys.core import PDF from validphys.renametools import rename_pdf from validphys.utils import tempfile_cleaner log = logging.getLogger(__name__) + def _fixup_new_replica(alphas_pdf: PDF, new_replica_file): """Helper function that takes in a :py:class:`validphys.core.PDF` object as well as @@ -33,6 +32,7 @@ def _fixup_new_replica(alphas_pdf: PDF, new_replica_file): out_stream.write(f"AlphaS_MZ: {alphas_mz}\nAlphaS_Vals: {alphas_vals}\n".encode()) out_stream.write(data) + @make_argcheck def _check_target_name(target_name): """Make sure this specifies a name and not some kid of path""" @@ -43,6 +43,7 @@ def _check_target_name(target_name): "`target_name` must contain alphnumeric characters and underscores only", ) + @_check_target_name def alpha_s_bundle_pdf(pdf, pdfs, output_path, target_name: (str, type(None)) = None): """Action that bundles PDFs for distributing to the LHAPDF @@ -101,8 +102,7 @@ def alpha_s_bundle_pdf(pdf, pdfs, output_path, target_name: (str, type(None)) = info_yaml['NumMembers'] = new_nrep info_yaml['ErrorType'] += '+as' extra_desc = '; '.join( - f"mem={i} => alphas(MZ)={val}" - for val, i in zip(alphas_values, range(nrep, new_nrep)) + f"mem={i} => alphas(MZ)={val}" for val, i in zip(alphas_values, range(nrep, new_nrep)) ) info_yaml['SetDesc'] += f"; {extra_desc}" with open(info_file, 'w') as stream: @@ -117,4 +117,3 @@ def alpha_s_bundle_pdf(pdf, pdfs, output_path, target_name: (str, type(None)) = new_pdf = new_pdf.rename(target_path) log.info(f"alpha_s bundle written at {new_pdf}") return target_name - diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 4bc74c5639..3ce629d8fa 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -14,30 +14,18 @@ import pandas as pd import scipy.linalg as la -from reportengine.checks import require_one, remove_outer, check_not_empty -from reportengine.table import table from reportengine import collect - +from reportengine.checks import check_not_empty, remove_outer, require_one +from reportengine.table import table +from validphys.calcutils import all_chi2, bootstrap_values, calc_chi2, calc_phi, central_chi2 from validphys.checks import ( check_cuts_considered, check_pdf_is_montecarlo, check_speclabels_different, check_two_dataspecs, ) - -from validphys.core import DataSetSpec, PDF, DataGroupSpec, Stats -from validphys.calcutils import ( - all_chi2, - central_chi2, - calc_chi2, - calc_phi, - bootstrap_values, -) -from validphys.convolution import ( - predictions, - PredictionsRequireCutsError, -) - +from validphys.convolution import PredictionsRequireCutsError, predictions +from validphys.core import PDF, DataGroupSpec, DataSetSpec, Stats log = logging.getLogger(__name__) @@ -209,26 +197,27 @@ def groups_index(groups_data): def experiments_index(experiments_data): return groups_index(experiments_data) + def procs_index(procs_data): return groups_index(procs_data) + def groups_data_values(group_result_table): """Returns list of data values for the input groups.""" data_central_values = group_result_table["data_central"] return data_central_values + def procs_data_values(proc_result_table): """Like groups_data_values but grouped by process.""" data_central_values = proc_result_table["data_central"] return data_central_values -groups_results = collect( - "dataset_inputs_results", ("group_dataset_inputs_by_metadata",) -) -procs_results = collect( - "dataset_inputs_results", ("group_dataset_inputs_by_process",) -) +groups_results = collect("dataset_inputs_results", ("group_dataset_inputs_by_metadata",)) + +procs_results = collect("dataset_inputs_results", ("group_dataset_inputs_by_process",)) + def group_result_table_no_table(groups_results, groups_index): """Generate a table containing the data central value, the central prediction, @@ -236,9 +225,7 @@ def group_result_table_no_table(groups_results, groups_index): result_records = [] for group_results in groups_results: dt, th = group_results - for index, (dt_central, th_central) in enumerate( - zip(dt.central_value, th.central_value) - ): + for index, (dt_central, th_central) in enumerate(zip(dt.central_value, th.central_value)): replicas = ( ("rep_%05d" % (i + 1), th_rep) for i, th_rep in enumerate(th.error_members[index, :]) @@ -256,9 +243,7 @@ def group_result_table_no_table(groups_results, groups_index): if not result_records: log.warning("Empty records for group results") return pd.DataFrame() - df = pd.DataFrame( - result_records, columns=result_records[0].keys(), index=groups_index - ) + df = pd.DataFrame(result_records, columns=result_records[0].keys(), index=groups_index) return df @@ -268,23 +253,21 @@ def group_result_table(group_result_table_no_table): """Duplicate of group_result_table_no_table but with a table decorator.""" return group_result_table_no_table + def proc_result_table_no_table(procs_results, procs_index): return group_result_table_no_table(procs_results, procs_index) + @table def proc_result_table(proc_result_table_no_table): return proc_result_table_no_table -experiment_result_table = collect( - "group_result_table", ("group_dataset_inputs_by_experiment",) -) +experiment_result_table = collect("group_result_table", ("group_dataset_inputs_by_experiment",)) @table -def group_result_table_68cl( - groups_results, group_result_table_no_table: pd.DataFrame, pdf: PDF -): +def group_result_table_68cl(groups_results, group_result_table_no_table: pd.DataFrame, pdf: PDF): """Generate a table containing the data central value, the data 68% confidence levels, the central prediction, and 68% confidence level bounds of the prediction. """ @@ -292,9 +275,7 @@ def group_result_table_68cl( # replica data is every columns after central values, transpose for stats class replica_data = df.iloc[:, 2:].values.T # Use pdf stats class but reshape output to have each row as a data point - th_unc_array = [ - level.reshape(-1, 1) for level in pdf.stats_class(replica_data).errorbar68() - ] + th_unc_array = [level.reshape(-1, 1) for level in pdf.stats_class(replica_data).errorbar68()] # concatenate for dataframe construction th_unc_array_reshaped = np.concatenate(th_unc_array, axis=1) data_unc_array = np.concatenate([i[0].std_error for i in groups_results]) @@ -313,18 +294,14 @@ def group_result_table_68cl( ) -def experiments_covmat_no_table( - experiments_data, experiments_index, experiments_covmat_collection -): +def experiments_covmat_no_table(experiments_data, experiments_index, experiments_covmat_collection): """Makes the total experiments covariance matrix, which can then be reindexed appropriately by the chosen grouping. The covariance matrix must first be grouped by experiments to ensure correlations within experiments are preserved.""" data = np.zeros((len(experiments_index), len(experiments_index))) df = pd.DataFrame(data, index=experiments_index, columns=experiments_index) - for experiment, experiment_covmat in zip( - experiments_data, experiments_covmat_collection - ): + for experiment, experiment_covmat in zip(experiments_data, experiments_covmat_collection): name = experiment.name df.loc[[name], [name]] = experiment_covmat return df @@ -333,15 +310,13 @@ def experiments_covmat_no_table( def relabel_experiments_to_groups(input_covmat, groups_index): """Takes a covmat grouped by experiments and relabels it by groups. This allows grouping over experiments to - preserve experimental correlations outwith the chosen + preserve experimental correlations outwith the chosen grouping.""" # Sorting along dataset axis so we can apply the groups index directly input_covmat = input_covmat.sort_index(axis=0, level=1) input_covmat = input_covmat.sort_index(axis=1, level=1) sorted_groups_index = groups_index.sortlevel(1)[0] - df = pd.DataFrame( - input_covmat.values, index=sorted_groups_index, columns=sorted_groups_index - ) + df = pd.DataFrame(input_covmat.values, index=sorted_groups_index, columns=sorted_groups_index) # Reindexing to fit with groups_index df = df.reindex(groups_index, axis=0) df = df.reindex(groups_index, axis=1) @@ -366,31 +341,30 @@ def groups_covmat(groups_covmat_no_table): """Duplicate of groups_covmat_no_table but with a table decorator.""" return groups_covmat_no_table + def procs_covmat_no_table(experiments_covmat_no_table, procs_index): return relabel_experiments_to_groups(experiments_covmat_no_table, procs_index) + @table def procs_covmat(procs_covmat_no_table): return procs_covmat_no_table + experiments_sqrt_covmat = collect( "dataset_inputs_sqrt_covmat", ("group_dataset_inputs_by_experiment",) ) @table -def experiments_sqrtcovmat( - experiments_data, experiments_index, experiments_sqrt_covmat -): +def experiments_sqrtcovmat(experiments_data, experiments_index, experiments_sqrt_covmat): """Like experiments_covmat, but dump the lower triangular part of the Cholesky decomposition as used in the fit. The upper part indices are set to zero. """ data = np.zeros((len(experiments_index), len(experiments_index))) df = pd.DataFrame(data, index=experiments_index, columns=experiments_index) - for experiment, experiments_sqrt_covmat in zip( - experiments_data, experiments_sqrt_covmat - ): + for experiment, experiments_sqrt_covmat in zip(experiments_data, experiments_sqrt_covmat): name = experiment.name experiments_sqrt_covmat[np.triu_indices_from(experiments_sqrt_covmat, k=1)] = 0 df.loc[[name], [name]] = experiments_sqrt_covmat @@ -404,17 +378,13 @@ def groups_sqrtcovmat(experiments_sqrtcovmat, groups_index): @table -def experiments_invcovmat( - experiments_data, experiments_index, experiments_covmat_collection -): +def experiments_invcovmat(experiments_data, experiments_index, experiments_covmat_collection): """Compute and export the inverse covariance matrix. Note that this inverts the matrices with the LU method which is suboptimal.""" data = np.zeros((len(experiments_index), len(experiments_index))) df = pd.DataFrame(data, index=experiments_index, columns=experiments_index) - for experiment, experiment_covmat in zip( - experiments_data, experiments_covmat_collection - ): + for experiment, experiment_covmat in zip(experiments_data, experiments_covmat_collection): name = experiment.name # Improve this inversion if this method tuns out to be important invcov = la.inv(experiment_covmat) @@ -439,10 +409,12 @@ def groups_normcovmat(groups_covmat, groups_data_values): mat = df / np.outer(groups_data_array, groups_data_array) return mat + @table def procs_normcovmat(procs_covmat, procs_data_values): return groups_normcovmat(procs_covmat, procs_data_values) + @table def groups_corrmat(groups_covmat): """Generates the grouped experimental correlation matrix with groups_covmat as input""" @@ -452,6 +424,7 @@ def groups_corrmat(groups_covmat): mat = diag_minus_half[:, np.newaxis] * df * diag_minus_half return mat + @table def procs_corrmat(procs_covmat): return groups_corrmat(procs_covmat) @@ -472,14 +445,11 @@ def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat): ) - def dataset_inputs_results( data, pdf: PDF, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat ): """Like `results` but for a group of datasets""" - return results( - data, pdf, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat - ) + return results(data, pdf, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat) # It's better to duplicate a few lines than to complicate the logic of @@ -531,9 +501,7 @@ def abs_chi2_data(results): central_result = central_chi2(results) - return Chi2Data( - th_result.stats_class(chi2s[:, np.newaxis]), central_result, len(data_result) - ) + return Chi2Data(th_result.stats_class(chi2s[:, np.newaxis]), central_result, len(data_result)) def dataset_inputs_abs_chi2_data(dataset_inputs_results): @@ -580,11 +548,11 @@ def total_phi_data_from_experiments(experiments_phi_data): """ unnorm_phi_squared, ndata = np.sum( - [(ndata*phi**2, ndata) for phi, ndata in experiments_phi_data], - axis=0 + [(ndata * phi**2, ndata) for phi, ndata in experiments_phi_data], axis=0 ) return np.sqrt(unnorm_phi_squared / ndata), ndata + @check_pdf_is_montecarlo def dataset_inputs_bootstrap_phi_data(dataset_inputs_results, bootstrap_samples=500): """Takes the data result and theory prediction given `dataset_inputs` and @@ -631,16 +599,22 @@ def dataset_inputs_bootstrap_chi2_central( def predictions_by_kinematics_table(results, kinematics_table_notable): """Return a table combining the output of :py:func:`validphys.kinematics.kinematics_table`` with the data and theory - central values. """ + central values.""" tb = kinematics_table_notable.copy() data, theory = results tb['data'] = data.central_value tb['prediction'] = theory.central_value return tb + groups_each_dataset_chi2 = collect("each_dataset_chi2", ("group_dataset_inputs_by_metadata",)) -groups_chi2_by_process = collect("dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_process",)) -groups_each_dataset_chi2_by_process = collect("each_dataset_chi2", ("group_dataset_inputs_by_process",)) +groups_chi2_by_process = collect( + "dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_process",) +) +groups_each_dataset_chi2_by_process = collect( + "each_dataset_chi2", ("group_dataset_inputs_by_process",) +) + @table def groups_chi2_table(groups_data, pdf, groups_chi2, groups_each_dataset_chi2): @@ -655,14 +629,11 @@ def groups_chi2_table(groups_data, pdf, groups_chi2, groups_each_dataset_chi2): return pd.DataFrame(records) -experiments_chi2_table = collect( - "groups_chi2_table", ("group_dataset_inputs_by_experiment",) -) +experiments_chi2_table = collect("groups_chi2_table", ("group_dataset_inputs_by_experiment",)) + @table -def procs_chi2_table( - procs_data, pdf, groups_chi2_by_process, groups_each_dataset_chi2_by_process -): +def procs_chi2_table(procs_data, pdf, groups_chi2_by_process, groups_each_dataset_chi2_by_process): """Same as groups_chi2_table but by process""" return groups_chi2_table( procs_data, @@ -681,6 +652,7 @@ def positivity_predictions_data_result(pdf, posdataset): dataspecs_positivity_predictions = collect(positivity_predictions_data_result, ("dataspecs",)) dataspecs_posdataset = collect("posdataset", ("dataspecs",)) + def count_negative_points(possets_predictions): """Return the number of replicas with negative predictions for each bin in the positivity observable.""" @@ -727,15 +699,15 @@ def experiments_chi2_stats(total_chi2_data): def chi2_stats(abs_chi2_data): """Compute several estimators from the chi²: - - central_mean + - central_mean - - npoints + - npoints - - perreplica_mean + - perreplica_mean - - perreplica_std + - perreplica_std - - chi2_per_data + - chi2_per_data """ rep_data, central_result, npoints = abs_chi2_data m = central_result.mean() @@ -757,15 +729,18 @@ def dataset_chi2_table(chi2_stats, dataset): return pd.DataFrame(chi2_stats, index=[dataset.name]) -groups_chi2 = collect( - "dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_metadata",) -) +groups_chi2 = collect("dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_metadata",)) -procs_chi2 = collect("dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_process",) -) +procs_chi2 = collect("dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_process",)) fits_groups_chi2_data = collect("groups_chi2", ("fits", "fitcontext")) -fits_groups = collect("groups_data", ("fits", "fitcontext",)) +fits_groups = collect( + "groups_data", + ( + "fits", + "fitcontext", + ), +) # TODO: Possibly get rid of the per_point_data parameter and have separate @@ -819,9 +794,7 @@ def fits_groups_phi_table(fits_name_with_covmat_label, fits_groups, fits_groups_ """ dfs = [] cols = ("ndata", r"$\phi$") - for label, groups, groups_phi in zip( - fits_name_with_covmat_label, fits_groups, fits_groups_phi - ): + for label, groups, groups_phi in zip(fits_name_with_covmat_label, fits_groups, fits_groups_phi): records = [] for group, (group_phi, npoints) in zip(groups, groups_phi): records.append(dict(group=str(group), npoints=npoints, phi=group_phi)) @@ -853,9 +826,7 @@ def dataspecs_groups_chi2_table( # we need this to reorder the datasets correctly, a potentially more satisfactory # method could be to make a datasets chi2 table which gets collected and concatenated -groups_datasets_chi2_data = collect( - "each_dataset_chi2", ("group_dataset_inputs_by_metadata",) -) +groups_datasets_chi2_data = collect("each_dataset_chi2", ("group_dataset_inputs_by_metadata",)) fits_datasets_chi2_data = collect("groups_datasets_chi2_data", ("fits", "fitcontext")) @@ -927,6 +898,7 @@ def dataspecs_datasets_chi2_table( fits_total_chi2_data = collect("total_chi2_data", ("fits", "fitcontext")) dataspecs_total_chi2_data = collect("total_chi2_data", ("dataspecs",)) + # TODO: Decide what to do with the horrible totals code. @table def fits_chi2_table( @@ -950,9 +922,7 @@ def fits_chi2_table( for lv in lvs: dfs.append(pd.concat((edf.loc[lv], ddf.loc[lv]), copy=False, axis=0)) if show_total: - total_points = np.array( - [total_chi2_data.ndata for total_chi2_data in fits_total_chi2_data] - ) + total_points = np.array([total_chi2_data.ndata for total_chi2_data in fits_total_chi2_data]) total_chi = np.array( [total_chi2_data.central_result for total_chi2_data in fits_total_chi2_data] ) @@ -999,10 +969,12 @@ def dataspecs_chi2_differences_table(dataspecs, dataspecs_chi2_table): df["difference"] = diff return df + experiments_chi2_data = collect( "dataset_inputs_abs_chi2_data", ("group_dataset_inputs_by_experiment",) ) + def total_chi2_data_from_experiments(experiments_chi2_data, pdf): """Like :py:func:`dataset_inputs_abs_chi2_data`, except sums the contribution from each experiment which is more efficient in the case that the total @@ -1020,16 +992,13 @@ def total_chi2_data_from_experiments(experiments_chi2_data, pdf): # we sum data, not error_members here because we feed it back into the stats # class, the stats class error_members cuts off the CV if needed data_sum = np.sum( - [exp_chi2_data.replica_result.data for exp_chi2_data in experiments_chi2_data], - axis=0 + [exp_chi2_data.replica_result.data for exp_chi2_data in experiments_chi2_data], axis=0 ) ndata = np.sum( [exp_chi2_data.ndata for exp_chi2_data in experiments_chi2_data], ) - return Chi2Data( - pdf.stats_class(data_sum), central_result, ndata - ) + return Chi2Data(pdf.stats_class(data_sum), central_result, ndata) def dataset_inputs_chi2_per_point_data(dataset_inputs_abs_chi2_data): @@ -1037,9 +1006,7 @@ def dataset_inputs_chi2_per_point_data(dataset_inputs_abs_chi2_data): Covariance matrix is fully correlated across datasets, with all known correlations. """ - return ( - dataset_inputs_abs_chi2_data.central_result / dataset_inputs_abs_chi2_data.ndata - ) + return dataset_inputs_abs_chi2_data.central_result / dataset_inputs_abs_chi2_data.ndata def total_chi2_per_point_data(total_chi2_data): @@ -1055,9 +1022,7 @@ def perreplica_chi2_table(groups_data, groups_chi2, total_chi2_data): and the second is the number of points.""" chs = groups_chi2 - total_chis = np.zeros( - (len(groups_data) + 1, 1 + len(chs[0].replica_result.error_members())) - ) + total_chis = np.zeros((len(groups_data) + 1, 1 + len(chs[0].replica_result.error_members()))) ls = [] for i, ch in enumerate(chs, 1): th, central, l = ch @@ -1096,14 +1061,17 @@ def groups_central_values(group_result_table): central_theory_values = group_result_table["theory_central"] return central_theory_values + def procs_central_values_no_table(proc_result_table_no_table): central_theory_values = proc_result_table_no_table["theory_central"] return central_theory_values + @table def procs_central_values(procs_central_values_no_table): return procs_central_values_no_table + dataspecs_each_dataset_chi2 = collect("each_dataset_chi2", ("dataspecs",)) each_dataset = collect("dataset", ("data",)) dataspecs_each_dataset = collect("each_dataset", ("dataspecs",)) @@ -1155,16 +1123,12 @@ def dataspecs_dataset_chi2_difference_table( fits_total_chi2 = collect("total_chi2_per_point_data", ("fits", "fitcontext")) -fits_total_chi2_for_groups = collect( - "total_chi2_per_point_data", ("fits", "fittheoryandpdf") -) +fits_total_chi2_for_groups = collect("total_chi2_per_point_data", ("fits", "fittheoryandpdf")) fits_pdf = collect("pdf", ("fits", "fitpdf")) -groups_data_phi = collect( - dataset_inputs_phi_data, ("group_dataset_inputs_by_metadata",) -) +groups_data_phi = collect(dataset_inputs_phi_data, ("group_dataset_inputs_by_metadata",)) fits_groups_data_phi = collect("groups_data_phi", ("fits", "fitcontext")) groups_bootstrap_phi = collect( dataset_inputs_bootstrap_phi_data, ("group_dataset_inputs_by_metadata",) diff --git a/validphys2/src/validphys/reweighting.py b/validphys2/src/validphys/reweighting.py index 8d3f7778ee..bfc8cf83bf 100644 --- a/validphys2/src/validphys/reweighting.py +++ b/validphys2/src/validphys/reweighting.py @@ -1,48 +1,51 @@ - # -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ Utilities for reweighting studies. Implements utilities for calculating the NNPDF weights and unweighted PDF sets. It also allows for some basic statistics. """ +from collections import OrderedDict import logging import warnings -from collections import OrderedDict - import numpy as np import pandas as pd from scipy import optimize from reportengine import collect -from reportengine.table import table from reportengine.figure import figure - +from reportengine.table import table +from validphys import checks, plotutils from validphys.core import PDF, Filter -from validphys import checks +from validphys.dataplots import plot_training_validation from validphys.lhio import new_pdf_from_indexes from validphys.pdfoutput import pdfset -from validphys.dataplots import plot_training_validation -from validphys import plotutils log = logging.getLogger(__name__) -__all__ = ('chi2_data_for_reweighting_experiments', 'make_unweighted_pdf', - 'nnpdf_weights', 'nnpdf_weights_numerator', 'p_alpha_study', - 'plot_p_alpha', 'reweighting_stats', 'unweighted_index', - 'make_pdf_from_filtered_outliers') +__all__ = ( + 'chi2_data_for_reweighting_experiments', + 'make_unweighted_pdf', + 'nnpdf_weights', + 'nnpdf_weights_numerator', + 'p_alpha_study', + 'plot_p_alpha', + 'reweighting_stats', + 'unweighted_index', + 'make_pdf_from_filtered_outliers', +) chi2_data_for_reweighting_experiments_inner = collect( 'abs_chi2_data_experiment', ['reweighting_experiments'] ) + # This is to add checks and to request that use_t0 is set explicitly # pylint: disable=unused-argument @checks.check_pdf_is_montecarlo -def chi2_data_for_reweighting_experiments( - chi2_data_for_reweighting_experiments_inner, use_t0 -): +def chi2_data_for_reweighting_experiments(chi2_data_for_reweighting_experiments_inner, use_t0): return chi2_data_for_reweighting_experiments_inner @@ -58,24 +61,26 @@ def nnpdf_weights_numerator(chi2_data_for_reweighting_experiments): chi2s = np.ravel(chi2s) - logw = ((total_ndata - 1)/2)*np.log(chi2s) - 0.5*chi2s + logw = ((total_ndata - 1) / 2) * np.log(chi2s) - 0.5 * chi2s logw -= np.max(logw) w = np.exp(logw) return w + @table -#will call list[0] +# will call list[0] @checks.check_not_empty('reweighting_experiments') def nnpdf_weights(chi2_data_for_reweighting_experiments): """Compute the replica weights according to the NNPDF formula.""" numerator = nnpdf_weights_numerator(chi2_data_for_reweighting_experiments) - return pd.DataFrame(numerator/np.sum(numerator), - index=np.arange(1, len(numerator) + 1)) + return pd.DataFrame(numerator / np.sum(numerator), index=np.arange(1, len(numerator) + 1)) + def effective_number_of_replicas(w): N = len(w) - w = w*N/np.sum(w) - return np.exp(np.nansum(w*np.log(N)-w*np.log(w))/N) + w = w * N / np.sum(w) + return np.exp(np.nansum(w * np.log(N) - w * np.log(w)) / N) + @table def reweighting_stats(pdf, nnpdf_weights, p_alpha_study): @@ -92,23 +97,27 @@ def reweighting_stats(pdf, nnpdf_weights, p_alpha_study): median = np.median(nnpdf_weights) max_alpha = p_alpha_study.idxmax() - result = OrderedDict([ - (r'$N_{\mathrm{initial}}$', initial_replicas), - (r'$N_{eff}$', er), - (r'median($w$)', median), - (r'$max_{[0.5,4]}P(\alpha)$', max_alpha) - ]) + result = OrderedDict( + [ + (r'$N_{\mathrm{initial}}$', initial_replicas), + (r'$N_{eff}$', er), + (r'median($w$)', median), + (r'$max_{[0.5,4]}P(\alpha)$', max_alpha), + ] + ) return pd.Series(result, index=result.keys(), name='Reweighting stats') + def _get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments): - new_chi2 = [((type(res)(res.error_members()/alpha**2)), central, ndata) - for (res,central,ndata) in - chi2_data_for_reweighting_experiments] + new_chi2 = [ + ((type(res)(res.error_members() / alpha**2)), central, ndata) + for (res, central, ndata) in chi2_data_for_reweighting_experiments + ] - new_ws = nnpdf_weights_numerator(new_chi2) - val = np.sum(new_ws / alpha) - return val + new_ws = nnpdf_weights_numerator(new_chi2) + val = np.sum(new_ws / alpha) + return val def _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments): @@ -123,15 +132,13 @@ def _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments): def p_alpha_study(chi2_data_for_reweighting_experiments): """Compute P(α) in an automatic range""" - - small = 0.5 f = lambda alpha: -_get_p_alpha_val(alpha, chi2_data_for_reweighting_experiments) - #Ignore warnings for nonsensical alphas + # Ignore warnings for nonsensical alphas with warnings.catch_warnings(): warnings.simplefilter('ignore') - alphamax= optimize.fmin(f, 10, disp=False)[0] + alphamax = optimize.fmin(f, 10, disp=False)[0] """ #First find the maximum @@ -154,15 +161,14 @@ def p_alpha_study(chi2_data_for_reweighting_experiments): alphas = np.linspace(small, big, 100) """ - big = 1.4*alphamax - + big = 1.4 * alphamax alphas = np.linspace(small, big, 1000) vals = _get_p_alpha_vals(alphas, chi2_data_for_reweighting_experiments) - return pd.Series(np.array(vals), index=alphas) + @figure def plot_p_alpha(p_alpha_study): """Plot the results of ``p_alpha_study``.""" @@ -172,55 +178,64 @@ def plot_p_alpha(p_alpha_study): xmax = p_alpha_study.idxmax() ymax = p_alpha_study[xmax] ax.axvline(xmax, color='red', linestyle='--') - ax.annotate(r'$\alpha=%.2f$'%xmax, (xmax,(ymax-ax.get_ylim()[0])/2), ) + ax.annotate( + r'$\alpha=%.2f$' % xmax, + (xmax, (ymax - ax.get_ylim()[0]) / 2), + ) ax.set_yticklabels([]) ax.set_xlabel(r'$\alpha$') - ax.plot(p_alpha_study) return fig + @table -def unweighted_index(nnpdf_weights, nreplicas:int=100): +def unweighted_index(nnpdf_weights, nreplicas: int = 100): """The index of the input replicas that corresponds to an unweighted set, for the given weights. This can be saved for testing purposes.""" nnpdf_weights = np.ravel(nnpdf_weights) res = 1 + np.random.choice(len(nnpdf_weights), size=nreplicas, p=nnpdf_weights) - return pd.DataFrame(res, index=np.arange(1,nreplicas+1)) + return pd.DataFrame(res, index=np.arange(1, nreplicas + 1)) + @pdfset @checks.check_can_save_grid -def make_unweighted_pdf(pdf, unweighted_index, - set_name:str, output_path=None, - installgrid:bool=True): +def make_unweighted_pdf( + pdf, unweighted_index, set_name: str, output_path=None, installgrid: bool = True +): """Generate an unweighted PDF set, from the prior ``pdf`` and the reweighting_experiments. The PDF is written to a `pdfsets` directory of the output folder. Return the relative path of the newly created PDF.""" + new_pdf_from_indexes( + pdf=pdf, + indexes=np.ravel(unweighted_index), + set_name=set_name, + folder=output_path, + installgrid=installgrid, + ) - new_pdf_from_indexes(pdf=pdf, indexes=np.ravel(unweighted_index), - set_name=set_name, folder=output_path, - installgrid=installgrid) @checks.make_check def _check_cut(ns, *args, **kwargs): cut = ns['nsigma_cut'] msg = "'nsigma_cut' must be a float greater than zero." try: - if cut>0: + if cut > 0: return - #TODO: Check types for parameters automatically + # TODO: Check types for parameters automatically except TypeError as e: raise checks.CheckError(msg) from e raise checks.CheckError(msg) + @checks.check_has_fitted_replicas @_check_cut -def chi2filtered_index(fit, replica_data, nsigma_cut:float): +def chi2filtered_index(fit, replica_data, nsigma_cut: float): """Discard outliers until the χ² to fitted data of all replicas has - a smaller deviation than `nsigma_cut` - (in units of the standard deviation of the final ensemble) + a smaller deviation than `nsigma_cut` + (in units of the standard deviation of the final ensemble) """ chis = np.array([dt.chi2 for dt in replica_data]) @@ -229,53 +244,64 @@ def chi2filtered_index(fit, replica_data, nsigma_cut:float): while True: mean = np.mean(newchis) std = np.std(newchis) - limit = mean + std*nsigma_cut - #Replica indexes start at 1 + limit = mean + std * nsigma_cut + # Replica indexes start at 1 indexes = np.where(chis < limit)[0] - if len(newchis) == len(indexes): break - newchis = chis[indexes] - log.info("%s: Mean ̉χ² was %.2f, and the threshold is %.2f. Discarded %d " - "replicas out of %d. Now the mean is %.2f.", fit.label, oldmean, limit, - len(chis) - len(indexes), len(chis), np.mean(newchis)) - - label = '$\chi^2$ Filter: %.2f'%nsigma_cut + log.info( + "%s: Mean ̉χ² was %.2f, and the threshold is %.2f. Discarded %d " + "replicas out of %d. Now the mean is %.2f.", + fit.label, + oldmean, + limit, + len(chis) - len(indexes), + len(chis), + np.mean(newchis), + ) + label = '$\chi^2$ Filter: %.2f' % nsigma_cut return Filter(indexes, label, nsigma_cut=nsigma_cut) + def negative_filtered_index(count_negative_points, filter_Q=75): cut = np.percentile(count_negative_points, filter_Q) - indexes = np.where(count_negative_points <= cut)[0] + indexes = np.where(count_negative_points <= cut)[0] label = 'Positivity Selection Q=%.2f' % filter_Q log.info("Positivity cut for Q=%.2f is at %.0f negative points" % (filter_Q, cut)) return Filter(indexes, label, filter_Q=filter_Q) + @pdfset @checks.check_can_save_grid -def make_pdf_from_filtered_outliers(fit, chi2filtered_index, - set_name:str, - output_path=None, - installgrid:bool=True): +def make_pdf_from_filtered_outliers( + fit, chi2filtered_index, set_name: str, output_path=None, installgrid: bool = True +): """Produce a new grid with the result of chi2filtered_index""" - indexes = chi2filtered_index.indexes + 1 #libnnpdf nonsense - new_pdf_from_indexes(pdf=PDF(fit.name), indexes=indexes, - set_name=set_name, folder=output_path, - installgrid=installgrid) + indexes = chi2filtered_index.indexes + 1 # libnnpdf nonsense + new_pdf_from_indexes( + pdf=PDF(fit.name), + indexes=indexes, + set_name=set_name, + folder=output_path, + installgrid=installgrid, + ) -#TODO: Use the filter framework here when it exists + +# TODO: Use the filter framework here when it exists @figure def plot_chi2filtered_training_validation(fit, nsigma_cut, replica_data, chi2filtered_index): """Like `plot_training_validation`, but apply `chi2filtered_index` mask.""" return plot_training_validation(fit, replica_data, dict([chi2filtered_index.as_pair()])) -#TODO: Use the filter framework here when it exists + +# TODO: Use the filter framework here when it exists @figure def plot_posfiltered_training_validation(fit, replica_data, negative_filtered_index): """Like `plot_training_validation`, but apply `chi2filtered_index` mask.""" @@ -284,22 +310,28 @@ def plot_posfiltered_training_validation(fit, replica_data, negative_filtered_in p_alpha_for_all_datasets = collect(p_alpha_study, ('reweight_all_datasets',)) -chi_2_for_all_datasets = collect(chi2_data_for_reweighting_experiments, - ('reweight_all_datasets',)) +chi_2_for_all_datasets = collect(chi2_data_for_reweighting_experiments, ('reweight_all_datasets',)) + @table -def p_alpha_all_datasets_table(p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets): +def p_alpha_all_datasets_table( + p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets +): """Compute and display P(alpha) and chi² for all datasets in all experiments.""" data = [] chilabel = r'$\chi^2$' modelabel = r'mode P($\alpha$)' - for series, rexp, chis in zip(p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets): - central = chis[0][1]/chis[0][2] + for series, rexp, chis in zip( + p_alpha_for_all_datasets, reweight_all_datasets, chi_2_for_all_datasets + ): + central = chis[0][1] / chis[0][2] exp = rexp['reweighting_experiments'][0] - data.append({'name':exp.datasets[0].name, - modelabel:series.argmax(), - chilabel: central, - }) - return pd.DataFrame.from_records(data, index='name', - columns=('name', modelabel, chilabel)) + data.append( + { + 'name': exp.datasets[0].name, + modelabel: series.argmax(), + chilabel: central, + } + ) + return pd.DataFrame.from_records(data, index='name', columns=('name', modelabel, chilabel)) diff --git a/validphys2/src/validphys/sumrules.py b/validphys2/src/validphys/sumrules.py index c8f15ffb4a..91baed58dd 100644 --- a/validphys2/src/validphys/sumrules.py +++ b/validphys2/src/validphys/sumrules.py @@ -13,10 +13,9 @@ import pandas as pd from scipy.integrate import quad -from reportengine.table import table from reportengine.checks import check_positive from reportengine.floatformatting import format_error_value_columns - +from reportengine.table import table from validphys.core import PDF from validphys.pdfbases import parse_flarr @@ -25,6 +24,7 @@ def _momentum_sum_rule_integrand(x, lpdf, Q): xqvals = lpdf.xfxQ(x, Q) return sum([xqvals[f] for f in lpdf.flavors()]) + def _make_momentum_fraction_integrand(fldict): """Make a suitable integrand function, which takes x to be integrated over and a PDF member and Q that computes the momentum fraction based on ``fldict``. @@ -49,13 +49,11 @@ def _make_momentum_fraction_integrand(fldict): fldict = {parse_flarr([k])[0]: v for k, v in fldict.items()} def f(x, lpdf, Q): - return sum( - multiplier * lpdf.xfxQ(x, Q)[flavour] - for flavour, multiplier in fldict.items() - ) + return sum(multiplier * lpdf.xfxQ(x, Q)[flavour] for flavour, multiplier in fldict.items()) return f + def _make_pdf_integrand(fldict): """Make a suitable integrand function, which takes x to be integrated over and a PDF member and Q that computes the integrand of the PDFs based on ``fldict``. @@ -81,11 +79,7 @@ def _make_pdf_integrand(fldict): def f(x, lpdf, Q): return ( - sum( - multiplier * lpdf.xfxQ(x, Q)[flavour] - for flavour, multiplier in fldict.items() - ) - / x + sum(multiplier * lpdf.xfxQ(x, Q)[flavour] for flavour, multiplier in fldict.items()) / x ) return f @@ -110,9 +104,7 @@ def f(x, lpdf, Q): "cm momentum fraction": _make_momentum_fraction_integrand({"c": 1, "cbar": -1}), "g momentum fraction": _make_momentum_fraction_integrand({"g": 1}), "T3": _make_pdf_integrand({"u": 1, "ubar": 1, "d": -1, "dbar": -1}), - "T8": _make_pdf_integrand( - {"u": 1, "ubar": 1, "d": 1, "dbar": 1, "s": -2, "sbar": -2} - ), + "T8": _make_pdf_integrand({"u": 1, "ubar": 1, "d": 1, "dbar": 1, "s": -2, "sbar": -2}), } KNOWN_SUM_RULES_EXPECTED = { @@ -129,7 +121,7 @@ def _integral(rule_f, pdf_member, Q, config=None): separating the regions of integration. Uses quad. """ if config is None: - config = {"limit":1000, "epsabs": 1e-4, "epsrel": 1e-4} + config = {"limit": 1000, "epsabs": 1e-4, "epsrel": 1e-4} res = 0.0 lims = [(1e-9, 1e-5), (1e-5, 1e-3), (1e-3, 1)] for lim in lims: @@ -139,11 +131,11 @@ def _integral(rule_f, pdf_member, Q, config=None): def _sum_rules(rules_dict, lpdf, Q): """Compute a SumRulesGrid from the loaded PDF, at Q""" - return {k: [_integral(r, m, Q) for m in lpdf.members] for k,r in rules_dict.items()} + return {k: [_integral(r, m, Q) for m in lpdf.members] for k, r in rules_dict.items()} @check_positive('Q') -def sum_rules(pdf:PDF, Q:numbers.Real): +def sum_rules(pdf: PDF, Q: numbers.Real): """Compute the momentum, uvalence, dvalence, svalence and cvalence sum rules for each member, at the energy scale ``Q``. Return a SumRulesGrid object with the list of values for each sum rule. @@ -153,7 +145,7 @@ def sum_rules(pdf:PDF, Q:numbers.Real): @check_positive('Q') -def central_sum_rules(pdf:PDF, Q:numbers.Real): +def central_sum_rules(pdf: PDF, Q: numbers.Real): """Compute the sum rules for the central member, at the scale Q""" lpdf = pdf.load_t0() return _sum_rules(KNOWN_SUM_RULES, lpdf, Q) @@ -162,21 +154,22 @@ def central_sum_rules(pdf:PDF, Q:numbers.Real): @check_positive('Q') def unknown_sum_rules(pdf: PDF, Q: numbers.Real): """Compute the following integrals - - u momentum fraction - - ubar momentum fraction - - d momentum fraction - - dbar momentum fraction - - s momentum fraction - - sbar momentum fraction - - cp momentum fraction - - cm momentum fraction - - g momentum fraction - - T3 - - T8 + - u momentum fraction + - ubar momentum fraction + - d momentum fraction + - dbar momentum fraction + - s momentum fraction + - sbar momentum fraction + - cp momentum fraction + - cm momentum fraction + - g momentum fraction + - T3 + - T8 """ lpdf = pdf.load() return _sum_rules(UNKNOWN_SUM_RULES, lpdf, Q) + def _simple_description(d): res = {} for k, arr in d.items(): @@ -187,29 +180,31 @@ def _simple_description(d): d["max"] = np.max(arr) return pd.DataFrame(res).T + def _err_mean_table(d): res = {} for k, arr in d.items(): res[k] = d = {} d["mean"] = np.mean(arr) d["std"] = np.std(arr) - df = pd.DataFrame(res) + df = pd.DataFrame(res) return format_error_value_columns(df.T, "mean", "std") - @table def sum_rules_table(sum_rules): """Return a table with the descriptive statistics of the sum rules, over members of the PDF.""" return _simple_description(sum_rules) + @table def central_sum_rules_table(central_sum_rules): """Construct a table with the value of each sum rule for the central member""" return pd.DataFrame(central_sum_rules, index=["Central value"]).T + @table def unknown_sum_rules_table(unknown_sum_rules): return _err_mean_table(unknown_sum_rules) diff --git a/validphys2/src/validphys/tableloader.py b/validphys2/src/validphys/tableloader.py index 8d4a493fbb..06209012ae 100644 --- a/validphys2/src/validphys/tableloader.py +++ b/validphys2/src/validphys/tableloader.py @@ -13,54 +13,62 @@ log = logging.getLogger(__name__) -#NOTE:Considering the first columns as index by default (the index_col=0) -#is not particularly sane, but turns out that it is advantageous for backward -#compatibility with the older DataFrame.from_csv method, that was employed -#previously. +# NOTE:Considering the first columns as index by default (the index_col=0) +# is not particularly sane, but turns out that it is advantageous for backward +# compatibility with the older DataFrame.from_csv method, that was employed +# previously. sane_load = functools.partial(pd.read_csv, sep='\t', index_col=0) + class TableLoaderError(Exception): """Errors in the tableloader module.""" + pass + def fixup_header(df, head_index, dtype): """Set the type of the column index in place""" oldcols = df.columns good = oldcols.levels[head_index].map(dtype) - newcols = oldcols.set_levels([*oldcols.levels[:head_index], - good, - *oldcols.levels[head_index+1:]]) + newcols = oldcols.set_levels( + [*oldcols.levels[:head_index], good, *oldcols.levels[head_index + 1 :]] + ) df.columns = newcols + def parse_data_cv(filename): """Useful for reading DataFrames with just one column.""" df = sane_load(filename, index_col=[0, 1, 2]) return df + def parse_exp_mat(filename): """Parse a dump of a matrix like experiments_covmat.""" - df = sane_load(filename, header=[0,1,2], index_col=[0,1,2]) + df = sane_load(filename, header=[0, 1, 2], index_col=[0, 1, 2]) fixup_header(df, 2, int) return df + load_experiments_covmat = parse_exp_mat load_experiments_invcovmat = parse_exp_mat + def load_perreplica_chi2_table(filename): """Load the output of ``perreplica_chi2_table``.""" - df = sane_load(filename, index_col=0 ,header=[0,1]) + df = sane_load(filename, index_col=0, header=[0, 1]) fixup_header(df, 1, int) return df def load_fits_computed_pseudoreplicas_chi2(filename): """Load the output of ``fits_computed_psedorreplicas_chi2``""" - return sane_load(filename, index_col=[0,1,2,3], header=[0,1,]) + return sane_load(filename, index_col=[0, 1, 2, 3], header=[0, 1]) def load_fits_chi2_table(filename): """Load the result of fits_chi2_tavle or similar.""" - return sane_load(filename, header=[0,1], index_col=[0,1]) + return sane_load(filename, header=[0, 1], index_col=[0, 1]) + def load_adapted_fits_chi2_table(filename): """Load the fits_chi2_table and adapt it in the way that suits the @@ -68,17 +76,17 @@ def load_adapted_fits_chi2_table(filename): another with the number of points.""" df = load_fits_chi2_table(filename) ndatalabel = df.columns[0][1] - dns= df.sort_index(axis=1).loc[:, pd.IndexSlice[:,ndatalabel]] + dns = df.sort_index(axis=1).loc[:, pd.IndexSlice[:, ndatalabel]] if not (dns.apply(pd.Series.nunique, axis=1) == 1).all(): raise TableLoaderError("Expecting all entries to have the same ndata") - ndatas = dns.iloc[:,0] + ndatas = dns.iloc[:, 0] - f = lambda x: x[x.columns[0]]*x[x.columns[1]] + f = lambda x: x[x.columns[0]] * x[x.columns[1]] df = df.groupby(axis=1, level=0).apply(f) df.columns = pd.MultiIndex.from_product([list(df.columns), ['chi2']]) - return ndatas, df + return ndatas, df def set_actual_column_level0(df, new_levels): @@ -89,9 +97,10 @@ def set_actual_column_level0(df, new_levels): cols.set_levels(new_levels, inplace=True, level=0) -#TODO: Find a better place for this function +# TODO: Find a better place for this function def combine_pseudoreplica_tables( - dfs, combined_names, * ,blacklist_datasets=None, min_points_required=2): + dfs, combined_names, *, blacklist_datasets=None, min_points_required=2 +): """Return a table in the same format as perreplica_chi2_table with th e minimum value of the chi² for each batch of fits.""" @@ -100,7 +109,7 @@ def combine_pseudoreplica_tables( if blacklist_datasets: m = np.ones(df.shape[0], dtype=bool) for it in blacklist_datasets: - dsmask = (df.index.get_level_values(1) != it) + dsmask = df.index.get_level_values(1) != it m &= dsmask if m.all(): log.warning(f"Did not blacklist any dataset from the list {blacklist_datasets}") @@ -111,28 +120,26 @@ def combine_pseudoreplica_tables( total = together.loc[(slice(None), 'Total'), :] - total_chis = total.groupby(level=3).sum(min_count=1) + total_chis = total.groupby(level=3).sum(min_count=1) def fixup_min_points(df): - m = (~df.isnull()).sum(axis=1, min_count=1)>=min_points_required + m = (~df.isnull()).sum(axis=1, min_count=1) >= min_points_required df[df[m].isnull()] = np.inf return df - #The idea is: Set to inf the nans of the valid curves, so that we select - #the minimum (which is not infinite). Leave the bad nans as nans, so we - #write nan always for those. - total_chis = total_chis.groupby(axis=1,level=1).apply(fixup_min_points) - + # The idea is: Set to inf the nans of the valid curves, so that we select + # the minimum (which is not infinite). Leave the bad nans as nans, so we + # write nan always for those. + total_chis = total_chis.groupby(axis=1, level=1).apply(fixup_min_points) - #Note, asarray is needed because it ignores NANs otherwise. + # Note, asarray is needed because it ignores NANs otherwise. argmin = lambda x: pd.Series(np.argmin(np.asarray(x), axis=1), index=x.index) best_replicas = total_chis.groupby(axis=1, level=1).apply(argmin) gb = together.groupby(axis=1, level=1) def inner_select(df, indexes): - return df.iloc[:,indexes[df.name]] - + return df.iloc[:, indexes[df.name]] def select_best_replicas(df): indexes = best_replicas[df.name] @@ -142,23 +149,26 @@ def select_best_replicas(df): res.index = res.index.droplevel(0) res.sort_index(inplace=True) - #TODO: Why in earth did I decide to keep this?! + # TODO: Why in earth did I decide to keep this?! res.columns = pd.MultiIndex.from_product((res.columns, ['chi2'])) - return res + def get_extrasum_slice(df, components): """Extract a slice of a table that has the components in the format that extra_sums expects.""" df = pd.DataFrame(df) df.sort_index(inplace=True) total_token = ' Total' - keys = [(c[:-len(total_token)], 'Total') if c.endswith(total_token) else - (slice(None), c) for c in components] + keys = [ + (c[: -len(total_token)], 'Total') if c.endswith(total_token) else (slice(None), c) + for c in components + ] locs = [flat for key in keys for flat in df.index.get_locs(key)] return df.iloc[locs, :] + # Define aliases for functions with spelling mistakes in their names which have now been corrected # Do this so that old runcards still work load_fits_computed_psedorreplicas_chi2 = load_fits_computed_pseudoreplicas_chi2 diff --git a/validphys2/src/validphys/theorycovariance/construction.py b/validphys2/src/validphys/theorycovariance/construction.py index 460e159daf..5feb6835df 100644 --- a/validphys2/src/validphys/theorycovariance/construction.py +++ b/validphys2/src/validphys/theorycovariance/construction.py @@ -5,38 +5,35 @@ """ from __future__ import generator_stop +from collections import defaultdict, namedtuple import logging -from collections import defaultdict, namedtuple import numpy as np -import scipy.linalg as la import pandas as pd +import scipy.linalg as la -from reportengine.table import table from reportengine import collect - -from validphys.results import ( - procs_central_values, - procs_central_values_no_table, -) -from validphys.results import Chi2Data, results -from validphys.calcutils import calc_chi2, all_chi2_theory, central_chi2_theory +from reportengine.table import table +from validphys.calcutils import all_chi2_theory, calc_chi2, central_chi2_theory +from validphys.results import Chi2Data, procs_central_values, procs_central_values_no_table, results from validphys.theorycovariance.theorycovarianceutils import ( - process_lookup, check_correct_theory_combination, check_fit_dataset_order_matches_grouped, + process_lookup, ) - log = logging.getLogger(__name__) theoryids_procs_central_values = collect(procs_central_values, ("theoryids",)) -theoryids_procs_central_values_no_table = collect( - procs_central_values_no_table, ("theoryids",) -) +theoryids_procs_central_values_no_table = collect(procs_central_values_no_table, ("theoryids",)) -collected_theoryids = collect("theoryids", ["theoryconfig",]) +collected_theoryids = collect( + "theoryids", + [ + "theoryconfig", + ], +) def make_scale_var_covmat(predictions): @@ -59,9 +56,11 @@ def make_scale_var_covmat(predictions): @check_correct_theory_combination def theory_covmat_singleprocess_no_table( - theoryids_procs_central_values_no_table, procs_index, theoryids, fivetheories, + theoryids_procs_central_values_no_table, + procs_index, + theoryids, + fivetheories, ): - """Calculates the theory covariance matrix for scale variations. The matrix is a dataframe indexed by procs_index.""" s = make_scale_var_covmat(theoryids_procs_central_values_no_table) @@ -223,9 +222,7 @@ def covmap(combine_by_type, dataset_names): start_exp[dataset] = running_index running_index += size start = 0 - names_by_proc_list = [ - item for sublist in process_info.namelist.values() for item in sublist - ] + names_by_proc_list = [item for sublist in process_info.namelist.values() for item in sublist] for dataset in names_by_proc_list: for i in range(process_info.sizes[dataset]): mapping[start + i] = start_exp[dataset] + i @@ -268,9 +265,9 @@ def covmat_5pt(name1, name2, deltas1, deltas2): if name1 == name2: s = 0.5 * sum(np.outer(d, d) for d in deltas1) else: - s = 0.5 * ( - np.outer(deltas1[0], deltas2[0]) + np.outer(deltas1[1], deltas2[1]) - ) + 0.25 * (np.outer((deltas1[2] + deltas1[3]), (deltas2[2] + deltas2[3]))) + s = 0.5 * (np.outer(deltas1[0], deltas2[0]) + np.outer(deltas1[1], deltas2[1])) + 0.25 * ( + np.outer((deltas1[2] + deltas1[3]), (deltas2[2] + deltas2[3])) + ) return s @@ -479,9 +476,7 @@ def fromfile_covmat(covmatpath, procs_data, procs_index): if (ds1 in shortlist) and (ds2 in shortlist): # If both datasets in the fromfile covmat, use the piece of the fromfile covmat covmat = ( - cut_df.xs(ds1, level=1, drop_level=False) - .T.xs(ds2, level=1, drop_level=False) - .T + cut_df.xs(ds1, level=1, drop_level=False).T.xs(ds2, level=1, drop_level=False).T ) else: # Otherwise use a covmat of 0s @@ -514,13 +509,13 @@ def fromfile_covmat(covmatpath, procs_data, procs_index): @table def user_covmat(procs_data, procs_index, loaded_user_covmat_path): """ - General theory covariance matrix provided by the user. + General theory covariance matrix provided by the user. Useful for testing the impact of externally produced - covariance matrices. Matrices must be produced as a + covariance matrices. Matrices must be produced as a csv of pandas DataFrame, and uploaded to the validphys - server. The server path is then provided via - ``user_covmat_path`` in ``theorycovmatconfig`` in the - runcard. For more information see documentation. + server. The server path is then provided via + ``user_covmat_path`` in ``theorycovmatconfig`` in the + runcard. For more information see documentation. """ return fromfile_covmat(loaded_user_covmat_path, procs_data, procs_index) @@ -535,29 +530,27 @@ def total_theory_covmat(theory_covmat_custom, user_covmat): def theory_covmat_custom_fitting(theory_covmat_custom, procs_index_matched): - """theory_covmat_custom but reindexed so the order of the datasets matches + """theory_covmat_custom but reindexed so the order of the datasets matches those in the experiment covmat so they are aligned when fitting.""" - df = theory_covmat_custom.reindex(procs_index_matched).T.reindex( - procs_index_matched - ) + df = theory_covmat_custom.reindex(procs_index_matched).T.reindex(procs_index_matched) return df def total_theory_covmat_fitting(total_theory_covmat, procs_index_matched): - """total_theory_covmat but reindexed so the order of the datasets matches - those in the experiment covmat so they are aligned when fitting.""" + """total_theory_covmat but reindexed so the order of the datasets matches + those in the experiment covmat so they are aligned when fitting.""" return theory_covmat_custom_fitting(total_theory_covmat, procs_index_matched) def user_covmat_fitting(user_covmat, procs_index_matched): - """user_covmat but reindexed so the order of the datasets matches - those in the experiment covmat so they are aligned when fitting.""" + """user_covmat but reindexed so the order of the datasets matches + those in the experiment covmat so they are aligned when fitting.""" return theory_covmat_custom_fitting(user_covmat, procs_index_matched) def procs_index_matched(groups_index, procs_index): """procs_index but matched to the dataset order given - by groups_index. """ + by groups_index.""" # Making list with exps ordered like in groups_index groups_ds_order = groups_index.get_level_values(level=1).unique().tolist() # Tuples to make multiindex, ordered like in groups_index @@ -671,7 +664,7 @@ def experimentplustheory_normcovmat_singleprocess( procs_covmat, theory_covmat_singleprocess, procs_data ): """Calculates the experiment + theory covariance matrix for scale - variations normalised to data.""" + variations normalised to data.""" df = procs_covmat + theory_covmat_singleprocess procs_data_array = np.array(procs_data) mat = df / np.outer(procs_data_array, procs_data_array) @@ -686,10 +679,8 @@ def experimentplusblocktheory_normcovmat( experimentplustheory_normcovmat, ): """Calculates the experiment + theory covariance matrix for scale - variations normalised to data, block diagonal by data set.""" - mat = experimentplustheory_normcovmat( - procs_covmat, theory_block_diag_covmat, procs_data_values - ) + variations normalised to data, block diagonal by data set.""" + mat = experimentplustheory_normcovmat(procs_covmat, theory_block_diag_covmat, procs_data_values) return mat @@ -701,18 +692,14 @@ def experimentplustheory_normcovmat_custom( experimentplustheory_normcovmat, ): """Calculates the experiment + theory covariance matrix for scale - variations normalised to data, correlations by process type.""" - mat = experimentplustheory_normcovmat( - procs_covmat, theory_covmat_custom, procs_data_values - ) + variations normalised to data, correlations by process type.""" + mat = experimentplustheory_normcovmat(procs_covmat, theory_covmat_custom, procs_data_values) return mat @table -def experimentplustheory_corrmat_singleprocess( - procs_covmat, theory_covmat_singleprocess -): +def experimentplustheory_corrmat_singleprocess(procs_covmat, theory_covmat_singleprocess): """Calculates the correlation matrix for the experimental plus theory covariance matrices.""" total_df = procs_covmat + theory_covmat_singleprocess @@ -726,9 +713,7 @@ def experimentplustheory_corrmat_singleprocess( def experimentplusblocktheory_corrmat(procs_covmat, theory_block_diag_covmat): """Calculates the correlation matrix for the experimental plus theory covariance matrices, block diagonal by dataset.""" - corrmat = experimentplustheory_corrmat_singleprocess( - procs_covmat, theory_block_diag_covmat - ) + corrmat = experimentplustheory_corrmat_singleprocess(procs_covmat, theory_block_diag_covmat) return corrmat @@ -736,9 +721,7 @@ def experimentplusblocktheory_corrmat(procs_covmat, theory_block_diag_covmat): def experimentplustheory_corrmat_custom(procs_covmat, theory_covmat_custom): """Calculates the correlation matrix for the experimental plus theory covariance matrices, correlations by prescription.""" - corrmat = experimentplustheory_corrmat_singleprocess( - procs_covmat, theory_covmat_custom - ) + corrmat = experimentplustheory_corrmat_singleprocess(procs_covmat, theory_covmat_custom) return corrmat @@ -837,10 +820,6 @@ def abs_chi2_data_diagtheory_proc(procs_results, total_covmat_diagtheory_procs): return abs_chi2_data_theory_proc(procs_results, total_covmat_diagtheory_procs) -def abs_chi2_data_diagtheory_dataset( - each_dataset_results, total_covmat_diagtheory_datasets -): +def abs_chi2_data_diagtheory_dataset(each_dataset_results, total_covmat_diagtheory_datasets): """For a diagonal theory covmat""" - return abs_chi2_data_theory_dataset( - each_dataset_results, total_covmat_diagtheory_datasets - ) + return abs_chi2_data_theory_dataset(each_dataset_results, total_covmat_diagtheory_datasets) diff --git a/validphys2/src/validphys/theorycovariance/output.py b/validphys2/src/validphys/theorycovariance/output.py index f3616ca723..2eab9d20c3 100644 --- a/validphys2/src/validphys/theorycovariance/output.py +++ b/validphys2/src/validphys/theorycovariance/output.py @@ -6,17 +6,17 @@ from __future__ import generator_stop import logging - from math import inf -import pandas as pd + +from matplotlib import cm +from matplotlib import colors as mcolors import numpy as np +import pandas as pd import scipy.linalg as la -from matplotlib import cm, colors as mcolors +from reportengine import collect from reportengine.figure import figure from reportengine.table import table -from reportengine import collect - from validphys import plotutils from validphys.results import groups_chi2_table @@ -241,23 +241,24 @@ def plot_expcorrmat_heatmap(procs_corrmat): def plot_normthblockcovmat_heatmap(theory_normblockcovmat): """Matrix plot for block diagonal theory covariance matrix""" fig = plot_covmat_heatmap( - theory_normblockcovmat, "Block diagonal theory covariance matrix by dataset", + theory_normblockcovmat, + "Block diagonal theory covariance matrix by dataset", ) return fig @figure def plot_normthcovmat_heatmap_custom( - theory_normcovmat_custom, theoryids, fivetheories, + theory_normcovmat_custom, + theoryids, + fivetheories, ): """Matrix plot for block diagonal theory covariance matrix by process type""" l = len(theoryids) if l == 5: if fivetheories == "bar": l = r"$\bar{5}$" - fig = plot_covmat_heatmap( - theory_normcovmat_custom, f"Theory Covariance matrix ({l} pt)" - ) + fig = plot_covmat_heatmap(theory_normcovmat_custom, f"Theory Covariance matrix ({l} pt)") return fig @@ -272,16 +273,16 @@ def plot_thblockcorrmat_heatmap(theory_blockcorrmat): @figure def plot_thcorrmat_heatmap_custom( - theory_corrmat_custom, theoryids, fivetheories, + theory_corrmat_custom, + theoryids, + fivetheories, ): """Matrix plot of the theory correlation matrix, correlations by process type""" l = len(theoryids) if l == 5: if fivetheories == "bar": l = r"$\bar{5}$" - fig = plot_corrmat_heatmap( - theory_corrmat_custom, f"Theory Correlation matrix ({l} pt)" - ) + fig = plot_corrmat_heatmap(theory_corrmat_custom, f"Theory Correlation matrix ({l} pt)") return fig @@ -297,7 +298,9 @@ def plot_normexpplusblockthcovmat_heatmap(experimentplusblocktheory_normcovmat): @figure def plot_normexpplusthcovmat_heatmap_custom( - experimentplustheory_normcovmat_custom, theoryids, fivetheories, + experimentplustheory_normcovmat_custom, + theoryids, + fivetheories, ): """Matrix plot of the exp + theory covariance matrix normalised to data""" l = len(theoryids) @@ -323,7 +326,9 @@ def plot_expplusblockthcorrmat_heatmap(experimentplusblocktheory_corrmat): @figure def plot_expplusthcorrmat_heatmap_custom( - experimentplustheory_corrmat_custom, theoryids, fivetheories, + experimentplustheory_corrmat_custom, + theoryids, + fivetheories, ): """Matrix plot of the exp + theory correlation matrix""" l = len(theoryids) @@ -340,20 +345,20 @@ def plot_expplusthcorrmat_heatmap_custom( @figure def plot_blockcovdiff_heatmap(theory_block_diag_covmat, procs_covmat): """Matrix plot (thcov + expcov)/expcov""" - df = (theory_block_diag_covmat.as_matrix() + procs_covmat.values) / np.mean( - procs_covmat.values - ) + df = (theory_block_diag_covmat.as_matrix() + procs_covmat.values) / np.mean(procs_covmat.values) fig = plot_covmat_heatmap( df, - "(Theory + experiment)/mean(experiment)" - + "for block diagonal theory covmat by dataset", + "(Theory + experiment)/mean(experiment)" + "for block diagonal theory covmat by dataset", ) return fig @figure def plot_covdiff_heatmap_custom( - theory_covmat_custom, procs_covmat, theoryids, fivetheories, + theory_covmat_custom, + procs_covmat, + theoryids, + fivetheories, ): """Matrix plot (thcov + expcov)/expcov""" l = len(theoryids) @@ -363,15 +368,18 @@ def plot_covdiff_heatmap_custom( df = (theory_covmat_custom + procs_covmat) / np.mean(procs_covmat.values) fig = plot_covmat_heatmap( df, - "(Theory + experiment)/mean(experiment)" - + f"covariance matrices for {l} points", + f"(Theory + experiment)/mean(experiment) covariance matrices for {l} points", ) return fig @figure def plot_diag_cov_comparison( - theory_covmat_custom, procs_covmat, procs_data_values, theoryids, fivetheories, + theory_covmat_custom, + procs_covmat, + procs_data_values, + theoryids, + fivetheories, ): """Plot of sqrt(cov_ii)/|data_i| for cov = exp, theory, exp+theory""" l = len(theoryids) @@ -420,7 +428,11 @@ def plot_diag_cov_comparison( @figure def plot_diag_cov_impact( - theory_covmat_custom, procs_covmat, procs_data_values, theoryids, fivetheories, + theory_covmat_custom, + procs_covmat, + procs_data_values, + theoryids, + fivetheories, ): """Plot ((expcov)^-1_ii)^-0.5 versus ((expcov + thcov)^-1_ii)^-0.5""" l = len(theoryids) @@ -430,9 +442,7 @@ def plot_diag_cov_impact( matrix_theory = theory_covmat_custom.values matrix_experiment = procs_covmat.values inv_exp = (np.diag(la.inv(matrix_experiment))) ** (-0.5) / procs_data_values - inv_tot = (np.diag(la.inv(matrix_theory + matrix_experiment))) ** ( - -0.5 - ) / procs_data_values + inv_tot = (np.diag(la.inv(matrix_theory + matrix_experiment))) ** (-0.5) / procs_data_values plot_index = theory_covmat_custom.index df_inv_exp = pd.DataFrame(inv_exp, index=plot_index) df_inv_exp.sort_index(0, inplace=True) @@ -461,9 +471,7 @@ def plot_diag_cov_impact( @figure -def plot_datasets_chi2_theory( - procs_data, each_dataset_chi2, abs_chi2_data_theory_dataset -): +def plot_datasets_chi2_theory(procs_data, each_dataset_chi2, abs_chi2_data_theory_dataset): """Plot the chi² of all datasets, before and after adding theory errors, with bars.""" ds = iter(each_dataset_chi2) dstheory = iter(abs_chi2_data_theory_dataset) diff --git a/validphys2/src/validphys/theorycovariance/tests.py b/validphys2/src/validphys/theorycovariance/tests.py index 4e0d94e6a8..fff466d676 100644 --- a/validphys2/src/validphys/theorycovariance/tests.py +++ b/validphys2/src/validphys/theorycovariance/tests.py @@ -5,42 +5,34 @@ """ from __future__ import generator_stop +from collections import namedtuple import logging -from collections import namedtuple -import numpy as np -import scipy.linalg as la from matplotlib.figure import Figure import matplotlib.patches as mpatches +import numpy as np import pandas as pd +import scipy.linalg as la +from reportengine import collect, floatformatting from reportengine.figure import figure from reportengine.table import table -from reportengine import collect -from reportengine import floatformatting - +from validphys import plotutils from validphys.checks import check_two_dataspecs - from validphys.theorycovariance.construction import ( combine_by_type, - process_starting_points, -) -from validphys.theorycovariance.construction import theory_corrmat_singleprocess -from validphys.theorycovariance.construction import ( covmap, covs_pt_prescrip, + process_starting_points, + theory_corrmat_singleprocess, theory_covmat_custom, ) - -from validphys.theorycovariance.output import matrix_plot_labels, _get_key -from validphys.theorycovariance.theorycovarianceutils import ( - process_lookup, - check_correct_theory_combination_theoryconfig, -) +from validphys.theorycovariance.output import _get_key, matrix_plot_labels from validphys.theorycovariance.theorycovarianceutils import ( check_correct_theory_combination_dataspecs, + check_correct_theory_combination_theoryconfig, + process_lookup, ) -from validphys import plotutils log = logging.getLogger(__name__) @@ -50,9 +42,7 @@ @check_two_dataspecs -def dataspecs_dataset_prediction_shift( - matched_dataspecs_results, process, dataset_name -): +def dataspecs_dataset_prediction_shift(matched_dataspecs_results, process, dataset_name): """Compute the difference in theory predictions between two dataspecs. This can be used in combination with `matched_datasets_from_dataspecs` It returns a ``LabeledShifts`` containing ``dataset_name``, @@ -68,16 +58,10 @@ def dataspecs_dataset_prediction_shift( ) -def shift_vector( - matched_dataspecs_dataset_prediction_shift, matched_dataspecs_dataset_theory -): +def shift_vector(matched_dataspecs_dataset_prediction_shift, matched_dataspecs_dataset_theory): """Returns a DataFrame of normalised shift vectors for matched dataspecs.""" - all_shifts = np.concatenate( - [val.shifts for val in matched_dataspecs_dataset_prediction_shift] - ) - all_theory = np.concatenate( - [val.shifts for val in matched_dataspecs_dataset_theory] - ) + all_shifts = np.concatenate([val.shifts for val in matched_dataspecs_dataset_prediction_shift]) + all_theory = np.concatenate([val.shifts for val in matched_dataspecs_dataset_theory]) norm_shifts = all_shifts / all_theory dsnames = np.concatenate( [ @@ -86,14 +70,9 @@ def shift_vector( ] ) point_indexes = np.concatenate( - [ - np.arange(len(val.shifts)) - for val in matched_dataspecs_dataset_prediction_shift - ] - ) - index = pd.MultiIndex.from_arrays( - [dsnames, point_indexes], names=["Dataset name", "Point"] + [np.arange(len(val.shifts)) for val in matched_dataspecs_dataset_prediction_shift] ) + index = pd.MultiIndex.from_arrays([dsnames, point_indexes], names=["Dataset name", "Point"]) return pd.DataFrame(norm_shifts, index=index) @@ -111,9 +90,7 @@ def dataspecs_dataset_theory(matched_dataspecs_results, process, dataset_name): def theory_vector(matched_dataspecs_dataset_theory): """Returns a DataFrame of the central theory vector for matched dataspecs.""" - all_theory = np.concatenate( - [val.shifts for val in matched_dataspecs_dataset_theory] - ) + all_theory = np.concatenate([val.shifts for val in matched_dataspecs_dataset_theory]) dsnames = np.concatenate( [ np.full(len(val.shifts), val.dataset_name, dtype=object) @@ -123,9 +100,7 @@ def theory_vector(matched_dataspecs_dataset_theory): point_indexes = np.concatenate( [np.arange(len(val.shifts)) for val in matched_dataspecs_dataset_theory] ) - index = pd.MultiIndex.from_arrays( - [dsnames, point_indexes], names=["Dataset name", "Point"] - ) + index = pd.MultiIndex.from_arrays([dsnames, point_indexes], names=["Dataset name", "Point"]) return pd.DataFrame(all_theory, index=index) @@ -138,19 +113,13 @@ def dataspecs_dataset_alltheory(matched_dataspecs_results, process, dataset_name return LabeledShifts(dataset_name=dataset_name, process=process, shifts=res) -matched_dataspecs_dataset_alltheory = collect( - "dataspecs_dataset_alltheory", ["dataspecs"] -) +matched_dataspecs_dataset_alltheory = collect("dataspecs_dataset_alltheory", ["dataspecs"]) -def alltheory_vector( - matched_dataspecs_dataset_alltheory, matched_dataspecs_dataset_theory -): +def alltheory_vector(matched_dataspecs_dataset_alltheory, matched_dataspecs_dataset_theory): """Returns a DataFrame with the theory vectors for matched dataspecs for the scale-varied theories (not the central one).""" - all_theory = np.concatenate( - [val.shifts for val in matched_dataspecs_dataset_alltheory], axis=1 - ) + all_theory = np.concatenate([val.shifts for val in matched_dataspecs_dataset_alltheory], axis=1) dsnames = np.concatenate( [ np.full(len(val.shifts), val.dataset_name, dtype=object) @@ -160,9 +129,7 @@ def alltheory_vector( point_indexes = np.concatenate( [np.arange(len(val.shifts)) for val in matched_dataspecs_dataset_theory] ) - index = pd.MultiIndex.from_arrays( - [dsnames, point_indexes], names=["Dataset name", "Point"] - ) + index = pd.MultiIndex.from_arrays([dsnames, point_indexes], names=["Dataset name", "Point"]) theory_vectors = [] for theoryvector in all_theory: theory_vectors.append(pd.DataFrame(theoryvector, index=index)) @@ -233,9 +200,7 @@ def matched_experiments_index(matched_dataspecs_dataset_name, all_matched_data_l [np.full(l, dsname, dtype=object) for (l, dsname) in zip(lens, dsnames)] ) point_indexes = np.concatenate([np.arange(l) for l in lens]) - index = pd.MultiIndex.from_arrays( - [dsnames, point_indexes], names=["Dataset name", "Point"] - ) + index = pd.MultiIndex.from_arrays([dsnames, point_indexes], names=["Dataset name", "Point"]) return index @@ -268,17 +233,11 @@ def theory_covmat_custom_dataspecs( "all_matched_results", ["combined_shift_and_theory_dataspecs", "theoryconfig"] ) -shx_vector = collect( - "shift_vector", ["combined_shift_and_theory_dataspecs", "shiftconfig"] -) +shx_vector = collect("shift_vector", ["combined_shift_and_theory_dataspecs", "shiftconfig"]) -thx_vector = collect( - "theory_vector", ["combined_shift_and_theory_dataspecs", "theoryconfig"] -) +thx_vector = collect("theory_vector", ["combined_shift_and_theory_dataspecs", "theoryconfig"]) -allthx_vector = collect( - "alltheory_vector", ["combined_shift_and_theory_dataspecs", "theoryconfig"] -) +allthx_vector = collect("alltheory_vector", ["combined_shift_and_theory_dataspecs", "theoryconfig"]) def theory_matrix_threshold(theory_threshold: (int, float) = 0): @@ -512,8 +471,7 @@ def evals_nonzero_basis( covmat = thx_covmat[0] / (np.outer(thx_vector[0], thx_vector[0])) # constructing vectors of shifts due to scale variation diffs = [ - ((thx_vector[0] - scalevarvector) / thx_vector[0]) - for scalevarvector in allthx_vector[0] + ((thx_vector[0] - scalevarvector) / thx_vector[0]) for scalevarvector in allthx_vector[0] ] # number of points in point prescription num_pts = len(diffs) + 1 @@ -563,9 +521,7 @@ def evals_nonzero_basis( ys = [x / np.linalg.norm(x) for x in xs] for i in range(1, len(xs)): for j in range(0, i): - ys[i] = ys[i] - (ys[i].T.dot(ys[j]))[0][0] * ys[j] / np.linalg.norm( - ys[j] - ) + ys[i] = ys[i] - (ys[i].T.dot(ys[j]))[0][0] * ys[j] / np.linalg.norm(ys[j]) ys[i] = ys[i] / np.linalg.norm(ys[i]) p = pd.concat(ys, axis=1) # Orthonormalising vectors according to singular value decomposition @@ -632,8 +588,8 @@ def efficiency(theory_shift_test): f = theory_shift_test[3] fmiss = theory_shift_test[4] fs = f - fmiss - fmod = np.sqrt(np.sum(f ** 2)) - fs_mod = np.sqrt(np.sum(fs ** 2)) + fmod = np.sqrt(np.sum(f**2)) + fs_mod = np.sqrt(np.sum(fs**2)) efficiency = fs_mod / fmod print(f"efficiency = {efficiency}") return efficiency @@ -645,7 +601,7 @@ def validation_theory_chi2(theory_shift_test): projectors = theory_shift_test[2] evals = theory_shift_test[0] ratio = projectors / np.sqrt(np.abs(evals)) - th_chi2 = 1 / len(evals) * np.sum(ratio ** 2) + th_chi2 = 1 / len(evals) * np.sum(ratio**2) print(f"Theory chi2 = {th_chi2}") return th_chi2 @@ -657,8 +613,8 @@ def theta(theory_shift_test): f = theory_shift_test[3] fmiss = theory_shift_test[4] fs = f - fmiss - fmod = np.sqrt(np.sum(f ** 2)) - fs_mod = np.sqrt(np.sum(fs ** 2)) + fmod = np.sqrt(np.sum(f**2)) + fs_mod = np.sqrt(np.sum(fs**2)) costheta = f @ fs / (fmod * fs_mod) th = np.arccos(costheta) return th @@ -671,7 +627,7 @@ def projector_eigenvalue_ratio(theory_shift_test): evals = theory_shift_test[0][::-1] projectors = theory_shift_test[2][::-1] fmiss = theory_shift_test[4] - fmiss_mod = np.sqrt(np.sum(fmiss ** 2)) + fmiss_mod = np.sqrt(np.sum(fmiss**2)) ratio = np.abs(projectors) / np.sqrt(np.abs(evals)) # Initialise array of zeros and set precision to same as FK tables # Ordering according to shift vector @@ -684,7 +640,7 @@ def projector_eigenvalue_ratio(theory_shift_test): fig = Figure(figsize=(5, 5)) ax1 = fig.add_subplot(2, 1, 1) ax2 = fig.add_subplots(2, 1, 2) - + ax1.plot(xvals, np.abs(projectors), "s", label=r"|$\delta_a$|") ax1.plot(xvals, np.sqrt(np.abs(evals)), "o", label=r"$|s_a|$") ax1.plot(0, fmiss_mod, "*", label=r"$|\delta_{miss}|$", color="b") @@ -732,7 +688,7 @@ def eigenvector_plot(evals_nonzero_basis, shx_vector): newindex = sorted(oldindex, key=_get_key) f = f.reindex(newindex) fig, axes = plotutils.subplots(figsize=(10, 2 * len(evecs)), nrows=len(evecs)) - + fig.subplots_adjust(hspace=0.8) for ax, evec, eval in zip(axes.flatten(), evecs, evals): eval_3sf = floatformatting.significant_digits(eval.item(), 3) @@ -743,9 +699,7 @@ def eigenvector_plot(evals_nonzero_basis, shx_vector): ticklocs, ticklabels, startlocs = matrix_plot_labels(evec) # Shift startlocs elements 0.5 to left so lines are between indexes startlocs_lines = [x - 0.5 for x in startlocs] - ax.vlines( - startlocs_lines, ax.get_ylim()[0], ax.get_ylim()[1], linestyles="dashed" - ) + ax.vlines(startlocs_lines, ax.get_ylim()[0], ax.get_ylim()[1], linestyles="dashed") ax.margins(x=0, y=0) # Adding eigenvalue to legend extraString = f"Eigenvalue = {eval_3sf}" @@ -793,13 +747,11 @@ def deltamiss_plot(theory_shift_test, allthx_vector, evals_nonzero_basis, shx_ve # Plotting fig, ax = plotutils.subplots(figsize=(20, 10)) ax.plot(f.values * 100, ".-", label="NNLO-NLO Shift", color="black") - ax.plot( - fmiss.values * 100, ".-", label=r"$\delta_{miss}$" + f" ({l} pt)", color="blue" - ) + ax.plot(fmiss.values * 100, ".-", label=r"$\delta_{miss}$" + f" ({l} pt)", color="blue") ticklocs, ticklabels, startlocs = matrix_plot_labels(f) ax.set_xticks(ticklocs) ax.set_xticklabels(ticklabels, rotation=45, fontsize=20) - + # Shift startlocs elements 0.5 to left so lines are between indexes startlocs_lines = [x - 0.5 for x in startlocs] ax.vlines(startlocs_lines, -70, 70, linestyles="dashed") diff --git a/validphys2/src/validphys/theorycovariance/theorycovarianceutils.py b/validphys2/src/validphys/theorycovariance/theorycovarianceutils.py index 104d59ee4c..369d0b258c 100644 --- a/validphys2/src/validphys/theorycovariance/theorycovarianceutils.py +++ b/validphys2/src/validphys/theorycovariance/theorycovarianceutils.py @@ -5,7 +5,7 @@ """ import logging -from reportengine.checks import make_argcheck, check +from reportengine.checks import check, make_argcheck from validphys.loader import Loader from validphys.plotoptions import get_info @@ -63,9 +63,7 @@ def check_correct_theory_combination_internal( ) -check_correct_theory_combination = make_argcheck( - check_correct_theory_combination_internal -) +check_correct_theory_combination = make_argcheck(check_correct_theory_combination_internal) @make_argcheck @@ -76,9 +74,7 @@ def check_correct_theory_combination_theoryconfig(collected_theoryids, fivetheor @make_argcheck def check_correct_theory_combination_dataspecs(dataspecs_theoryids, fivetheories): """Like check_correct_theory_combination but for matched dataspecs.""" - return check_correct_theory_combination.__wrapped__( - dataspecs_theoryids, fivetheories - ) + return check_correct_theory_combination.__wrapped__(dataspecs_theoryids, fivetheories) @make_argcheck @@ -86,10 +82,10 @@ def check_fit_dataset_order_matches_grouped( group_dataset_inputs_by_metadata, data_input, processed_metadata_group ): """ - Check for use with theory covmat generation. + Check for use with theory covmat generation. Makes sure that the order of datasets listed in the fit runcard is the same - as that specified by the metadata grouping. Otherwise there can be a + as that specified by the metadata grouping. Otherwise there can be a misalignment between the experiment covmat and theory covmat. """ data_input_iter = iter(data_input) diff --git a/validphys2/src/validphys/theorydbutils.py b/validphys2/src/validphys/theorydbutils.py index 6b0d5850ee..4a0c7e6954 100644 --- a/validphys2/src/validphys/theorydbutils.py +++ b/validphys2/src/validphys/theorydbutils.py @@ -6,14 +6,17 @@ data as a python object. """ from pathlib import Path + # Keep the sqlite3 dependence in one location import sqlite3 import pandas as pd + class TheoryNotFoundInDatabase(Exception): pass + def make_query(query: str, dbpath: Path): """Base level function which executes a `query` given the path to a sqlite3 database @@ -52,6 +55,7 @@ def make_query(query: str, dbpath: Path): res = cursor.execute(query) return res + def fetch_theory(theory_database: Path, theoryID: int): """Looks in the theory database and returns a dictionary of theory info for the theory number specified by `theoryID`. @@ -68,7 +72,7 @@ def fetch_theory(theory_database: Path, theoryID: int): theory_info_dict: dict dictionary filled with relevant entry from theory database """ - #int casting is intentional to avoid malformed querys. + # int casting is intentional to avoid malformed querys. query = f"SELECT * FROM TheoryIndex WHERE ID={int(theoryID)};" res = make_query(query, theory_database) val = res.fetchone() @@ -76,6 +80,7 @@ def fetch_theory(theory_database: Path, theoryID: int): raise TheoryNotFoundInDatabase(f"ID {theoryID} not found in database.") return {k[0]: v for k, v in zip(res.description, val)} + def fetch_all(theory_database: Path): """Looks in the theory database and returns a dataframe with theory info for all theories diff --git a/validphys2/src/validphys/theoryinfo.py b/validphys2/src/validphys/theoryinfo.py index fcbc290218..c7ac8df380 100644 --- a/validphys2/src/validphys/theoryinfo.py +++ b/validphys2/src/validphys/theoryinfo.py @@ -7,8 +7,8 @@ from pandas import DataFrame from reportengine.table import table +from validphys.theorydbutils import fetch_all, fetch_theory -from validphys.theorydbutils import fetch_theory, fetch_all @table def all_theory_info_table(theory_database): @@ -34,6 +34,7 @@ def all_theory_info_table(theory_database): """ return fetch_all(theory_database) + @table def theory_info_table(theory_database, theory_db_id): """fetches theory info for given `theory_db_id` constructs DataFrame from it diff --git a/validphys2/src/validphys/uploadutils.py b/validphys2/src/validphys/uploadutils.py index 62b90b6a44..bc459c7a17 100644 --- a/validphys2/src/validphys/uploadutils.py +++ b/validphys2/src/validphys/uploadutils.py @@ -3,38 +3,44 @@ Tools to upload resources to remote servers. """ -import time -import subprocess +import base64 +import contextlib +from glob import glob +import hashlib import logging import os -import shutil +import pathlib import re -import uuid -import base64 +import shutil +import subprocess import sys -import contextlib -import pathlib import tempfile -from glob import glob +import time from urllib.parse import urljoin -import hashlib +import uuid import prompt_toolkit from prompt_toolkit.completion import WordCompleter -from reportengine.compat import yaml from reportengine.colors import t -from validphys.loader import RemoteLoader, Loader +from reportengine.compat import yaml +from validphys.loader import Loader, RemoteLoader from validphys.renametools import Spinner log = logging.getLogger(__name__) -class UploadError(Exception): pass -class BadSSH(UploadError): pass +class UploadError(Exception): + pass + + +class BadSSH(UploadError): + pass + def _profile_key(k): """Return a property that fetches a given key from ``self._profile``.""" + @property def f(self): try: @@ -44,7 +50,8 @@ def f(self): return f -class Uploader(): + +class Uploader: """Base class for implementing upload behaviour. The main abstraction is a context manager ``upload_context`` which checks that the upload seems possible, then does the work inside the context and then uploads the @@ -59,8 +66,14 @@ def get_relative_path(self, output_path): def check_auth(self): """Check that we can authenticate with a certificate.""" - ssh_command_line = ('ssh', '-o', 'PreferredAuthentications=publickey', - '-q', self.upload_host, 'exit') + ssh_command_line = ( + 'ssh', + '-o', + 'PreferredAuthentications=publickey', + '-q', + self.upload_host, + 'exit', + ) str_line = ' '.join(repr(ele) for ele in ssh_command_line) @@ -69,33 +82,39 @@ def check_auth(self): try: subprocess.run(ssh_command_line, check=True) except subprocess.CalledProcessError as e: - raise BadSSH(("Could not validate the SSH key. " - "The command\n%s\nreturned a non zero exit status. " - "Please make sure that your public SSH key is on the server.") - % str_line) from e + raise BadSSH( + ( + "Could not validate the SSH key. " + "The command\n%s\nreturned a non zero exit status. " + "Please make sure that your public SSH key is on the server." + ) + % str_line + ) from e except OSError as e: raise BadSSH("Could not run the command\n%s\n: %s" % (str_line, e)) from e log.info("Connection seems OK.") - def check_rsync(self): """Check that the rsync command exists""" if not shutil.which('rsync'): - raise BadSSH("Could not find the rsync command. " - "Please make sure it is installed.") - + raise BadSSH("Could not find the rsync command. Please make sure it is installed.") def upload_output(self, output_path): """Rsync ``output_path`` to the server and print the resulting URL. If specific_file is given""" - #Set the date to now + # Set the date to now pathlib.Path(output_path).touch() randname = self.get_relative_path(output_path) newdir = self.target_dir + randname - rsync_command = ('rsync', '-aLz', '--chmod=ug=rwx,o=rx', - f"{output_path}/", f'{self.upload_host}:{newdir}') + rsync_command = ( + 'rsync', + '-aLz', + '--chmod=ug=rwx,o=rx', + f"{output_path}/", + f"{self.upload_host}:{newdir}", + ) log.info(f"Uploading output ({output_path}) to {self.upload_host}") try: @@ -105,12 +124,10 @@ def upload_output(self, output_path): raise BadSSH(msg) from e return randname - def _print_output(self, name): url = urljoin(self.root_url, name) log.info(f"Upload completed. The result is available at:\n{t.bold_blue(url)}") - def check_upload(self): """Check that it looks possible to upload something. Raise an UploadError if not.""" @@ -140,14 +157,15 @@ def upload_or_exit_context(self, output): class ReportUploader(Uploader): """An uploader for validphys reports.""" + target_dir = _profile_key('reports_target_dir') root_url = _profile_key('reports_root_url') - class FileUploader(Uploader): """Uploader for individual files for single-file resources. It does the " "same but prints the URL of the file.""" + def _print_output(self, result, name): url = urljoin(result, name) log.info(f"Upload completed. The result is available at:\n{t.bold_blue(url)}") @@ -158,24 +176,26 @@ def upload_context(self, output_and_file): self.check_upload() yield res = self.upload_output(output) - self._print_output(self.root_url+'/'+res+'/', specific_file) + self._print_output(self.root_url + '/' + res + '/', specific_file) + class ReportFileUploader(FileUploader, ReportUploader): pass class ArchiveUploader(FileUploader): - """ Uploader for objects comprising many files such as fits or PDFs """ + """Uploader for objects comprising many files such as fits or PDFs""" + target_dir = None root_url = None - _loader_name = None # vp loader for this kind of archive - _resource_type = "Archive" # name used during logging + _loader_name = None # vp loader for this kind of archive + _resource_type = "Archive" # name used during logging def get_relative_path(self, output_path=None): return '' def _check_existence(self, resource_name): - """ Check whether the given resource exists on the server. + """Check whether the given resource exists on the server. Returns true if the resource exists with the same name on the server or false otherwise. Note that the type of resource being checked is defined by the ``_loader_name`` attribute @@ -183,32 +203,38 @@ def _check_existence(self, resource_name): l = RemoteLoader() resource_list = getattr(l, self._loader_name) - return resource_name in resource_list + return resource_name in resource_list def _check_is_indexed(self, resource_name): - """ Check whether the fit is correctly indexed in the server - """ + """Check whether the fit is correctly indexed in the server""" log.info("Checking whether %s was correctly uploaded...", resource_name) time.sleep(3) if self._check_existence(resource_name): log.info("It has been correctly indexed by the server!") else: - log.error("The object is uploaded but hasn't been indexed yet by the server. " - "You should upload it again to ensure it is indexed: vp-upload %s", resource_name) + log.error( + "The object is uploaded but hasn't been indexed yet by the server. " + "You should upload it again to ensure it is indexed: vp-upload %s", + resource_name, + ) def _compress(self, output_path): """Compress the folder and put in in a directory inside its parent.""" - #make_archive fails if we give it relative paths for some reason + # make_archive fails if we give it relative paths for some reason output_path = output_path.resolve() - tempdir = tempfile.mkdtemp(prefix=f'{self._resource_type}_upload_deleteme_', - dir=output_path.parent) + tempdir = tempfile.mkdtemp( + prefix=f'{self._resource_type}_upload_deleteme_', dir=output_path.parent + ) log.info(f"Compressing {self._resource_type} to {tempdir}") - archive_path_without_extension = pathlib.Path(tempdir)/(output_path.name) + archive_path_without_extension = pathlib.Path(tempdir) / (output_path.name) try: with Spinner(): - shutil.make_archive(base_name=archive_path_without_extension, - format='gztar', - root_dir=output_path.parent, base_dir=output_path.name) + shutil.make_archive( + base_name=archive_path_without_extension, + format='gztar', + root_dir=output_path.parent, + base_dir=output_path.name, + ) except Exception as e: log.error(f"Couldn't compress archive: {e}") raise UploadError(e) from e @@ -220,10 +246,13 @@ def upload_output(self, output_path, force): if not force: if self._check_existence(fit_name): - log.error("A %s with the same name already exists on " - "the server. To overwrite it use the " - "--force flag, as in `vp-upload <%s_name> --force.", - self._resource_type, self._resource_type) + log.error( + "A %s with the same name already exists on " + "the server. To overwrite it use the " + "--force flag, as in `vp-upload <%s_name> --force.", + self._resource_type, + self._resource_type, + ) raise UploadError new_out, name = self._compress(output_path) @@ -255,6 +284,7 @@ def upload_or_exit_context(self, output, force): class FitUploader(ArchiveUploader): """An uploader for fits. Fits will be automatically compressed before uploading.""" + target_dir = _profile_key('fits_target_dir') root_url = _profile_key('fits_root_url') _loader_name = "downloadable_fits" @@ -298,6 +328,7 @@ def upload_output(self, output_path, force): class HyperscanUploader(FitUploader): """Uploader for hyperopt scans, which are just special cases of fits""" + _resource_type = "hyperscans" _loader_name = "downloadable_hyperscans" target_dir = _profile_key('hyperscan_target_dir') @@ -307,6 +338,7 @@ class HyperscanUploader(FitUploader): class PDFUploader(ArchiveUploader): """An uploader for PDFs. PDFs will be automatically compressed before uploading.""" + target_dir = _profile_key('pdfs_target_dir') root_url = _profile_key('pdfs_root_url') _loader_name = "downloadable_pdfs" @@ -329,10 +361,10 @@ def check_for_meta(path): """ if "meta.yaml" not in os.listdir(path): raise FileNotFoundError( - "No meta.yaml file found. Please either add " - "the meta tags to the runcard or use the --interactive flag " - "with vp-upload to interactively create one" - ) + "No meta.yaml file found. Please either add " + "the meta tags to the runcard or use the --interactive flag " + "with vp-upload to interactively create one" + ) return True @@ -365,7 +397,8 @@ def interactive_meta(path): kwinp = prompt_toolkit.prompt( "Enter keywords: ", completer=WordCompleter(words=KeywordsWithCache(RemoteLoader())), - complete_in_thread=True) + complete_in_thread=True, + ) keywords = [k.strip() for k in kwinp.split(",") if k] meta_dict = {"title": title, "author": author, "keywords": keywords} @@ -417,9 +450,13 @@ def check_input(path): elif list(filter(info_reg.match, files)) and list(filter(rep0_reg.match, files)): return 'pdf' else: - log.error(f"Specified input directory: {path} did not fall under the known " - "categories of validphys (report, fit, or pdf).") - raise ValueError("Unrecognized type of input, " - "please save to the server using rsync or wiki-upload. " - "The --interactive flag will generate a meta file which " - "will cause the input to be registered as a report.") + log.error( + f"Specified input directory: {path} did not fall under the known " + "categories of validphys (report, fit, or pdf)." + ) + raise ValueError( + "Unrecognized type of input, " + "please save to the server using rsync or wiki-upload. " + "The --interactive flag will generate a meta file which " + "will cause the input to be registered as a report." + ) diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index 0c2956daaa..e78fc866bc 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -5,13 +5,12 @@ @author: Zahari Kassabov """ import contextlib -import shutil import pathlib +import shutil import tempfile import numpy as np - -from validobj import parse_input, ValidationError +from validobj import ValidationError, parse_input def parse_yaml_inp(inp, spec, path): @@ -56,7 +55,7 @@ def parse_yaml_inp(inp, spec, path): current_exc = current_exc.__cause__ raise ValidationError('\n'.join(error_text_lines)) from e - + @contextlib.contextmanager def tempfile_cleaner(root, exit_func, exc, prefix=None, **kwargs): """A context manager to handle temporary directory creation and @@ -146,6 +145,7 @@ def experiments_to_dataset_inputs(experiments_list): return dataset_inputs + def split_by(it, crit): """Split ``it`` in two lists, the first is such that ``crit`` evaluates to True and the second such it doesn't. Crit can be either a function or an @@ -160,7 +160,7 @@ def split_by(it, crit): else: false.append(ele) elif hasattr(crit, '__iter__'): - for keep, ele in zip(crit,it): + for keep, ele in zip(crit, it): if keep: true.append(ele) else: @@ -170,8 +170,9 @@ def split_by(it, crit): return true, false -#Copied from smpdf.utils -def split_ranges(a,cond=None,*, filter_falses=False): + +# Copied from smpdf.utils +def split_ranges(a, cond=None, *, filter_falses=False): """Split ``a`` so that each range has the same value for ``cond`` . If ``filter_falses`` is true, only the ranges for which the @@ -179,11 +180,11 @@ def split_ranges(a,cond=None,*, filter_falses=False): if cond is None: cond = a cond = cond.astype(bool) - d = np.r_[False, cond[1:]^cond[:-1]] + d = np.r_[False, cond[1:] ^ cond[:-1]] split_at = np.argwhere(d) splits = np.split(a, np.ravel(split_at)) if filter_falses: - #Evaluate condition at split points + # Evaluate condition at split points it = iter(cond[np.r_[0, np.ravel(split_at)]]) return [s for s in splits if next(it)] else: @@ -200,12 +201,13 @@ def sane_groupby_iter(df, by, *args, **kwargs): if by is None or not by: yield ('',), df return - gb = df.groupby(by, *args,**kwargs) + gb = df.groupby(by, *args, **kwargs) for same_vals, table in gb: if not isinstance(same_vals, tuple): same_vals = (same_vals,) yield same_vals, table + def common_prefix(*s): """Return the longest string that is a prefix to both s1 and s2""" small, big = min(s), max(s) @@ -214,6 +216,7 @@ def common_prefix(*s): return small[:i] return small + def scale_from_grid(grid): """Guess the appropriate matplotlib scale from a grid object. Returns ``'linear'`` if the scale of the grid object is linear, diff --git a/validphys2/src/validphys/version.py b/validphys2/src/validphys/version.py index adf81433ef..845e4b697a 100644 --- a/validphys2/src/validphys/version.py +++ b/validphys2/src/validphys/version.py @@ -4,7 +4,7 @@ def __give_git(): from pathlib import Path file_dir = Path(__file__).parent - from subprocess import run, CalledProcessError + from subprocess import CalledProcessError, run try: result = run( @@ -28,9 +28,7 @@ def __give_git(): check=True, cwd=file_dir, ).stdout.strip() - version = result.replace(f"-g{githash}", f"+g{githash}").replace( - f"{tag}-", f"{tag}." - ) + version = result.replace(f"-g{githash}", f"+g{githash}").replace(f"{tag}-", f"{tag}.") except CalledProcessError: # In principle this function should not exist on an installed version # but who knows. Also maybe git doesn't work on the machine or whatever