diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 7d764cddaf..20ee2193c2 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -38,6 +38,7 @@ SimilarCuts, ThCovMatSpec, ) +from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas from validphys.loader import ( Loader, LoaderError, @@ -237,6 +238,52 @@ def parse_fit(self, fit: str): except LoadFailedError as e: raise ConfigError(str(e), fit, self.loader.available_fits) + def produce_fitreplicas(self, fit): + """Production rule mapping the ``replica`` key to each Monte Carlo + fit replica. + """ + num_replicas = num_fitted_replicas(fit) + return NSList(range(1, num_replicas + 1), nskey='replica') + + def produce_pdfreplicas(self, fitpdf): + """Production rule mapping the ``replica`` key to each postfit + replica. + """ + pdf = fitpdf['pdf'] + replicas = fitted_replica_indexes(pdf) + return NSList(replicas, nskey='replica') + + def produce_fitenvironment(self, fit, fitinputcontext): + """Like fitcontext, but additionally forcing various other + parameters, such as the cuts policy and Monte Carlo seeding to be + the same as the fit. + + Notes + ----- + - This production rule is designed to be used as a namespace + to collect over, for use with + :py:func:`validphys.pseudodata.recreate_fit_pseudodata` and + can be added to freely, e.g by setting trvlseed to be from + the fit runcard. + """ + log.warning(f"Using mcseed and trvlseed from fit: {fit}") + theoryid = fitinputcontext['theoryid'] + data_input = fitinputcontext['data_input'] + + runcard = fit.as_input() + trvlseed = runcard['trvlseed'] + mcseed = runcard['mcseed'] + genrep = runcard['genrep'] + + return { + "dataset_inputs": data_input, + "theoryid": theoryid, + "use_cuts": CutsPolicy.FROMFIT, + "mcseed": mcseed, + "trvlseed": trvlseed, + "genrep": genrep, + } + def produce_fitcontext(self, fitinputcontext, fitpdf): """Set PDF, theory ID and data input from the fit config""" diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index ca7fed9959..fc631c68b5 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -405,29 +405,77 @@ def validation_pseudodata(pseudodata_table, training_mask): replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",)) -def training_mask(replicas_exps_tr_masks, replicas, experiments_index): +@table +def replica_training_mask_table(exps_tr_masks, replica, experiments_index): """Save the boolean mask used to split data into training and validation - for each replica as a pandas DataFrame, indexed by + for a given replica as a pandas DataFrame, indexed by :py:func:`validphys.results.experiments_index`. Can be used to reconstruct the training and validation data used in a fit. Parameters ---------- - replicas_exps_tr_masks: list[list[list[np.array]]] - Result of :py:func:`tr_masks` collected over experiments then replicas, - which creates the nested structure. The outer list is len(replicas), - the next list is len(group_dataset_inputs_by_experiment) and the - inner-most list has an array for each dataset in that particular - experiment - as defined by the metadata. The arrays should be 1-D - boolean arrays which can be used as masks. - replicas: NSlist - Namespace list of replica numbers to tabulate masks for, each element - of the list should be a `replica`. See example below for more - information. + exps_tr_masks: list[list[np.array]] + Result of :py:func:`tr_masks` collected over experiments, which creates + the nested structure. The outer list is + len(group_dataset_inputs_by_experiment) and the inner-most list has an + array for each dataset in that particular experiment - as defined by the + metadata. The arrays should be 1-D boolean arrays which can be used as + masks. + replica: int + The index of the replica. experiments_index: pd.MultiIndex Index returned by :py:func:`validphys.results.experiments_index`. + Example + ------- + >>> from validphys.api import API + >>> ds_inp = [ + ... {'dataset': 'NMC', 'frac': 0.75}, + ... {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD'], 'frac': 0.75}, + ... {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10, 'frac': 0.75} + ... ] + >>> API.replica_training_mask_table(dataset_inputs=ds_inp, replica=1, trvlseed=123, theoryid=162, use_cuts="nocuts", mcseed=None, genrep=False) + replica 1 + group dataset id + NMC NMC 0 True + 1 True + 2 False + 3 True + 4 True + ... ... + CMS CMSZDIFF12 45 True + 46 True + 47 True + 48 False + 49 True + + [345 rows x 1 columns] + """ + all_masks = np.concatenate([ + ds_mask + for exp_masks in exps_tr_masks + for ds_mask in exp_masks + ]) + return pd.DataFrame( + all_masks, + columns=[f"replica {replica}"], + index=experiments_index + ) + +replicas_training_mask_table = collect("replica_training_mask_table", ("replicas",)) +@table +def training_mask_table(replicas_training_mask_table): + """Save the boolean mask used to split data into training and validation + for each replica as a pandas DataFrame, indexed by + :py:func:`validphys.results.experiments_index`. Can be used to reconstruct + the training and validation data used in a fit. + + Parameters + ---------- + replicas_exps_tr_masks: list[list[list[np.array]]] + Result of :py:func:`replica_tr_masks` collected over replicas + Example ------- >>> from validphys.api import API @@ -457,26 +505,8 @@ def training_mask(replicas_exps_tr_masks, replicas, experiments_index): [345 rows x 3 columns] """ - rep_dfs = [] - for rep_exps_masks, rep in zip(replicas_exps_tr_masks, replicas): - # create flat list with all dataset masks in, then concatenate to single - # array. - all_masks = np.concatenate([ - ds_mask - for exp_masks in rep_exps_masks - for ds_mask in exp_masks - ]) - rep_dfs.append(pd.DataFrame( - all_masks, - columns=[f"replica {rep}"], - index=experiments_index - )) - return pd.concat(rep_dfs, axis=1) - + return pd.concat(replicas_training_mask_table, axis=1) -@table -def training_mask_table(training_mask): - return training_mask def fitting_pos_dict(posdataset): """Loads a positivity dataset. For more information see diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 5937f8a35f..f449958898 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -5,31 +5,26 @@ """ from collections import namedtuple import logging -import multiprocessing as mp -import os import pathlib import numpy as np import pandas as pd -from validphys.checks import check_cuts_fromfit, check_darwin_single_process from validphys.covmats import INTRA_DATASET_SYS_NAME from reportengine import collect -from validphys.n3fit_data import replica_mcseed, replica_trvlseed -import validphys.n3fit_data_utils as reader +FILE_PREFIX = "datacuts_theory_fitting_" log = logging.getLogger(__name__) DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx']) -fitted_pseudodata = collect('fitted_pseudodata_internal', ('fitcontext',)) - context_index = collect("groups_index", ("fitcontext",)) +read_fit_pseudodata = collect('read_replica_pseudodata', ('fitreplicas', 'fitenvironment')) +read_pdf_pseudodata = collect('read_replica_pseudodata', ('pdfreplicas', 'fitenvironment')) -@check_cuts_fromfit -def read_fit_pseudodata(fitcontext, context_index): +def read_replica_pseudodata(fit, context_index, replica): """Function to handle the reading of training and validation splits for a fit that has been produced with the ``savepseudodata`` flag set to ``True``. @@ -55,64 +50,49 @@ def read_fit_pseudodata(fitcontext, context_index): Example ------- >>> from validphys.api import API - >>> data_indices_list = API.read_fit_pseudodata(fit="NNPDF31_nnlo_as_0118_DISonly_pseudodata", use_cuts="fromfit") + >>> data_indices_list = API.read_fit_pseudodata(fit="pseudodata_test_fit_n3fit") >>> len(data_indices_list) # Same as nrep - 100 + 10 >>> rep_info = data_indices_list[0] >>> rep_info.pseudodata.loc[rep_info.tr_idx].head() - data - group dataset id - BCDMS BCDMSD 0 0.371510 - 1 0.365659 - 2 0.350234 - 4 0.355560 - 6 0.346234 - data + replica 1 + group dataset id + ATLAS ATLASZPT8TEVMDIST 1 30.665835 + 3 15.795880 + 4 8.769734 + 5 3.117819 + 6 0.771079 """ # List of length 1 due to the collect context_index = context_index[0] # The [0] is because of how pandas handles sorting a MultiIndex sorted_index = context_index.sortlevel(level=range(1,3))[0] - pdf = fitcontext["pdf"] - log.debug(f"Using same pseudodata & training/validation splits as {pdf.name}.") - nrep = len(pdf) - path = pathlib.Path(pdf.infopath) - - data_indices_list = [] - for rep_number in range(1, nrep): - # This is a symlink (usually). - replica = path.with_name(pdf.name + "_" + str(rep_number).zfill(4) + ".dat") - # we resolve the symlink - if replica.parent.is_symlink(): - replica = pathlib.Path(os.path.realpath(replica)) - - training_path = replica.with_name("training.dat") - validation_path = replica.with_name("validation.dat") - - try: - tr = pd.read_csv(training_path, index_col=[0, 1, 2], sep="\t", names=["data"]) - val = pd.read_csv(validation_path, index_col=[0, 1, 2], sep="\t", names=["data"]) - except FileNotFoundError as e: - raise FileNotFoundError( - "Could not find saved training and validation data files. " - f"Please ensure {pdf} was generated with the savepseudodata flag set to true" - ) from e - tr["type"], val["type"] = "training", "validation" - - pseudodata = pd.concat((tr, val)) - pseudodata.sort_index(level=range(1,3), inplace=True) - - pseudodata.index = sorted_index - - tr = pseudodata[pseudodata["type"]=="training"] - val = pseudodata[pseudodata["type"]=="validation"] - - data_indices_list.append( - DataTrValSpec(pseudodata.drop("type", axis=1), tr.index, val.index) - ) + log.debug(f"Reading pseudodata & training/validation splits from {fit.name}.") + replica_path = fit.path / "nnfit" / f"replica_{replica}" + + training_path = replica_path / (FILE_PREFIX + "training_pseudodata.csv") + validation_path = replica_path / (FILE_PREFIX + "validation_pseudodata.csv") + + try: + tr = pd.read_csv(training_path, index_col=[0, 1, 2], sep="\t", header=0) + val = pd.read_csv(validation_path, index_col=[0, 1, 2], sep="\t", header=0) + except FileNotFoundError as e: + raise FileNotFoundError( + "Could not find saved training and validation data files. " + f"Please ensure {fit} was generated with the savepseudodata flag set to true" + ) from e + tr["type"], val["type"] = "training", "validation" + + pseudodata = pd.concat((tr, val)) + pseudodata.sort_index(level=range(1,3), inplace=True) + + pseudodata.index = sorted_index - return data_indices_list + tr = pseudodata[pseudodata["type"]=="training"] + val = pseudodata[pseudodata["type"]=="validation"] + + return DataTrValSpec(pseudodata.drop("type", axis=1), tr.index, val.index) def make_replica(dataset_inputs_loaded_cd_with_cuts, replica_mcseed): @@ -238,233 +218,70 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) -@check_darwin_single_process -def fitted_pseudodata_internal(fit, experiments, num_fitted_replicas, t0pdfset=None, NPROC=None): - """A function to obtain information about the pseudodata that went - into an N3FIT fit. - - Parameters - ---------- - fit: :py:class:`validphys.core.FitSpec` - experiments: - List of :py:class:`validphys.core.ExeperimentSpec` - num_nnfit_replicas: ``int`` - Provided for by :py:mod:`validphys.fitdata`. Equal to the number of - pre-postfit replicas. - t0pdfset: :py:class:`validphys.core.PDF` - NPROC: ``int`` - Integer specifying how many cores to run on. Default is - ``mp.cpu_count()`` - - Example - ------- - Create a ``YAML`` file say ``runcard_for_pseudodata.yaml`` - - .. code-block:: YAML - :caption: runcard_for_pseudodata.yaml - - pdf: PN3_DIS_130519 - fit: PN3_DIS_130519 - - experiments: - from_: fit - - theory: - from_: fit - - t0pdfset: - from_: datacuts - - datacuts: - from_: fit - - theoryid: - from_: theory - - use_cuts: fromfit - - Then run - - >>> with open("./runcard_for_pseudodata.yaml", 'r') as stream: - ... from reportengine.compat import yaml - ... runcard = yaml.safe_load(stream) - >>> from validphys.api import API - >>> API.get_pseudodata_internal(**runcard) - - Notes - ----- - - This is a wrapper for the ``fitted_pseudodata`` action - which knows that ``experiments``, *must* come from fit - and similarly ``PDF`` and ``theoryid`` *must* be the same as - that of ``fit`` and so on. - - This function returns the pseudodata for the replicas - pre-postfit. Postfit discards some replicas and rearranges - the order. The correpsondence is done by the - :py:func:`get_pseudodata` - function. - - This code runs in parallel to increase efficiency. - """ - if t0pdfset is not None: - t0pdfset = t0pdfset.load_t0() - - # The + 1 coming from the fact that we wish to - # include the last replica - replica = range(1, num_fitted_replicas + 1) - - trvlseed, mcseed, genrep = [ - fit.as_input().get(i) - for i in ["trvlseed", "mcseed", "genrep"] - ] - - # common_data_reader expects None if genrep is False - if genrep: - replicas_mcseed = [ - replica_mcseed(rep, mcseed, genrep) for rep in replica - ] - else: - replicas_mcseed = None - - replicas_trvlseeds = [replica_trvlseed(rep, trvlseed) for rep in replica] - - def task(d, mcseeds, trvlseeds, replicas): - all_exp_infos = [[] for _ in range(len(mcseeds))] - for exp in experiments: - all_exp_dicts = reader.common_data_reader( - exp, t0pdfset, replica_seeds=mcseeds, trval_seeds=trvlseeds - ) - for i, exp_dict in enumerate(all_exp_dicts): - all_exp_infos[i].append(exp_dict) - for i, j in zip(all_exp_infos, replicas): - d[j] = i - - if NPROC == 1: - pseudodata_dicts = dict() - task(pseudodata_dicts, replicas_mcseed, replicas_trvlseeds, replica) - else: - with mp.Manager() as manager: - d = manager.dict() - - if NPROC is None: - NPROC = mp.cpu_count() - log.warning( - f"Using all {NPROC} cores available, this may be dangerous " - "especially for use on a cluster. Consider setting the NPROC " - "variable to something sensible." - ) - processes = [] - - # convert sub arrays back to lists, use tolist to get builtin python - # types. - list_split = lambda lst, n: [ - arr.tolist() for arr in np.array_split(lst, n) - ] - batched_mcseeds = list_split(replicas_mcseed, NPROC) - batched_trvlseeds = list_split(replicas_trvlseeds, NPROC) - batched_replica_num = list_split(replica, NPROC) - for mc_batch, trvl_batch, replica_batch in zip( - batched_mcseeds, batched_trvlseeds, batched_replica_num - ): - p = mp.Process( - target=task, - args=(d, mc_batch, trvl_batch, replica_batch,), - ) - p.start() - processes.append(p) - for p in processes: - p.join() - pseudodata_dicts = dict(d) - return pseudodata_dicts - - -def get_pseudodata(fitted_pseudodata, fitted_replica_indexes): - """Pseudodata used during fitting but correctly accounting for - the postfit reordering. - """ - # By collecting over `fitcontext` we create a list of length - # one. - fitted_pseudodata = fitted_pseudodata[0] - return [fitted_pseudodata[i] for i in fitted_replica_indexes] +_recreate_fit_pseudodata = collect('indexed_make_replica', ('fitreplicas', 'fitenvironment')) +_recreate_pdf_pseudodata = collect('indexed_make_replica', ('pdfreplicas', 'fitenvironment')) +fit_tr_masks = collect('replica_training_mask_table', ('fitreplicas', 'fitenvironment')) +pdf_tr_masks = collect('replica_training_mask_table', ('pdfreplicas', 'fitenvironment')) -def _datasets_mask(experiment_list): - """Function to obtain a per datasets training/validation - mask given the mask for the corresponding experiment. +def recreate_fit_pseudodata(_recreate_fit_pseudodata, fitreplicas, fit_tr_masks): + """Function used to reconstruct the pseudodata seen by each of the + Monte Carlo fit replicas. Returns ------- - dict: - - tr_mask: training mask for the datasets in the experiment - - vl_mask: validation mask for the datasets in the experiment + res : list[namedtuple] + List of namedtuples, each of which contains a dataframe + containing all the data points, the training indices, and + the validation indices. + + Example + ------- + >>> from validphys.api import API + >>> API.recreate_fit_pseudodata(fit="pseudodata_test_fit_n3fit") + + Notes + ----- + - This function does not account for the postfit reshuffling. + + See Also + -------- + :py:func:`validphys.pseudodata.recreate_pdf_pseudodata` """ - tr_mask = experiment_list["trmask"] - vl_mask = experiment_list["vlmask"] - slices = [] - start = 0 - for i in experiment_list["datasets"]: - ndata = i["ndata"] - slices.append(start + ndata) - start += ndata - - return { - "trmask": np.split(tr_mask, slices[:-1]), - "vlmask": np.split(vl_mask, slices[:-1]), - } - - -def training_validation_pseudodata(get_pseudodata): - """Generator to yield a dictionary of training and validation DataFrame - per replica indexed appropriately using a MultiIndex + res = [] + for pseudodata, mask, rep in zip(_recreate_fit_pseudodata, fit_tr_masks, fitreplicas): + df = pseudodata + df.columns = [f"replica {rep}"] + tr_idx = df.loc[mask.values].index + val_idx = df.loc[~mask.values].index + res.append(DataTrValSpec(pseudodata, tr_idx, val_idx)) + return res + +def recreate_pdf_pseudodata(_recreate_pdf_pseudodata, pdf_tr_masks, pdfreplicas): + """Like :py:func:`validphys.pseudodata.recreate_fit_pseudodata` + but accounts for the postfit reshuffling of replicas. + + Returns + ------- + res : list[namedtuple] + List of namedtuples, each of which contains a dataframe + containing all the data points, the training indices, and + the validation indices. + + Example + ------- + >>> from validphys.api import API + >>> API.recreate_pdf_pseudodata(fit="pseudodata_test_fit_n3fit") + + See Also + -------- + :py:func:`validphys.pseudodata.recreate_fit_pseudodata` """ - exp_infos = get_pseudodata - columns = ["experiment", "dataset", "id"] - # Loop over all initial replicas - for replica in exp_infos: - tr_records, tr_central_values = [], [] - vl_records, vl_central_values = [], [] - # Loop over experiments in given replica - for experiment in replica: - split_masks = _datasets_mask(experiment) - tr_mask, vl_mask = split_masks["trmask"], split_masks["vlmask"] - # While we're here extend the central_values of the experiment - tr_central_values.extend(np.squeeze(experiment["expdata"])) - vl_central_values.extend(np.squeeze(experiment["expdata_vl"])) - # Loop over datasets in experiment - for i, dataset in enumerate(experiment["datasets"]): - tr_dataset_mask = tr_mask[i] - vl_dataset_mask = vl_mask[i] - tr_indices = np.array((range(dataset["ndata"])))[tr_dataset_mask] - vl_indices = np.array((range(dataset["ndata"])))[vl_dataset_mask] - for tr_idat in tr_indices: - tr_records.append( - dict( - [ - ("experiment", experiment["name"]), - ("dataset", dataset["name"]), - ("id", tr_idat), - ] - ) - ) - for vl_idat in vl_indices: - vl_records.append( - dict( - [ - ("experiment", experiment["name"]), - ("dataset", dataset["name"]), - ("id", vl_idat), - ] - ) - ) - - tr_df = pd.DataFrame(tr_records, columns=columns) - vl_df = pd.DataFrame(vl_records, columns=columns) - - tr_df.set_index(columns, inplace=True) - vl_df.set_index(columns, inplace=True) - - tr_index = tr_df.index - vl_index = vl_df.index - tr_vl_dict = { - "trdata": pd.DataFrame(tr_central_values, index=tr_index, columns=["data"]), - "vldata": pd.DataFrame(vl_central_values, index=vl_index, columns=["data"]), - } - yield tr_vl_dict + res = [] + for pseudodata, mask, rep in zip(_recreate_pdf_pseudodata, pdf_tr_masks, pdfreplicas): + df = pseudodata + df.columns = [f"replica {rep}"] + tr_idx = df.loc[mask.values].index + val_idx = df.loc[~mask.values].index + res.append(DataTrValSpec(pseudodata, tr_idx, val_idx)) + return res \ No newline at end of file diff --git a/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle b/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle deleted file mode 100644 index e854305bfb..0000000000 Binary files a/validphys2/src/validphys/tests/regressions/test_exp_infos.pickle and /dev/null differ diff --git a/validphys2/src/validphys/tests/test_pseudodata.py b/validphys2/src/validphys/tests/test_pseudodata.py index 6643cf97dc..604c3211a2 100644 --- a/validphys2/src/validphys/tests/test_pseudodata.py +++ b/validphys2/src/validphys/tests/test_pseudodata.py @@ -2,111 +2,83 @@ Test to ensure the validphys.pseudodata.get_pseudodata action correctly obtains the appropriate pseudodata for an n3fit fit. -To this end, a 10 replica fit has been uploaded named -`pseudodata_test_fit` obtained using 100 epochs, theoryID 162 and a -subset of DIS datasets. When this fit was performed, the `all_exp_infos` -was pickled and stored in `exp_infos.pickle` which is the benchmark -we use to ensure the action is working appropriately. +A fit has been generated called pseudodata_test_fit_n3fit +which has the pseudodata saved as training and validation splits. +This is used to benchmark the correctness of the pseudodata +recreation. """ -import pickle -from importlib.resources import read_binary - -import numpy as np +import pandas as pd import pytest from validphys.api import API -from validphys.pseudodata import training_validation_pseudodata -import validphys.tests.regressions +from validphys.tests.conftest import FIT -from reportengine.checks import CheckError -from reportengine.compat import yaml -from reportengine.resourcebuilder import ResourceError +PSEUDODATA_FIT = "pseudodata_test_fit_n3fit" -EXAMPLE_RUNCARD = """fit: pseudodata_test_fit -pdf: pseudodata_test_fit -experiments: - from_: fit +def test_read_fit_pseudodata(): + fit_pseudodata = API.read_fit_pseudodata(fit=PSEUDODATA_FIT) -t0pdfset: - from_: datacuts + nrep = API.num_fitted_replicas(fit=PSEUDODATA_FIT) + assert nrep == len(fit_pseudodata) -datacuts: - from_: fit + for data, tr_idx, val_idx in fit_pseudodata: + assert set(tr_idx).isdisjoint(set(val_idx)) + assert set(tr_idx).union(val_idx) == set(data.index) -use_cuts: fromfit -""" +def test_read_pdf_pseudodata(): + pdf_pseudodata = API.read_pdf_pseudodata(fit=PSEUDODATA_FIT) -@pytest.fixture( - scope="session", - params=[1, pytest.param(None, marks=pytest.mark.linux)], -) -def setup_dicts(request): - n_process_config = dict(NPROC=request.param) - exp_infos_bytes = read_binary(validphys.tests.regressions, "test_exp_infos.pickle") - ns = yaml.safe_load(EXAMPLE_RUNCARD) - # This is what all the fitted replicas saw - exp_infos = pickle.loads(exp_infos_bytes) + pdf = API.pdf(pdf=PSEUDODATA_FIT) + # -1 because we ignore replica 0 + assert len(pdf) - 1 == len(pdf_pseudodata) - # We now need to convert these to postfit replicas - fitted_indices = API.fitted_replica_indexes(**ns) - fit_postfit_mapping = dict(enumerate(exp_infos, 1)) - exp_infos = [fit_postfit_mapping[i] for i in fitted_indices] + for data, tr_idx, val_idx in pdf_pseudodata: + assert set(tr_idx).isdisjoint(set(val_idx)) + assert set(tr_idx).union(val_idx) == set(data.index) - pseudodata_info = API.get_pseudodata(**ns, **n_process_config) - return exp_infos, pseudodata_info +def test_recreate_fit_pseudodata(): + fit_pseudodata = API.recreate_fit_pseudodata(fit=PSEUDODATA_FIT) + nrep = API.num_fitted_replicas(fit=PSEUDODATA_FIT) + assert nrep == len(fit_pseudodata) -def test_read_fit_pseudodata(): - data_indices_list = API.read_fit_pseudodata( - fit="dummy_pseudodata_read_test_fit", - use_cuts="fromfit" - ) - - # Only bother checking the first ten replicas - for data_indices in data_indices_list[:10]: - data, tr_idx, val_idx = data_indices - # Check the training and validation index are disjoint - assert set(tr_idx).isdisjoint(set(val_idx)) - - - with pytest.raises(FileNotFoundError): - # Check a FileNotFoundError is raised - # if the input fit wasn't generated - # with the savepseudodata flag set to true - bad_gen = API.read_fit_pseudodata( - fit="dummy_pseudodata_read_failure_test_fit", use_cuts="fromfit" - ) - next(bad_gen) - - with pytest.raises(ResourceError) as e_info: - # Check the enforcement of use_cuts being set - # to fromfit is in place - API.read_fit_pseudodata( - fit="dummy_pseudodata_read_test_fit", - use_cuts="nocuts" + for data, tr_idx, val_idx in fit_pseudodata: + assert set(tr_idx).isdisjoint(set(val_idx)) + assert set(tr_idx).union(val_idx) == set(data.index) + + +def test_recreate_pdf_pseudodata(): + pdf_pseudodata = API.recreate_pdf_pseudodata(fit=PSEUDODATA_FIT) + + pdf = API.pdf(pdf=PSEUDODATA_FIT) + # -1 because we ignore replica 0 + assert len(pdf) - 1 == len(pdf_pseudodata) + + for data, tr_idx, val_idx in pdf_pseudodata: + assert set(tr_idx).isdisjoint(set(val_idx)) + assert set(tr_idx).union(val_idx) == set(data.index) + + +def test_no_savepseudodata(): + for func in (API.read_fit_pseudodata, API.read_pdf_pseudodata): + with pytest.raises(FileNotFoundError): + # Check a FileNotFoundError is raised + # if the input fit wasn't generated + # with the savepseudodata flag set to true + func(fit=FIT) + + +def test_read_matches_recreate(): + reads = API.read_fit_pseudodata(fit=PSEUDODATA_FIT) + recreates = API.recreate_fit_pseudodata(fit=PSEUDODATA_FIT) + for read, recreate in zip(reads, recreates): + # We ignore the absolute ordering of the dataframes and just check + # that they contain identical elements. + pd.testing.assert_frame_equal( + read.pseudodata, recreate.pseudodata, check_like=True ) - assert isinstance(e_info.__cause__, CheckError) - - -def test_pseudodata(setup_dicts): - exp_infos, pseudodata_info = setup_dicts - # Loop over replicas - for i, j in zip(exp_infos, pseudodata_info): - # For each replica, loop over experiments - for exp1, exp2 in zip(i, j): - assert np.allclose(exp1["expdata"], exp2["expdata"]) - assert np.allclose(exp1["expdata_vl"], exp2["expdata_vl"]) - - -def test_pseudodata_generator(setup_dicts): - exp_infos, pseudodata_info = setup_dicts - gen = training_validation_pseudodata(pseudodata_info) - for i, j in enumerate(gen): - continue - # There is only one postfit replica in this fit - assert i == 0 - # The training and validation split should be disjoint - assert set(j["trdata"].index).isdisjoint(j["vldata"].index) + pd.testing.assert_index_equal(read.tr_idx, recreate.tr_idx, check_order=False) + pd.testing.assert_index_equal(read.val_idx, recreate.val_idx, check_order=False)