Skip to content

Commit

Permalink
Merge pull request #1328 from NNPDF/replicate_pseudodata
Browse files Browse the repository at this point in the history
Improving reading and recreating fit pseudodata
  • Loading branch information
siranipour committed Sep 15, 2021
2 parents 6744f99 + 3114c38 commit b7e1e14
Show file tree
Hide file tree
Showing 5 changed files with 268 additions and 402 deletions.
47 changes: 47 additions & 0 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
SimilarCuts,
ThCovMatSpec,
)
from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
from validphys.loader import (
Loader,
LoaderError,
Expand Down Expand Up @@ -237,6 +238,52 @@ def parse_fit(self, fit: str):
except LoadFailedError as e:
raise ConfigError(str(e), fit, self.loader.available_fits)

def produce_fitreplicas(self, fit):
"""Production rule mapping the ``replica`` key to each Monte Carlo
fit replica.
"""
num_replicas = num_fitted_replicas(fit)
return NSList(range(1, num_replicas + 1), nskey='replica')

def produce_pdfreplicas(self, fitpdf):
"""Production rule mapping the ``replica`` key to each postfit
replica.
"""
pdf = fitpdf['pdf']
replicas = fitted_replica_indexes(pdf)
return NSList(replicas, nskey='replica')

def produce_fitenvironment(self, fit, fitinputcontext):
"""Like fitcontext, but additionally forcing various other
parameters, such as the cuts policy and Monte Carlo seeding to be
the same as the fit.
Notes
-----
- This production rule is designed to be used as a namespace
to collect over, for use with
:py:func:`validphys.pseudodata.recreate_fit_pseudodata` and
can be added to freely, e.g by setting trvlseed to be from
the fit runcard.
"""
log.warning(f"Using mcseed and trvlseed from fit: {fit}")
theoryid = fitinputcontext['theoryid']
data_input = fitinputcontext['data_input']

runcard = fit.as_input()
trvlseed = runcard['trvlseed']
mcseed = runcard['mcseed']
genrep = runcard['genrep']

return {
"dataset_inputs": data_input,
"theoryid": theoryid,
"use_cuts": CutsPolicy.FROMFIT,
"mcseed": mcseed,
"trvlseed": trvlseed,
"genrep": genrep,
}

def produce_fitcontext(self, fitinputcontext, fitpdf):
"""Set PDF, theory ID and data input from the fit config"""

Expand Down
94 changes: 62 additions & 32 deletions validphys2/src/validphys/n3fit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,29 +405,77 @@ def validation_pseudodata(pseudodata_table, training_mask):
replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",))


def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
@table
def replica_training_mask_table(exps_tr_masks, replica, experiments_index):
"""Save the boolean mask used to split data into training and validation
for each replica as a pandas DataFrame, indexed by
for a given replica as a pandas DataFrame, indexed by
:py:func:`validphys.results.experiments_index`. Can be used to reconstruct
the training and validation data used in a fit.
Parameters
----------
replicas_exps_tr_masks: list[list[list[np.array]]]
Result of :py:func:`tr_masks` collected over experiments then replicas,
which creates the nested structure. The outer list is len(replicas),
the next list is len(group_dataset_inputs_by_experiment) and the
inner-most list has an array for each dataset in that particular
experiment - as defined by the metadata. The arrays should be 1-D
boolean arrays which can be used as masks.
replicas: NSlist
Namespace list of replica numbers to tabulate masks for, each element
of the list should be a `replica`. See example below for more
information.
exps_tr_masks: list[list[np.array]]
Result of :py:func:`tr_masks` collected over experiments, which creates
the nested structure. The outer list is
len(group_dataset_inputs_by_experiment) and the inner-most list has an
array for each dataset in that particular experiment - as defined by the
metadata. The arrays should be 1-D boolean arrays which can be used as
masks.
replica: int
The index of the replica.
experiments_index: pd.MultiIndex
Index returned by :py:func:`validphys.results.experiments_index`.
Example
-------
>>> from validphys.api import API
>>> ds_inp = [
... {'dataset': 'NMC', 'frac': 0.75},
... {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD'], 'frac': 0.75},
... {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10, 'frac': 0.75}
... ]
>>> API.replica_training_mask_table(dataset_inputs=ds_inp, replica=1, trvlseed=123, theoryid=162, use_cuts="nocuts", mcseed=None, genrep=False)
replica 1
group dataset id
NMC NMC 0 True
1 True
2 False
3 True
4 True
... ...
CMS CMSZDIFF12 45 True
46 True
47 True
48 False
49 True
[345 rows x 1 columns]
"""
all_masks = np.concatenate([
ds_mask
for exp_masks in exps_tr_masks
for ds_mask in exp_masks
])
return pd.DataFrame(
all_masks,
columns=[f"replica {replica}"],
index=experiments_index
)

replicas_training_mask_table = collect("replica_training_mask_table", ("replicas",))
@table
def training_mask_table(replicas_training_mask_table):
"""Save the boolean mask used to split data into training and validation
for each replica as a pandas DataFrame, indexed by
:py:func:`validphys.results.experiments_index`. Can be used to reconstruct
the training and validation data used in a fit.
Parameters
----------
replicas_exps_tr_masks: list[list[list[np.array]]]
Result of :py:func:`replica_tr_masks` collected over replicas
Example
-------
>>> from validphys.api import API
Expand Down Expand Up @@ -457,26 +505,8 @@ def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
[345 rows x 3 columns]
"""
rep_dfs = []
for rep_exps_masks, rep in zip(replicas_exps_tr_masks, replicas):
# create flat list with all dataset masks in, then concatenate to single
# array.
all_masks = np.concatenate([
ds_mask
for exp_masks in rep_exps_masks
for ds_mask in exp_masks
])
rep_dfs.append(pd.DataFrame(
all_masks,
columns=[f"replica {rep}"],
index=experiments_index
))
return pd.concat(rep_dfs, axis=1)

return pd.concat(replicas_training_mask_table, axis=1)

@table
def training_mask_table(training_mask):
return training_mask

def fitting_pos_dict(posdataset):
"""Loads a positivity dataset. For more information see
Expand Down
Loading

0 comments on commit b7e1e14

Please sign in to comment.