Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving reading and recreating fit pseudodata #1328

Merged
merged 20 commits into from
Sep 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
SimilarCuts,
ThCovMatSpec,
)
from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
from validphys.loader import (
Loader,
LoaderError,
Expand Down Expand Up @@ -237,6 +238,52 @@ def parse_fit(self, fit: str):
except LoadFailedError as e:
raise ConfigError(str(e), fit, self.loader.available_fits)

def produce_fitreplicas(self, fit):
"""Production rule mapping the ``replica`` key to each Monte Carlo
fit replica.
"""
num_replicas = num_fitted_replicas(fit)
return NSList(range(1, num_replicas + 1), nskey='replica')

def produce_pdfreplicas(self, fitpdf):
"""Production rule mapping the ``replica`` key to each postfit
replica.
"""
pdf = fitpdf['pdf']
replicas = fitted_replica_indexes(pdf)
return NSList(replicas, nskey='replica')

def produce_fitenvironment(self, fit, fitinputcontext):
"""Like fitcontext, but additionally forcing various other
parameters, such as the cuts policy and Monte Carlo seeding to be
the same as the fit.
siranipour marked this conversation as resolved.
Show resolved Hide resolved

Notes
-----
- This production rule is designed to be used as a namespace
to collect over, for use with
:py:func:`validphys.pseudodata.recreate_fit_pseudodata` and
can be added to freely, e.g by setting trvlseed to be from
the fit runcard.
"""
log.warning(f"Using mcseed and trvlseed from fit: {fit}")
theoryid = fitinputcontext['theoryid']
data_input = fitinputcontext['data_input']

runcard = fit.as_input()
trvlseed = runcard['trvlseed']
mcseed = runcard['mcseed']
genrep = runcard['genrep']

return {
"dataset_inputs": data_input,
"theoryid": theoryid,
"use_cuts": CutsPolicy.FROMFIT,
"mcseed": mcseed,
"trvlseed": trvlseed,
"genrep": genrep,
}

def produce_fitcontext(self, fitinputcontext, fitpdf):
"""Set PDF, theory ID and data input from the fit config"""

Expand Down
94 changes: 62 additions & 32 deletions validphys2/src/validphys/n3fit_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,29 +405,77 @@ def validation_pseudodata(pseudodata_table, training_mask):
replicas_exps_tr_masks = collect("exps_tr_masks", ("replicas",))


def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
@table
def replica_training_mask_table(exps_tr_masks, replica, experiments_index):
"""Save the boolean mask used to split data into training and validation
for each replica as a pandas DataFrame, indexed by
for a given replica as a pandas DataFrame, indexed by
:py:func:`validphys.results.experiments_index`. Can be used to reconstruct
the training and validation data used in a fit.

Parameters
----------
replicas_exps_tr_masks: list[list[list[np.array]]]
Result of :py:func:`tr_masks` collected over experiments then replicas,
which creates the nested structure. The outer list is len(replicas),
the next list is len(group_dataset_inputs_by_experiment) and the
inner-most list has an array for each dataset in that particular
experiment - as defined by the metadata. The arrays should be 1-D
boolean arrays which can be used as masks.
replicas: NSlist
Namespace list of replica numbers to tabulate masks for, each element
of the list should be a `replica`. See example below for more
information.
exps_tr_masks: list[list[np.array]]
Result of :py:func:`tr_masks` collected over experiments, which creates
the nested structure. The outer list is
len(group_dataset_inputs_by_experiment) and the inner-most list has an
array for each dataset in that particular experiment - as defined by the
metadata. The arrays should be 1-D boolean arrays which can be used as
masks.
replica: int
The index of the replica.
experiments_index: pd.MultiIndex
Index returned by :py:func:`validphys.results.experiments_index`.


Example
-------
>>> from validphys.api import API
>>> ds_inp = [
... {'dataset': 'NMC', 'frac': 0.75},
... {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD'], 'frac': 0.75},
... {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10, 'frac': 0.75}
... ]
>>> API.replica_training_mask_table(dataset_inputs=ds_inp, replica=1, trvlseed=123, theoryid=162, use_cuts="nocuts", mcseed=None, genrep=False)
replica 1
group dataset id
NMC NMC 0 True
1 True
2 False
3 True
4 True
... ...
CMS CMSZDIFF12 45 True
46 True
47 True
48 False
49 True

[345 rows x 1 columns]
"""
all_masks = np.concatenate([
ds_mask
for exp_masks in exps_tr_masks
for ds_mask in exp_masks
])
return pd.DataFrame(
all_masks,
columns=[f"replica {replica}"],
index=experiments_index
)

replicas_training_mask_table = collect("replica_training_mask_table", ("replicas",))
@table
def training_mask_table(replicas_training_mask_table):
siranipour marked this conversation as resolved.
Show resolved Hide resolved
"""Save the boolean mask used to split data into training and validation
for each replica as a pandas DataFrame, indexed by
:py:func:`validphys.results.experiments_index`. Can be used to reconstruct
the training and validation data used in a fit.

Parameters
----------
replicas_exps_tr_masks: list[list[list[np.array]]]
Result of :py:func:`replica_tr_masks` collected over replicas

Example
-------
>>> from validphys.api import API
Expand Down Expand Up @@ -457,26 +505,8 @@ def training_mask(replicas_exps_tr_masks, replicas, experiments_index):
[345 rows x 3 columns]

"""
rep_dfs = []
for rep_exps_masks, rep in zip(replicas_exps_tr_masks, replicas):
# create flat list with all dataset masks in, then concatenate to single
# array.
all_masks = np.concatenate([
ds_mask
for exp_masks in rep_exps_masks
for ds_mask in exp_masks
])
rep_dfs.append(pd.DataFrame(
all_masks,
columns=[f"replica {rep}"],
index=experiments_index
))
return pd.concat(rep_dfs, axis=1)

return pd.concat(replicas_training_mask_table, axis=1)

@table
def training_mask_table(training_mask):
return training_mask

def fitting_pos_dict(posdataset):
"""Loads a positivity dataset. For more information see
Expand Down
Loading