Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dataset/prediction ordering problem in theory covmat creation #1899

Merged
merged 14 commits into from
Dec 21, 2023
2 changes: 1 addition & 1 deletion validphys2/src/validphys/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"validphys.paramfits.plots",
"validphys.theorycovariance.construction",
"validphys.theorycovariance.output",
"validphys.theorycovariance.tests",
# "validphys.theorycovariance.tests",
"validphys.replica_selector",
"validphys.closuretest",
"validphys.mc_gen",
Expand Down
13 changes: 11 additions & 2 deletions validphys2/src/validphys/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(self, dataset, covmat, sqrtcovmat):
stats = Stats(self._central_value)
self._covmat = covmat
self._sqrtcovmat = sqrtcovmat
self._name = dataset.name
RoyStegeman marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(stats)

@property
Expand All @@ -98,15 +99,19 @@ def sqrtcovmat(self):
"""Lower part of the Cholesky decomposition"""
return self._sqrtcovmat

@property
def name(self):
return self._name
RoyStegeman marked this conversation as resolved.
Show resolved Hide resolved

class ThPredictionsResult(StatsResult):
"""Class holding theory prediction, inherits from StatsResult
When created with `from_convolution`, it keeps tracks of the PDF for which it was computed
"""

def __init__(self, dataobj, stats_class, label=None, pdf=None, theoryid=None):
def __init__(self, dataobj, stats_class, datasetnames=None, label=None, pdf=None, theoryid=None):
self.stats_class = stats_class
self.label = label
self._datasetnames = datasetnames
statsobj = stats_class(dataobj.T)
self._pdf = pdf
self._theoryid = theoryid
Expand Down Expand Up @@ -149,8 +154,12 @@ def from_convolution(cls, pdf, dataset, central_only=False):

label = cls.make_label(pdf, dataset)
thid = dataset.thspec.id
datasetnames = [i.name for i in datasets]
return cls(th_predictions, pdf.stats_class, datasetnames, label, pdf=pdf, theoryid=thid)

return cls(th_predictions, pdf.stats_class, label, pdf=pdf, theoryid=thid)
@property
def datasetnames(self):
return self._datasetnames


class ThUncertaintiesResult(StatsResult):
Expand Down
225 changes: 104 additions & 121 deletions validphys2/src/validphys/theorycovariance/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ def theory_covmat_singleprocess(theory_covmat_singleprocess_no_table, fivetheori


results_central_bytheoryids = collect(results_central, ("theoryids",))
each_dataset_results_central_bytheory = collect(
"results_central_bytheoryids", ("group_dataset_inputs_by_process", "data")
)
each_dataset_results_central_bytheory = collect("results_central_bytheoryids", ("data",))


@check_using_theory_covmat
Expand Down Expand Up @@ -100,48 +98,48 @@ def theory_covmat_dataset(
return thcovmat


@check_correct_theory_combination
def theory_covmat_datasets(each_dataset_results_central_bytheory, fivetheories):
"""Produces an array of theory covariance matrices. Each matrix corresponds
to a different dataset, which must be specified in the runcard."""
dataset_covmats = []
for dataset in each_dataset_results_central_bytheory:
theory_centrals = [x[1].central_value for x in dataset]
s = make_scale_var_covmat(theory_centrals)
dataset_covmats.append(s)
return dataset_covmats


@check_correct_theory_combination
def total_covmat_datasets(each_dataset_results_central_bytheory, fivetheories):
"""Produces an array of total covariance matrices; the sum of experimental
and scale-varied theory covariance matrices. Each matrix corresponds
to a different dataset, which must be specified in the runcard.
These are needed for calculation of chi2 per dataset."""
dataset_covmats = []
for dataset in each_dataset_results_central_bytheory:
theory_centrals = [x[1].central_value for x in dataset]
s = make_scale_var_covmat(theory_centrals)
sigma = dataset[0][0].covmat
cov = s + sigma
dataset_covmats.append(cov)
return dataset_covmats


@check_correct_theory_combination
def total_covmat_diagtheory_datasets(each_dataset_results_central_bytheory, fivetheories):
"""Same as total_covmat_theory_datasets but for diagonal theory only"""
dataset_covmats = []
for dataset in each_dataset_results_central_bytheory:
theory_centrals = [x[1].central_value for x in dataset]
s = make_scale_var_covmat(theory_centrals)
# Initialise array of zeros and set precision to same as FK tables
s_diag = np.zeros((len(s), len(s)), dtype=np.float32)
np.fill_diagonal(s_diag, np.diag(s))
sigma = dataset[0][0].covmat
cov = s_diag + sigma
dataset_covmats.append(cov)
return dataset_covmats
# @check_correct_theory_combination
# def theory_covmat_datasets(each_dataset_results_central_bytheory, fivetheories):
# """Produces an array of theory covariance matrices. Each matrix corresponds
# to a different dataset, which must be specified in the runcard."""
# dataset_covmats = []
# for dataset in each_dataset_results_central_bytheory:
# theory_centrals = [x[1].central_value for x in dataset]
# s = make_scale_var_covmat(theory_centrals)
# dataset_covmats.append(s)
# return dataset_covmats


# @check_correct_theory_combination
# def total_covmat_datasets(each_dataset_results_central_bytheory, fivetheories):
# """Produces an array of total covariance matrices; the sum of experimental
# and scale-varied theory covariance matrices. Each matrix corresponds
# to a different dataset, which must be specified in the runcard.
# These are needed for calculation of chi2 per dataset."""
# dataset_covmats = []
# for dataset in each_dataset_results_central_bytheory:
# theory_centrals = [x[1].central_value for x in dataset]
# s = make_scale_var_covmat(theory_centrals)
# sigma = dataset[0][0].covmat
# cov = s + sigma
# dataset_covmats.append(cov)
# return dataset_covmats


# @check_correct_theory_combination
# def total_covmat_diagtheory_datasets(each_dataset_results_central_bytheory, fivetheories):
andreab1997 marked this conversation as resolved.
Show resolved Hide resolved
# """Same as total_covmat_theory_datasets but for diagonal theory only"""
# dataset_covmats = []
# for dataset in each_dataset_results_central_bytheory:
# theory_centrals = [x[1].central_value for x in dataset]
# s = make_scale_var_covmat(theory_centrals)
# # Initialise array of zeros and set precision to same as FK tables
# s_diag = np.zeros((len(s), len(s)), dtype=np.float32)
# np.fill_diagonal(s_diag, np.diag(s))
# sigma = dataset[0][0].covmat
# cov = s_diag + sigma
# dataset_covmats.append(cov)
# return dataset_covmats


@table
Expand Down Expand Up @@ -182,27 +180,31 @@ def total_covmat_procs(procs_results_theory, fivetheories):
return proc_result_covmats


def dataset_names(data_input):
"""Returns a list of the names of the datasets, in the same order as
they are inputted in the runcard"""
return [el.name for el in data_input]

ProcessInfo = namedtuple("ProcessInfo", ("preds", "namelist", "sizes"))

ProcessInfo = namedtuple("ProcessInfo", ("theory", "namelist", "sizes"))

def combine_by_type(each_dataset_results_central_bytheory):
"""Groups the datasets according to processes returns an instance of the ProcessInfo class

def combine_by_type(each_dataset_results_central_bytheory, dataset_names):
"""Groups the datasets according to processes and returns three objects:
theories_by_process: the relevant theories grouped by process type
ordered_names: dictionary with keys of process type and values being the
corresponding list of names of datasets, in the order they
are appended to theories_by_process
dataset_size: dictionary with keys of dataset name and values being the
number of points in that dataset"""
Parameters
----------
each_dataset_results_central_bytheory: list(list((DataResult,ThPredictionsResult)))

Returns
-------
ProcesInfo
Class with info needed to construct the theory covmat.

Raises
------
ValueError
If the order is of the inputs are not the same
"""
dataset_size = defaultdict(list)
theories_by_process = defaultdict(list)
ordered_names = defaultdict(list)
for dataset, name in zip(each_dataset_results_central_bytheory, dataset_names):
for dataset in each_dataset_results_central_bytheory:
name = dataset[0][0].name
theory_centrals = [x[1].central_value for x in dataset]
dataset_size[name] = len(theory_centrals[0])
proc_type = process_lookup(name)
Expand All @@ -211,44 +213,11 @@ def combine_by_type(each_dataset_results_central_bytheory, dataset_names):
for key, item in theories_by_process.items():
theories_by_process[key] = np.concatenate(item, axis=1)
process_info = ProcessInfo(
theory=theories_by_process, namelist=ordered_names, sizes=dataset_size
preds=theories_by_process, namelist=ordered_names, sizes=dataset_size
)
return process_info


def process_starting_points(combine_by_type):
"""Returns a dictionary of indices in the covariance matrix corresponding
to the starting point of each process."""
process_info = combine_by_type
running_index = 0
start_proc = defaultdict(list)
for name in process_info.theory:
size = len(process_info.theory[name][0])
start_proc[name] = running_index
running_index += size
return start_proc


def covmap(combine_by_type, dataset_names):
"""Creates a map between the covmat indices from matrices ordered by
process to matrices ordered by experiment as listed in the runcard"""
mapping = defaultdict(list)
start_exp = defaultdict(list)
process_info = combine_by_type
running_index = 0
for dataset in dataset_names:
size = process_info.sizes[dataset]
start_exp[dataset] = running_index
running_index += size
start = 0
names_by_proc_list = [item for sublist in process_info.namelist.values() for item in sublist]
for dataset in names_by_proc_list:
for i in range(process_info.sizes[dataset]):
mapping[start + i] = start_exp[dataset] + i
start += process_info.sizes[dataset]
return mapping


def covmat_3fpt(name1, name2, deltas1, deltas2):
"""Returns theory covariance sub-matrix for 3pt factorisation
scale variation *only*, given two dataset names and collections
Expand Down Expand Up @@ -490,58 +459,71 @@ def compute_covs_pt_prescrip(


@check_correct_theory_combination
def covs_pt_prescrip(
combine_by_type,
process_starting_points,
theoryids,
point_prescription,
fivetheories,
seventheories,
):
def covs_pt_prescrip(combine_by_type, theoryids, point_prescription, fivetheories, seventheories):
"""Produces the sub-matrices of the theory covariance matrix according
to a point prescription which matches the number of input theories.
If 5 theories are provided, a scheme 'bar' or 'nobar' must be
chosen in the runcard in order to specify the prescription. Sub-matrices
correspond to applying the scale variation prescription to each pair of
processes in turn, using a different procedure for the case where the
processes are the same relative to when they are different."""
l = len(theoryids)
start_proc = process_starting_points

process_info = combine_by_type
running_index = 0
RoyStegeman marked this conversation as resolved.
Show resolved Hide resolved
start_proc = defaultdict(list)
for name in process_info.preds:
size = len(process_info.preds[name][0])
start_proc[name] = running_index
running_index += size

covmats = defaultdict(list)
for name1 in process_info.theory:
for name2 in process_info.theory:
central1, *others1 = process_info.theory[name1]
for name1 in process_info.preds:
for name2 in process_info.preds:
central1, *others1 = process_info.preds[name1]
deltas1 = list((other - central1 for other in others1))
central2, *others2 = process_info.theory[name2]
central2, *others2 = process_info.preds[name2]
deltas2 = list((other - central2 for other in others2))
s = compute_covs_pt_prescrip(
point_prescription, l, name1, deltas1, name2, deltas2, fivetheories, seventheories
point_prescription, len(theoryids), name1, deltas1, name2, deltas2, fivetheories, seventheories
)
start_locs = (start_proc[name1], start_proc[name2])
covmats[start_locs] = s
return covmats


@table
def theory_covmat_custom(covs_pt_prescrip, covmap, procs_index):
def theory_covmat_custom(covs_pt_prescrip, procs_index, combine_by_type):
"""Takes the individual sub-covmats between each two processes and assembles
them into a full covmat. Then reshuffles the order from ordering by process
to ordering by experiment as listed in the runcard"""
matlength = int(
sum([len(covmat) for covmat in covs_pt_prescrip.values()])
/ int(np.sqrt(len(covs_pt_prescrip)))
)
process_info = combine_by_type

# the order is important for the construction of comvat_index below
if procs_index.names != ['group', 'dataset', 'id']:
raise ValueError
RoyStegeman marked this conversation as resolved.
Show resolved Hide resolved

# construct covmat_index based on the order of experiments as they are in combine_by_type
indexlist = []
for procname in process_info.preds:
for expname in process_info.namelist[procname]:
for ind in procs_index:
# we need procs_index for the datapoint ids of the datapoints that survived the cuts
# or do we just assume they're the same as for the exp covmat? Perhaps this
# additional layer is a bit pointless.
RoyStegeman marked this conversation as resolved.
Show resolved Hide resolved
if ind[0] == procname and ind[1] == expname:
data_id = ind[2]
indexlist.append((procname, expname, data_id))
# Is this always the exact same as procs index? This depends on how procs_index orders datasets
# within a process.
covmat_index = pd.MultiIndex.from_tuples(indexlist, names=procs_index.names)

# Initialise arrays of zeros and set precision to same as FK tables
mat = np.zeros((matlength, matlength), dtype=np.float32)
cov_by_exp = np.zeros((matlength, matlength), dtype=np.float32)
total_datapoints = sum(combine_by_type.sizes.values())
mat = np.zeros((total_datapoints, total_datapoints), dtype=np.float32)
for locs in covs_pt_prescrip:
cov = covs_pt_prescrip[locs]
mat[locs[0] : (len(cov) + locs[0]), locs[1] : (len(cov.T) + locs[1])] = cov
for i in range(matlength):
for j in range(matlength):
cov_by_exp[covmap[i]][covmap[j]] = mat[i][j]
df = pd.DataFrame(cov_by_exp, index=procs_index, columns=procs_index)
df = pd.DataFrame(mat, index=covmat_index, columns=covmat_index)
return df


Expand Down Expand Up @@ -763,7 +745,8 @@ def theory_normcovmat_custom(theory_covmat_custom, procs_data_values):
"""Calculates the theory covariance matrix for scale variations normalised
to data, with variations according to the relevant prescription."""
df = theory_covmat_custom
mat = df / np.outer(procs_data_values, procs_data_values)
vals = procs_data_values.reindex(df.index)
mat = df / np.outer(vals, vals)
return mat


Expand Down
Loading