Commit e5af9b31 authored by Dennis Noll's avatar Dennis Noll
Browse files

[tasks] sync: adds single event comparison

parent 0e7b91c4
# coding: utf-8
from collections import defaultdict
from functools import cached_property
from utils.util import DotDict
import itertools
import numpy as np
......@@ -13,7 +13,10 @@ import luigi
import boost_histogram as bh
import hist
from tqdm import tqdm
from rich.console import Console
from rich.table import Table
from utils.util import NumpyEncoder
from tasks.base import BaseTask, ConfigTask
from tasks.mixins import PGroupMixin, RecipeMixin
from tasks.coffea import CoffeaProcessor
......@@ -79,11 +82,7 @@ class SyncSelection(PlotHistsBase, ConfigTask):
req = {}
if self.own:
req.update(
{
"own": CoffeaProcessor.req(
self, processor="SyncSelectionExporter", explorative=True
)
}
{"own": CoffeaProcessor.req(self, processor="SyncSelectionExporter", debug=True)}
)
req.update({name: SyncFile.req(self, filepath=file) for name, file in self._files.items()})
return req
......@@ -122,50 +121,95 @@ class SyncSelection(PlotHistsBase, ConfigTask):
def output(self):
return {
"plots": self.local_target("plots"),
"statistical_tests": self.local_target("statistical_tests.json"),
"metrics": self.local_target("metrics.json"),
}
lut = {"lep0_energy": "lep_pt"}
def run(self):
statistical_tests = {}
for mask_key in tqdm(self.mask_keys, desc="categories"):
for feature_key in self.feature_keys:
metrics = {}
for mask_key in tqdm(self.mask_keys, desc="categories", position=0):
metrics[mask_key] = defaultdict(dict)
for feature_key in tqdm(self.feature_keys, desc="features", position=1, leave=None):
# extract name + binning from variables
var_name = self.lut.get(feature_key, feature_key)
try:
var = self.config_inst.get_variable(var_name)
var_name = var.name
var = self.config_inst.get_variable(feature_key)
binning = var.binning
except ValueError:
# default binning
binning = (100, -400, +400)
compare = hist.Hist(
hist.axis.StrCategory(list(self.data.keys()), name="group"),
# currently bug in boost-histogram: https://github.com/scikit-hep/boost-histogram/issues/621
# optimally would use one histogram - as patch using two until fixed
compare0 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[:1], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
compare1 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[1:], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
comparison, comparison_eventnr = None, None
setdiffs, cleareddiffs, eventdiff = [], [], []
for key, dat in self.data.items():
mask = dat[mask_key].astype(bool)
values = dat[feature_key][mask]
compare.fill(group=key, variable=values)
eventnr = dat["eventnr_eventnr"][mask]
if comparison is None:
comparison = values
comparison_eventnr = eventnr
compare0.fill(group=key, variable=values)
else:
compare1.fill(group=key, variable=values)
diff = len(np.setdiff1d(values, comparison))
rel_setdiff = diff / len(comparison) if len(comparison) != 0 else 1e5
setdiffs.append(rel_setdiff)
_, m, comp_m = np.intersect1d(eventnr, comparison_eventnr, return_indices=True)
cleareddiff = values[m] - comparison[comp_m]
rel_cleareddiff = (
np.sum(cleareddiff != 0) / len(cleareddiff)
if len(cleareddiff) != 0
else 1e5
)
cleareddiffs.append(rel_cleareddiff)
metrics[mask_key][feature_key]["setdiff"] = setdiffs
metrics[mask_key][feature_key]["cleareddiff"] = cleareddiffs
# plotting of histograms
with self.plot_silencer():
# ratio plot still wrong due to https://github.com/scikit-hep/boost-histogram/issues/621
self.plot(
targets=[self.local_target(f"plots/{mask_key}/{feature_key}.pdf")],
stack=compare[:1, :],
lines=compare[1:, :],
stack=compare0,
lines=compare1,
ratio="lines",
)
# statistical tests
hnew = histutils.kstest(compare[:1, :], compare[1:, :], 0, -1)
statistical_tests[f"{mask_key}_{feature_key}"] = hnew.view()[0]
self.output()["statistical_tests"].dump(statistical_tests)
# class SyncSelectionWrapper(SyncTask):
# def requires(self):
# return {
# "upload": SyncSelectionUpload.req(self),
# "plots": SyncSelectionPlots.req(self),
# }
# statistical tests on histograms
hnew = histutils.kstest(compare0, compare1, 0, -1)
metrics[mask_key][feature_key]["kstest"] = hnew.view()
break
# print summary metrics to console
table = Table(title="Maximum discrepancies")
table.add_column("Category", justify="right")
table.add_column("max setdiff")
table.add_column("max cleareddiff")
table.add_column("max kstest")
for k, v in metrics.items():
max_var_setdiff = max(v, key=lambda k: np.max(v[k]["setdiff"]))
max_var_cleareddiff = max(v, key=lambda k: np.max(v[k]["cleareddiff"]))
max_var_kstest = max(v, key=lambda k: np.max(v[k]["kstest"]))
table.add_row(
k,
f"{max_var_setdiff}: {np.max(v[max_var_setdiff]['setdiff'])}",
f"{max_var_cleareddiff}: {np.max(v[max_var_cleareddiff]['cleareddiff'])}",
f"{max_var_kstest}: {np.max(v[max_var_kstest]['kstest'])}",
)
console = Console()
console.print(table)
self.output()["metrics"].dump(metrics, cls=NumpyEncoder)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment