Commit 61a9dc59 authored by Peter Fackeldey's avatar Peter Fackeldey
Browse files

[tasks/sync] add multiprocessing to sync selection task

parent 975adf8b
......@@ -2,23 +2,20 @@
from collections import defaultdict
from functools import cached_property
import itertools
import numpy as np
import scipy
import uproot
import law
from law.task.base import ExternalTask
import luigi
import boost_histogram as bh
import hist
from tqdm import tqdm
from rich.console import Console
from rich.table import Table
from utils.util import NumpyEncoder
from tasks.base import BaseTask, ConfigTask
from tasks.mixins import PGroupMixin, RecipeMixin
from tasks.base import ConfigTask, PoolMap
from tasks.mixins import RecipeMixin
from tasks.coffea import CoffeaProcessor
from tasks.plotting import PlotHistsBase
import utils.hist as histutils
......@@ -63,10 +60,12 @@ class SyncFile(ExternalTask):
return law.LocalFileTarget(self.filepath)
class SyncSelection(PlotHistsBase, ConfigTask):
class SyncSelection(PlotHistsBase, ConfigTask, PoolMap):
files = law.CSVParameter(default=[])
own = luigi.BoolParameter()
n_parallel_max = 32
@property
def _files(self):
out = {}
......@@ -130,74 +129,90 @@ class SyncSelection(PlotHistsBase, ConfigTask):
"metrics": self.local_target("metrics.json"),
}
def run(self):
def sync(self, keys):
metrics = {}
for mask_key in tqdm(self.mask_keys, desc="categories", position=0):
metrics[mask_key] = defaultdict(dict)
for feature_key in tqdm(self.feature_keys, desc="features", position=1, leave=None):
# extract name + binning from variables
try:
var = self.config_inst.get_variable(feature_key)
binning = var.binning
except ValueError:
# default binning
binning = (100, -400, +400)
# currently bug in boost-histogram: https://github.com/scikit-hep/boost-histogram/issues/621
# optimally would use one histogram - as patch using two until fixed
compare0 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[:1], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
compare1 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[1:], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
comparison, comparison_eventnr = None, None
setdiffs, cleareddiffs, eventdiff = [], [], []
for key, dat in self.data.items():
mask = dat[mask_key].astype(bool)
values = dat[feature_key][mask]
eventnr = dat["eventnr_eventnr"][mask]
if comparison is None:
comparison = values
comparison_eventnr = eventnr
compare0.fill(group=key, variable=values)
else:
compare1.fill(group=key, variable=values)
diff = len(np.setdiff1d(values, comparison))
rel_setdiff = diff / len(comparison) if len(comparison) != 0 else 1e5
setdiffs.append(rel_setdiff)
_, m, comp_m = np.intersect1d(eventnr, comparison_eventnr, return_indices=True)
cleareddiff = values[m] - comparison[comp_m]
rel_cleareddiff = (
np.sum(cleareddiff != 0) / len(cleareddiff)
if len(cleareddiff) != 0
else 1e5
)
cleareddiffs.append(rel_cleareddiff)
metrics[mask_key][feature_key]["setdiff"] = setdiffs
metrics[mask_key][feature_key]["cleareddiff"] = cleareddiffs
# plotting of histograms
with self.plot_silencer():
self.plot(
targets=[self.local_target(f"plots/{mask_key}/{feature_key}.pdf")],
stack=compare0,
lines=compare1,
ratio="lines",
)
# statistical tests on histograms
hnew = histutils.kstest(compare0, compare1, 0, -1)
metrics[mask_key][feature_key]["kstest"] = hnew.view()
break
mask_key, feature_key = keys
# extract name + binning from variables
try:
var = self.config_inst.get_variable(feature_key)
binning = var.binning
except ValueError:
# default binning
binning = (100, -400, +400)
# currently bug in boost-histogram: https://github.com/scikit-hep/boost-histogram/issues/621
# optimally would use one histogram - as patch using two until fixed
compare0 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[:1], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
compare1 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[1:], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
comparison, comparison_eventnr = None, None
setdiffs, cleareddiffs, eventdiff = [], [], []
for key, dat in self.data.items():
mask = dat[mask_key].astype(bool)
values = dat[feature_key][mask]
eventnr = dat["eventnr_eventnr"][mask]
if comparison is None:
comparison = values
comparison_eventnr = eventnr
compare0.fill(group=key, variable=values)
else:
compare1.fill(group=key, variable=values)
diff = len(np.setdiff1d(values, comparison))
rel_setdiff = diff / len(comparison) if len(comparison) != 0 else 1e5
setdiffs.append(rel_setdiff)
_, m, comp_m = np.intersect1d(eventnr, comparison_eventnr, return_indices=True)
cleareddiff = values[m] - comparison[comp_m]
rel_cleareddiff = (
np.sum(cleareddiff != 0) / len(cleareddiff) if len(cleareddiff) != 0 else 1e5
)
cleareddiffs.append(rel_cleareddiff)
metrics["setdiff"] = setdiffs
metrics["cleareddiff"] = cleareddiffs
# plotting of histograms
with self.plot_silencer():
self.plot(
targets=[self.local_target(f"plots/{mask_key}/{feature_key}.pdf")],
stack=compare0,
lines=compare1,
ratio="lines",
)
# statistical tests on histograms
hnew = histutils.kstest(compare0, compare1, 0, -1)
metrics["kstest"] = hnew.view()
return keys, metrics
def run(self):
metrics = defaultdict(dict)
# touch cached properties
self.data
self.keys
self.mask_keys
self.feature_keys
work = [(m, v) for m in self.mask_keys for v in self.feature_keys]
for (mask_key, feature_key), metric in self.pmap(
self.sync,
work,
unit="sync",
unordered=True,
):
metrics[mask_key][feature_key] = metric
# print summary metrics to console
table = Table(title="Maximum discrepancies")
table.add_column("Category", justify="right")
......@@ -219,3 +234,9 @@ class SyncSelection(PlotHistsBase, ConfigTask):
console.print(table)
self.output()["metrics"].dump(metrics, cls=NumpyEncoder)
console.print(
"For PScan on VISPA:",
f"Directory: {self.output()['plots'].path}",
r"Regex: (?P<category>.+)/(?P<variable>.+).pdf",
sep="\n\t",
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment