Commit d57be8df authored by Peter Fackeldey's avatar Peter Fackeldey
Browse files

Merge branch 'master' into datasets-data-v7

parents 82231844 4b198b13
fail_fast: true
exclude: '^(config/campaigns/|bbbb/|bbww_dl/|bbww_sl/|jet_tagging_sf/|jtsf/|tools/|vh/|wh/|zh/)'
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: check-added-large-files
args: ['--maxkb=1000']
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
- id: check-yaml
- id: debug-statements
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 21.7b0
hooks:
- id: black
args: [--line-length=100, --diff, --check]
- repo: https://github.com/PyCQA/isort
rev: 5.9.3
hooks:
- id: isort
......@@ -2,6 +2,15 @@
### Infos on git
##### Install pre-commit hooks
```
pip install pre-commit # or brew install pre-commit on macOS
pre-commit install # Will install a pre-commit hook into the git repo
```
##### `[user]` config
Make sure to have your name and email as registered at CERN in the `.git/config` of the cloned repository. Example:
......@@ -33,5 +42,5 @@ once.
### Fix Kerberos ticket and open ssh connection
1. `ssh -o exit lxplus`
2. `dhi_law.sh --help` (triggers `kinit`)
1. `ssh -o exit lxplus`
2. `dhi_law.sh --help` (triggers `kinit`)
......@@ -286,7 +286,7 @@ files = {
"btag": {
"deepjet": {
"POG_full": "https://twiki.cern.ch/twiki/pub/CMS/BtagRecommendation94X/DeepFlavour_94XSF_V4_B_F.csv",
"POG_reduced": "/eos/user/m/mfackeld/btagsf_reduced_jes/deepjet_2017.csv",
"POG_reduced": "https://twiki.cern.ch/twiki/pub/CMS/BtagRecommendation94X/DeepFlavour_94XSF_V4_B_F_JESreduced.csv", # "/eos/user/m/mfackeld/btagsf_reduced_jes/deepjet_2017.csv",
},
"subjet_deepb": "https://twiki.cern.ch/twiki/pub/CMS/BtagRecommendation94X/subjet_DeepCSV_94XSF_V4_B_F_v2.csv",
},
......
......@@ -235,7 +235,7 @@ files = {
"btag": {
"deepjet": {
"POG_full": "https://twiki.cern.ch/twiki/pub/CMS/BtagRecommendation102X/DeepJet_102XSF_V2.csv",
"POG_reduced": "/eos/user/m/mfackeld/sf_2018/deepjet_2018.csv",
"POG_reduced": "https://twiki.cern.ch/twiki/pub/CMS/BtagRecommendation102X/DeepJet_102XSF_V2_JESreduced.csv", # "/eos/user/m/mfackeld/sf_2018/deepjet_2018.csv",
},
"subjet_deepb": "https://twiki.cern.ch/twiki/pub/CMS/BtagRecommendation102X/subjet_DeepCSV_102XSF_V1.csv",
},
......
......@@ -98,6 +98,24 @@ class GeneratorHelper:
weightDown=self.ones,
)
# add envelope
self.weights.add(
"ScaleWeight_Envelope",
self.ones,
weightUp=np.maximum.reduce(
[
self.weights._modifiers[f"ScaleWeight_{suffix}Up"]
for suffix in ("Fact", "Renorm", "Mixed")
]
),
weightDown=np.minimum.reduce(
[
self.weights._modifiers[f"ScaleWeight_{suffix}Down"]
for suffix in ("Fact", "Renorm", "Mixed")
]
),
)
def gen_weight(
self,
output: Dict[str, object],
......
[tool.black]
line-length = 100
[tool.isort]
profile = "black"
multi_line_output = 3
......@@ -256,7 +256,7 @@ class VptSF:
if nlo:
nlo = self.nlo(dataset, LHE.Vpt, LHE.Nb, shifts=True)
if nlo is None:
sf = np.ones_like(LHE.Vpt)
sf = np.ones((len(events.MET),))
shift = 0
else:
sf, shift = nlo
......
# coding: utf-8
from collections import defaultdict
from functools import cached_property
from utils.util import DotDict
import itertools
import numpy as np
import scipy
import uproot
import law
from law.task.base import ExternalTask
import luigi
import boost_histogram as bh
import hist
from tqdm import tqdm
from rich.console import Console
from rich.table import Table
from tasks.base import BaseTask, ConfigTask
from tasks.mixins import PGroupMixin, RecipeMixin
from utils.util import NumpyEncoder
from tasks.base import ConfigTask, PoolMap
from tasks.mixins import RecipeMixin
from tasks.coffea import CoffeaProcessor
from tasks.plotting import PlotHistsBase
import utils.hist as histutils
class SyncTask(RecipeMixin, ConfigTask):
class Sync(ConfigTask):
@cached_property
def sync_config(self):
return self.config_inst.get_aux("sync", {})
@cached_property
def lookup_set(self):
return self.sync_config.get("lookup_set", {})
@cached_property
def categories(self):
return self.sync_config.get("categories", None)
class SyncSelectionUpload(Sync, RecipeMixin):
upload_identifier = luigi.Parameter(default="")
def requires(self):
return CoffeaProcessor.req(self, processor="SyncSelectionExporter", debug=True)
class SyncSelectionUpload(SyncTask):
def output(self):
return self.wlcg_target("sync.root", fs="wlcg_fs_public")
return self.wlcg_target(f"{self.upload_identifier}sync.root", fs="wlcg_fs_public")
def lookup(self, old_key):
key = old_key
for (old, new) in self.lookup_set:
key = key.replace(old, new)
return key
def run(self):
intree = uproot.open(self.input().path)
with self.output().localize("w") as target:
self.input().copy_to(target)
with uproot.recreate(target.path) as file:
outtree = {k: intree["tree"][k].array() for k in intree["tree"].keys()}
for k_old in list(outtree.keys()):
k_new = self.lookup(k_old.decode())
outtree[k_new] = outtree.pop(k_old)
file["tree"] = uproot.newtree({n: v.dtype for n, v in outtree.items()})
file["tree"].extend(outtree)
class SyncFile(ExternalTask):
......@@ -41,53 +70,12 @@ class SyncFile(ExternalTask):
return law.LocalFileTarget(self.filepath)
def reduce_along_axis(func, axis, h):
axes = np.array(h.axes, dtype=object)
axes = np.delete(axes, axis)
hnew = hist.Hist(*axes)
hnew.view()[:] = np.apply_along_axis(func, axis, h.view())
return hnew
def _kstest(arr):
return scipy.stats.kstest(arr[:, 0], arr[:, 1]).statistic
def kstest(h1, h2, compare_axis, feature_axis):
assert feature_axis == -1, "Other axes not implemented yet"
axes1 = np.array(h1.axes, dtype=object)
axes2 = np.array(h2.axes, dtype=object)
# currently only works for StrCategory
compared_axis = [
f"{a}_{b}" for (a, b) in itertools.product(axes1[compare_axis], axes2[compare_axis])
]
new_axes = np.copy(axes1)
new_axes[compare_axis] = type(axes1[compare_axis])(compared_axis)
new_axes = np.delete(new_axes, feature_axis)
hnew = hist.Hist(*new_axes)
a1, a2 = h1.view(), h2.view()
c, d = np.broadcast_arrays(
np.expand_dims(a1, compare_axis), np.expand_dims(a2, compare_axis + 1)
)
s = np.stack([c, d], axis=-2)
# reshape
o = np.array(s.shape)
o[compare_axis] = o[compare_axis] * o[compare_axis + 1]
o = np.delete(o, compare_axis + 1)
s = np.reshape(s, o)
# perform test
kstest = np.vectorize(_kstest, signature="(2,n)->()")
hnew.view()[:] = kstest(s)
return hnew
class SyncSelection(PlotHistsBase, ConfigTask):
class SyncSelection(PlotHistsBase, Sync, PoolMap):
files = law.CSVParameter(default=[])
own = luigi.BoolParameter()
debug = luigi.BoolParameter()
n_parallel_max = 32
@property
def _files(self):
......@@ -103,8 +91,15 @@ class SyncSelection(PlotHistsBase, ConfigTask):
def requires(self):
req = {}
if self.own:
kwargs = dict(debug=True) if self.debug else dict(explorative=True)
req.update(
{"own": CoffeaProcessor.req(self, processor="SyncSelectionExporter", debug=True)}
{
"own": CoffeaProcessor.req(
self,
processor="SyncSelectionExporter",
**kwargs,
)
}
)
req.update({name: SyncFile.req(self, filepath=file) for name, file in self._files.items()})
return req
......@@ -115,7 +110,7 @@ class SyncSelection(PlotHistsBase, ConfigTask):
assert len(inps) > 0, "No inputs defined, use `--own` and/or `--files`"
out = {}
for key, target in inps.items():
for key, target in tqdm(inps.items(), desc="load data"):
tree = uproot.open(target.path)["tree"]
keys, size = tree.keys(), len(tree["eventnr_eventnr"].array())
dtype = [(k.decode(), "i4" if self.is_mask_key(k.decode()) else "f4") for k in keys]
......@@ -143,49 +138,120 @@ class SyncSelection(PlotHistsBase, ConfigTask):
def output(self):
return {
"plots": self.local_target("plots"),
"statistical_tests": self.local_target("statistical_tests.json"),
"metrics": self.local_target("metrics.json"),
}
lut = {"lep0_energy": "lep_pt"}
def sync(self, keys):
metrics = {}
mask_key, feature_key = keys
# extract name + binning from variables
try:
var = self.config_inst.get_variable(feature_key)
binning = var.binning
except ValueError:
# default binning
binning = (100, -400, +400)
compare0 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[:1], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
compare1 = hist.Hist(
hist.axis.StrCategory(list(self.data.keys())[1:], name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
comparison, comparison_eventnr = None, None
setdiffs, cleareddiffs, eventdiff = [], [], []
for key, dat in self.data.items():
mask = dat[mask_key].astype(bool)
values = dat[feature_key][mask]
eventnr = dat["eventnr_eventnr"][mask]
if comparison is None:
comparison = values
comparison_eventnr = eventnr
compare0.fill(group=key, variable=values)
else:
compare1.fill(group=key, variable=values)
diff = len(np.setdiff1d(values, comparison))
rel_setdiff = diff / len(comparison) if len(comparison) != 0 else 1e5
setdiffs.append(rel_setdiff)
_, m, comp_m = np.intersect1d(eventnr, comparison_eventnr, return_indices=True)
cleareddiff = values[m] - comparison[comp_m]
rel_cleareddiff = (
np.sum(cleareddiff != 0) / len(cleareddiff) if len(cleareddiff) != 0 else 1e5
)
cleareddiffs.append(rel_cleareddiff)
metrics["setdiff"] = setdiffs
metrics["cleareddiff"] = cleareddiffs
# plotting of histograms
with self.plot_silencer():
self.plot(
targets=[self.local_target(f"plots/{mask_key}/{feature_key}.pdf")],
stack=compare0,
lines=compare1,
ratio="lines",
)
# statistical tests on histograms
hnew = histutils.kstest(compare0, compare1, 0, -1)
metrics["kstest"] = hnew.view()
return keys, metrics
def run(self):
statistical_tests = {}
for mask_key in self.mask_keys:
for feature_key in self.feature_keys:
# extract name + binning from variables
var_name = self.lut.get(feature_key, feature_key)
try:
var = self.config_inst.get_variable(var_name)
var_name = var.name
binning = var.binning
except ValueError:
binning = (100, -400, +400)
compare = hist.Hist(
hist.axis.StrCategory(self.data.keys(), name="group"),
hist.axis.Regular(*binning, name="variable"),
storage=hist.storage.Weight(),
)
for key, dat in self.data.items():
mask = dat[mask_key].astype(bool)
values = dat[feature_key][mask]
compare.fill(group=key, variable=values)
self.plot(
targets=[self.local_target(f"plots/{mask_key}/{feature_key}.pdf")],
stack=compare[:1, :],
lines=compare[1:, :],
ratio="lines",
)
# plot histograms
self.plot(compare)
# statistical tests
hnew = kstest(compare[:1, :], compare[1:, :], 0, -1)
statistical_tests[f"{mask_key}_{feature_key}"] = hnew.view()[0]
self.output()["statistical_tests"].dump(statistical_tests)
class SyncSelectionWrapper(SyncTask):
def requires(self):
return {
"upload": SyncSelectionUpload.req(self),
"plots": SyncSelectionPlots.req(self),
}
metrics = defaultdict(dict)
# touch cached properties
self.data
self.keys
self.mask_keys
self.feature_keys
work = [
(m, v)
for m in self.mask_keys
for v in self.feature_keys
if self.categories is None or m in self.categories
]
for (mask_key, feature_key), metric in self.pmap(
self.sync,
work,
unit="sync",
unordered=True,
):
metrics[mask_key][feature_key] = metric
# print summary metrics to console
table = Table(title="Maximum discrepancies")
table.add_column("Category", justify="right")
table.add_column("max setdiff")
table.add_column("max cleareddiff")
table.add_column("max kstest")
for k, v in metrics.items():
max_var_setdiff = max(v, key=lambda k: np.max(v[k]["setdiff"]))
max_var_cleareddiff = max(v, key=lambda k: np.max(v[k]["cleareddiff"]))
max_var_kstest = max(v, key=lambda k: np.max(v[k]["kstest"]))
table.add_row(
k,
f"{max_var_setdiff}: {np.max(v[max_var_setdiff]['setdiff'])}",
f"{max_var_cleareddiff}: {np.max(v[max_var_cleareddiff]['cleareddiff'])}",
f"{max_var_kstest}: {np.max(v[max_var_kstest]['kstest'])}",
)
console = Console()
console.print(table)
self.output()["metrics"].dump(metrics, cls=NumpyEncoder)
console.print(
"For PScan on VISPA:",
f"Directory: {self.output()['plots'].path}",
r"Regex: (?P<category>.+)/(?P<variable>.+).pdf",
sep="\n\t",
)
......@@ -588,12 +588,13 @@ class ArrayExporter(BaseProcessor):
select_output = self.select(events, unc="nominal", shift=None)
categories = self.categories(select_output)
output = select_output["output"]
weights = select_output["weights"]
if categories:
arrays = self.arrays(ChainMap({}, select_output))
dataset = self.get_dataset(events)
weight = select_output["weights"].weight()
weight = weights.weight()
arrays.setdefault(
"weight", np.stack([np.full_like(weight, dataset.id), weight], axis=-1)
)
......@@ -650,11 +651,32 @@ class TreeExporter(ArrayExporter):
output = "sync.root"
dtype = None
groups = {}
@classmethod
def look_up(cls, name):
for was, gets in cls.lut.items():
name = name.replace(was, gets)
return name
def group_tree(cls, outtree):
# add regrouped categories
cats = [c for c in outtree.keys() if c.startswith("is_")]
regroups = []
for dst, pat in cls.groups.items():
if dst is None:
continue
cats_new = set()
for cat in cats:
rem = re.sub(pat, dst, cat)
if rem != cat:
assert rem not in cats, f"new category {rem} already existing"
cats_new.add(rem)
regroups.append((dst, pat, cats.copy()))
cats.extend(sorted(cats_new))
for dst, pat, cats in tqdm(regroups, desc="regroup", leave=False):
for cat in tqdm(cats, unit="category", leave=False):
rem = re.sub(pat, dst, cat)
if rem != cat:
outtree[rem] = np.logical_or(outtree.get(rem, 0), outtree[cat])
return outtree
@classmethod
def arrays_to_tree(cls, arrays, target, vars, **kwargs):
......@@ -663,28 +685,27 @@ class TreeExporter(ArrayExporter):
with uproot.recreate(target.path) as file:
tree = defaultdict(list)
for cat, cat_array in arrays.items():
# remove broken categories
if re.search(cls.groups.get(None, "x^"), cat):
continue
for var, _var_array in cat_array.items():
if var not in cls.tensors.fget(cls).keys():
continue
var_array = _var_array.value
parts = [-1] if len(var_array.shape) == 2 else range(var_array.shape[1])
for i in parts:
if i == -1:
name = var
vals = var_array
else:
name = f"{var}{i}"
vals = var_array[:, i]
for j in range(vals.shape[-1]):
var_name = f"{name}_{vars[var][2][j]}"
v = vals[:, j]
tree[var_name].append(v)
if vars[var][1] == 0:
name = f"{var}_" if "part" in vars[var][-1].get("groups", {}) else ""
for i, feat in enumerate(vars[var][2]):
tree[f"{name}{feat}"].append(var_array[:, i])
else:
for i in range(vars[var][1]):
for j, feat in enumerate(vars[var][2]):
tree[f"{var}{i}_{feat}"].append(var_array[:, i, j])
for cat_flag in arrays.keys():
tree[cat_flag].append(np.ones(var_array.shape[0]) * (cat == cat_flag))
tree[f"is_{cat_flag}"].append(np.ones(var_array.shape[0]) * (cat == cat_flag))
outtree = {}
for v, k in tree.items():
var_name = cls.look_up(v)
outtree[var_name] = np.concatenate(k, axis=-1)
outtree[v] = np.concatenate(k, axis=-1)
outtree = cls.group_tree(outtree)
file["tree"] = uproot.newtree({n: v.dtype for n, v in outtree.items()})
file["tree"].extend(outtree)
......
# coding: utf-8
import itertools
import numpy as np
import scipy
import hist
def reduce_along_axis(func, axis, h):
axes = np.array(h.axes, dtype=object)
axes = np.delete(axes, axis)
hnew = hist.Hist(*axes)
hnew.view()[:] = np.apply_along_axis(func, axis, h.view())
return hnew
def _kstest(arr):
with np.errstate(invalid="ignore"):
return scipy.stats.kstest(arr[0, :], arr[1, :]).statistic
return scipy.stats.chisquare(arr[0, :], arr[1, :]).statistic
def kstest(h1, h2, compare_axis, feature_axis):
assert feature_axis == -1, "Other axes not implemented yet"
axes1 = np.array(h1.axes, dtype=object)
axes2 = np.array(h2.axes, dtype=object)
# currently only works for StrCategory
compared_axis = [
f"{a}_{b}" for (a, b) in itertools.product(axes1[compare_axis], axes2[compare_axis])
]
new_axes = np.copy(axes1)
new_axes[compare_axis] = type(axes1[compare_axis])(compared_axis)
new_axes = np.delete(new_axes, feature_axis)
hnew = hist.Hist(*new_axes)
a1, a2 = h1.view(), h2.view()
c, d = np.broadcast_arrays(
np.expand_dims(a1, compare_axis), np.expand_dims(a2, compare_axis + 1)
)
s = np.stack([c, d], axis=-2)
# reshape
o = np.array(s.shape)
o[compare_axis] = o[compare_axis] * o[compare_axis + 1]
o = np.delete(o, compare_axis + 1)
s = np.reshape(s, o)
# perform test
kstest = np.vectorize(_kstest, signature="(2,n)->()")
hnew.view()[:] = kstest(s["value"])
return hnew
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment