From d48d9882e481b266b95c923e8e98e931d8ee2407 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 7 Oct 2025 17:42:30 +0200 Subject: [PATCH 1/2] producers for ml reweighting --- columnflow/calibration/cmsGhent/__init__.py | 6 + columnflow/calibration/cmsGhent/hdamp.py | 121 ++++++++++++++++ columnflow/calibration/cmsGhent/lepton_mva.py | 2 +- .../calibration/cmsGhent/ttbar_minnlo.py | 133 ++++++++++++++++++ law.cfg | 2 +- sandboxes/onnxruntime.txt | 8 ++ sandboxes/venv_onnxruntime.sh | 18 +++ sandboxes/venv_onnxruntime_dev.sh | 18 +++ 8 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 columnflow/calibration/cmsGhent/hdamp.py create mode 100644 columnflow/calibration/cmsGhent/ttbar_minnlo.py create mode 100644 sandboxes/onnxruntime.txt create mode 100644 sandboxes/venv_onnxruntime.sh create mode 100644 sandboxes/venv_onnxruntime_dev.sh diff --git a/columnflow/calibration/cmsGhent/__init__.py b/columnflow/calibration/cmsGhent/__init__.py index e69de29bb..f0588da23 100644 --- a/columnflow/calibration/cmsGhent/__init__.py +++ b/columnflow/calibration/cmsGhent/__init__.py @@ -0,0 +1,6 @@ +# coding: utf-8 +# flake8: noqa + +import columnflow.calibration.cmsGhent.lepton_mva +import columnflow.calibration.cmsGhent.hdamp +import columnflow.calibration.cmsGhent.ttbar_minnlo diff --git a/columnflow/calibration/cmsGhent/hdamp.py b/columnflow/calibration/cmsGhent/hdamp.py new file mode 100644 index 000000000..db0c138e4 --- /dev/null +++ b/columnflow/calibration/cmsGhent/hdamp.py @@ -0,0 +1,121 @@ +""" +Code to add lepton MVA to NanoAOD +""" + +import law + +from columnflow.calibration import Calibrator, calibrator +from columnflow.util import maybe_import, four_vec +from columnflow.columnar_util import set_ak_column +from columnflow.columnar_util_Ghent import TetraVec +from columnflow.tasks.external import BundleExternalFiles + +np = maybe_import("numpy") +ak = maybe_import("awkward") +coffea = maybe_import("coffea") +maybe_import("coffea.nanoevents.methods.nanoaod") + + +@calibrator( + uses=four_vec("GenPart", "pdgId"), + produces={"hdamp_{up,down}"}, + sandbox="bash::$CF_BASE/sandboxes/venv_onnxruntime.sh", + maxM=243.9517, + default_hdamp=1.379, +) +def hdamp_reweighting_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + """ + Produces the hdamp reweighting scores. + Based on https://twiki.cern.ch/twiki/pub/CMS/MLReweighting/ImplementationCMSSW.pdf + Requires an external file in the config under ``hdamp``: + + .. code-block:: python + + cfg.x.external_files = DotDict.wrap({ + "hdamp": { + "up": f"YOURDIRECTORY/mymodel12_hdamp_up_13TeV.onnx", + "down": f"YOURDIRECTORY/mymodel12_hdamp_down_13TeV.onnx", + }, + }) + + The onnx files can be found on this twiki: + https://twiki.cern.ch/twiki/bin/view/CMS/MLReweighting + + Requires adding the environment venv_onnx which includes onnx to the analysis or config. E.g. + + analysis_inst.x.bash_sandboxes = [ + "$CF_BASE/sandboxes/cf.sh", + "$CF_BASE/sandboxes/venv_onnxruntime.sh", + ] + + """ + input = [] + sum_top = None + for pdgId in [6, -6]: + # get the initial top quarks + top = events.GenPart[events.GenPart.pdgId == pdgId][:, 0] + top = TetraVec(top) + + # sum top quarks + sum_top = top if sum_top is None else (sum_top + top) + + # + top_inp = np.array([ + np.log10(top.pt), + top.rapidity, + top.phi, + top.mass / self.maxM, + ] + [ + np.full(len(top), cst) + for cst in [ + {6: 0.1, -6: 0.2}[pdgId], + self.default_hdamp, + ] + ]) + input.append(top_inp) + + # if pt of sum larger then 1000, no reweighting + mask = sum_top.pt < 1000 + + # (2, 6, N) > (N, 2, 6) + input = np.rollaxis(np.array(input), -1)[mask] + for variation, model in self.models.items(): + label_name = model.get_outputs()[0].name + input_name = model.get_inputs()[0].name + pred = model.run([label_name], {input_name: input.astype(np.float32)})[0] + out = np.ones(len(events)) + out[mask] = pred[:, 0] / pred[:, 1] + events = set_ak_column(events, f"hdamp_{variation}", out) + + return events + + +@hdamp_reweighting_producer.requires +def hdamp_reweighting_producer_requires( + self: Calibrator, + task: law.Task, + reqs: dict, + **kwargs, +) -> None: + if "external_files" in reqs: + return + reqs["external_files"] = BundleExternalFiles.req(task) + + +@hdamp_reweighting_producer.setup +def hdamp_reweighting_producer_setup( + self: Calibrator, + task: law.Task, + reqs: dict, + inputs: dict, + reader_targets: law.util.InsertableDict, +) -> None: + bundle = reqs["external_files"] + + # create the xgboost predictor + import onnxruntime + + self.models = {} + for variation in ["up", "down"]: + file = bundle.files.hdamp[variation].path + self.models[variation] = onnxruntime.InferenceSession(file) diff --git a/columnflow/calibration/cmsGhent/lepton_mva.py b/columnflow/calibration/cmsGhent/lepton_mva.py index 95c4199ad..021a4b117 100644 --- a/columnflow/calibration/cmsGhent/lepton_mva.py +++ b/columnflow/calibration/cmsGhent/lepton_mva.py @@ -149,7 +149,7 @@ def lepton_mva_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Arra @lepton_mva_producer.requires -def lepton_mva_producer_requires(self: Calibrator, task: law.Task, reqs: dict) -> None: +def lepton_mva_producer_requires(self: Calibrator, task: law.Task, reqs: dict, **kwargs) -> None: if "external_files" in reqs: return reqs["external_files"] = BundleExternalFiles.req(task) diff --git a/columnflow/calibration/cmsGhent/ttbar_minnlo.py b/columnflow/calibration/cmsGhent/ttbar_minnlo.py new file mode 100644 index 000000000..d4f6108db --- /dev/null +++ b/columnflow/calibration/cmsGhent/ttbar_minnlo.py @@ -0,0 +1,133 @@ +""" +Code to add lepton MVA to NanoAOD +""" + +import law + +from columnflow.calibration import Calibrator, calibrator +from columnflow.util import maybe_import, four_vec +from columnflow.columnar_util import set_ak_column +from columnflow.columnar_util_Ghent import TetraVec +from columnflow.tasks.external import BundleExternalFiles + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +def norm(X, mean, std, scale): + if scale == "log": + X = np.log(np.clip(X, a_min=1e-6, a_max=None)) + # recenter and renormalize + return (X - mean) / np.where(std > 1e-2, std, 1) + + +@calibrator( + uses=four_vec("GenPart", {"pdgId", "statusFlags"}), + produces={"ttbar_minnlo_weight"}, + sandbox="bash::$CF_BASE/sandboxes/venv_onnxruntime.sh", + input_norm={ + # keys: 0 = ttbar, (-)6 = (anti)top + # values: mean, std, scale + "pt": { + 0: (3.6520673599656903, 1.0123402362573612, "log"), + 6: (4.595855742518925, 0.7101176940989488, "log"), + -6: (4.5986175957604045, 0.7103218938891299, "log"), + }, + "rapidity": { + 0: (0.0001718810581680775, 1.0362455506718102, "linear"), + 6: (0.00022746366634849002, 1.213207643109532, "linear"), + -6: (0.00011712322394057398, 1.2076422016031159, "linear"), + }, + "phi": { + 0: (2.8943571877384285e-05, 1.8139038706413384, "linear"), + 6: (-0.00028213870737636996, 1.8136544140703632, "linear"), + -6: (0.0003628069129526392, 1.8139415747773364, "linear"), + }, + "mass": { + 0: (6.21729978047307, 0.2771419580231537, "log"), + 6: (171.93706459943778, 6.9652037622153, "linear"), + -6: (171.93691192651536, 6.9500586980501575, "linear"), + }, + }, +) +def ttbar_minnlo_reweighting_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + """ + Produces the HVQ to MiNNLO reweighting values. + Based on https://twiki.cern.ch/twiki/pub/CMS/MLReweighting/CMSSW_script_afterShower.py.txt + Requires an external file in the config under ``ttbar_minnlo``: + + .. code-block:: python + + cfg.x.external_files = DotDict.wrap({ + "ttbar_minnlo": f"YOURDIRECTORY/mymodel12_13TeV_MiNNLO_afterShower.onnx", }, + }) + + Requires adding the environment venv_onnx which includes onnx to the analysis or config. E.g. + + analysis_inst.x.bash_sandboxes = [ + "$CF_BASE/sandboxes/cf.sh", + "$CF_BASE/sandboxes/venv_onnxruntime.sh", + ] + + """ + input = [] + sum_top = None + for pdgId in [6, -6, 0]: + # get the initial top quarks + if not pdgId: + top = sum_top + else: + top = events.GenPart[events.GenPart.pdgId == pdgId] + top = top[top.hasFlags("isLastCopy")][:, 0] + top = TetraVec(top) + # sum top quarks + sum_top = top if sum_top is None else (sum_top + top) + + # inputs + top_inp = [ + norm(getattr(top, inp), *norms[pdgId]) + for inp, norms in self.input_norm.items() + ] + [np.full(len(top), pdgId / 10)] + input.append(np.array(top_inp)) + + # top, antitop, ttbar > ttbar, top, antitop + input = np.roll(input, 1, axis=0) + + # (3, 6, N) > (N, 3, 6) + input = np.rollaxis(input, -1) + + label_name = self.model.get_outputs()[0].name + input_name = self.model.get_inputs()[0].name + pred = self.model.run([label_name], {input_name: input.astype(np.float32)})[0] + events = set_ak_column(events, "ttbar_minnlo_weight", pred[:, 1] / pred[:, 0]) + + return events + + +@ttbar_minnlo_reweighting_producer.requires +def ttbar_minnlo_reweighting_producer_requires( + self: Calibrator, + task: law.Task, + reqs: dict, + **kwargs, +) -> None: + if "external_files" in reqs: + return + reqs["external_files"] = BundleExternalFiles.req(task) + + +@ttbar_minnlo_reweighting_producer.setup +def ttbar_minnlo_reweighting_producer_setup( + self: Calibrator, + task: law.Task, + reqs: dict, + inputs: dict, + reader_targets: law.util.InsertableDict, +) -> None: + bundle = reqs["external_files"] + + # create the xgboost predictor + import onnxruntime + + file = bundle.files.ttbar_minnlo.path + self.model = onnxruntime.InferenceSession(file) diff --git a/law.cfg b/law.cfg index 95010ce73..246ad81e7 100644 --- a/law.cfg +++ b/law.cfg @@ -23,7 +23,7 @@ default_config: run2_pp_2018 default_dataset: st_tchannel_t production_modules: columnflow.production.{categories,processes,normalization,veto} -calibration_modules: columnflow.calibration +calibration_modules: columnflow.calibration,columnflow.calibration.cmsGhent selection_modules: columnflow.selection.empty reduction_modules: columnflow.reduction.default categorization_modules: columnflow.categorization diff --git a/sandboxes/onnxruntime.txt b/sandboxes/onnxruntime.txt new file mode 100644 index 000000000..df4fc9be3 --- /dev/null +++ b/sandboxes/onnxruntime.txt @@ -0,0 +1,8 @@ +# version 11 + +# use packages from columnar sandbox as baseline +-r columnar.txt + +# add numpy and onnxruntime with exact version requirement +onnxruntime==1.15.1 +numpy==1.26.4 diff --git a/sandboxes/venv_onnxruntime.sh b/sandboxes/venv_onnxruntime.sh new file mode 100644 index 000000000..37206ee4f --- /dev/null +++ b/sandboxes/venv_onnxruntime.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Script that sets up a virtual env in $CF_VENV_BASE. +# For more info on functionality and parameters, see the generic setup script _setup_venv.sh. + +action() { + local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )" + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # set variables and source the generic venv setup + export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_VENV_NAME="$( basename "${this_file%.sh}" )" + export CF_VENV_REQUIREMENTS="${this_dir}/onnxruntime.txt" + + source "${this_dir}/_setup_venv.sh" "$@" +} +action "$@" diff --git a/sandboxes/venv_onnxruntime_dev.sh b/sandboxes/venv_onnxruntime_dev.sh new file mode 100644 index 000000000..2b09a2240 --- /dev/null +++ b/sandboxes/venv_onnxruntime_dev.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Script that sets up a virtual env in $CF_VENV_BASE. +# For more info on functionality and parameters, see the generic setup script _setup_venv.sh. + +action() { + local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )" + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # set variables and source the generic venv setup + export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_VENV_NAME="$( basename "${this_file%.sh}" )" + export CF_VENV_REQUIREMENTS="${this_dir}/onnxruntime.txt,${this_dir}/dev.txt" + + source "${this_dir}/_setup_venv.sh" "$@" +} +action "$@" From 60269fb43c3a8c12e194dd0f335c331fc8e84375 Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Mon, 2 Mar 2026 14:01:22 +0100 Subject: [PATCH 2/2] np.bool -> np.bool_ --- columnflow/calibration/cms/jets.py | 2 +- columnflow/production/cms/pileup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/columnflow/calibration/cms/jets.py b/columnflow/calibration/cms/jets.py index 709f253bc..504b843c0 100644 --- a/columnflow/calibration/cms/jets.py +++ b/columnflow/calibration/cms/jets.py @@ -723,7 +723,7 @@ def get_jer_config_default(self: Calibrator) -> DotDict: # whether gen jet matching should be performed relative to the nominal jet pt, or the jec varied values gen_jet_matching_nominal=False, # regions where stochastic smearing is applied - stochastic_smearing_mask=lambda self, jets: ak.ones_like(jets.pt, dtype=np.bool), + stochastic_smearing_mask=lambda self, jets: ak.ones_like(jets.pt, dtype=np.bool_), ) def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: """ diff --git a/columnflow/production/cms/pileup.py b/columnflow/production/cms/pileup.py index 6675e8d04..d5d843b5f 100644 --- a/columnflow/production/cms/pileup.py +++ b/columnflow/production/cms/pileup.py @@ -55,7 +55,7 @@ def pu_weight( "NumTrueInteractions": events.Pileup.nTrueInt, } - any_outlier_mask = ak.zeros_like(events.Pileup.nTrueInt, dtype=np.bool) + any_outlier_mask = ak.zeros_like(events.Pileup.nTrueInt, dtype=np.bool_) for column_name, syst in ( ("pu_weight", "nominal"), ("pu_weight_minbias_xs_up", "up"),