From d48d9882e481b266b95c923e8e98e931d8ee2407 Mon Sep 17 00:00:00 2001
From: maadcoen <maarten.de.coen@cern.ch>
Date: Tue, 7 Oct 2025 17:42:30 +0200
Subject: [PATCH 1/2] producers for ml reweighting

---
 columnflow/calibration/cmsGhent/__init__.py   |   6 +
 columnflow/calibration/cmsGhent/hdamp.py      | 121 ++++++++++++++++
 columnflow/calibration/cmsGhent/lepton_mva.py |   2 +-
 .../calibration/cmsGhent/ttbar_minnlo.py      | 133 ++++++++++++++++++
 law.cfg                                       |   2 +-
 sandboxes/onnxruntime.txt                     |   8 ++
 sandboxes/venv_onnxruntime.sh                 |  18 +++
 sandboxes/venv_onnxruntime_dev.sh             |  18 +++
 8 files changed, 306 insertions(+), 2 deletions(-)
 create mode 100644 columnflow/calibration/cmsGhent/hdamp.py
 create mode 100644 columnflow/calibration/cmsGhent/ttbar_minnlo.py
 create mode 100644 sandboxes/onnxruntime.txt
 create mode 100644 sandboxes/venv_onnxruntime.sh
 create mode 100644 sandboxes/venv_onnxruntime_dev.sh

diff --git a/columnflow/calibration/cmsGhent/__init__.py b/columnflow/calibration/cmsGhent/__init__.py
index e69de29bb..f0588da23 100644
--- a/columnflow/calibration/cmsGhent/__init__.py
+++ b/columnflow/calibration/cmsGhent/__init__.py
@@ -0,0 +1,6 @@
+# coding: utf-8
+# flake8: noqa
+
+import columnflow.calibration.cmsGhent.lepton_mva
+import columnflow.calibration.cmsGhent.hdamp
+import columnflow.calibration.cmsGhent.ttbar_minnlo
diff --git a/columnflow/calibration/cmsGhent/hdamp.py b/columnflow/calibration/cmsGhent/hdamp.py
new file mode 100644
index 000000000..db0c138e4
--- /dev/null
+++ b/columnflow/calibration/cmsGhent/hdamp.py
@@ -0,0 +1,121 @@
+"""
+Code to add lepton MVA to NanoAOD
+"""
+
+import law
+
+from columnflow.calibration import Calibrator, calibrator
+from columnflow.util import maybe_import, four_vec
+from columnflow.columnar_util import set_ak_column
+from columnflow.columnar_util_Ghent import TetraVec
+from columnflow.tasks.external import BundleExternalFiles
+
+np = maybe_import("numpy")
+ak = maybe_import("awkward")
+coffea = maybe_import("coffea")
+maybe_import("coffea.nanoevents.methods.nanoaod")
+
+
+@calibrator(
+    uses=four_vec("GenPart", "pdgId"),
+    produces={"hdamp_{up,down}"},
+    sandbox="bash::$CF_BASE/sandboxes/venv_onnxruntime.sh",
+    maxM=243.9517,
+    default_hdamp=1.379,
+)
+def hdamp_reweighting_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
+    """
+    Produces the hdamp reweighting scores.
+    Based on https://twiki.cern.ch/twiki/pub/CMS/MLReweighting/ImplementationCMSSW.pdf
+    Requires an external file in the config under ``hdamp``:
+
+    .. code-block:: python
+
+        cfg.x.external_files = DotDict.wrap({
+            "hdamp": {
+                "up": f"YOURDIRECTORY/mymodel12_hdamp_up_13TeV.onnx",
+                "down": f"YOURDIRECTORY/mymodel12_hdamp_down_13TeV.onnx",
+            },
+        })
+
+    The onnx files can be found on this twiki:
+    https://twiki.cern.ch/twiki/bin/view/CMS/MLReweighting
+
+    Requires adding the environment venv_onnx which includes onnx to the analysis or config. E.g.
+
+    analysis_inst.x.bash_sandboxes = [
+        "$CF_BASE/sandboxes/cf.sh",
+        "$CF_BASE/sandboxes/venv_onnxruntime.sh",
+    ]
+
+    """
+    input = []
+    sum_top = None
+    for pdgId in [6, -6]:
+        # get the initial top quarks
+        top = events.GenPart[events.GenPart.pdgId == pdgId][:, 0]
+        top = TetraVec(top)
+
+        # sum top quarks
+        sum_top = top if sum_top is None else (sum_top + top)
+
+        #
+        top_inp = np.array([
+            np.log10(top.pt),
+            top.rapidity,
+            top.phi,
+            top.mass / self.maxM,
+        ] + [
+            np.full(len(top), cst)
+            for cst in [
+                {6: 0.1, -6: 0.2}[pdgId],
+                self.default_hdamp,
+            ]
+        ])
+        input.append(top_inp)
+
+    # if pt of sum larger then 1000, no reweighting
+    mask = sum_top.pt < 1000
+
+    # (2, 6, N) > (N, 2, 6)
+    input = np.rollaxis(np.array(input), -1)[mask]
+    for variation, model in self.models.items():
+        label_name = model.get_outputs()[0].name
+        input_name = model.get_inputs()[0].name
+        pred = model.run([label_name], {input_name: input.astype(np.float32)})[0]
+        out = np.ones(len(events))
+        out[mask] = pred[:, 0] / pred[:, 1]
+        events = set_ak_column(events, f"hdamp_{variation}", out)
+
+    return events
+
+
+@hdamp_reweighting_producer.requires
+def hdamp_reweighting_producer_requires(
+    self: Calibrator,
+    task: law.Task,
+    reqs: dict,
+    **kwargs,
+) -> None:
+    if "external_files" in reqs:
+        return
+    reqs["external_files"] = BundleExternalFiles.req(task)
+
+
+@hdamp_reweighting_producer.setup
+def hdamp_reweighting_producer_setup(
+    self: Calibrator,
+    task: law.Task,
+    reqs: dict,
+    inputs: dict,
+    reader_targets: law.util.InsertableDict,
+) -> None:
+    bundle = reqs["external_files"]
+
+    # create the xgboost predictor
+    import onnxruntime
+
+    self.models = {}
+    for variation in ["up", "down"]:
+        file = bundle.files.hdamp[variation].path
+        self.models[variation] = onnxruntime.InferenceSession(file)
diff --git a/columnflow/calibration/cmsGhent/lepton_mva.py b/columnflow/calibration/cmsGhent/lepton_mva.py
index 95c4199ad..021a4b117 100644
--- a/columnflow/calibration/cmsGhent/lepton_mva.py
+++ b/columnflow/calibration/cmsGhent/lepton_mva.py
@@ -149,7 +149,7 @@ def lepton_mva_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Arra
 
 
 @lepton_mva_producer.requires
-def lepton_mva_producer_requires(self: Calibrator, task: law.Task, reqs: dict) -> None:
+def lepton_mva_producer_requires(self: Calibrator, task: law.Task, reqs: dict, **kwargs) -> None:
     if "external_files" in reqs:
         return
     reqs["external_files"] = BundleExternalFiles.req(task)
diff --git a/columnflow/calibration/cmsGhent/ttbar_minnlo.py b/columnflow/calibration/cmsGhent/ttbar_minnlo.py
new file mode 100644
index 000000000..d4f6108db
--- /dev/null
+++ b/columnflow/calibration/cmsGhent/ttbar_minnlo.py
@@ -0,0 +1,133 @@
+"""
+Code to add lepton MVA to NanoAOD
+"""
+
+import law
+
+from columnflow.calibration import Calibrator, calibrator
+from columnflow.util import maybe_import, four_vec
+from columnflow.columnar_util import set_ak_column
+from columnflow.columnar_util_Ghent import TetraVec
+from columnflow.tasks.external import BundleExternalFiles
+
+np = maybe_import("numpy")
+ak = maybe_import("awkward")
+
+
+def norm(X, mean, std, scale):
+    if scale == "log":
+        X = np.log(np.clip(X, a_min=1e-6, a_max=None))
+    # recenter and renormalize
+    return (X - mean) / np.where(std > 1e-2, std, 1)
+
+
+@calibrator(
+    uses=four_vec("GenPart", {"pdgId", "statusFlags"}),
+    produces={"ttbar_minnlo_weight"},
+    sandbox="bash::$CF_BASE/sandboxes/venv_onnxruntime.sh",
+    input_norm={
+        # keys: 0 = ttbar, (-)6 = (anti)top
+        # values: mean, std, scale
+        "pt": {
+            0: (3.6520673599656903, 1.0123402362573612, "log"),
+            6: (4.595855742518925, 0.7101176940989488, "log"),
+            -6: (4.5986175957604045, 0.7103218938891299, "log"),
+        },
+        "rapidity": {
+            0: (0.0001718810581680775, 1.0362455506718102, "linear"),
+            6: (0.00022746366634849002, 1.213207643109532, "linear"),
+            -6: (0.00011712322394057398, 1.2076422016031159, "linear"),
+        },
+        "phi": {
+            0: (2.8943571877384285e-05, 1.8139038706413384, "linear"),
+            6: (-0.00028213870737636996, 1.8136544140703632, "linear"),
+            -6: (0.0003628069129526392, 1.8139415747773364, "linear"),
+        },
+        "mass": {
+            0: (6.21729978047307, 0.2771419580231537, "log"),
+            6: (171.93706459943778, 6.9652037622153, "linear"),
+            -6: (171.93691192651536, 6.9500586980501575, "linear"),
+        },
+    },
+)
+def ttbar_minnlo_reweighting_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
+    """
+    Produces the HVQ to MiNNLO reweighting values.
+    Based on https://twiki.cern.ch/twiki/pub/CMS/MLReweighting/CMSSW_script_afterShower.py.txt
+    Requires an external file in the config under ``ttbar_minnlo``:
+
+    .. code-block:: python
+
+        cfg.x.external_files = DotDict.wrap({
+            "ttbar_minnlo":  f"YOURDIRECTORY/mymodel12_13TeV_MiNNLO_afterShower.onnx",            },
+        })
+
+    Requires adding the environment venv_onnx which includes onnx to the analysis or config. E.g.
+
+    analysis_inst.x.bash_sandboxes = [
+        "$CF_BASE/sandboxes/cf.sh",
+        "$CF_BASE/sandboxes/venv_onnxruntime.sh",
+    ]
+
+    """
+    input = []
+    sum_top = None
+    for pdgId in [6, -6, 0]:
+        # get the initial top quarks
+        if not pdgId:
+            top = sum_top
+        else:
+            top = events.GenPart[events.GenPart.pdgId == pdgId]
+            top = top[top.hasFlags("isLastCopy")][:, 0]
+            top = TetraVec(top)
+            # sum top quarks
+            sum_top = top if sum_top is None else (sum_top + top)
+
+        # inputs
+        top_inp = [
+            norm(getattr(top, inp), *norms[pdgId])
+            for inp, norms in self.input_norm.items()
+        ] + [np.full(len(top), pdgId / 10)]
+        input.append(np.array(top_inp))
+
+    # top, antitop, ttbar > ttbar, top, antitop
+    input = np.roll(input, 1, axis=0)
+
+    # (3, 6, N) > (N, 3, 6)
+    input = np.rollaxis(input, -1)
+
+    label_name = self.model.get_outputs()[0].name
+    input_name = self.model.get_inputs()[0].name
+    pred = self.model.run([label_name], {input_name: input.astype(np.float32)})[0]
+    events = set_ak_column(events, "ttbar_minnlo_weight", pred[:, 1] / pred[:, 0])
+
+    return events
+
+
+@ttbar_minnlo_reweighting_producer.requires
+def ttbar_minnlo_reweighting_producer_requires(
+    self: Calibrator,
+    task: law.Task,
+    reqs: dict,
+    **kwargs,
+) -> None:
+    if "external_files" in reqs:
+        return
+    reqs["external_files"] = BundleExternalFiles.req(task)
+
+
+@ttbar_minnlo_reweighting_producer.setup
+def ttbar_minnlo_reweighting_producer_setup(
+    self: Calibrator,
+    task: law.Task,
+    reqs: dict,
+    inputs: dict,
+    reader_targets: law.util.InsertableDict,
+) -> None:
+    bundle = reqs["external_files"]
+
+    # create the xgboost predictor
+    import onnxruntime
+
+    file = bundle.files.ttbar_minnlo.path
+    self.model = onnxruntime.InferenceSession(file)
diff --git a/law.cfg b/law.cfg
index 95010ce73..246ad81e7 100644
--- a/law.cfg
+++ b/law.cfg
@@ -23,7 +23,7 @@ default_config: run2_pp_2018
 default_dataset: st_tchannel_t
 
 production_modules: columnflow.production.{categories,processes,normalization,veto}
-calibration_modules: columnflow.calibration
+calibration_modules: columnflow.calibration,columnflow.calibration.cmsGhent
 selection_modules: columnflow.selection.empty
 reduction_modules: columnflow.reduction.default
 categorization_modules: columnflow.categorization
diff --git a/sandboxes/onnxruntime.txt b/sandboxes/onnxruntime.txt
new file mode 100644
index 000000000..df4fc9be3
--- /dev/null
+++ b/sandboxes/onnxruntime.txt
@@ -0,0 +1,8 @@
+# version 11
+
+# use packages from columnar sandbox as baseline
+-r columnar.txt
+
+# add numpy and onnxruntime with exact version requirement
+onnxruntime==1.15.1
+numpy==1.26.4
diff --git a/sandboxes/venv_onnxruntime.sh b/sandboxes/venv_onnxruntime.sh
new file mode 100644
index 000000000..37206ee4f
--- /dev/null
+++ b/sandboxes/venv_onnxruntime.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Script that sets up a virtual env in $CF_VENV_BASE.
+# For more info on functionality and parameters, see the generic setup script _setup_venv.sh.
+
+action() {
+    local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )"
+    local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )"
+    local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )"
+
+    # set variables and source the generic venv setup
+    export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}"
+    export CF_VENV_NAME="$( basename "${this_file%.sh}" )"
+    export CF_VENV_REQUIREMENTS="${this_dir}/onnxruntime.txt"
+
+    source "${this_dir}/_setup_venv.sh" "$@"
+}
+action "$@"
diff --git a/sandboxes/venv_onnxruntime_dev.sh b/sandboxes/venv_onnxruntime_dev.sh
new file mode 100644
index 000000000..2b09a2240
--- /dev/null
+++ b/sandboxes/venv_onnxruntime_dev.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Script that sets up a virtual env in $CF_VENV_BASE.
+# For more info on functionality and parameters, see the generic setup script _setup_venv.sh.
+
+action() {
+    local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )"
+    local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )"
+    local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )"
+
+    # set variables and source the generic venv setup
+    export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}"
+    export CF_VENV_NAME="$( basename "${this_file%.sh}" )"
+    export CF_VENV_REQUIREMENTS="${this_dir}/onnxruntime.txt,${this_dir}/dev.txt"
+
+    source "${this_dir}/_setup_venv.sh" "$@"
+}
+action "$@"

From 60269fb43c3a8c12e194dd0f335c331fc8e84375 Mon Sep 17 00:00:00 2001
From: Jan van der Linden <jan.vdlinden@cern.ch>
Date: Mon, 2 Mar 2026 14:01:22 +0100
Subject: [PATCH 2/2] np.bool -> np.bool_

---
 columnflow/calibration/cms/jets.py  | 2 +-
 columnflow/production/cms/pileup.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/columnflow/calibration/cms/jets.py b/columnflow/calibration/cms/jets.py
index 709f253bc..504b843c0 100644
--- a/columnflow/calibration/cms/jets.py
+++ b/columnflow/calibration/cms/jets.py
@@ -723,7 +723,7 @@ def get_jer_config_default(self: Calibrator) -> DotDict:
     # whether gen jet matching should be performed relative to the nominal jet pt, or the jec varied values
     gen_jet_matching_nominal=False,
     # regions where stochastic smearing is applied
-    stochastic_smearing_mask=lambda self, jets: ak.ones_like(jets.pt, dtype=np.bool),
+    stochastic_smearing_mask=lambda self, jets: ak.ones_like(jets.pt, dtype=np.bool_),
 )
 def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
     """
diff --git a/columnflow/production/cms/pileup.py b/columnflow/production/cms/pileup.py
index 6675e8d04..d5d843b5f 100644
--- a/columnflow/production/cms/pileup.py
+++ b/columnflow/production/cms/pileup.py
@@ -55,7 +55,7 @@ def pu_weight(
         "NumTrueInteractions": events.Pileup.nTrueInt,
     }
 
-    any_outlier_mask = ak.zeros_like(events.Pileup.nTrueInt, dtype=np.bool)
+    any_outlier_mask = ak.zeros_like(events.Pileup.nTrueInt, dtype=np.bool_)
     for column_name, syst in (
         ("pu_weight", "nominal"),
         ("pu_weight_minbias_xs_up", "up"),