From 8e78169a3f4a4dbd1230c8990fc1230d4b2ef1dd Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 3 Feb 2025 12:29:54 +0100
Subject: [PATCH 01/26] Resolved the conflicts: mostly left local version of
 the code

---
 columnflow/plotting/plot_functions_1d.py | 103 ++++++++++++++++++++---
 columnflow/production/cms/mc_weight.py   |   9 +-
 columnflow/production/cms/pileup.py      |   4 +
 columnflow/production/normalization.py   |   9 +-
 columnflow/selection/stats.py            |   2 +-
 sandboxes/cmssw_columnar.sh              |   6 +-
 sandboxes/cmssw_default.sh               |   4 +-
 7 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py
index f73ceac9c..2dc19901c 100644
--- a/columnflow/plotting/plot_functions_1d.py
+++ b/columnflow/plotting/plot_functions_1d.py
@@ -33,7 +33,7 @@
 plt = maybe_import("matplotlib.pyplot")
 mplhep = maybe_import("mplhep")
 od = maybe_import("order")
-
+import warnings
 
 def plot_variable_per_process(
     hists: OrderedDict,
@@ -50,24 +50,62 @@ def plot_variable_per_process(
     **kwargs,
 ) -> plt.Figure:
     """
-    TODO.
+    Plots histograms for multiple processes, ordering them by the total number of events in ascending order
+    and assigning specific colors to each process based on a predefined color map.
     """
     remove_residual_axis(hists, "shift")
 
-    variable_inst = variable_insts[0]
-    blinding_threshold = kwargs.get("blinding_threshold", None)
+    # Define the color maps
+    color_maps = {
+        "6": ["#5790fc", "#7a21dd", "#964a8b", "#9c9ca1", "#e42536", "#f89c20"],
+        "8": ["#1845fb", "#578dff", "#656364", "#86c8dd", "#adad7d", "#c849a9", "#c91f16", "#ff5e02"],
+        "10": ["#3f90da", "#717581", "#832db6", "#92dadd", "#94a4a2", "#a96b59", "#b9ac70", "#bd1f01", "#e76300", "#ffa90e"],
+    }
 
-    if blinding_threshold:
-        hists = blind_sensitive_bins(hists, config_inst, blinding_threshold)
-    hists = apply_variable_settings(hists, variable_insts, variable_settings)
-    hists = apply_process_settings(hists, process_settings)
-    hists = apply_density_to_hists(hists, density)
+    # Basic colors for more than 24 processes
+    basic_colors = ["#FF0000", "#0000FF", "#00FF00", "#FFFF00", "#FF00FF", "#00FFFF", "#800000", "#808000"]
+
+    # Calculate the total number of events for each process
+    total_events = {key: sum(hist.values()) for key, hist in hists.items()}
+
+    # Sort processes by total number of events in ascending order
+    # sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]]))
+    # Sort processes by total number of events in descending order
+    sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True))
+
+    variable_inst = variable_insts[0]
+    sorted_hists = apply_variable_settings(sorted_hists, variable_insts, variable_settings)
+    sorted_hists = apply_process_settings(sorted_hists, process_settings)
+    sorted_hists = apply_density_to_hists(sorted_hists, density)
 
     plot_config = prepare_plot_config(
-        hists,
+        sorted_hists,
         shape_norm=shape_norm,
         hide_errors=hide_errors,
     )
+    
+
+    if 'data' not in plot_config:
+
+        # Determine the appropriate color map based on the number of processes
+        num_processes = len(sorted_hists)
+        if num_processes <= 6:
+            colors = color_maps["6"][:num_processes]
+        elif num_processes == 7:
+            colors = color_maps["8"][:num_processes]
+        elif num_processes <= 10:
+            colors = color_maps["8"][:num_processes] if num_processes == 8 else color_maps["10"][:num_processes]
+        elif num_processes <= 18:
+            colors = color_maps["10"] + color_maps["8"][:num_processes - 10]
+        elif num_processes <= 24:
+            colors = color_maps["10"] + color_maps["8"] + color_maps["6"][:num_processes - 18]
+        else:
+            warnings.warn("You are about to plot more than 24 processes together, please reconsider... (Colors not in the approved palette will be assigned)")
+            colors = color_maps["10"] + color_maps["8"] + color_maps["6"]
+            colors += basic_colors[:num_processes - 24]
+        plot_config["mc_stack"]["kwargs"]["color"] = colors[:num_processes]
+
+
 
     default_style_config = prepare_style_config(
         config_inst, category_inst, variable_inst, density, shape_norm, yscale,
@@ -80,6 +118,51 @@ def plot_variable_per_process(
     return plot_all(plot_config, style_config, **kwargs)
 
 
+# def plot_variable_per_process(
+#     hists: OrderedDict,
+#     config_inst: od.Config,
+#     category_inst: od.Category,
+#     variable_insts: list[od.Variable],
+#     style_config: dict | None = None,
+#     density: bool | None = False,
+#     shape_norm: bool | None = False,
+#     yscale: str | None = "",
+#     hide_errors: bool | None = None,
+#     process_settings: dict | None = None,
+#     variable_settings: dict | None = None,
+#     **kwargs,
+# ) -> plt.Figure:
+#     """
+#     TODO.
+#     """
+#     remove_residual_axis(hists, "shift")
+
+#     variable_inst = variable_insts[0]
+#     blinding_threshold = kwargs.get("blinding_threshold", None)
+
+#     if blinding_threshold:
+#         hists = blind_sensitive_bins(hists, config_inst, blinding_threshold)
+#     hists = apply_variable_settings(hists, variable_insts, variable_settings)
+#     hists = apply_process_settings(hists, process_settings)
+#     hists = apply_density_to_hists(hists, density)
+
+#     plot_config = prepare_plot_config(
+#         hists,
+#         shape_norm=shape_norm,
+#         hide_errors=hide_errors,
+#     )
+
+#     default_style_config = prepare_style_config(
+#         config_inst, category_inst, variable_inst, density, shape_norm, yscale,
+#     )
+
+#     style_config = law.util.merge_dicts(default_style_config, style_config, deep=True)
+#     if shape_norm:
+#         style_config["ax_cfg"]["ylabel"] = r"$\Delta N/N$"
+
+#     return plot_all(plot_config, style_config, **kwargs)
+
+
 def plot_variable_variants(
     hists: OrderedDict,
     config_inst: od.Config,
diff --git a/columnflow/production/cms/mc_weight.py b/columnflow/production/cms/mc_weight.py
index 9994c5b5a..e56b60b6e 100644
--- a/columnflow/production/cms/mc_weight.py
+++ b/columnflow/production/cms/mc_weight.py
@@ -31,11 +31,14 @@ def mc_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
 
     [1] https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookNanoAOD?rev=99#Weigths
     """
+    # # determine the mc_weight
+    # mc_weight = events.genWeight
+    # if has_ak_column(events, "LHEWeight.originalXWGTUP") and ak.all(events.genWeight == 1.0):
+    #     mc_weight = events.LHEWeight.originalXWGTUP
     # determine the mc_weight
-    mc_weight = events.genWeight
+    mc_weight = np.sign(events.genWeight)
     if has_ak_column(events, "LHEWeight.originalXWGTUP") and ak.all(events.genWeight == 1.0):
-        mc_weight = events.LHEWeight.originalXWGTUP
-
+        mc_weight = np.sign(events.LHEWeight.originalXWGTUP)
     # store the column
     events = set_ak_column(events, "mc_weight", mc_weight, value_type=np.float32)
 
diff --git a/columnflow/production/cms/pileup.py b/columnflow/production/cms/pileup.py
index 5e025c120..438d43889 100644
--- a/columnflow/production/cms/pileup.py
+++ b/columnflow/production/cms/pileup.py
@@ -54,6 +54,10 @@ def pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
 
         # evaluate and store the produced column
         pu_weight = self.pileup_corrector.evaluate(*inputs)
+        #####################################################
+        ### Keeps the pu_weight lower then 300
+        pu_weight[pu_weight > 300] = 0 
+        #####################################################
         events = set_ak_column(events, column_name, pu_weight, value_type=np.float32)
 
     return events
diff --git a/columnflow/production/normalization.py b/columnflow/production/normalization.py
index 9c2dd296f..66616ac7e 100644
--- a/columnflow/production/normalization.py
+++ b/columnflow/production/normalization.py
@@ -207,10 +207,8 @@ def normalization_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Arra
             f"process_id field contains id(s) {invalid_ids} for which no cross sections were "
             f"found; process ids with cross sections: {self.xs_process_ids}",
         )
-
     # read the weight per process (defined as lumi * xsec / sum_weights) from the lookup table
     process_weight = np.squeeze(np.asarray(self.process_weight_table[0, process_id].todense()))
-
     # compute the weight and store it
     norm_weight = events.mc_weight * process_weight
     events = set_ak_column(events, self.weight_name, norm_weight, value_type=np.float32)
@@ -352,13 +350,18 @@ def normalization_weights_setup(
                     f"energy of {self.config_inst.campaign.ecm}",
                 )
             sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(process_inst.id)]
+            #quick fix that need to be fixed
+            ################################
+            #n_evt_per_file = /self.dataset_inst.n_files
+            sum_weights = self.dataset_inst.n_events
+            ################################
             xsec = process_inst.get_xsec(self.config_inst.campaign.ecm).nominal
             process_weight_table[0, process_inst.id] = lumi * xsec / sum_weights
 
+
     self.process_weight_table = process_weight_table
     self.xs_process_ids = set(self.process_weight_table.rows[0])
 
-
 @normalization_weights.init
 def normalization_weights_init(self: Producer) -> None:
     """
diff --git a/columnflow/selection/stats.py b/columnflow/selection/stats.py
index 5038a6a03..8141fb957 100644
--- a/columnflow/selection/stats.py
+++ b/columnflow/selection/stats.py
@@ -145,7 +145,7 @@ def increment_stats(
                 "'num' or 'sum'",
             )
 
-        # interpret obj based on the aoperation to be applied
+        # interpret obj based on the operation to be applied
         weights = None
         weight_mask = Ellipsis
         if isinstance(obj, (tuple, list)):
diff --git a/sandboxes/cmssw_columnar.sh b/sandboxes/cmssw_columnar.sh
index 350a954be..f626eaccd 100644
--- a/sandboxes/cmssw_columnar.sh
+++ b/sandboxes/cmssw_columnar.sh
@@ -10,8 +10,10 @@ action() {
 
     # set variables and source the generic CMSSW setup
     export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}"
-    export CF_SCRAM_ARCH="el9_amd64_gcc11"
-    export CF_CMSSW_VERSION="CMSSW_13_0_19"
+    # export CF_SCRAM_ARCH="$( [ "${os_version}" = "8" ] && echo "el8" || echo "slc7" )_amd64_gcc10"
+    # export CF_CMSSW_VERSION="CMSSW_12_6_2"
+    export CF_SCRAM_ARCH=el9_amd64_gcc12
+    export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 
     export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )"
     export CF_CMSSW_FLAG="1"  # increment when content changed
 
diff --git a/sandboxes/cmssw_default.sh b/sandboxes/cmssw_default.sh
index d2e31eb15..cbfd928f8 100644
--- a/sandboxes/cmssw_default.sh
+++ b/sandboxes/cmssw_default.sh
@@ -10,8 +10,8 @@ action() {
 
     # set variables and source the generic CMSSW setup
     export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}"
-    export CF_SCRAM_ARCH="el9_amd64_gcc11"
-    export CF_CMSSW_VERSION="CMSSW_13_0_19"
+    export CF_SCRAM_ARCH=el9_amd64_gcc12
+    export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 
     export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )"
     export CF_CMSSW_FLAG="1"  # increment when content changed
 

From d4f96f7be432264d9fd62c12bf5653f636e2a992 Mon Sep 17 00:00:00 2001
From: Jacopo Malvaso <jacopo.malvaso@cern.ch>
Date: Tue, 26 Nov 2024 11:44:33 +0100
Subject: [PATCH 02/26] Plotting script with CAT colors

---
 columnflow/plotting/plot_functions_1d.py | 36 +++++++++++++++++-------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py
index 2dc19901c..716ec5421 100644
--- a/columnflow/plotting/plot_functions_1d.py
+++ b/columnflow/plotting/plot_functions_1d.py
@@ -33,7 +33,8 @@
 plt = maybe_import("matplotlib.pyplot")
 mplhep = maybe_import("mplhep")
 od = maybe_import("order")
-import warnings
+
+logger = law.logger.get_logger(__name__)
 
 def plot_variable_per_process(
     hists: OrderedDict,
@@ -50,8 +51,10 @@ def plot_variable_per_process(
     **kwargs,
 ) -> plt.Figure:
     """
-    Plots histograms for multiple processes, ordering them by the total number of events in ascending order
-    and assigning specific colors to each process based on a predefined color map.
+    Plots histograms for multiple processes, ordering them by a custom order:
+    the process with the highest number of events first, followed by the others,
+    and the process with the second highest number of events last.
+    Handles cases with only one or two processes.
     """
     remove_residual_axis(hists, "shift")
 
@@ -68,10 +71,25 @@ def plot_variable_per_process(
     # Calculate the total number of events for each process
     total_events = {key: sum(hist.values()) for key, hist in hists.items()}
 
-    # Sort processes by total number of events in ascending order
-    # sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]]))
     # Sort processes by total number of events in descending order
-    sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True))
+    sorted_hists_desc = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True))
+
+    # Get keys of sorted processes
+    sorted_keys = list(sorted_hists_desc.keys())
+
+    # Handle cases with 1 or 2 processes
+    if len(sorted_keys) == 1:
+        # Only one process, no special reordering needed
+        custom_order = sorted_keys
+    elif len(sorted_keys) == 2:
+        # Two processes, highest first, then second highest
+        custom_order = sorted_keys
+    else:
+        # More than two processes, custom order: highest, rest, then second highest
+        custom_order = [sorted_keys[0]] + sorted_keys[2:] + [sorted_keys[1]]
+
+    # Reorder histograms based on custom order
+    sorted_hists = OrderedDict((key, sorted_hists_desc[key]) for key in custom_order)
 
     variable_inst = variable_insts[0]
     sorted_hists = apply_variable_settings(sorted_hists, variable_insts, variable_settings)
@@ -83,7 +101,6 @@ def plot_variable_per_process(
         shape_norm=shape_norm,
         hide_errors=hide_errors,
     )
-    
 
     if 'data' not in plot_config:
 
@@ -100,13 +117,11 @@ def plot_variable_per_process(
         elif num_processes <= 24:
             colors = color_maps["10"] + color_maps["8"] + color_maps["6"][:num_processes - 18]
         else:
-            warnings.warn("You are about to plot more than 24 processes together, please reconsider... (Colors not in the approved palette will be assigned)")
+            logger.warning("You are about to plot more than 24 processes together, please reconsider... (Colors not in the approved palette will be assigned)")
             colors = color_maps["10"] + color_maps["8"] + color_maps["6"]
             colors += basic_colors[:num_processes - 24]
         plot_config["mc_stack"]["kwargs"]["color"] = colors[:num_processes]
 
-
-
     default_style_config = prepare_style_config(
         config_inst, category_inst, variable_inst, density, shape_norm, yscale,
     )
@@ -118,6 +133,7 @@ def plot_variable_per_process(
     return plot_all(plot_config, style_config, **kwargs)
 
 
+
 # def plot_variable_per_process(
 #     hists: OrderedDict,
 #     config_inst: od.Config,

From 0ece10d03f590c9496619e92073900d7db8825fd Mon Sep 17 00:00:00 2001
From: Jacopo Malvaso <jacopo.malvaso@cern.ch>
Date: Tue, 26 Nov 2024 11:55:09 +0100
Subject: [PATCH 03/26] Normalization, pileup, job submission and other small
 updates

---
 columnflow/calibration/cms/jets.py  | 228 ++++++++++++++--------------
 columnflow/calibration/cms/met.py   |  29 ++--
 columnflow/calibration/util.py      |  27 +++-
 columnflow/columnar_util.py         |  62 ++++++++
 columnflow/production/cms/pileup.py |   1 +
 columnflow/selection/cms/jets.py    |   4 +-
 columnflow/tasks/plotting.py        |   5 +
 sandboxes/cmssw_columnar.sh         |   4 +-
 sandboxes/cmssw_default.sh          |   4 +-
 9 files changed, 231 insertions(+), 133 deletions(-)

diff --git a/columnflow/calibration/cms/jets.py b/columnflow/calibration/cms/jets.py
index bd910264b..32c7c816b 100644
--- a/columnflow/calibration/cms/jets.py
+++ b/columnflow/calibration/cms/jets.py
@@ -3,6 +3,7 @@
 """
 Jet energy corrections and jet resolution smearing.
 """
+from pprint import pprint
 
 import functools
 
@@ -29,6 +30,8 @@
 set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32)
 
 
+import difflib
+
 def get_evaluators(
     correction_set: correctionlib.highlevel.CorrectionSet,
     names: list[str],
@@ -45,25 +48,32 @@ def get_evaluators(
     :return: List of compounded corrections, see
         :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
     """
-    # raise nice error if keys not found
     available_keys = set(correction_set.keys()).union(correction_set.compound.keys())
-    missing_keys = set(names) - available_keys
-    if missing_keys:
-        raise RuntimeError("corrections not found:" + "".join(
-            f"\n  - {name}" for name in names if name in missing_keys
-        ) + "\navailable:" + "".join(
-            f"\n  - {name}" for name in sorted(available_keys)
-        ))
-
-    # retrieve the evaluators
+    corrected_names = []
+
+    for name in names:
+        if name not in available_keys:
+            # Find the closest match using difflib
+            closest_matches = difflib.get_close_matches(name, available_keys, n=1)
+            if closest_matches:
+                closest_match = closest_matches[0]
+                print(
+                    f"Correction '{name}' not found. Using closest match: '{closest_match}'",
+                )
+                corrected_names.append(closest_match)
+            else:
+                raise RuntimeError(f"Correction '{name}' not found and no close match available.")
+        else:
+            corrected_names.append(name)
+
+    # Retrieve the evaluators
     return [
         correction_set.compound[name]
         if name in correction_set.compound
         else correction_set[name]
-        for name in names
+        for name in corrected_names
     ]
 
-
 def ak_evaluate(evaluator: correctionlib.highlevel.Correction, *args) -> float:
     """
     Evaluate a :external+correctionlib:py:class:`correctionlib.highlevel.Correction`
@@ -240,13 +250,14 @@ def get_jec_config_default(self: Calibrator) -> DotDict:
     raw_met_name="RawMET",
     # custom uncertainty sources, defaults to config when empty
     uncertainty_sources=None,
-    # toggle for propagation to MET
+    # toggle for propagation to PuppiMET
     propagate_met=True,
-    # function to determine the correction file
-    get_jec_file=get_jerc_file_default,
-    # function to determine the jec configuration dict
+    # # function to determine the correction file
+    get_jec_file=get_jec_file_default,
+    # # function to determine the jec configuration dict
     get_jec_config=get_jec_config_default,
 )
+
 def jec(
     self: Calibrator,
     events: ak.Array,
@@ -256,7 +267,7 @@ def jec(
 ) -> ak.Array:
     """Performs the jet energy corrections (JECs) and uncertainty shifts using the
     :external+correctionlib:doc:`index`, optionally
-    propagating the changes to the MET.
+    propagating the changes to the PuppiMET.
 
     The *jet_name* should be set to the name of the NanoAOD jet collection to calibrate
     (default: ``Jet``, i.e. AK4 jets).
@@ -313,16 +324,18 @@ def jec(
     :param events: awkward array containing events to process
 
     :param min_pt_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet values
-        to the missing transverse energy (MET) using
+        to the missing transverse energy (PuppiMET) using
         :py:func:`~columnflow.calibration.util.propagate_met` for events where
         ``met.pt > *min_pt_met_prop*``.
     :param max_eta_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet
-        values to the missing transverse energy (MET) using
+        values to the missing transverse energy (PuppiMET) using
         :py:func:`~columnflow.calibration.util.propagate_met` for events where
         ``met.eta > *min_eta_met_prop*``.
     """ # noqa
-    # use local variable for convenience
-    jet_name = self.jet_name
+    
+    # calculate uncorrected pt, mass
+    events = set_ak_column_f32(events, "Jet.pt_raw", events.Jet.pt * (1 - events.Jet.rawFactor))
+    events = set_ak_column_f32(events, "Jet.mass_raw", events.Jet.mass * (1 - events.Jet.rawFactor))
 
     # calculate uncorrected pt, mass
     events = set_ak_column_f32(events, f"{jet_name}.pt_raw", events[jet_name].pt * (1 - events[jet_name].rawFactor))
@@ -340,6 +353,8 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
 
         # apply all correctors sequentially, updating the pt each time
         full_correction = ak.ones_like(pt, dtype=np.float32)
+        
+
         for corrector in self.evaluators[evaluator_key]:
             # determine correct inputs (change depending on corrector)
             inputs = [
@@ -348,6 +363,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
             ]
             correction = ak_evaluate(corrector, *inputs)
             # update pt for subsequent correctors
+            #pprint(corrector.__dict__)  # If `corrector` is a custom object with attributes
             variable_map["JetPt"] = variable_map["JetPt"] * correction
             full_correction = full_correction * correction
 
@@ -361,7 +377,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
     )
 
     # correct jets with only a subset of correction levels
-    # (for calculating TypeI MET correction)
+    # (for calculating TypeI PuppiMET correction)
     if self.propagate_met:
         # get correction factors
         jec_factors_subset_type1_met = correct_jets(
@@ -378,7 +394,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
         events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors_subset_type1_met)
         events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
 
-        # store pt and phi of the full jet system for MET propagation, including a selection in raw info
+        # store pt and phi of the full jet system for PuppiMET propagation, including a selection in raw info
         # see https://twiki.cern.ch/twiki/bin/view/CMS/JECAnalysesRecommendations?rev=19#Minimum_jet_selection_cuts
         met_prop_mask = (events[jet_name].pt_raw > min_pt_met_prop) & (abs(events[jet_name].eta) < max_eta_met_prop)
         jetsum = events[jet_name][met_prop_mask].sum(axis=1)
@@ -408,18 +424,20 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
         jetsum = events[jet_name][met_prop_mask].sum(axis=1)
         jetsum_pt_all_levels = jetsum.pt
         jetsum_phi_all_levels = jetsum.phi
-        # propagate changes to MET, starting from jets corrected with subset of JEC levels
+
+        # propagate changes to PuppiMET, starting from jets corrected with subset of JEC levels
         # (recommendation is to propagate only L2 corrections and onwards)
         met_pt, met_phi = propagate_met(
             jetsum_pt_subset_type1_met,
             jetsum_phi_subset_type1_met,
             jetsum_pt_all_levels,
             jetsum_phi_all_levels,
-            events[self.raw_met_name].pt,
-            events[self.raw_met_name].phi,
+            events.RawPuppiMET.pt,
+            events.RawPuppiMET.phi,
         )
-        events = set_ak_column_f32(events, f"{self.met_name}.pt", met_pt)
-        events = set_ak_column_f32(events, f"{self.met_name}.phi", met_phi)
+
+        events = set_ak_column_f32(events, "PuppiMET.pt", met_pt)
+        events = set_ak_column_f32(events, "PuppiMET.phi", met_phi)
 
     # variable naming conventions
     variable_map = {
@@ -447,7 +465,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
             events, f"{jet_name}.mass_jec_{name}_down", events[jet_name].mass * (1.0 - jec_uncertainty),
         )
 
-        # propagate shifts to MET
+        # propagate shifts to PuppiMET
         if self.propagate_met:
             jet_pt_up = events[jet_name][met_prop_mask][f"pt_jec_{name}_up"]
             jet_pt_down = events[jet_name][met_prop_mask][f"pt_jec_{name}_down"]
@@ -467,10 +485,10 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
                 met_pt,
                 met_phi,
             )
-            events = set_ak_column_f32(events, f"{self.met_name}.pt_jec_{name}_up", met_pt_up)
-            events = set_ak_column_f32(events, f"{self.met_name}.pt_jec_{name}_down", met_pt_down)
-            events = set_ak_column_f32(events, f"{self.met_name}.phi_jec_{name}_up", met_phi_up)
-            events = set_ak_column_f32(events, f"{self.met_name}.phi_jec_{name}_down", met_phi_down)
+            events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_up", met_pt_up)
+            events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_down", met_pt_down)
+            events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_up", met_phi_up)
+            events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_down", met_phi_down)
 
     return events
 
@@ -497,14 +515,14 @@ def jec_init(self: Calibrator) -> None:
         for junc_dir in ("up", "down")
     }
 
-    # add MET variables
+    # add PuppiMET variables
     if self.propagate_met:
-        self.uses.add(f"{self.raw_met_name}.{{pt,phi}}")
-        self.produces.add(f"{self.met_name}.{{pt,phi}}")
+        self.uses |= {"RawPuppiMET.pt", "RawPuppiMET.phi","PuppiMET.pt", "PuppiMET.phi"}
+        self.produces |= {"PuppiMET.pt", "PuppiMET.phi"}
 
-        # add shifted MET variables
+        # add shifted PuppiMET variables
         self.produces |= {
-            f"{self.met_name}.{shifted_var}_jec_{junc_name}_{junc_dir}"
+            f"PuppiMET.{shifted_var}_jec_{junc_name}_{junc_dir}"
             for shifted_var in ("pt", "phi")
             for junc_name in sources
             for junc_dir in ("up", "down")
@@ -544,27 +562,25 @@ def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert
     .. code-block:: python
 
         cfg.x.jec = DotDict.wrap({
-            "Jet": {
-                # campaign name for this JEC correctiono
-                "campaign": f"Summer19UL{year2}{jerc_postfix}",
-                # version of the corrections
-                "version": "V7",
-                # Type of jets that the corrections should be applied on
-                "jet_type": "AK4PFchs",
-                # relevant levels in the derivation process of the JEC
-                "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"],
-                # relevant levels in the derivation process of the Type 1 MET JEC
-                "levels_for_type1_met": ["L1FastJet"],
-                # names of the uncertainties to be applied
-                "uncertainty_sources": [
-                    "Total",
-                    "CorrelationGroupMPFInSitu",
-                    "CorrelationGroupIntercalibration",
-                    "CorrelationGroupbJES",
-                    "CorrelationGroupFlavor",
-                    "CorrelationGroupUncorrelated",
-                ],
-            },
+            # campaign name for this JEC correctiono
+            "campaign": f"Summer19UL{year2}{jerc_postfix}",
+            # version of the corrections
+            "version": "V7",
+            # Type of jets that the corrections should be applied on
+            "jet_type": "AK4PFchs",
+            # relevant levels in the derivation process of the JEC
+            "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"],
+            # relevant levels in the derivation process of the Type 1 PuppiMET JEC
+            "levels_for_type1_met": ["L1FastJet"],
+            # names of the uncertainties to be applied
+            "uncertainty_sources": [
+                "Total",
+                "CorrelationGroupMPFInSitu",
+                "CorrelationGroupIntercalibration",
+                "CorrelationGroupbJES",
+                "CorrelationGroupFlavor",
+                "CorrelationGroupUncorrelated",
+            ],
         })
 
     :param reqs: Requirement dictionary for this
@@ -572,10 +588,12 @@ def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert
     :param inputs: Additional inputs, currently not used
     :param reader_targets: TODO: add documentation
     """
-    bundle = reqs["external_files"]
 
+    bundle = reqs["external_files"]
+    
     # import the correction sets from the external file
     import correctionlib
+
     correction_set = correctionlib.CorrectionSet.from_string(
         self.get_jec_file(bundle.files).load(formatter="gzip").decode("utf-8"),
     )
@@ -585,6 +603,7 @@ def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert
 
     def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data):
         if is_data:
+
             jec_era = self.dataset_inst.get_aux("jec_era", None)
             # if no special JEC era is specified, infer based on 'era'
             if jec_era is None:
@@ -601,8 +620,11 @@ def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data):
     sources = self.uncertainty_sources
     if sources is None:
         sources = jec_cfg.uncertainty_sources
-
-    jec_keys = make_jme_keys(jec_cfg.levels)
+    
+    if self.dataset_inst.is_data :
+        jec_keys = make_jme_keys(jec_cfg.levels_DATA)
+    else :
+        jec_keys = make_jme_keys(jec_cfg.levels_MC)
     jec_keys_subset_type1_met = make_jme_keys(jec_cfg.levels_for_type1_met)
     junc_keys = make_jme_keys(sources, is_data=False)  # uncertainties only stored as MC keys
 
@@ -617,14 +639,8 @@ def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data):
 # custom jec calibrator that only runs nominal correction
 jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []})
 
-# explicit calibrators for standard jet collections
-jec_ak4 = jec.derive("jec_ak4", cls_dict={"jet_name": "Jet"})
-jec_ak8 = jec.derive("jec_ak8", cls_dict={"jet_name": "FatJet", "propagate_met": False})
-jec_ak4_nominal = jec_ak4.derive("jec_ak4", cls_dict={"uncertainty_sources": []})
-jec_ak8_nominal = jec_ak8.derive("jec_ak8", cls_dict={"uncertainty_sources": []})
-
-
-def get_jer_config_default(self: Calibrator) -> DotDict:
+# define default functions for jec calibrator
+def get_jer_file(self, external_files: DotDict) -> str:
     """
     Load config relevant to the jet energy resolution (JER) smearing.
 
@@ -679,15 +695,18 @@ def get_jer_config_default(self: Calibrator) -> DotDict:
     uses={
         optional("Rho.fixedGridRhoFastjetAll"),
         optional("fixedGridRhoFastjetAll"),
+        "GenJet.pt", "GenJet.eta", "GenJet.phi",
+        "PuppiMET.pt", "PuppiMET.phi",
         attach_coffea_behavior,
     },
-    # name of the jet collection to smear
-    jet_name="Jet",
-    # name of the associated gen jet collection
-    gen_jet_name="GenJet",
-    # name of the associated MET collection
-    met_name="MET",
-    # toggle for propagation to MET
+    produces={
+        "Jet.pt", "Jet.mass",
+        "Jet.pt_unsmeared", "Jet.mass_unsmeared",
+        "Jet.pt_jer_up", "Jet.pt_jer_down", "Jet.mass_jer_up", "Jet.mass_jer_down",
+        "PuppiMET.pt", "PuppiMET.phi",
+        "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", "PuppiMET.phi_jer_down",
+    },
+    # toggle for propagation to PuppiMET
     propagate_met=True,
     # only run on mc
     mc_only=True,
@@ -875,27 +894,27 @@ def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
     if self.propagate_met:
 
         # save unsmeared quantities
-        events = set_ak_column_f32(events, f"{self.met_name}.pt_unsmeared", events[self.met_name].pt)
-        events = set_ak_column_f32(events, f"{self.met_name}.phi_unsmeared", events[self.met_name].phi)
+        events = set_ak_column_f32(events, "PuppiMET.pt_unsmeared", events.PuppiMET.pt)
+        events = set_ak_column_f32(events, "PuppiMET.phi_unsmeared", events.PuppiMET.phi)
 
         # get pt and phi of all jets after correcting
         jetsum = events[jet_name].sum(axis=1)
         jetsum_pt_after = jetsum.pt
         jetsum_phi_after = jetsum.phi
 
-        # propagate changes to MET
+        # propagate changes to PuppiMET
         met_pt, met_phi = propagate_met(
             jetsum_pt_before,
             jetsum_phi_before,
             jetsum_pt_after,
             jetsum_phi_after,
-            events[self.met_name].pt,
-            events[self.met_name].phi,
+            events.PuppiMET.pt,
+            events.PuppiMET.phi,
         )
-        events = set_ak_column_f32(events, f"{self.met_name}.pt", met_pt)
-        events = set_ak_column_f32(events, f"{self.met_name}.phi", met_phi)
+        events = set_ak_column_f32(events, "PuppiMET.pt", met_pt)
+        events = set_ak_column_f32(events, "PuppiMET.phi", met_phi)
 
-        # syst variations on top of corrected MET
+        # syst variations on top of corrected PuppiMET
         met_pt_up, met_phi_up = propagate_met(
             jetsum_pt_after,
             jetsum_phi_after,
@@ -912,10 +931,10 @@ def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
             met_pt,
             met_phi,
         )
-        events = set_ak_column_f32(events, f"{self.met_name}.pt_jer_up", met_pt_up)
-        events = set_ak_column_f32(events, f"{self.met_name}.pt_jer_down", met_pt_down)
-        events = set_ak_column_f32(events, f"{self.met_name}.phi_jer_up", met_phi_up)
-        events = set_ak_column_f32(events, f"{self.met_name}.phi_jer_down", met_phi_down)
+        events = set_ak_column_f32(events, "PuppiMET.pt_jer_up", met_pt_up)
+        events = set_ak_column_f32(events, "PuppiMET.pt_jer_down", met_pt_down)
+        events = set_ak_column_f32(events, "PuppiMET.phi_jer_up", met_phi_up)
+        events = set_ak_column_f32(events, "PuppiMET.phi_jer_down", met_phi_down)
 
     return events
 
@@ -926,22 +945,13 @@ def jer_init(self: Calibrator) -> None:
     lower_first = lambda s: s[0].lower() + s[1:] if s else s
     self.gen_jet_idx_column = lower_first(self.gen_jet_name) + "Idx"
 
-    # register used jet columns
-    self.uses.add(f"{self.jet_name}.{{pt,eta,phi,mass,{self.gen_jet_idx_column}}}")
-
-    # register used gen jet columns
-    self.uses.add(f"{self.gen_jet_name}.{{pt,eta,phi}}")
-
-    # register produced jet columns
-    self.produces.add(f"{self.jet_name}.{{pt,mass}}{{,_unsmeared,_jer_up,_jer_down}}")
-
-    # register produced MET columns
-    if self.propagate_met:
-        # register used MET columns
-        self.uses.add(f"{self.met_name}.{{pt,phi}}")
-
-        # register produced MET columns
-        self.produces.add(f"{self.met_name}.{{pt,phi}}{{,_jer_up,_jer_down,_unsmeared}}")
+    self.uses |= {
+        "PuppiMET.pt", "PuppiMET.phi",
+    }
+    self.produces |= {
+        "PuppiMET.pt", "PuppiMET.phi", "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up",
+        "PuppiMET.phi_jer_down", "PuppiMET.pt_unsmeared", "PuppiMET.phi_unsmeared",
+    }
 
 
 @jer.requires
@@ -994,7 +1004,7 @@ def jer_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert
     correction_set = correctionlib.CorrectionSet.from_string(
         self.get_jer_file(bundle.files).load(formatter="gzip").decode("utf-8"),
     )
-
+    
     # compute JER keys from config information
     jer_cfg = self.get_jer_config()
     jer_keys = {
@@ -1032,11 +1042,7 @@ def deterministic_normal(loc, scale, seed):
 @calibrator(
     uses={jec, jer},
     produces={jec, jer},
-    # name of the jet collection to smear
-    jet_name="Jet",
-    # name of the associated gen jet collection (for JER smearing)
-    gen_jet_name="GenJet",
-    # toggle for propagation to MET
+    # toggle for propagation to PuppiMET
     propagate_met=None,
     # functions to determine configs and files
     get_jec_file=None,
diff --git a/columnflow/calibration/cms/met.py b/columnflow/calibration/cms/met.py
index 01b6ea9ef..aec3ca73b 100644
--- a/columnflow/calibration/cms/met.py
+++ b/columnflow/calibration/cms/met.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 
 """
-MET corrections.
+PuppiMET corrections.
 """
 
 from columnflow.calibration import Calibrator, calibrator
@@ -13,9 +13,8 @@
 
 
 @calibrator(
-    uses={"run", "PV.npvs"},
-    # name of the MET collection to calibrate
-    met_name="MET",
+    uses={"run", "PV.npvs", "PuppiMET.pt", "PuppiMET.phi"},
+    produces={"PuppiMET.pt", "PuppiMET.phi"},
     # function to determine the correction file
     get_met_file=(lambda self, external_files: external_files.met_phi_corr),
     # function to determine met correction config
@@ -23,9 +22,9 @@
 )
 def met_phi(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
     """
-    Performs the MET phi (type II) correction using the
+    Performs the PuppiMET phi (type II) correction using the
     :external+correctionlib:doc:`index` for events there the
-    uncorrected MET pt is below the beam energy (extracted from ``config_inst.campaign.ecm * 0.5``).
+    uncorrected PuppiMET pt is below the beam energy (extracted from ``config_inst.campaign.ecm * 0.5``).
     Requires an external file in the config under ``met_phi_corr``:
 
     .. code-block:: python
@@ -54,16 +53,16 @@ def met_phi(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
     met = events[self.met_name]
 
     # copy the intial pt and phi values
-    corr_pt = np.array(met.pt, dtype=np.float32)
-    corr_phi = np.array(met.phi, dtype=np.float32)
+    corr_pt = np.array(events.PuppiMET.pt, dtype=np.float32)
+    corr_phi = np.array(events.PuppiMET.phi, dtype=np.float32)
 
-    # select only events where MET pt is below the expected beam energy
-    mask = met.pt < (0.5 * self.config_inst.campaign.ecm)
+    # select only events where PuppiMET pt is below the expected beam energy
+    mask = events.PuppiMET.pt < (0.5 * self.config_inst.campaign.ecm)
 
     # arguments for evaluation
     args = (
-        met.pt[mask],
-        met.phi[mask],
+        events.PuppiMET.pt[mask],
+        events.PuppiMET.phi[mask],
         ak.values_astype(events.PV.npvs[mask], np.float32),
         ak.values_astype(events.run[mask], np.float32),
     )
@@ -73,8 +72,8 @@ def met_phi(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
     corr_phi[mask] = self.met_phi_corrector.evaluate(*args)
 
     # save the corrected values
-    events = set_ak_column(events, f"{self.met_name}.pt", corr_pt, value_type=np.float32)
-    events = set_ak_column(events, f"{self.met_name}.phi", corr_phi, value_type=np.float32)
+    events = set_ak_column(events, "PuppiMET.pt", corr_pt, value_type=np.float32)
+    events = set_ak_column(events, "PuppiMET.phi", corr_phi, value_type=np.float32)
 
     return events
 
@@ -110,7 +109,7 @@ def met_phi_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: di
     :param reader_targets: Additional targets, currently not used.
     """
     bundle = reqs["external_files"]
-
+    
     # create the pt and phi correctors
     import correctionlib
     correction_set = correctionlib.CorrectionSet.from_string(
diff --git a/columnflow/calibration/util.py b/columnflow/calibration/util.py
index ac20de9bb..af9955836 100644
--- a/columnflow/calibration/util.py
+++ b/columnflow/calibration/util.py
@@ -13,6 +13,9 @@
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
 
+import law
+
+logger = law.logger.get_logger(__name__)
 
 # https://github.com/scikit-hep/awkward/issues/489\#issuecomment-711090923
 def ak_random(*args, rand_func: Callable) -> ak.Array:
@@ -91,7 +94,29 @@ def propagate_met(
     if jet_pt2.ndim > 1:
         jet_px2 = ak.sum(jet_px2, axis=1)
         jet_py2 = ak.sum(jet_py2, axis=1)
-
+    
+    # RawPuppiMET sanity check
+           
+    crazy_PuppiMET_values_mask = met_pt1 > 14*10**3     
+    
+    crazy_PuppiMET_values = met_pt1[crazy_PuppiMET_values_mask]
+    
+    # Get the indices of the infinite values
+    crazy_PuppiMET_indices = np.where(crazy_PuppiMET_values_mask)[0]
+
+    # Count the number of infinite values
+    crazy_PuppiMET_count = ak.sum(crazy_PuppiMET_values_mask)
+    
+    if crazy_PuppiMET_count > 0:
+        # Replace infinite values with 0
+        met_pt1 = ak.where(~crazy_PuppiMET_values_mask, met_pt1, 1000)
+
+        # Raise a warning about the replacement
+        logger.warning(
+            f"Warning: Found and replaced {crazy_PuppiMET_count} crazy value(s) {crazy_PuppiMET_values.tolist()} in 'RawPuppiMET.pt' with 1000.\n"
+            f"Indices in the chuck: {crazy_PuppiMET_indices.tolist()}\n"
+            f"We will get rid of these events in the selection step")
+        
     # propagate to met
     met_px2 = met_pt1 * np.cos(met_phi1) - (jet_px2 - jet_px1)
     met_py2 = met_pt1 * np.sin(met_phi1) - (jet_py2 - jet_py1)
diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py
index 7651675f5..9057b3faf 100644
--- a/columnflow/columnar_util.py
+++ b/columnflow/columnar_util.py
@@ -1354,6 +1354,68 @@ def ak_copy(ak_array: ak.Array) -> ak.Array:
     return layout_ak_array(np.array(ak.flatten(ak_array)), ak_array)
 
 
+def fill_hist(
+    h: hist.Hist,
+    data: ak.Array | np.array | dict[str, ak.Array | np.array],
+    *,
+    last_edge_inclusive: bool | None = None,
+    fill_kwargs: dict[str, Any] | None = None,
+) -> None:
+    """
+    Fills a histogram *h* with data from an awkward array, numpy array or nested dictionary *data*.
+    The data is assumed to be structured in the same way as the histogram axes. If
+    *last_edge_inclusive* is *True*, values that would land exactly on the upper-most bin edge of an
+    axis are shifted into the last bin. If it is *None*, the behavior is determined automatically
+    and depends on the variable axis type. In this case, shifting is applied to all continuous,
+    non-circular axes.
+    """
+    if fill_kwargs is None:
+        fill_kwargs = {}
+
+    # helper to decide whether the variable axis qualifies for shifting the last bin
+    def allows_shift(ax) -> bool:
+        return ax.traits.continuous and not ax.traits.circular
+
+    # determine the axis names, figure out which which axes the last bin correction should be done
+    axis_names = []
+    correct_last_bin_axes = []
+    for ax in h.axes:
+        axis_names.append(ax.name)
+        # include values hitting last edge?
+        if not len(ax.widths) or not isinstance(ax, hist.axis.Variable):
+            continue
+        if (last_edge_inclusive is None and allows_shift(ax)) or last_edge_inclusive:
+            correct_last_bin_axes.append(ax)
+
+    # check data
+    if not isinstance(data, dict):
+        if len(axis_names) != 1:
+            raise ValueError("got multi-dimensional hist but only one dimensional data")
+        data = {axis_names[0]: data}
+    else:
+        for name in axis_names:
+            if name not in data and name not in fill_kwargs:
+                raise ValueError(f"missing data for histogram axis '{name}'")
+
+    # correct last bin values
+    for ax in correct_last_bin_axes:
+        right_egde_mask = ak.flatten(data[ax.name], axis=None) == ax.edges[-1]
+        if np.any(right_egde_mask):
+            data[ax.name] = ak.copy(data[ax.name])
+            flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5
+
+    # fill
+    if 'event' in data.keys():
+        arrays = {}
+        for ax_name in axis_names:
+            if ax_name in data.keys():
+                arrays[ax_name] = data[ax_name]
+        h.fill(**fill_kwargs, **arrays)
+    else:
+        arrays = ak.flatten(ak.cartesian(data))
+        h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
+
+
 class RouteFilter(object):
     """
     Shallow helper class that handles removal of routes in an awkward array that do not match those
diff --git a/columnflow/production/cms/pileup.py b/columnflow/production/cms/pileup.py
index 438d43889..346be3125 100644
--- a/columnflow/production/cms/pileup.py
+++ b/columnflow/production/cms/pileup.py
@@ -58,6 +58,7 @@ def pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
         ### Keeps the pu_weight lower then 300
         pu_weight[pu_weight > 300] = 0 
         #####################################################
+    
         events = set_ak_column(events, column_name, pu_weight, value_type=np.float32)
 
     return events
diff --git a/columnflow/selection/cms/jets.py b/columnflow/selection/cms/jets.py
index 945ed1b1e..89be152c3 100644
--- a/columnflow/selection/cms/jets.py
+++ b/columnflow/selection/cms/jets.py
@@ -22,7 +22,7 @@
 
 @selector(
     uses={
-        "Jet.{pt,eta,phi,mass,jetId,chEmEF}", optional("Jet.puId"),
+        "Jet.{pt,eta,phi,mass,jetId,chEmEF}", 
         "Muon.{pt,eta,phi,mass,isPFcand}",
     },
     produces={"Jet.veto_map_mask"},
@@ -59,7 +59,7 @@ def jet_veto_map(
     # loose jet selection
     jet_mask = (
         (jet.pt > 15) &
-        (jet.jetId >= 2) &  # tight id
+        (jet.jetId >= 2) &  # tight id 
         (jet.chEmEF < 0.9) &
         ak.all(events.Jet.metric_table(muon) >= 0.2, axis=2)
     )
diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py
index b922684a8..6709d3fbb 100644
--- a/columnflow/tasks/plotting.py
+++ b/columnflow/tasks/plotting.py
@@ -213,6 +213,11 @@ def create_branch_map(self):
 
     def workflow_requires(self):
         reqs = super().workflow_requires()
+
+        # no need to require merged histograms since each branch already requires them as a workflow
+        # if self.workflow == "local":
+        #     reqs.pop("merged_hists", None)
+
         return reqs
 
     def requires(self):
diff --git a/sandboxes/cmssw_columnar.sh b/sandboxes/cmssw_columnar.sh
index f626eaccd..5dd283d28 100644
--- a/sandboxes/cmssw_columnar.sh
+++ b/sandboxes/cmssw_columnar.sh
@@ -10,10 +10,10 @@ action() {
 
     # set variables and source the generic CMSSW setup
     export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}"
+    export CF_SCRAM_ARCH="el9_amd64_gcc12"
+    export CF_CMSSW_VERSION="CMSSW_14_1_0_pre4" 
     # export CF_SCRAM_ARCH="$( [ "${os_version}" = "8" ] && echo "el8" || echo "slc7" )_amd64_gcc10"
     # export CF_CMSSW_VERSION="CMSSW_12_6_2"
-    export CF_SCRAM_ARCH=el9_amd64_gcc12
-    export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 
     export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )"
     export CF_CMSSW_FLAG="1"  # increment when content changed
 
diff --git a/sandboxes/cmssw_default.sh b/sandboxes/cmssw_default.sh
index cbfd928f8..95dcaf592 100644
--- a/sandboxes/cmssw_default.sh
+++ b/sandboxes/cmssw_default.sh
@@ -10,8 +10,8 @@ action() {
 
     # set variables and source the generic CMSSW setup
     export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}"
-    export CF_SCRAM_ARCH=el9_amd64_gcc12
-    export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 
+    export CF_SCRAM_ARCH="el9_amd64_gcc12"
+    export CF_CMSSW_VERSION="CMSSW_14_1_0_pre4"
     export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )"
     export CF_CMSSW_FLAG="1"  # increment when content changed
 

From c83375288715e459a3ce8663c77fd14ad1dfb19e Mon Sep 17 00:00:00 2001
From: Jacopo Malvaso <jacopo.malvaso@cern.ch>
Date: Tue, 3 Dec 2024 15:07:17 +0100
Subject: [PATCH 04/26] Modification to the legend position and columns

---
 columnflow/plotting/plot_all.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/columnflow/plotting/plot_all.py b/columnflow/plotting/plot_all.py
index 60207a301..a30db1e58 100644
--- a/columnflow/plotting/plot_all.py
+++ b/columnflow/plotting/plot_all.py
@@ -273,9 +273,15 @@ def plot_all(
     if not skip_legend:
         # resolve legend kwargs
         legend_kwargs = {
-            "ncols": 1,
-            "loc": "upper right",
+            "ncol": 2,
+            "loc": "center left",
+            "bbox_to_anchor": (0.35, 0.8),  # Position the legend outside the plot
+                                         # Moves the legend to the right side of the plot.
+                                         # The first value (1) controls the horizontal position,
+                                         # and the second value (0.95) controls the vertical position.
+            "fontsize": 16, 
         }
+        
         legend_kwargs.update(style_config.get("legend_cfg", {}))
 
         # retrieve the legend handles and their labels

From 394805c0f3c5cb34d155d9eb54f42941f66786f6 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Tue, 10 Dec 2024 15:50:42 +0100
Subject: [PATCH 05/26] Fake factor estimation code: initial commit

---
 columnflow/tasks/data_driven_methods.py | 731 ++++++++++++++++++++++++
 law.cfg                                 |   1 +
 2 files changed, 732 insertions(+)
 create mode 100644 columnflow/tasks/data_driven_methods.py

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
new file mode 100644
index 000000000..0f47638e6
--- /dev/null
+++ b/columnflow/tasks/data_driven_methods.py
@@ -0,0 +1,731 @@
+# # coding: utf-8
+
+# """
+# Tasks to plot different types of histograms.
+# """
+
+# from collections import OrderedDict
+# from abc import abstractmethod
+
+# import law
+# import luigi
+
+# from columnflow.tasks.framework.base import Requirements, ShiftTask
+# from columnflow.tasks.framework.mixins import (
+#     CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin,
+#     CategoriesMixin, ShiftSourcesMixin,
+# )
+# from columnflow.tasks.framework.plotting import (
+#     PlotBase, PlotBase2D, ProcessPlotSettingMixin, VariablePlotSettingMixin,
+# )
+# from columnflow.tasks.framework.decorators import view_output_plots
+# from columnflow.tasks.framework.remote import RemoteWorkflow
+# from columnflow.tasks.histograms import MergeHistograms
+# from columnflow.util import DotDict, dev_sandbox, dict_add_strict
+
+
+# class DataDrivenEstimationBase(
+#     VariablePlotSettingMixin,
+#     ProcessPlotSettingMixin,
+#     CategoriesMixin,
+#     MLModelsMixin,
+#     ProducersMixin,
+#     SelectorStepsMixin,
+#     CalibratorsMixin,
+#     law.LocalWorkflow,
+#     RemoteWorkflow,
+# ):
+#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+#     """sandbox to use for this task. Defaults to *default_columnar_sandbox* from
+#     analysis config.
+#     """
+
+#     exclude_index = True
+
+#     # upstream requirements
+#     reqs = Requirements(
+#         RemoteWorkflow.reqs,
+#         MergeHistograms=MergeHistograms,
+#     )
+#     """Set upstream requirements, in this case :py:class:`~columnflow.tasks.histograms.MergeHistograms`
+#     """
+
+#     def store_parts(self):
+#         parts = super().store_parts()
+#         parts.insert_before("version", "plot", f"datasets_{self.datasets_repr}")
+#         return parts
+
+#     def create_branch_map(self):
+#         return [
+#             DotDict({"category": cat_name, "variable": var_name})
+#             for cat_name in sorted(self.categories)
+#             for var_name in sorted(self.variables)
+#         ]
+
+#     def workflow_requires(self):
+#         reqs = super().workflow_requires()
+
+#         reqs["merged_hists"] = self.requires_from_branch()
+
+#         return reqs
+
+#     @abstractmethod
+#     def get_plot_shifts(self):
+#         return
+    
+#     @law.decorator.log
+#     @view_output_plots
+#     def run(self):
+#         import hist
+#         import numpy as np
+#         from cmsdb.processes.qcd import qcd
+
+#         # get the shifts to extract and plot
+#         plot_shifts = law.util.make_list(self.get_plot_shifts())
+
+#         # prepare config objects
+#         variable_tuple = self.variable_tuples[self.branch_data.variable]
+#         variable_insts = [
+#             self.config_inst.get_variable(var_name)
+#             for var_name in variable_tuple
+#         ]
+#         category_inst = self.config_inst.get_category(self.branch_data.category)
+#         leaf_category_insts = category_inst.get_leaf_categories() or [category_inst]
+#         process_insts = list(map(self.config_inst.get_process, self.processes))
+#         sub_process_insts = {
+#             proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)]
+#             for proc in process_insts
+#         }
+
+#         # histogram data per process
+#         hists = {}
+#         if 'ff_control_reg' in category_inst.name :
+#             with self.publish_step(f"estimating qcd for {self.branch_data.variable} in {category_inst.name}"):
+#                 for dataset, inp in self.input().items():
+#                     dataset_inst = self.config_inst.get_dataset(dataset)
+#                     h_in = inp["collection"][0]["hists"].targets[self.branch_data.variable].load(formatter="pickle")
+
+#                     # loop and extract one histogram per process
+#                     for process_inst in process_insts:
+#                         # skip when the dataset is already known to not contain any sub process
+#                         if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])):
+#                             continue
+#                         # work on a copy
+#                         h = h_in.copy()
+#                         # axis selections
+#                         h = h[{
+#                             "process": [
+#                                 hist.loc(p.id)
+#                                 for p in sub_process_insts[process_inst]
+#                                 if p.id in h.axes["process"]
+#                             ],
+#                             "category": [
+#                                 hist.loc(c.id)
+#                                 for c in leaf_category_insts
+#                                 if c.id in h.axes["category"]
+#                             ],
+#                             "shift": [
+#                                 hist.loc(s.id)
+#                                 for s in plot_shifts
+#                                 if s.id in h.axes["shift"]
+#                             ],
+#                         }]
+
+#                         # axis reductions
+#                         h = h[{"process": sum, "category": sum}]
+
+#                         # add the histogram
+#                         if process_inst in hists:
+#                             hists[process_inst] += h
+#                         else:
+#                             hists[process_inst] = h
+
+#                 # there should be hists to plot
+#                 if not hists:
+#                     raise Exception(
+#                         "no histograms found to plot; possible reasons:\n" +
+#                         "  - requested variable requires columns that were missing during histogramming\n" +
+#                         "  - selected --processes did not match any value on the process axis of the input histogram",
+#                     )
+
+#                 # sort hists by process order
+#                 hists = OrderedDict(
+#                     (process_inst.copy_shallow(), hists[process_inst])
+#                     for process_inst in sorted(hists, key=process_insts.index)
+#                 )
+                
+#                 qcd_hist = None
+#                 qcd_hist_values = None
+#                 for process_inst, h in hists.items():
+#                     hist_np , _ , _ = h.to_numpy(flow=True)
+#                     if qcd_hist is None:
+#                         qcd_hist = h.copy()
+#                         qcd_hist_values = np.zeros_like(hist_np)
+#                     if process_inst.is_data: qcd_hist_values += hist_np
+#                     else: qcd_hist_values -= hist_np
+                
+#                 #if the array contains negative values, set them to zero
+#                 qcd_hist_values = np.where(qcd_hist_values > 0, qcd_hist_values, 0)
+#                 qcd_hist.view(flow=True).value[:] = qcd_hist_values
+#                 qcd_hist.view(flow=True).variance[:] = np.zeros_like(qcd_hist_values)
+#                 qcd_hist
+#                 #register a new datased at the hlist
+#                 hists[qcd] = qcd_hist
+#                 #save qcd estimation histogram and plots only for control region
+                
+#                 self.output()["qcd_hists"][self.branch_data.variable].dump(qcd_hist, formatter="pickle")
+#                 # call the plot function
+#                 fig, _ = self.call_plot_func(
+#                     self.plot_function,
+#                     hists=hists,
+#                     config_inst=self.config_inst,
+#                     category_inst=category_inst.copy_shallow(),
+#                     variable_insts=[var_inst.copy_shallow() for var_inst in variable_insts],
+#                     **self.get_plot_parameters(),
+#                 )
+#                 # save the plot
+#                 for outp in self.output()["plots"]:
+#                     outp.dump(fig, formatter="mpl")
+#         else:
+#             self.publish_step(f"Category: {category_inst.name} isn't used to estimate QCD, skipping this task.")
+
+
+# class DataDrivenEstimationSingleShift(
+#     DataDrivenEstimationBase,
+#     ShiftTask,
+# ):
+#     exclude_index = True
+
+#     # upstream requirements
+#     reqs = Requirements(
+#         DataDrivenEstimationBase.reqs,
+#         MergeHistograms=MergeHistograms,
+#     )
+
+#     def create_branch_map(self):
+#         return [
+#             DotDict({"category": cat_name, "variable": var_name})
+#             for var_name in sorted(self.variables)
+#             for cat_name in sorted(self.categories)
+#         ]
+
+#     def requires(self):
+#         return {
+#             d: self.reqs.MergeHistograms.req(
+#                 self,
+#                 dataset=d,
+#                 branch=-1,
+#                 _exclude={"branches"},
+#                 _prefer_cli={"variables"},
+#             )
+#             for d in self.datasets
+#         }
+
+#     def output(self):
+#         b = self.branch_data
+#         return {"plots": [
+#             self.target(name)
+#             for name in self.get_plot_names(f"plot__proc_{self.processes_repr}__cat_{b.category}__var_{b.variable}")
+#         ],
+#         "qcd_hists": law.SiblingFileCollection({
+#             variable_name: self.target(f"qcd_histogram__{b.category}_{variable_name}.pickle")
+#             for variable_name in self.variables
+#         })}
+
+#     def get_plot_shifts(self):
+#         return [self.global_shift_inst]
+
+
+# class DataDrivenEstimation(
+#     DataDrivenEstimationSingleShift,
+#     DataDrivenEstimationBase,
+# ):
+#     plot_function = PlotBase.plot_function.copy(
+#         default="columnflow.plotting.plot_functions_1d.plot_variable_per_process",
+#         add_default_to_description=True,
+#     )
+    
+    
+    
+
+# coding: utf-8
+
+"""
+Task to produce and merge histograms.
+"""
+
+from __future__ import annotations
+
+import luigi
+import law
+
+from columnflow.tasks.framework.base import Requirements, AnalysisTask, DatasetTask, wrapper_factory
+from columnflow.tasks.framework.mixins import (
+    CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, VariablesMixin,
+    ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin,
+)
+from columnflow.tasks.framework.remote import RemoteWorkflow
+from columnflow.tasks.framework.parameters import last_edge_inclusive_inst
+from columnflow.tasks.reduction import ReducedEventsUser
+from columnflow.tasks.production import ProduceColumns
+from columnflow.tasks.ml import MLEvaluation
+from columnflow.util import dev_sandbox
+
+
+class CreateFakeFactorHistograms(
+    VariablesMixin,
+    WeightProducerMixin,
+    ProducersMixin,
+    ReducedEventsUser,
+    ChunkedIOMixin,
+    law.LocalWorkflow,
+    RemoteWorkflow,
+):
+    last_edge_inclusive = last_edge_inclusive_inst
+
+    sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+    # upstream requirements
+    reqs = Requirements(
+        ReducedEventsUser.reqs,
+        RemoteWorkflow.reqs,
+        ProduceColumns=ProduceColumns,
+    )
+
+    # strategy for handling missing source columns when adding aliases on event chunks
+    missing_column_alias_strategy = "original"
+
+    # names of columns that contain category ids
+    # (might become a parameter at some point)
+    category_id_columns = {"category_ids"}
+
+    # register sandbox and shifts found in the chosen weight producer to this task
+    register_weight_producer_sandbox = True
+    register_weight_producer_shifts = True
+
+    @law.util.classproperty
+    def mandatory_columns(cls) -> set[str]:
+        return set(cls.category_id_columns) | {"process_id"}
+
+    def workflow_requires(self):
+        reqs = super().workflow_requires()
+
+        # require the full merge forest
+        reqs["events"] = self.reqs.ProvideReducedEvents.req(self)
+
+        if not self.pilot:
+            if self.producer_insts:
+                reqs["producers"] = [
+                    self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
+                    for producer_inst in self.producer_insts
+                    if producer_inst.produced_columns
+                ]
+
+            # add weight_producer dependent requirements
+            reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
+
+        return reqs
+
+    def requires(self):
+        reqs = {"events": self.reqs.ProvideReducedEvents.req(self)}
+
+        if self.producer_insts:
+            reqs["producers"] = [
+                self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
+                for producer_inst in self.producer_insts
+                if producer_inst.produced_columns
+            ]
+
+        # add weight_producer dependent requirements
+        reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
+
+        return reqs
+
+    workflow_condition = ReducedEventsUser.workflow_condition.copy()
+
+    @workflow_condition.output
+    def output(self):
+        return {"hists": self.target(f"fake_factor__{self.branch}.pickle")}
+
+    @law.decorator.log
+    @law.decorator.localize(input=True, output=False)
+    @law.decorator.safe_output
+    def run(self):
+        import hist
+        import numpy as np
+        import awkward as ak
+        from columnflow.columnar_util import (
+            Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist,
+        )
+
+        # prepare inputs
+        inputs = self.input()
+
+        # declare output: dict of histograms
+        histograms = {}
+
+        # run the weight_producer setup
+        producer_reqs = self.weight_producer_inst.run_requires()
+        reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs))
+
+        # create a temp dir for saving intermediate files
+        tmp_dir = law.LocalDirectoryTarget(is_tmp=True)
+        tmp_dir.touch()
+
+        # get shift dependent aliases
+        aliases = self.local_shift_inst.x("column_aliases", {})
+
+        # define columns that need to be read
+        read_columns = {Route("process_id")}
+        read_columns |= set(map(Route, self.category_id_columns))
+        read_columns |= set(self.weight_producer_inst.used_columns)
+        read_columns |= set(map(Route, aliases.values()))
+        read_columns |= {
+            Route(the_var) for the_var in self.config_inst.x.fake_factor_method.vars.keys()
+        }
+        from IPython import embed; embed()
+        # empty float array to use when input files have no entries
+        empty_f32 = ak.Array(np.array([], dtype=np.float32))
+
+        # iterate over chunks of events and diffs
+        file_targets = [inputs["events"]["events"]]
+        if self.producer_insts:
+            file_targets.extend([inp["columns"] for inp in inputs["producers"]])
+
+        # prepare inputs for localization
+        with law.localize_file_targets(
+            [*file_targets, *reader_targets.values()],
+            mode="r",
+        ) as inps:
+            for (events, *columns), pos in self.iter_chunked_io(
+                [inp.path for inp in inps],
+                source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets),
+                read_columns=(len(file_targets) + len(reader_targets)) * [read_columns],
+                chunk_size=self.weight_producer_inst.get_min_chunk_size(),
+            ):
+                # optional check for overlapping inputs
+                if self.check_overlapping_inputs:
+                    self.raise_if_overlapping([events] + list(columns))
+
+                # add additional columns
+                events = update_ak_array(events, *columns)
+
+                # add aliases
+                events = add_ak_aliases(
+                    events,
+                    aliases,
+                    remove_src=True,
+                    missing_strategy=self.missing_column_alias_strategy,
+                )
+
+                # build the full event weight
+                if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
+                    events, weight = self.weight_producer_inst(events)
+                else:
+                    weight = ak.Array(np.ones(len(events), dtype=np.float32))
+                # define and fill histograms, taking into account multiple axes
+                
+                h = (hist.Hist.new
+                    .IntCat([], name="category", growth=True)
+                    .IntCat([], name="process", growth=True)
+                    .IntCat([], name="shift", growth=True))
+                for (var_name, var_axis) in self.config_inst.x.fake_factor_method.vars.items(): 
+                    h = eval(f'h.{var_axis}') 
+                
+                histograms['fake_factor'] = h.Weight()
+                
+                category_ids = ak.concatenate(
+                        [Route(c).apply(events) for c in self.category_id_columns],
+                        axis=-1,
+                    )
+                # broadcast arrays so that each event can be filled for all its categories
+                fill_data = {
+                    "category": category_ids,
+                    "process": events.process_id,
+                    "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
+                    "weight": weight,
+                }
+                # for variable_inst in self.config_inst.x.fake_factor_method.vars.:
+                #     # prepare the expression
+                #     expr = variable_inst.expression
+                #     if isinstance(expr, str):
+                #         route = Route(expr)
+                #         def expr(events, *args, **kwargs):
+                #             if len(events) == 0 and not has_ak_column(events, route):
+                #                 return empty_f32
+                #             return route.apply(events, null_value=variable_inst.null_value)
+                #     fill_data[variable_inst.name] = expr(events)
+                from IPython import embed; embed()
+                # for var_key, var_names in self.variable_tuples.items():
+                #     variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
+
+                #     # create the histogram if not present yet
+                #     if var_key not in histograms:
+                #         h = (
+                #             hist.Hist.new
+                #             .IntCat([], name="category", growth=True)
+                #             .IntCat([], name="process", growth=True)
+                #             .IntCat([], name="shift", growth=True)
+                #         )
+                #         # add variable axes
+                #         for variable_inst in variable_insts:
+                #             h = h.Var(
+                #                 variable_inst.bin_edges,
+                #                 name=variable_inst.name,
+                #                 label=variable_inst.get_full_x_title(),
+                #             )
+                #         # enable weights and store it
+                #         histograms[var_key] = h.Weight()
+
+                    # # merge category ids
+                    # category_ids = ak.concatenate(
+                    #     [Route(c).apply(events) for c in self.category_id_columns],
+                    #     axis=-1,
+                    # )
+
+                    # broadcast arrays so that each event can be filled for all its categories
+                    # fill_data = {
+                    #     "category": category_ids,
+                    #     "process": events.process_id,
+                    #     "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
+                    #     "weight": weight,
+                    # }
+                    # for variable_inst in variable_insts:
+                    #     # prepare the expression
+                    #     expr = variable_inst.expression
+                    #     if isinstance(expr, str):
+                    #         route = Route(expr)
+                    #         def expr(events, *args, **kwargs):
+                    #             if len(events) == 0 and not has_ak_column(events, route):
+                    #                 return empty_f32
+                    #             return route.apply(events, null_value=variable_inst.null_value)
+                    #     # apply it
+                    #     fill_data[variable_inst.name] = expr(events)
+
+                    # # fill it
+                    # fill_hist(
+                    #     histograms[var_key],
+                    #     fill_data,
+                    #     last_edge_inclusive=self.last_edge_inclusive,
+                    # )
+
+        # merge output files
+        self.output()["hists"].dump(histograms, formatter="pickle")
+
+
+# overwrite class defaults
+check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
+CreateFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
+    default=CreateFakeFactorHistograms.task_family in check_overlap_tasks,
+    add_default_to_description=True,
+)
+
+
+CreateFakeFactorHistogramsWrapper = wrapper_factory(
+    base_cls=AnalysisTask,
+    require_cls=CreateFakeFactorHistograms,
+    enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
+)
+
+
+# class MergeHistograms(
+#     VariablesMixin,
+#     WeightProducerMixin,
+#     MLModelsMixin,
+#     ProducersMixin,
+#     SelectorStepsMixin,
+#     CalibratorsMixin,
+#     DatasetTask,
+#     law.LocalWorkflow,
+#     RemoteWorkflow,
+# ):
+#     only_missing = luigi.BoolParameter(
+#         default=False,
+#         description="when True, identify missing variables first and only require histograms of "
+#         "missing ones; default: False",
+#     )
+#     remove_previous = luigi.BoolParameter(
+#         default=False,
+#         significant=False,
+#         description="when True, remove particlar input histograms after merging; default: False",
+#     )
+
+#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+#     # upstream requirements
+#     reqs = Requirements(
+#         RemoteWorkflow.reqs,
+#         CreateHistograms=CreateHistograms,
+#     )
+
+#     @classmethod
+#     def req_params(cls, inst: AnalysisTask, **kwargs) -> dict:
+#         _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"}
+#         kwargs["_prefer_cli"] = _prefer_cli
+#         return super().req_params(inst, **kwargs)
+
+#     def create_branch_map(self):
+#         # create a dummy branch map so that this task could be submitted as a job
+#         return {0: None}
+
+#     def _get_variables(self):
+#         if self.is_workflow():
+#             return self.as_branch()._get_variables()
+
+#         variables = self.variables
+
+#         # optional dynamic behavior: determine not yet created variables and require only those
+#         if self.only_missing:
+#             missing = self.output().count(existing=False, keys=True)[1]
+#             variables = sorted(missing, key=variables.index)
+
+#         return variables
+
+#     def workflow_requires(self):
+#         reqs = super().workflow_requires()
+
+#         if not self.pilot:
+#             variables = self._get_variables()
+#             if variables:
+#                 reqs["hists"] = self.reqs.CreateHistograms.req_different_branching(
+#                     self,
+#                     branch=-1,
+#                     variables=tuple(variables),
+#                 )
+
+#         return reqs
+
+#     def requires(self):
+#         variables = self._get_variables()
+#         if not variables:
+#             return []
+
+#         return self.reqs.CreateHistograms.req_different_branching(
+#             self,
+#             branch=-1,
+#             variables=tuple(variables),
+#             workflow="local",
+#         )
+
+#     def output(self):
+#         return {"hists": law.SiblingFileCollection({
+#             variable_name: self.target(f"hist__{variable_name}.pickle")
+#             for variable_name in self.variables
+#         })}
+
+#     @law.decorator.log
+#     def run(self):
+#         # preare inputs and outputs
+#         inputs = self.input()["collection"]
+#         outputs = self.output()
+
+#         # load input histograms
+#         hists = [
+#             inp["hists"].load(formatter="pickle")
+#             for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
+#         ]
+
+#         # create a separate file per output variable
+#         variable_names = list(hists[0].keys())
+#         for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)):
+#             self.publish_message(f"merging histograms for '{variable_name}'")
+
+#             variable_hists = [h[variable_name] for h in hists]
+#             merged = sum(variable_hists[1:], variable_hists[0].copy())
+#             outputs["hists"][variable_name].dump(merged, formatter="pickle")
+
+#         # optionally remove inputs
+#         if self.remove_previous:
+#             inputs.remove()
+
+
+# MergeHistogramsWrapper = wrapper_factory(
+#     base_cls=AnalysisTask,
+#     require_cls=MergeHistograms,
+#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
+# )
+
+
+# class MergeShiftedHistograms(
+#     VariablesMixin,
+#     ShiftSourcesMixin,
+#     WeightProducerMixin,
+#     MLModelsMixin,
+#     ProducersMixin,
+#     SelectorStepsMixin,
+#     CalibratorsMixin,
+#     DatasetTask,
+#     law.LocalWorkflow,
+#     RemoteWorkflow,
+# ):
+#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+#     # disable the shift parameter
+#     shift = None
+#     effective_shift = None
+#     allow_empty_shift = True
+
+#     # allow only running on nominal
+#     allow_empty_shift_sources = True
+
+#     # upstream requirements
+#     reqs = Requirements(
+#         RemoteWorkflow.reqs,
+#         MergeHistograms=MergeHistograms,
+#     )
+
+#     def create_branch_map(self):
+#         # create a dummy branch map so that this task could as a job
+#         return {0: None}
+
+#     def workflow_requires(self):
+#         reqs = super().workflow_requires()
+
+#         # add nominal and both directions per shift source
+#         for shift in ["nominal"] + self.shifts:
+#             reqs[shift] = self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"})
+
+#         return reqs
+
+#     def requires(self):
+#         return {
+#             shift: self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"})
+#             for shift in ["nominal"] + self.shifts
+#         }
+
+#     def store_parts(self):
+#         parts = super().store_parts()
+#         parts.insert_after("dataset", "shift_sources", f"shifts_{self.shift_sources_repr}")
+#         return parts
+
+#     def output(self):
+#         return {"hists": law.SiblingFileCollection({
+#             variable_name: self.target(f"shifted_hist__{variable_name}.pickle")
+#             for variable_name in self.variables
+#         })}
+
+#     @law.decorator.log
+#     def run(self):
+#         # preare inputs and outputs
+#         inputs = self.input()
+#         outputs = self.output()["hists"].targets
+
+#         for variable_name, outp in self.iter_progress(outputs.items(), len(outputs)):
+#             self.publish_message(f"merging histograms for '{variable_name}'")
+
+#             # load hists
+#             variable_hists = [
+#                 coll["hists"].targets[variable_name].load(formatter="pickle")
+#                 for coll in inputs.values()
+#             ]
+
+#             # merge and write the output
+#             merged = sum(variable_hists[1:], variable_hists[0].copy())
+#             outp.dump(merged, formatter="pickle")
+
+
+# MergeShiftedHistogramsWrapper = wrapper_factory(
+#     base_cls=AnalysisTask,
+#     require_cls=MergeShiftedHistograms,
+#     enable=["configs", "skip_configs", "datasets", "skip_datasets"],
+# )
diff --git a/law.cfg b/law.cfg
index 86b667a76..0d6ae338f 100644
--- a/law.cfg
+++ b/law.cfg
@@ -8,6 +8,7 @@ columnflow.tasks.reduction
 columnflow.tasks.production
 columnflow.tasks.ml
 columnflow.tasks.union
+columnflow.tasks.data_driven_methods
 columnflow.tasks.histograms
 columnflow.tasks.plotting
 columnflow.tasks.yields

From 2a8894ac53ac3bacef3fd2d2a1181ad45a9ace8c Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 20 Jan 2025 20:18:00 +0100
Subject: [PATCH 06/26] Fake factor method, work in progress

---
 columnflow/tasks/data_driven_methods.py | 689 ++++++------------------
 1 file changed, 166 insertions(+), 523 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 0f47638e6..28082ad77 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -1,254 +1,3 @@
-# # coding: utf-8
-
-# """
-# Tasks to plot different types of histograms.
-# """
-
-# from collections import OrderedDict
-# from abc import abstractmethod
-
-# import law
-# import luigi
-
-# from columnflow.tasks.framework.base import Requirements, ShiftTask
-# from columnflow.tasks.framework.mixins import (
-#     CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin,
-#     CategoriesMixin, ShiftSourcesMixin,
-# )
-# from columnflow.tasks.framework.plotting import (
-#     PlotBase, PlotBase2D, ProcessPlotSettingMixin, VariablePlotSettingMixin,
-# )
-# from columnflow.tasks.framework.decorators import view_output_plots
-# from columnflow.tasks.framework.remote import RemoteWorkflow
-# from columnflow.tasks.histograms import MergeHistograms
-# from columnflow.util import DotDict, dev_sandbox, dict_add_strict
-
-
-# class DataDrivenEstimationBase(
-#     VariablePlotSettingMixin,
-#     ProcessPlotSettingMixin,
-#     CategoriesMixin,
-#     MLModelsMixin,
-#     ProducersMixin,
-#     SelectorStepsMixin,
-#     CalibratorsMixin,
-#     law.LocalWorkflow,
-#     RemoteWorkflow,
-# ):
-#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
-#     """sandbox to use for this task. Defaults to *default_columnar_sandbox* from
-#     analysis config.
-#     """
-
-#     exclude_index = True
-
-#     # upstream requirements
-#     reqs = Requirements(
-#         RemoteWorkflow.reqs,
-#         MergeHistograms=MergeHistograms,
-#     )
-#     """Set upstream requirements, in this case :py:class:`~columnflow.tasks.histograms.MergeHistograms`
-#     """
-
-#     def store_parts(self):
-#         parts = super().store_parts()
-#         parts.insert_before("version", "plot", f"datasets_{self.datasets_repr}")
-#         return parts
-
-#     def create_branch_map(self):
-#         return [
-#             DotDict({"category": cat_name, "variable": var_name})
-#             for cat_name in sorted(self.categories)
-#             for var_name in sorted(self.variables)
-#         ]
-
-#     def workflow_requires(self):
-#         reqs = super().workflow_requires()
-
-#         reqs["merged_hists"] = self.requires_from_branch()
-
-#         return reqs
-
-#     @abstractmethod
-#     def get_plot_shifts(self):
-#         return
-    
-#     @law.decorator.log
-#     @view_output_plots
-#     def run(self):
-#         import hist
-#         import numpy as np
-#         from cmsdb.processes.qcd import qcd
-
-#         # get the shifts to extract and plot
-#         plot_shifts = law.util.make_list(self.get_plot_shifts())
-
-#         # prepare config objects
-#         variable_tuple = self.variable_tuples[self.branch_data.variable]
-#         variable_insts = [
-#             self.config_inst.get_variable(var_name)
-#             for var_name in variable_tuple
-#         ]
-#         category_inst = self.config_inst.get_category(self.branch_data.category)
-#         leaf_category_insts = category_inst.get_leaf_categories() or [category_inst]
-#         process_insts = list(map(self.config_inst.get_process, self.processes))
-#         sub_process_insts = {
-#             proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)]
-#             for proc in process_insts
-#         }
-
-#         # histogram data per process
-#         hists = {}
-#         if 'ff_control_reg' in category_inst.name :
-#             with self.publish_step(f"estimating qcd for {self.branch_data.variable} in {category_inst.name}"):
-#                 for dataset, inp in self.input().items():
-#                     dataset_inst = self.config_inst.get_dataset(dataset)
-#                     h_in = inp["collection"][0]["hists"].targets[self.branch_data.variable].load(formatter="pickle")
-
-#                     # loop and extract one histogram per process
-#                     for process_inst in process_insts:
-#                         # skip when the dataset is already known to not contain any sub process
-#                         if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])):
-#                             continue
-#                         # work on a copy
-#                         h = h_in.copy()
-#                         # axis selections
-#                         h = h[{
-#                             "process": [
-#                                 hist.loc(p.id)
-#                                 for p in sub_process_insts[process_inst]
-#                                 if p.id in h.axes["process"]
-#                             ],
-#                             "category": [
-#                                 hist.loc(c.id)
-#                                 for c in leaf_category_insts
-#                                 if c.id in h.axes["category"]
-#                             ],
-#                             "shift": [
-#                                 hist.loc(s.id)
-#                                 for s in plot_shifts
-#                                 if s.id in h.axes["shift"]
-#                             ],
-#                         }]
-
-#                         # axis reductions
-#                         h = h[{"process": sum, "category": sum}]
-
-#                         # add the histogram
-#                         if process_inst in hists:
-#                             hists[process_inst] += h
-#                         else:
-#                             hists[process_inst] = h
-
-#                 # there should be hists to plot
-#                 if not hists:
-#                     raise Exception(
-#                         "no histograms found to plot; possible reasons:\n" +
-#                         "  - requested variable requires columns that were missing during histogramming\n" +
-#                         "  - selected --processes did not match any value on the process axis of the input histogram",
-#                     )
-
-#                 # sort hists by process order
-#                 hists = OrderedDict(
-#                     (process_inst.copy_shallow(), hists[process_inst])
-#                     for process_inst in sorted(hists, key=process_insts.index)
-#                 )
-                
-#                 qcd_hist = None
-#                 qcd_hist_values = None
-#                 for process_inst, h in hists.items():
-#                     hist_np , _ , _ = h.to_numpy(flow=True)
-#                     if qcd_hist is None:
-#                         qcd_hist = h.copy()
-#                         qcd_hist_values = np.zeros_like(hist_np)
-#                     if process_inst.is_data: qcd_hist_values += hist_np
-#                     else: qcd_hist_values -= hist_np
-                
-#                 #if the array contains negative values, set them to zero
-#                 qcd_hist_values = np.where(qcd_hist_values > 0, qcd_hist_values, 0)
-#                 qcd_hist.view(flow=True).value[:] = qcd_hist_values
-#                 qcd_hist.view(flow=True).variance[:] = np.zeros_like(qcd_hist_values)
-#                 qcd_hist
-#                 #register a new datased at the hlist
-#                 hists[qcd] = qcd_hist
-#                 #save qcd estimation histogram and plots only for control region
-                
-#                 self.output()["qcd_hists"][self.branch_data.variable].dump(qcd_hist, formatter="pickle")
-#                 # call the plot function
-#                 fig, _ = self.call_plot_func(
-#                     self.plot_function,
-#                     hists=hists,
-#                     config_inst=self.config_inst,
-#                     category_inst=category_inst.copy_shallow(),
-#                     variable_insts=[var_inst.copy_shallow() for var_inst in variable_insts],
-#                     **self.get_plot_parameters(),
-#                 )
-#                 # save the plot
-#                 for outp in self.output()["plots"]:
-#                     outp.dump(fig, formatter="mpl")
-#         else:
-#             self.publish_step(f"Category: {category_inst.name} isn't used to estimate QCD, skipping this task.")
-
-
-# class DataDrivenEstimationSingleShift(
-#     DataDrivenEstimationBase,
-#     ShiftTask,
-# ):
-#     exclude_index = True
-
-#     # upstream requirements
-#     reqs = Requirements(
-#         DataDrivenEstimationBase.reqs,
-#         MergeHistograms=MergeHistograms,
-#     )
-
-#     def create_branch_map(self):
-#         return [
-#             DotDict({"category": cat_name, "variable": var_name})
-#             for var_name in sorted(self.variables)
-#             for cat_name in sorted(self.categories)
-#         ]
-
-#     def requires(self):
-#         return {
-#             d: self.reqs.MergeHistograms.req(
-#                 self,
-#                 dataset=d,
-#                 branch=-1,
-#                 _exclude={"branches"},
-#                 _prefer_cli={"variables"},
-#             )
-#             for d in self.datasets
-#         }
-
-#     def output(self):
-#         b = self.branch_data
-#         return {"plots": [
-#             self.target(name)
-#             for name in self.get_plot_names(f"plot__proc_{self.processes_repr}__cat_{b.category}__var_{b.variable}")
-#         ],
-#         "qcd_hists": law.SiblingFileCollection({
-#             variable_name: self.target(f"qcd_histogram__{b.category}_{variable_name}.pickle")
-#             for variable_name in self.variables
-#         })}
-
-#     def get_plot_shifts(self):
-#         return [self.global_shift_inst]
-
-
-# class DataDrivenEstimation(
-#     DataDrivenEstimationSingleShift,
-#     DataDrivenEstimationBase,
-# ):
-#     plot_function = PlotBase.plot_function.copy(
-#         default="columnflow.plotting.plot_functions_1d.plot_variable_per_process",
-#         add_default_to_description=True,
-#     )
-    
-    
-    
-
-# coding: utf-8
 
 """
 Task to produce and merge histograms.
@@ -262,14 +11,16 @@
 from columnflow.tasks.framework.base import Requirements, AnalysisTask, DatasetTask, wrapper_factory
 from columnflow.tasks.framework.mixins import (
     CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, VariablesMixin,
-    ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin,
+    ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin, DatasetsProcessesMixin, CategoriesMixin
 )
+from columnflow.tasks.framework.plotting import ProcessPlotSettingMixin
+
 from columnflow.tasks.framework.remote import RemoteWorkflow
 from columnflow.tasks.framework.parameters import last_edge_inclusive_inst
 from columnflow.tasks.reduction import ReducedEventsUser
 from columnflow.tasks.production import ProduceColumns
 from columnflow.tasks.ml import MLEvaluation
-from columnflow.util import dev_sandbox
+from columnflow.util import dev_sandbox, DotDict
 
 
 class CreateFakeFactorHistograms(
@@ -355,7 +106,7 @@ def run(self):
         import numpy as np
         import awkward as ak
         from columnflow.columnar_util import (
-            Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist,
+            Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, EMPTY_FLOAT
         )
 
         # prepare inputs
@@ -381,9 +132,8 @@ def run(self):
         read_columns |= set(self.weight_producer_inst.used_columns)
         read_columns |= set(map(Route, aliases.values()))
         read_columns |= {
-            Route(the_var) for the_var in self.config_inst.x.fake_factor_method.vars.keys()
+            Route(the_ax.var_route) for the_ax in self.config_inst.x.fake_factor_method.axes.values()
         }
-        from IPython import embed; embed()
         # empty float array to use when input files have no entries
         empty_f32 = ak.Array(np.array([], dtype=np.float32))
 
@@ -429,10 +179,10 @@ def run(self):
                     .IntCat([], name="category", growth=True)
                     .IntCat([], name="process", growth=True)
                     .IntCat([], name="shift", growth=True))
-                for (var_name, var_axis) in self.config_inst.x.fake_factor_method.vars.items(): 
-                    h = eval(f'h.{var_axis}') 
+                for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
+                    h = eval(f'h.{var_axis.ax_str}') 
                 
-                histograms['fake_factor'] = h.Weight()
+                histograms['fake_factors'] = h.Weight()
                 
                 category_ids = ak.concatenate(
                         [Route(c).apply(events) for c in self.category_id_columns],
@@ -440,75 +190,24 @@ def run(self):
                     )
                 # broadcast arrays so that each event can be filled for all its categories
                 fill_data = {
-                    "category": category_ids,
-                    "process": events.process_id,
-                    "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
+                    "category"          : category_ids,
+                    "process"           : events.process_id,
+                    "shift"             : np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
                     "weight": weight,
                 }
-                # for variable_inst in self.config_inst.x.fake_factor_method.vars.:
-                #     # prepare the expression
-                #     expr = variable_inst.expression
-                #     if isinstance(expr, str):
-                #         route = Route(expr)
-                #         def expr(events, *args, **kwargs):
-                #             if len(events) == 0 and not has_ak_column(events, route):
-                #                 return empty_f32
-                #             return route.apply(events, null_value=variable_inst.null_value)
-                #     fill_data[variable_inst.name] = expr(events)
-                from IPython import embed; embed()
-                # for var_key, var_names in self.variable_tuples.items():
-                #     variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
-
-                #     # create the histogram if not present yet
-                #     if var_key not in histograms:
-                #         h = (
-                #             hist.Hist.new
-                #             .IntCat([], name="category", growth=True)
-                #             .IntCat([], name="process", growth=True)
-                #             .IntCat([], name="shift", growth=True)
-                #         )
-                #         # add variable axes
-                #         for variable_inst in variable_insts:
-                #             h = h.Var(
-                #                 variable_inst.bin_edges,
-                #                 name=variable_inst.name,
-                #                 label=variable_inst.get_full_x_title(),
-                #             )
-                #         # enable weights and store it
-                #         histograms[var_key] = h.Weight()
-
-                    # # merge category ids
-                    # category_ids = ak.concatenate(
-                    #     [Route(c).apply(events) for c in self.category_id_columns],
-                    #     axis=-1,
-                    # )
-
-                    # broadcast arrays so that each event can be filled for all its categories
-                    # fill_data = {
-                    #     "category": category_ids,
-                    #     "process": events.process_id,
-                    #     "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
-                    #     "weight": weight,
-                    # }
-                    # for variable_inst in variable_insts:
-                    #     # prepare the expression
-                    #     expr = variable_inst.expression
-                    #     if isinstance(expr, str):
-                    #         route = Route(expr)
-                    #         def expr(events, *args, **kwargs):
-                    #             if len(events) == 0 and not has_ak_column(events, route):
-                    #                 return empty_f32
-                    #             return route.apply(events, null_value=variable_inst.null_value)
-                    #     # apply it
-                    #     fill_data[variable_inst.name] = expr(events)
-
-                    # # fill it
-                    # fill_hist(
-                    #     histograms[var_key],
-                    #     fill_data,
-                    #     last_edge_inclusive=self.last_edge_inclusive,
-                    # )
-
+                for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
+                    route = Route(var_axis.var_route)
+                    if len(events) == 0 and not has_ak_column(events, route):
+                        values = empty_f32
+                    else:
+                        values = ak.fill_none(ak.firsts(route.apply(events),axis=1), EMPTY_FLOAT)
+                        if 'IntCategory' in var_axis.ax_str: values = ak.values_astype(values, np.int64)
+                    fill_data[var_name] = values
+                # fill it
+                fill_hist(
+                    histograms['fake_factors'],
+                    fill_data,
+                )
         # merge output files
         self.output()["hists"].dump(histograms, formatter="pickle")
 
@@ -527,205 +226,149 @@ def run(self):
     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 )
 
+class MergeFakeFactors(
+    VariablesMixin,
+    DatasetsProcessesMixin,
+    CategoriesMixin,
+    WeightProducerMixin,
+    ProducersMixin,
+    SelectorStepsMixin,
+    CalibratorsMixin,
+    law.LocalWorkflow,
+    RemoteWorkflow,
+):
+    sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
 
-# class MergeHistograms(
-#     VariablesMixin,
-#     WeightProducerMixin,
-#     MLModelsMixin,
-#     ProducersMixin,
-#     SelectorStepsMixin,
-#     CalibratorsMixin,
-#     DatasetTask,
-#     law.LocalWorkflow,
-#     RemoteWorkflow,
-# ):
-#     only_missing = luigi.BoolParameter(
-#         default=False,
-#         description="when True, identify missing variables first and only require histograms of "
-#         "missing ones; default: False",
-#     )
-#     remove_previous = luigi.BoolParameter(
-#         default=False,
-#         significant=False,
-#         description="when True, remove particlar input histograms after merging; default: False",
-#     )
-
-#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
-
-#     # upstream requirements
-#     reqs = Requirements(
-#         RemoteWorkflow.reqs,
-#         CreateHistograms=CreateHistograms,
-#     )
-
-#     @classmethod
-#     def req_params(cls, inst: AnalysisTask, **kwargs) -> dict:
-#         _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"}
-#         kwargs["_prefer_cli"] = _prefer_cli
-#         return super().req_params(inst, **kwargs)
-
-#     def create_branch_map(self):
-#         # create a dummy branch map so that this task could be submitted as a job
-#         return {0: None}
-
-#     def _get_variables(self):
-#         if self.is_workflow():
-#             return self.as_branch()._get_variables()
-
-#         variables = self.variables
-
-#         # optional dynamic behavior: determine not yet created variables and require only those
-#         if self.only_missing:
-#             missing = self.output().count(existing=False, keys=True)[1]
-#             variables = sorted(missing, key=variables.index)
-
-#         return variables
-
-#     def workflow_requires(self):
-#         reqs = super().workflow_requires()
-
-#         if not self.pilot:
-#             variables = self._get_variables()
-#             if variables:
-#                 reqs["hists"] = self.reqs.CreateHistograms.req_different_branching(
-#                     self,
-#                     branch=-1,
-#                     variables=tuple(variables),
-#                 )
-
-#         return reqs
-
-#     def requires(self):
-#         variables = self._get_variables()
-#         if not variables:
-#             return []
-
-#         return self.reqs.CreateHistograms.req_different_branching(
-#             self,
-#             branch=-1,
-#             variables=tuple(variables),
-#             workflow="local",
-#         )
-
-#     def output(self):
-#         return {"hists": law.SiblingFileCollection({
-#             variable_name: self.target(f"hist__{variable_name}.pickle")
-#             for variable_name in self.variables
-#         })}
-
-#     @law.decorator.log
-#     def run(self):
-#         # preare inputs and outputs
-#         inputs = self.input()["collection"]
-#         outputs = self.output()
-
-#         # load input histograms
-#         hists = [
-#             inp["hists"].load(formatter="pickle")
-#             for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
-#         ]
-
-#         # create a separate file per output variable
-#         variable_names = list(hists[0].keys())
-#         for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)):
-#             self.publish_message(f"merging histograms for '{variable_name}'")
-
-#             variable_hists = [h[variable_name] for h in hists]
-#             merged = sum(variable_hists[1:], variable_hists[0].copy())
-#             outputs["hists"][variable_name].dump(merged, formatter="pickle")
-
-#         # optionally remove inputs
-#         if self.remove_previous:
-#             inputs.remove()
-
-
-# MergeHistogramsWrapper = wrapper_factory(
-#     base_cls=AnalysisTask,
-#     require_cls=MergeHistograms,
-#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
-# )
+    only_missing = luigi.BoolParameter(
+        default=False,
+        description="when True, identify missing variables first and only require histograms of "
+        "missing ones; default: False",
+    )
+    remove_previous = luigi.BoolParameter(
+        default=False,
+        significant=False,
+        description="when True, remove particlar input histograms after merging; default: False",
+    )
+    
+    # upstream requirements
+    reqs = Requirements(
+        RemoteWorkflow.reqs,
+        CreateFakeFactorHistograms=CreateFakeFactorHistograms,
+    )
+    
+    def store_parts(self):
+        parts = super().store_parts()
+        parts.insert_before("version", "datasets" )#, f"datasets_{self.datasets_repr}")
+        return parts
+    
+    @classmethod
+    def req_params(cls, inst: AnalysisTask, **kwargs) -> dict:
+        _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"}
+        kwargs["_prefer_cli"] = _prefer_cli
+        return super().req_params(inst, **kwargs)
+
+    def create_branch_map(self):
+        return [
+            DotDict({"category": cat_name})
+            for cat_name in sorted(self.categories)
+        ]
+
+    def _get_variables(self):
+        if self.is_workflow():
+            return self.as_branch()._get_variables()
+
+        variables = self.variables
+
+        # optional dynamic behavior: determine not yet created variables and require only those
+        if self.only_missing:
+            missing = self.output().count(existing=False, keys=True)[1]
+            variables = sorted(missing, key=variables.index)
+
+        return variables
+
+    def workflow_requires(self):
+        reqs = super().workflow_requires()
+        if not self.pilot:
+            variables = self._get_variables()
+            if variables:
+                reqs["ff_method"] = self.reqs.CreateFakeFactorHistograms.req_different_branching(
+                    self,
+                    branch=-1,
+                    variables=tuple(variables),
+                )
+
+        return reqs
 
+    def requires(self):
+        return {
+            d: self.reqs.CreateFakeFactorHistograms.req(
+                self,
+                dataset=d,
+                branch=-1,
+            )
+            for d in self.datasets
+        }
+    def output(self):
+        return {"hists": self.target(f"fake_factors.pickle")}
+
+    @law.decorator.log
+    def run(self):
+        import hist
+        import numpy as np
+        import matplotlib.pyplot as plt
+        # preare inputs and outputs
+        inputs = self.input()
+        outputs = self.output()
+        merged_per_dataset = {}
+        projected_hists = []
+        for (dataset_name, dataset) in inputs.items():
+            files = dataset['collection']
+            # load input histograms per dataset
+            hists = [
+                inp['hists'].load(formatter="pickle")['fake_factors']
+                for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50))
+            ]
+            self.publish_message(f"merging Fake factor histograms for {dataset_name}")
+            the_hist = sum(hists[1:], hists[0].copy())
+            merged_per_dataset[dataset_name] = the_hist
+            #Get axes names excluding 'process'. This is needed to merge hists for different processes
+            ax_names = [ax_name for ax_name in the_hist.axes.name if ax_name != 'process']
+            #Remove 'process' axis by projecting hist on the remaining axes
+            projected_hists.append(the_hist.project(*ax_names))
+        merged_hist = sum(projected_hists[1:], projected_hists[0].copy())
+        
+        cat_SR = self.config_inst.get_category(self.branch_data.category)
+        cat_DR_den = self.config_inst.get_category(cat_SR.x.DR_den)
+        cat_DR_num = self.config_inst.get_category(cat_SR.x.DR_num)
+        
+        def get_hist (h, category): 
+            return h[{"category": hist.loc(category.id)}]
+        
+        h_DR_num = get_hist(merged_hist,cat_DR_num).values()
+        h_DR_den = get_hist(merged_hist,cat_DR_den).values()
+        
+        ff_values = np.where((h_DR_num > 0) & (h_DR_den > 0),
+                             h_DR_num / np.maximum(h_DR_den, 1),
+                             0.0,
+        )
+        
+        #For the control: make 2d hists and plot them:
+        hist2d = merged_hist.project('tau_pt','tau_dm_pnet')
+        ff_hist = hist.Hist(*hist2d.axes, data=ff_values[0])
+        fig, ax = plt.subplots(figsize=(12, 8))
+        ff_hist.plot2d(ax=ax)
+        plt.savefig('fake_factors.pdf')
+        from IPython import embed; embed()
+        #outputs["hists"][variable_name].dump(merged, formatter="pickle")F
 
-# class MergeShiftedHistograms(
-#     VariablesMixin,
-#     ShiftSourcesMixin,
-#     WeightProducerMixin,
-#     MLModelsMixin,
-#     ProducersMixin,
-#     SelectorStepsMixin,
-#     CalibratorsMixin,
-#     DatasetTask,
-#     law.LocalWorkflow,
-#     RemoteWorkflow,
-# ):
-#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
-
-#     # disable the shift parameter
-#     shift = None
-#     effective_shift = None
-#     allow_empty_shift = True
-
-#     # allow only running on nominal
-#     allow_empty_shift_sources = True
-
-#     # upstream requirements
-#     reqs = Requirements(
-#         RemoteWorkflow.reqs,
-#         MergeHistograms=MergeHistograms,
-#     )
-
-#     def create_branch_map(self):
-#         # create a dummy branch map so that this task could as a job
-#         return {0: None}
-
-#     def workflow_requires(self):
-#         reqs = super().workflow_requires()
-
-#         # add nominal and both directions per shift source
-#         for shift in ["nominal"] + self.shifts:
-#             reqs[shift] = self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"})
-
-#         return reqs
-
-#     def requires(self):
-#         return {
-#             shift: self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"})
-#             for shift in ["nominal"] + self.shifts
-#         }
-
-#     def store_parts(self):
-#         parts = super().store_parts()
-#         parts.insert_after("dataset", "shift_sources", f"shifts_{self.shift_sources_repr}")
-#         return parts
-
-#     def output(self):
-#         return {"hists": law.SiblingFileCollection({
-#             variable_name: self.target(f"shifted_hist__{variable_name}.pickle")
-#             for variable_name in self.variables
-#         })}
-
-#     @law.decorator.log
-#     def run(self):
-#         # preare inputs and outputs
-#         inputs = self.input()
-#         outputs = self.output()["hists"].targets
-
-#         for variable_name, outp in self.iter_progress(outputs.items(), len(outputs)):
-#             self.publish_message(f"merging histograms for '{variable_name}'")
-
-#             # load hists
-#             variable_hists = [
-#                 coll["hists"].targets[variable_name].load(formatter="pickle")
-#                 for coll in inputs.values()
-#             ]
-
-#             # merge and write the output
-#             merged = sum(variable_hists[1:], variable_hists[0].copy())
-#             outp.dump(merged, formatter="pickle")
-
-
-# MergeShiftedHistogramsWrapper = wrapper_factory(
+        # optionally remove inputs
+        if self.remove_previous:
+            inputs.remove()
+
+
+# MergeFakeFactorsWrapper = wrapper_factory(
 #     base_cls=AnalysisTask,
-#     require_cls=MergeShiftedHistograms,
-#     enable=["configs", "skip_configs", "datasets", "skip_datasets"],
+#     require_cls=MergeFakeFactors,
+#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 # )
+

From c4b62497b644855168f591164babb14251cbaea2 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 20 Jan 2025 20:20:18 +0100
Subject: [PATCH 07/26] Some plotting aestetics

---
 columnflow/plotting/plot_functions_1d.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py
index 716ec5421..0c60ff0fb 100644
--- a/columnflow/plotting/plot_functions_1d.py
+++ b/columnflow/plotting/plot_functions_1d.py
@@ -72,7 +72,8 @@ def plot_variable_per_process(
     total_events = {key: sum(hist.values()) for key, hist in hists.items()}
 
     # Sort processes by total number of events in descending order
-    sorted_hists_desc = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True))
+    #sorted_hists_desc = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True))
+    sorted_hists_desc = OrderedDict(hists.items())
 
     # Get keys of sorted processes
     sorted_keys = list(sorted_hists_desc.keys())
@@ -86,7 +87,7 @@ def plot_variable_per_process(
         custom_order = sorted_keys
     else:
         # More than two processes, custom order: highest, rest, then second highest
-        custom_order = [sorted_keys[0]] + sorted_keys[2:] + [sorted_keys[1]]
+        custom_order = sorted_keys #[sorted_keys[0]] + sorted_keys[2:] + [sorted_keys[1]]
 
     # Reorder histograms based on custom order
     sorted_hists = OrderedDict((key, sorted_hists_desc[key]) for key in custom_order)

From 21ab46bcba50d53045d9ebfd831a66c69d4306be Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Tue, 28 Jan 2025 10:45:28 +0100
Subject: [PATCH 08/26] Developed a task to calculate fake factors for WJ and
 QCD

---
 columnflow/tasks/data_driven_methods.py | 172 ++++++++++++++++--------
 1 file changed, 119 insertions(+), 53 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 28082ad77..d12da8645 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -23,7 +23,7 @@
 from columnflow.util import dev_sandbox, DotDict
 
 
-class CreateFakeFactorHistograms(
+class PrepareFakeFactorHistograms(
     VariablesMixin,
     WeightProducerMixin,
     ProducersMixin,
@@ -177,8 +177,7 @@ def run(self):
                 
                 h = (hist.Hist.new
                     .IntCat([], name="category", growth=True)
-                    .IntCat([], name="process", growth=True)
-                    .IntCat([], name="shift", growth=True))
+                    .IntCat([], name="process", growth=True))
                 for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                     h = eval(f'h.{var_axis.ax_str}') 
                 
@@ -189,11 +188,11 @@ def run(self):
                         axis=-1,
                     )
                 # broadcast arrays so that each event can be filled for all its categories
+                
                 fill_data = {
                     "category"          : category_ids,
                     "process"           : events.process_id,
-                    "shift"             : np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
-                    "weight": weight,
+                    "weight"            : weight,
                 }
                 for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                     route = Route(var_axis.var_route)
@@ -214,19 +213,19 @@ def run(self):
 
 # overwrite class defaults
 check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
-CreateFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
-    default=CreateFakeFactorHistograms.task_family in check_overlap_tasks,
+PrepareFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
+    default=PrepareFakeFactorHistograms.task_family in check_overlap_tasks,
     add_default_to_description=True,
 )
 
 
-CreateFakeFactorHistogramsWrapper = wrapper_factory(
+PrepareFakeFactorHistogramsWrapper = wrapper_factory(
     base_cls=AnalysisTask,
-    require_cls=CreateFakeFactorHistograms,
+    require_cls=PrepareFakeFactorHistograms,
     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 )
 
-class MergeFakeFactors(
+class ComputeFakeFactors(
     VariablesMixin,
     DatasetsProcessesMixin,
     CategoriesMixin,
@@ -253,12 +252,12 @@ class MergeFakeFactors(
     # upstream requirements
     reqs = Requirements(
         RemoteWorkflow.reqs,
-        CreateFakeFactorHistograms=CreateFakeFactorHistograms,
+        PrepareFakeFactorHistograms=PrepareFakeFactorHistograms,
     )
     
     def store_parts(self):
         parts = super().store_parts()
-        parts.insert_before("version", "datasets" )#, f"datasets_{self.datasets_repr}")
+        parts.insert_before("version", "datasets", f"datasets_{self.datasets_repr}")
         return parts
     
     @classmethod
@@ -291,7 +290,7 @@ def workflow_requires(self):
         if not self.pilot:
             variables = self._get_variables()
             if variables:
-                reqs["ff_method"] = self.reqs.CreateFakeFactorHistograms.req_different_branching(
+                reqs["ff_method"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching(
                     self,
                     branch=-1,
                     variables=tuple(variables),
@@ -301,7 +300,7 @@ def workflow_requires(self):
 
     def requires(self):
         return {
-            d: self.reqs.CreateFakeFactorHistograms.req(
+            d: self.reqs.PrepareFakeFactorHistograms.req(
                 self,
                 dataset=d,
                 branch=-1,
@@ -309,66 +308,133 @@ def requires(self):
             for d in self.datasets
         }
     def output(self):
-        return {"hists": self.target(f"fake_factors.pickle")}
+        return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']},
+                "plots": {syst: self.target(f"fake_factor_syst_{syst}.png") for syst in ['nominal', 'up', 'down']},}
 
     @law.decorator.log
     def run(self):
         import hist
         import numpy as np
         import matplotlib.pyplot as plt
+        import correctionlib.convert as cl_convert 
         # preare inputs and outputs
         inputs = self.input()
         outputs = self.output()
         merged_per_dataset = {}
         projected_hists = []
+        hists_by_dataset = []
         for (dataset_name, dataset) in inputs.items():
             files = dataset['collection']
             # load input histograms per dataset
-            hists = [
+            hists_per_ds = [
                 inp['hists'].load(formatter="pickle")['fake_factors']
                 for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50))
             ]
             self.publish_message(f"merging Fake factor histograms for {dataset_name}")
-            the_hist = sum(hists[1:], hists[0].copy())
-            merged_per_dataset[dataset_name] = the_hist
-            #Get axes names excluding 'process'. This is needed to merge hists for different processes
-            ax_names = [ax_name for ax_name in the_hist.axes.name if ax_name != 'process']
-            #Remove 'process' axis by projecting hist on the remaining axes
-            projected_hists.append(the_hist.project(*ax_names))
-        merged_hist = sum(projected_hists[1:], projected_hists[0].copy())
+            ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy())
+            hists_by_dataset.append(ds_single_hist)
+        
+        hists_by_proc = {}
+        for proc_name in self.config_inst.processes.names():
+            proc = self.config_inst.processes.get(proc_name)
+            self.publish_message(f"merging Fake factor histograms for process: {proc.name}")
+            for the_hist in hists_by_dataset:
+                
+                if proc.id in the_hist.axes["process"]: 
+                    h = the_hist.copy()
+                    h = h[{"process": hist.loc(proc.id)}]
+                    # add the histogram
+                    if proc in hists_by_proc:
+                        hists_by_proc[proc] += h
+                    else:
+                        hists_by_proc[proc] = h
         
-        cat_SR = self.config_inst.get_category(self.branch_data.category)
-        cat_DR_den = self.config_inst.get_category(cat_SR.x.DR_den)
-        cat_DR_num = self.config_inst.get_category(cat_SR.x.DR_num)
+        mc_hists    = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")]
+        data_hists  = [h for p, h in hists_by_proc.items() if p.is_data]
         
-        def get_hist (h, category): 
-            return h[{"category": hist.loc(category.id)}]
+        mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
+        data_hists  = sum(data_hists[1:], data_hists[0].copy())
         
-        h_DR_num = get_hist(merged_hist,cat_DR_num).values()
-        h_DR_den = get_hist(merged_hist,cat_DR_den).values()
+        dr_names = ['dr_num_wj','dr_den_wj','dr_num_qcd','dr_den_qcd']
+        
+        def get_hist(h, category): 
+             return h[{"category": hist.loc(category.id)}]
         
-        ff_values = np.where((h_DR_num > 0) & (h_DR_den > 0),
-                             h_DR_num / np.maximum(h_DR_den, 1),
-                             0.0,
-        )
         
-        #For the control: make 2d hists and plot them:
-        hist2d = merged_hist.project('tau_pt','tau_dm_pnet')
-        ff_hist = hist.Hist(*hist2d.axes, data=ff_values[0])
-        fig, ax = plt.subplots(figsize=(12, 8))
-        ff_hist.plot2d(ax=ax)
-        plt.savefig('fake_factors.pdf')
-        from IPython import embed; embed()
-        #outputs["hists"][variable_name].dump(merged, formatter="pickle")F
-
-        # optionally remove inputs
-        if self.remove_previous:
-            inputs.remove()
-
-
-# MergeFakeFactorsWrapper = wrapper_factory(
-#     base_cls=AnalysisTask,
-#     require_cls=MergeFakeFactors,
-#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
-# )
+        #Create two dictionaries that contain histograms for different determination regions
+        data_h_cat ={}
+        mc_h_cat = {}
+        for dr_name in dr_names:
+            cat = self.config_inst.get_category(self.branch_data.category.replace('sr',dr_name))
+            data_h_cat[dr_name]  = get_hist(data_hists, cat)
+            mc_h_cat[dr_name]    = get_hist(mc_hists, cat)
+            
+        
+        def get_ff_corr(self, h_data, h_mc, num_cat, den_cat, name='ff_hist', label='ff_hist'):
+            num = h_data[num_cat].values() - h_mc[num_cat].values()
+            den = h_data[den_cat].values() - h_mc[den_cat].values()
+            ff_val = np.where((num > 0) & (den > 0),
+                               num / np.maximum(den, 1),
+                               1)
+            def rel_err(x):
+                return x.variances()/np.maximum(x.values()**2, 1)
+            ff_err2 = np.where((num > 0) & (den > 0),
+                               np.sqrt(rel_err(h_data[num_cat]) + 
+                                       + rel_err(h_mc[den_cat]) +
+                                       + rel_err(h_data[num_cat]) + 
+                                       + rel_err(h_mc[den_cat])) * ff_val**2,
+                               0.5* np.ones_like(ff_val))
+            h = hist.Hist.new
+            for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
+                h = eval(f'h.{var_axis.ax_str}') 
+            h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor')
+            ff_hist= h.Weight()
+            ff_hist.view().value[...,0] = ff_val
+            ff_hist.view().value[...,1] = ff_val + np.sqrt(ff_err2)
+            ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0)
+            ff_hist.name = name
+            ff_hist.label = label
+            ff_corr = cl_convert.from_histogram(ff_hist) #temporary correction without systematic axis
+            ff_corr.data.flow = "clamp"
+            return ff_corr, ff_hist
+        
+        import rich
+        
+        wj_corr, wj_h = get_ff_corr(self,
+                              data_h_cat,
+                              mc_h_cat,
+                              num_cat = 'dr_num_wj',
+                              den_cat = 'dr_den_wj',
+                              name='ff_wjets',
+                              label='Fake factor W+jets')
+        
+        qcd_corr, qcd_h = get_ff_corr(self,
+                              data_h_cat,
+                              mc_h_cat,
+                              num_cat = 'dr_num_qcd',
+                              den_cat = 'dr_den_qcd',
+                              name='ff_qcd',
+                              label='Fake factor QCD')
+        
+        for h_name in ['wj', 'qcd']:
+            the_hist = eval(f'{h_name}_h')
+            
+            for syst in ['nominal','up','down']:
+                fig, ax = plt.subplots(figsize=(12, 8))
+                the_hist[...,syst].plot2d(ax=ax)
+                self.output()['plots'][syst].dump(fig, formatter="mpl")
+                
+            
+        self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json")
+        self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json")
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
 

From b9d4b3f1a7d150a4a5bbdb169a7eefce267f635b Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Tue, 28 Jan 2025 14:05:57 +0100
Subject: [PATCH 09/26] Fake factor method: work in progress

---
 columnflow/tasks/data_driven_methods.py | 70 ++++++++++++-------------
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index d12da8645..931f507df 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -309,7 +309,9 @@ def requires(self):
         }
     def output(self):
         return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']},
-                "plots": {syst: self.target(f"fake_factor_syst_{syst}.png") for syst in ['nominal', 'up', 'down']},}
+                "plots": {'_'.join((ff_type, syst)): self.target(f"fake_factor_{ff_type}_{syst}.png")
+                          for syst in ['nominal', 'up', 'down']
+                          for ff_type in ['qcd','wj']},}
 
     @law.decorator.log
     def run(self):
@@ -333,7 +335,7 @@ def run(self):
             self.publish_message(f"merging Fake factor histograms for {dataset_name}")
             ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy())
             hists_by_dataset.append(ds_single_hist)
-        
+        #Create a dict of histograms indexed by the process
         hists_by_proc = {}
         for proc_name in self.config_inst.processes.names():
             proc = self.config_inst.processes.get(proc_name)
@@ -349,40 +351,39 @@ def run(self):
                     else:
                         hists_by_proc[proc] = h
         
+        #Divide histograms to data and bkg
         mc_hists    = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")]
         data_hists  = [h for p, h in hists_by_proc.items() if p.is_data]
         
+        #Merge histograms to get a joint data and mc histogram
         mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
         data_hists  = sum(data_hists[1:], data_hists[0].copy())
         
-        dr_names = ['dr_num_wj','dr_den_wj','dr_num_qcd','dr_den_qcd']
-        
-        def get_hist(h, category): 
-             return h[{"category": hist.loc(category.id)}]
-        
-        
-        #Create two dictionaries that contain histograms for different determination regions
-        data_h_cat ={}
-        mc_h_cat = {}
-        for dr_name in dr_names:
-            cat = self.config_inst.get_category(self.branch_data.category.replace('sr',dr_name))
-            data_h_cat[dr_name]  = get_hist(data_hists, cat)
-            mc_h_cat[dr_name]    = get_hist(mc_hists, cat)
+        #Function that performs the calculation of th
+        def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'):
             
-        
-        def get_ff_corr(self, h_data, h_mc, num_cat, den_cat, name='ff_hist', label='ff_hist'):
-            num = h_data[num_cat].values() - h_mc[num_cat].values()
-            den = h_data[den_cat].values() - h_mc[den_cat].values()
+            def get_dr_hist(self, h, det_reg): 
+                cat = self.config_inst.get_category(self.branch_data.category.replace('sr',det_reg))
+                return h[{"category": hist.loc(cat.id)}]
+         
+            data_num = get_dr_hist(self, h_data, num_reg)
+            data_den = get_dr_hist(self, h_data, den_reg)
+            mc_num = get_dr_hist(self, h_mc, num_reg)
+            mc_den = get_dr_hist(self, h_mc, den_reg)
+            
+            num = data_num.values() - mc_num.values()
+            den = data_den.values() - mc_den.values()
             ff_val = np.where((num > 0) & (den > 0),
                                num / np.maximum(den, 1),
                                1)
             def rel_err(x):
                 return x.variances()/np.maximum(x.values()**2, 1)
+            
             ff_err2 = np.where((num > 0) & (den > 0),
-                               np.sqrt(rel_err(h_data[num_cat]) + 
-                                       + rel_err(h_mc[den_cat]) +
-                                       + rel_err(h_data[num_cat]) + 
-                                       + rel_err(h_mc[den_cat])) * ff_val**2,
+                               np.sqrt(rel_err(data_num) + 
+                                       + rel_err(data_den) +
+                                       + rel_err(mc_num) + 
+                                       + rel_err(mc_den)) * ff_val**2,
                                0.5* np.ones_like(ff_val))
             h = hist.Hist.new
             for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
@@ -394,25 +395,23 @@ def rel_err(x):
             ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0)
             ff_hist.name = name
             ff_hist.label = label
-            ff_corr = cl_convert.from_histogram(ff_hist) #temporary correction without systematic axis
+            ff_corr = cl_convert.from_histogram(ff_hist)
             ff_corr.data.flow = "clamp"
             return ff_corr, ff_hist
         
-        import rich
-        
         wj_corr, wj_h = get_ff_corr(self,
-                              data_h_cat,
-                              mc_h_cat,
-                              num_cat = 'dr_num_wj',
-                              den_cat = 'dr_den_wj',
+                              data_hists,
+                              mc_hists,
+                              num_reg = 'dr_num_wj',
+                              den_reg = 'dr_den_wj',
                               name='ff_wjets',
                               label='Fake factor W+jets')
         
         qcd_corr, qcd_h = get_ff_corr(self,
-                              data_h_cat,
-                              mc_h_cat,
-                              num_cat = 'dr_num_qcd',
-                              den_cat = 'dr_den_qcd',
+                              data_hists,
+                              mc_hists,
+                              num_reg = 'dr_num_qcd',
+                              den_reg = 'dr_den_qcd',
                               name='ff_qcd',
                               label='Fake factor QCD')
         
@@ -422,9 +421,8 @@ def rel_err(x):
             for syst in ['nominal','up','down']:
                 fig, ax = plt.subplots(figsize=(12, 8))
                 the_hist[...,syst].plot2d(ax=ax)
-                self.output()['plots'][syst].dump(fig, formatter="mpl")
+                self.output()['plots']['_'.join((h_name,syst))].dump(fig, formatter="mpl")
                 
-            
         self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json")
         self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json")
             

From c76efd3ca63b4ea3a209319402d884afba940bb5 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 3 Feb 2025 12:24:08 +0100
Subject: [PATCH 10/26] FF method work in progress. Trying to slove problems of
 framework freezing while executing Compute fake factors task

---
 columnflow/columnar_util.py             |  18 +-
 columnflow/tasks/data_driven_methods.py | 254 ++++++++++++++++++++----
 2 files changed, 224 insertions(+), 48 deletions(-)

diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py
index 9057b3faf..a7b3c5ebe 100644
--- a/columnflow/columnar_util.py
+++ b/columnflow/columnar_util.py
@@ -1405,15 +1405,15 @@ def allows_shift(ax) -> bool:
             flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5
 
     # fill
-    if 'event' in data.keys():
-        arrays = {}
-        for ax_name in axis_names:
-            if ax_name in data.keys():
-                arrays[ax_name] = data[ax_name]
-        h.fill(**fill_kwargs, **arrays)
-    else:
-        arrays = ak.flatten(ak.cartesian(data))
-        h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
+    # if 'event' in data.keys():
+    #     arrays = {}
+    #     for ax_name in axis_names:
+    #         if ax_name in data.keys():
+    #             arrays[ax_name] = data[ax_name]
+    #     h.fill(**fill_kwargs, **arrays)
+    # else:
+    arrays = ak.flatten(ak.cartesian(data))
+    h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
 
 
 class RouteFilter(object):
diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 931f507df..7f58ad75a 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -174,9 +174,9 @@ def run(self):
                 else:
                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
                 # define and fill histograms, taking into account multiple axes
-                
+                categories = self.config_inst.categories.ids()
                 h = (hist.Hist.new
-                    .IntCat([], name="category", growth=True)
+                    .IntCat(categories , name="category", growth=True)
                     .IntCat([], name="process", growth=True))
                 for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                     h = eval(f'h.{var_axis.ax_str}') 
@@ -231,10 +231,6 @@ class ComputeFakeFactors(
     CategoriesMixin,
     WeightProducerMixin,
     ProducersMixin,
-    SelectorStepsMixin,
-    CalibratorsMixin,
-    law.LocalWorkflow,
-    RemoteWorkflow,
 ):
     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
 
@@ -266,25 +262,6 @@ def req_params(cls, inst: AnalysisTask, **kwargs) -> dict:
         kwargs["_prefer_cli"] = _prefer_cli
         return super().req_params(inst, **kwargs)
 
-    def create_branch_map(self):
-        return [
-            DotDict({"category": cat_name})
-            for cat_name in sorted(self.categories)
-        ]
-
-    def _get_variables(self):
-        if self.is_workflow():
-            return self.as_branch()._get_variables()
-
-        variables = self.variables
-
-        # optional dynamic behavior: determine not yet created variables and require only those
-        if self.only_missing:
-            missing = self.output().count(existing=False, keys=True)[1]
-            variables = sorted(missing, key=variables.index)
-
-        return variables
-
     def workflow_requires(self):
         reqs = super().workflow_requires()
         if not self.pilot:
@@ -356,14 +333,15 @@ def run(self):
         data_hists  = [h for p, h in hists_by_proc.items() if p.is_data]
         
         #Merge histograms to get a joint data and mc histogram
-        mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
-        data_hists  = sum(data_hists[1:], data_hists[0].copy())
+        if len(mc_hists) > 1:   mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
+        if len(data_hists) > 1: data_hists  = sum(data_hists[1:], data_hists[0].copy())
         
         #Function that performs the calculation of th
         def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'):
-            
             def get_dr_hist(self, h, det_reg): 
-                cat = self.config_inst.get_category(self.branch_data.category.replace('sr',det_reg))
+                cat_name = self.categories[0]
+                from IPython import embed; embed()
+                cat = self.config_inst.get_category(cat_name.replace('sr',det_reg))
                 return h[{"category": hist.loc(cat.id)}]
          
             data_num = get_dr_hist(self, h_data, num_reg)
@@ -425,14 +403,212 @@ def rel_err(x):
                 
         self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json")
         self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json")
-            
-            
-            
-            
-            
-            
-            
-            
-            
-            
+
+
+
+class CreateDataDrivenHistograms(
+    VariablesMixin,
+    WeightProducerMixin,
+    ProducersMixin,
+    ReducedEventsUser,
+    ChunkedIOMixin,
+    law.LocalWorkflow,
+    RemoteWorkflow,
+):
+
+    sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+    # upstream requirements
+    reqs = Requirements(
+        ReducedEventsUser.reqs,
+        RemoteWorkflow.reqs,
+        ComputeFakeFactors=ComputeFakeFactors,
+        ProduceColumns=ProduceColumns,
+    )
+    
+    def requires(self):
+        reqs = {"events": self.reqs.ProvideReducedEvents.req(self)}
+        from IPython import embed; embed()
+        if self.producer_insts:
+            reqs["producers"] = [
+                self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
+                for producer_inst in self.producer_insts
+                if producer_inst.produced_columns
+            ]
+        reqs['ff_json'] = self.reqs.ComputeFakeFactors.req(self)
+        reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
+        return reqs
+
+    def output(self):
+        return {"hists": self.target(f"histograms__vars_{self.variables_repr}__{self.branch}.pickle")}
+
+    @law.decorator.log
+    @law.decorator.localize(input=True, output=False)
+    @law.decorator.safe_output
+    def run(self):
+        import hist
+        import numpy as np
+        import awkward as ak
+        from columnflow.columnar_util import (
+            Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist,
+        )
+
+        # prepare inputs
+        inputs = self.input()
+        from IPython import embed; embed()
+        # declare output: dict of histograms
+        histograms = {}
+
+#         # run the weight_producer setup
+#         producer_reqs = self.weight_producer_inst.run_requires()
+#         reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs))
+
+#         # create a temp dir for saving intermediate files
+#         tmp_dir = law.LocalDirectoryTarget(is_tmp=True)
+#         tmp_dir.touch()
+
+#         # get shift dependent aliases
+#         aliases = self.local_shift_inst.x("column_aliases", {})
+
+#         # define columns that need to be read
+#         read_columns = {Route("process_id")}
+#         read_columns |= set(map(Route, self.category_id_columns))
+#         read_columns |= set(self.weight_producer_inst.used_columns)
+#         read_columns |= set(map(Route, aliases.values()))
+#         read_columns |= {
+#             Route(inp)
+#             for variable_inst in (
+#                 self.config_inst.get_variable(var_name)
+#                 for var_name in law.util.flatten(self.variable_tuples.values())
+#             )
+#             for inp in ((
+#                 {variable_inst.expression}
+#                 if isinstance(variable_inst.expression, str)
+#                 # for variable_inst with custom expressions, read columns declared via aux key
+#                 else set(variable_inst.x("inputs", []))
+#             ) | (
+#                 # for variable_inst with selection, read columns declared via aux key
+#                 set(variable_inst.x("inputs", []))
+#                 if variable_inst.selection != "1"
+#                 else set()
+#             ))
+#         }
+
+#         # empty float array to use when input files have no entries
+#         empty_f32 = ak.Array(np.array([], dtype=np.float32))
+
+#         # iterate over chunks of events and diffs
+#         file_targets = [inputs["events"]["events"]]
+#         if self.producer_insts:
+#             file_targets.extend([inp["columns"] for inp in inputs["producers"]])
+#         # if self.ml_model_insts:
+#         #     file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]])
+
+#         # prepare inputs for localization
+#         with law.localize_file_targets(
+#             [*file_targets, *reader_targets.values()],
+#             mode="r",
+#         ) as inps:
+#             for (events, *columns), pos in self.iter_chunked_io(
+#                 [inp.abspath for inp in inps],
+#                 source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets),
+#                 read_columns=(len(file_targets) + len(reader_targets)) * [read_columns],
+#                 chunk_size=self.weight_producer_inst.get_min_chunk_size(),
+#             ):
+#                 # optional check for overlapping inputs
+#                 if self.check_overlapping_inputs:
+#                     self.raise_if_overlapping([events] + list(columns))
+
+#                 # add additional columns
+#                 events = update_ak_array(events, *columns)
+
+#                 # add aliases
+#                 events = add_ak_aliases(
+#                     events,
+#                     aliases,
+#                     remove_src=True,
+#                     missing_strategy=self.missing_column_alias_strategy,
+#                 )
+
+#                 # build the full event weight without fake factors
+#                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
+#                     events, weight = self.weight_producer_inst(events)
+#                 else:
+#                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
+
+#                 # define and fill histograms, taking into account multiple axes
+#                 for var_key, var_names in self.variable_tuples.items():
+#                     # get variable instances
+#                     variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
+
+                    
+#                     # create the histogram if not present yet
+#                     if var_key not in histograms:
+#                         for reg_key in ['ar_wj','ar_wj','ar_yields']:
+#                             h = (
+#                                 hist.Hist.new
+#                                 .IntCat([], name="process", growth=True)
+#                                 .IntCat([], name="shift", growth=True)
+#                             )
+#                             # add variable axes
+#                             for variable_inst in variable_insts:
+#                                 h = h.Var(
+#                                     variable_inst.bin_edges,
+#                                     name='_'.join((variable_inst.name, reg_key))
+#                                     label=variable_inst.get_full_x_title(),
+#                                 )
+#                             # enable weights and store it
+#                             histograms[var_key] = h.Weight()
+                            
+#                     # merge category ids
+#                     category_ids = ak.concatenate(
+#                         [Route(c).apply(events) for c in self.category_id_columns],
+#                         axis=-1,
+#                     )
+
+#                     # broadcast arrays so that each event can be filled for all its categories
+#                     fill_data = {
+#                         "category": category_ids,
+#                         "process": events.process_id,
+#                         "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
+#                         "weight": weight,
+#                     }
+#                     for variable_inst in variable_insts:
+#                         # prepare the expression
+#                         expr = variable_inst.expression
+#                         if isinstance(expr, str):
+#                             route = Route(expr)
+#                             def expr(events, *args, **kwargs):
+#                                 if len(events) == 0 and not has_ak_column(events, route):
+#                                     return empty_f32
+#                                 return route.apply(events, null_value=variable_inst.null_value)
+#                         # apply it
+#                         fill_data[variable_inst.name] = expr(masked_events)
+
+#                     # fill it
+#                     fill_hist(
+#                         histograms[var_key],
+#                         fill_data,
+#                         last_edge_inclusive=self.last_edge_inclusive,
+#                     )
+
+#         # merge output files
+#         self.output()["hists"].dump(histograms, formatter="pickle")
+
+
+# # overwrite class defaults
+# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
+# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
+#     default=CreateHistograms.task_family in check_overlap_tasks,
+#     add_default_to_description=True,
+# )
+
+
+# CreateHistogramsWrapper = wrapper_factory(
+#     base_cls=AnalysisTask,
+#     require_cls=CreateHistograms,
+#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
+# )
+
+
 

From 9bf8095d918a09fa58bec89c2c6289e1ee91c978 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Wed, 12 Feb 2025 11:23:08 +0100
Subject: [PATCH 11/26] Fake Factor method: minimal working version

---
 columnflow/tasks/data_driven_methods.py | 282 ++++++----------
 columnflow/tasks/histograms.py          | 423 ++++++++++++++++++++----
 columnflow/tasks/plotting.py            |  67 ++--
 3 files changed, 492 insertions(+), 280 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 7f58ad75a..4a7367521 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -24,8 +24,8 @@
 
 
 class PrepareFakeFactorHistograms(
-    VariablesMixin,
     WeightProducerMixin,
+    MLModelsMixin,
     ProducersMixin,
     ReducedEventsUser,
     ChunkedIOMixin,
@@ -226,7 +226,6 @@ def run(self):
 )
 
 class ComputeFakeFactors(
-    VariablesMixin,
     DatasetsProcessesMixin,
     CategoriesMixin,
     WeightProducerMixin,
@@ -285,16 +284,21 @@ def requires(self):
             for d in self.datasets
         }
     def output(self):
-        return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']},
+        return {"ff_json": self.target(f"fake_factors.json"),
                 "plots": {'_'.join((ff_type, syst)): self.target(f"fake_factor_{ff_type}_{syst}.png")
                           for syst in ['nominal', 'up', 'down']
-                          for ff_type in ['qcd','wj']},}
+                          for ff_type in ['qcd','wj']},
+                "plots1d": {'_'.join((ff_type,str(dm))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}.png")
+                          for ff_type in ['qcd','wj']
+                          for dm in [0,1,2,10,11]}}
 
     @law.decorator.log
     def run(self):
         import hist
         import numpy as np
+        from scipy.optimize import curve_fit
         import matplotlib.pyplot as plt
+        import correctionlib
         import correctionlib.convert as cl_convert 
         # preare inputs and outputs
         inputs = self.input()
@@ -309,14 +313,12 @@ def run(self):
                 inp['hists'].load(formatter="pickle")['fake_factors']
                 for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50))
             ]
-            self.publish_message(f"merging Fake factor histograms for {dataset_name}")
             ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy())
             hists_by_dataset.append(ds_single_hist)
         #Create a dict of histograms indexed by the process
         hists_by_proc = {}
         for proc_name in self.config_inst.processes.names():
             proc = self.config_inst.processes.get(proc_name)
-            self.publish_message(f"merging Fake factor histograms for process: {proc.name}")
             for the_hist in hists_by_dataset:
                 
                 if proc.id in the_hist.axes["process"]: 
@@ -334,13 +336,14 @@ def run(self):
         
         #Merge histograms to get a joint data and mc histogram
         if len(mc_hists) > 1:   mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
+        else: mc_hists = mc_hists[0].copy()
         if len(data_hists) > 1: data_hists  = sum(data_hists[1:], data_hists[0].copy())
+        else: data_hists = data_hists[0].copy()
         
         #Function that performs the calculation of th
         def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'):
             def get_dr_hist(self, h, det_reg): 
                 cat_name = self.categories[0]
-                from IPython import embed; embed()
                 cat = self.config_inst.get_category(cat_name.replace('sr',det_reg))
                 return h[{"category": hist.loc(cat.id)}]
          
@@ -353,31 +356,54 @@ def get_dr_hist(self, h, det_reg):
             den = data_den.values() - mc_den.values()
             ff_val = np.where((num > 0) & (den > 0),
                                num / np.maximum(den, 1),
-                               1)
+                               -1)
             def rel_err(x):
                 return x.variances()/np.maximum(x.values()**2, 1)
             
-            ff_err2 = np.where((num > 0) & (den > 0),
-                               np.sqrt(rel_err(data_num) + 
-                                       + rel_err(data_den) +
-                                       + rel_err(mc_num) + 
-                                       + rel_err(mc_den)) * ff_val**2,
-                               0.5* np.ones_like(ff_val))
+            ff_err2 = np.abs(1./den) * (data_num.variances()**0.5 + mc_num.variances()**0.5) + np.abs(num)/(den**2) * (data_den.variances()**0.5 + mc_den.variances()**0.5)
+             
+            def fitf(x, a, b):
+                return a + b * x 
+            #make interpolation of the ff values
+            ipt_range = ff_val.shape[0]
+            x = data_num.axes[0].centers
+            
+            ff_fit = np.zeros((*np.shape(ff_val),3))
+            for idm in range(ff_val.shape[1]):
+                mask = ff_val[:,idm] > 0
+                y = ff_val[mask,idm]
+                y_err = ff_err2[mask,idm]
+                x_masked = x[mask]
+                popt, pcov = curve_fit(fitf,
+                                       x_masked,
+                                       y,
+                                       sigma=y_err,
+                                       absolute_sigma=True)
+                ff_fit[:,idm,0] = fitf(x, *popt)
+                ff_fit[:,idm,1] = fitf(x, *popt + np.sqrt(np.diag(pcov)))
+                ff_fit[:,idm,2] = fitf(x, *popt - np.sqrt(np.diag(pcov)))
             h = hist.Hist.new
             for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                 h = eval(f'h.{var_axis.ax_str}') 
             h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor')
-            ff_hist= h.Weight()
-            ff_hist.view().value[...,0] = ff_val
-            ff_hist.view().value[...,1] = ff_val + np.sqrt(ff_err2)
-            ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0)
-            ff_hist.name = name
-            ff_hist.label = label
-            ff_corr = cl_convert.from_histogram(ff_hist)
-            ff_corr.data.flow = "clamp"
-            return ff_corr, ff_hist
+            ff_fitted = h.Weight()
+            
+            ff_fitted.view().value = ff_fit
+            ff_fitted.name = name
+            ff_fitted.label = label
+            
+            ff_raw = ff_fitted.copy().reset()
+            ff_raw.view().value[...,0] = ff_val
+            ff_raw.view().variance[...,0] = ff_err2
+            ff_raw.name = name + '_raw'
+            ff_raw.label = label + '_raw'
+            
+            
+           
+            
+            return ff_raw, ff_fitted
         
-        wj_corr, wj_h = get_ff_corr(self,
+        wj_raw, wj_fitted = get_ff_corr(self,
                               data_hists,
                               mc_hists,
                               num_reg = 'dr_num_wj',
@@ -385,7 +411,7 @@ def rel_err(x):
                               name='ff_wjets',
                               label='Fake factor W+jets')
         
-        qcd_corr, qcd_h = get_ff_corr(self,
+        qcd_raw, qcd_fitted = get_ff_corr(self,
                               data_hists,
                               mc_hists,
                               num_reg = 'dr_num_qcd',
@@ -393,16 +419,52 @@ def rel_err(x):
                               name='ff_qcd',
                               label='Fake factor QCD')
         
+        corr_list = []
+        for h in [wj_raw, wj_fitted, qcd_raw, qcd_fitted]:
+            corr = cl_convert.from_histogram(h)
+            corr.data.flow = "clamp"
+            corr.version = 2
+            corr_list.append(corr)
+        cset = correctionlib.schemav2.CorrectionSet(
+        schema_version=2,
+        description="Fake factors",
+        corrections=corr_list
+        )
+        self.output()['ff_json'].dump(cset.json(exclude_unset=True), formatter="json")
         for h_name in ['wj', 'qcd']:
-            the_hist = eval(f'{h_name}_h')
+            h_raw = eval(f'{h_name}_raw')
+            h_fitted = eval(f'{h_name}_fitted')
             
-            for syst in ['nominal','up','down']:
-                fig, ax = plt.subplots(figsize=(12, 8))
-                the_hist[...,syst].plot2d(ax=ax)
-                self.output()['plots']['_'.join((h_name,syst))].dump(fig, formatter="mpl")
+            fig, ax = plt.subplots(figsize=(12, 8))
+            h_raw[...,'nominal'].plot2d(ax=ax)
+            self.output()['plots']['_'.join((h_name,'nominal'))].dump(fig, formatter="mpl")
+           
+            dm_axis = h_raw.axes['tau_dm_pnet']
+            for dm in dm_axis:
+                h1d = h_raw[{'tau_dm_pnet': hist.loc(dm),
+                                'syst': hist.loc('nominal')}]
+                
+                hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm)}]
                 
-        self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json")
-        self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json")
+                fig, ax = plt.subplots(figsize=(8, 6))
+                mask = h1d.counts() > 0
+                x = h1d.axes[0].centers[mask]
+                y = h1d.counts()[mask]
+                xerr = (np.diff(h1d.axes[0]).flatten()/2.)[mask],
+                yerr = np.sqrt(h1d.variances()).flatten()[mask],
+                ax.errorbar(x, y, xerr = xerr, yerr = yerr,
+                                label=f"PNet decay mode = {dm}",
+                                marker='o',
+                                fmt='o',
+                                line=None, color='#2478B7', capsize=4)
+                ax.plot(hfit.axes[0].centers,
+                        hfit[:,0].counts(),
+                        color='#FF867B')
+                ax.fill_between(hfit.axes[0].centers, hfit[:,2].counts(), hfit[:,1].counts(), color='#83d55f', alpha=0.5)
+                ax.set_ylabel('Fake Factor')
+                ax.set_xlabel('Tau pT [GeV]')
+                ax.set_title(f'Jet Fake Factors (Tau PNet Decay Mode {(dm)}')
+                self.output()['plots1d']['_'.join((h_name,str(dm)))].dump(fig, formatter="mpl")
 
 
 
@@ -457,158 +519,4 @@ def run(self):
         inputs = self.input()
         from IPython import embed; embed()
         # declare output: dict of histograms
-        histograms = {}
-
-#         # run the weight_producer setup
-#         producer_reqs = self.weight_producer_inst.run_requires()
-#         reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs))
-
-#         # create a temp dir for saving intermediate files
-#         tmp_dir = law.LocalDirectoryTarget(is_tmp=True)
-#         tmp_dir.touch()
-
-#         # get shift dependent aliases
-#         aliases = self.local_shift_inst.x("column_aliases", {})
-
-#         # define columns that need to be read
-#         read_columns = {Route("process_id")}
-#         read_columns |= set(map(Route, self.category_id_columns))
-#         read_columns |= set(self.weight_producer_inst.used_columns)
-#         read_columns |= set(map(Route, aliases.values()))
-#         read_columns |= {
-#             Route(inp)
-#             for variable_inst in (
-#                 self.config_inst.get_variable(var_name)
-#                 for var_name in law.util.flatten(self.variable_tuples.values())
-#             )
-#             for inp in ((
-#                 {variable_inst.expression}
-#                 if isinstance(variable_inst.expression, str)
-#                 # for variable_inst with custom expressions, read columns declared via aux key
-#                 else set(variable_inst.x("inputs", []))
-#             ) | (
-#                 # for variable_inst with selection, read columns declared via aux key
-#                 set(variable_inst.x("inputs", []))
-#                 if variable_inst.selection != "1"
-#                 else set()
-#             ))
-#         }
-
-#         # empty float array to use when input files have no entries
-#         empty_f32 = ak.Array(np.array([], dtype=np.float32))
-
-#         # iterate over chunks of events and diffs
-#         file_targets = [inputs["events"]["events"]]
-#         if self.producer_insts:
-#             file_targets.extend([inp["columns"] for inp in inputs["producers"]])
-#         # if self.ml_model_insts:
-#         #     file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]])
-
-#         # prepare inputs for localization
-#         with law.localize_file_targets(
-#             [*file_targets, *reader_targets.values()],
-#             mode="r",
-#         ) as inps:
-#             for (events, *columns), pos in self.iter_chunked_io(
-#                 [inp.abspath for inp in inps],
-#                 source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets),
-#                 read_columns=(len(file_targets) + len(reader_targets)) * [read_columns],
-#                 chunk_size=self.weight_producer_inst.get_min_chunk_size(),
-#             ):
-#                 # optional check for overlapping inputs
-#                 if self.check_overlapping_inputs:
-#                     self.raise_if_overlapping([events] + list(columns))
-
-#                 # add additional columns
-#                 events = update_ak_array(events, *columns)
-
-#                 # add aliases
-#                 events = add_ak_aliases(
-#                     events,
-#                     aliases,
-#                     remove_src=True,
-#                     missing_strategy=self.missing_column_alias_strategy,
-#                 )
-
-#                 # build the full event weight without fake factors
-#                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
-#                     events, weight = self.weight_producer_inst(events)
-#                 else:
-#                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
-
-#                 # define and fill histograms, taking into account multiple axes
-#                 for var_key, var_names in self.variable_tuples.items():
-#                     # get variable instances
-#                     variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
-
-                    
-#                     # create the histogram if not present yet
-#                     if var_key not in histograms:
-#                         for reg_key in ['ar_wj','ar_wj','ar_yields']:
-#                             h = (
-#                                 hist.Hist.new
-#                                 .IntCat([], name="process", growth=True)
-#                                 .IntCat([], name="shift", growth=True)
-#                             )
-#                             # add variable axes
-#                             for variable_inst in variable_insts:
-#                                 h = h.Var(
-#                                     variable_inst.bin_edges,
-#                                     name='_'.join((variable_inst.name, reg_key))
-#                                     label=variable_inst.get_full_x_title(),
-#                                 )
-#                             # enable weights and store it
-#                             histograms[var_key] = h.Weight()
-                            
-#                     # merge category ids
-#                     category_ids = ak.concatenate(
-#                         [Route(c).apply(events) for c in self.category_id_columns],
-#                         axis=-1,
-#                     )
-
-#                     # broadcast arrays so that each event can be filled for all its categories
-#                     fill_data = {
-#                         "category": category_ids,
-#                         "process": events.process_id,
-#                         "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
-#                         "weight": weight,
-#                     }
-#                     for variable_inst in variable_insts:
-#                         # prepare the expression
-#                         expr = variable_inst.expression
-#                         if isinstance(expr, str):
-#                             route = Route(expr)
-#                             def expr(events, *args, **kwargs):
-#                                 if len(events) == 0 and not has_ak_column(events, route):
-#                                     return empty_f32
-#                                 return route.apply(events, null_value=variable_inst.null_value)
-#                         # apply it
-#                         fill_data[variable_inst.name] = expr(masked_events)
-
-#                     # fill it
-#                     fill_hist(
-#                         histograms[var_key],
-#                         fill_data,
-#                         last_edge_inclusive=self.last_edge_inclusive,
-#                     )
-
-#         # merge output files
-#         self.output()["hists"].dump(histograms, formatter="pickle")
-
-
-# # overwrite class defaults
-# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
-# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
-#     default=CreateHistograms.task_family in check_overlap_tasks,
-#     add_default_to_description=True,
-# )
-
-
-# CreateHistogramsWrapper = wrapper_factory(
-#     base_cls=AnalysisTask,
-#     require_cls=CreateHistograms,
-#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
-# )
-
-
-
+        histograms = {}
\ No newline at end of file
diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py
index 070e9c49d..bfc316e9e 100644
--- a/columnflow/tasks/histograms.py
+++ b/columnflow/tasks/histograms.py
@@ -22,7 +22,6 @@
 from columnflow.util import dev_sandbox
 from columnflow.hist_util import create_hist_from_variables
 
-
 class CreateHistograms(
     VariablesMixin,
     WeightProducerMixin,
@@ -143,6 +142,7 @@ def run(self):
         read_columns = {Route("process_id")}
         read_columns |= set(map(Route, self.category_id_columns))
         read_columns |= set(self.weight_producer_inst.used_columns)
+        read_columns |= set(map(Route, [n +'*' for n in self.config_inst.x.fake_factor_method.columns]))
         read_columns |= set(map(Route, aliases.values()))
         read_columns |= {
             Route(inp)
@@ -201,72 +201,85 @@ def run(self):
 
                 # attach coffea behavior aiding functional variable expressions
                 events = attach_coffea_behavior(events)
-
+                
                 # build the full event weight
                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
                     events, weight = self.weight_producer_inst(events)
                 else:
                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
 
+                categories = self.config_inst.categories.names()
+                sig_regs = [the_cat for the_cat in categories if 'sr' in the_cat]
                 # define and fill histograms, taking into account multiple axes
-                for var_key, var_names in self.variable_tuples.items():
-                    # get variable instances
-                    variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
-
-                    if var_key not in histograms:
-                        # create the histogram in the first chunk
-                        histograms[var_key] = create_hist_from_variables(
-                            *variable_insts,
-                            int_cat_axes=("category", "process", "shift"),
-                        )
-
-                    # mask events and weights when selection expressions are found
-                    masked_events = events
-                    masked_weights = weight
-                    for variable_inst in variable_insts:
-                        sel = variable_inst.selection
-                        if sel == "1":
-                            continue
-                        if not callable(sel):
-                            raise ValueError(
-                                f"invalid selection '{sel}', for now only callables are supported",
+                for sig_reg in sig_regs:
+                    #iterate over the regions needed for calculation of the ff_method
+                    for region in ["sr", "ar_wj", "ar_qcd", "ar_yields"]: 
+                        #by accessing the list of categories we check if the category with this name exists
+                        cat = self.config_inst.get_category(sig_reg.replace('sr',region))
+                        if cat.name not in histograms.keys(): histograms[cat.name] = {}
+                        for var_key, var_names in self.variable_tuples.items():
+                            # get variable instances
+                            variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
+                    
+                            if var_key not in histograms[cat.name].keys():
+                                # create the histogram in the first chunk
+                                histograms[cat.name][var_key] = create_hist_from_variables(
+                                    *variable_insts,
+                                    int_cat_axes=("category", "process", "shift"),
+                                )
+                            # mask events and weights when selection expressions are found
+                            masked_events = events
+                            
+                            if region == 'ar_wj':
+                                masked_weights = weight * events.ff_weight_wj_nominal
+                            elif region == 'ar_qcd':
+                                masked_weights = weight * events.ff_weight_qcd_nominal
+                            else:
+                                masked_weights = weight
+                            for variable_inst in variable_insts:
+                                sel = variable_inst.selection
+                                if sel == "1":
+                                    continue
+                                if not callable(sel):
+                                    raise ValueError(
+                                        f"invalid selection '{sel}', for now only callables are supported",
+                                    )
+                                mask = sel(masked_events)
+                                #select only one category per histogram
+                                masked_events = masked_events[mask]
+                                masked_weights = masked_weights[mask]
+
+                            # merge category ids
+                            category_ids = ak.concatenate(
+                                [Route(c).apply(masked_events) for c in self.category_id_columns],
+                                axis=-1,
                             )
-                        mask = sel(masked_events)
-                        masked_events = masked_events[mask]
-                        masked_weights = masked_weights[mask]
-
-                    # merge category ids
-                    category_ids = ak.concatenate(
-                        [Route(c).apply(masked_events) for c in self.category_id_columns],
-                        axis=-1,
-                    )
-
-                    # broadcast arrays so that each event can be filled for all its categories
-                    fill_data = {
-                        "category": category_ids,
-                        "process": masked_events.process_id,
-                        "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id,
-                        "weight": masked_weights,
-                    }
-                    for variable_inst in variable_insts:
-                        # prepare the expression
-                        expr = variable_inst.expression
-                        if isinstance(expr, str):
-                            route = Route(expr)
-                            def expr(events, *args, **kwargs):
-                                if len(events) == 0 and not has_ak_column(events, route):
-                                    return empty_f32
-                                return route.apply(events, null_value=variable_inst.null_value)
-                        # apply it
-                        fill_data[variable_inst.name] = expr(masked_events)
-
-                    # fill it
-                    fill_hist(
-                        histograms[var_key],
-                        fill_data,
-                        last_edge_inclusive=self.last_edge_inclusive,
-                    )
 
+                            # broadcast arrays so that each event can be filled for all its categories
+                            fill_data = {
+                                "category": category_ids,
+                                "process": masked_events.process_id,
+                                "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id,
+                                "weight": masked_weights,
+                            }
+                            for variable_inst in variable_insts:
+                                # prepare the expression
+                                expr = variable_inst.expression
+                                if isinstance(expr, str):
+                                    route = Route(expr)
+                                    def expr(events, *args, **kwargs):
+                                        if len(events) == 0 and not has_ak_column(events, route):
+                                            return empty_f32
+                                        return route.apply(events, null_value=variable_inst.null_value)
+                                # apply it
+                                fill_data[variable_inst.name] = expr(masked_events)
+                            # fill it
+
+                            fill_hist(
+                                histograms[cat.name][var_key],
+                                fill_data,
+                                last_edge_inclusive=self.last_edge_inclusive,
+                            )
         # merge output files
         self.output()["hists"].dump(histograms, formatter="pickle")
 
@@ -278,6 +291,261 @@ def expr(events, *args, **kwargs):
     add_default_to_description=True,
 )
 
+# class CreateHistograms(
+#     VariablesMixin,
+#     WeightProducerMixin,
+#     MLModelsMixin,
+#     ProducersMixin,
+#     ReducedEventsUser,
+#     ChunkedIOMixin,
+#     law.LocalWorkflow,
+#     RemoteWorkflow,
+# ):
+#     last_edge_inclusive = last_edge_inclusive_inst
+
+#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+#     # upstream requirements
+#     reqs = Requirements(
+#         ReducedEventsUser.reqs,
+#         RemoteWorkflow.reqs,
+#         ProduceColumns=ProduceColumns,
+#         MLEvaluation=MLEvaluation,
+#     )
+
+#     # strategy for handling missing source columns when adding aliases on event chunks
+#     missing_column_alias_strategy = "original"
+
+#     # names of columns that contain category ids
+#     # (might become a parameter at some point)
+#     category_id_columns = {"category_ids"}
+
+#     # register sandbox and shifts found in the chosen weight producer to this task
+#     register_weight_producer_sandbox = True
+#     register_weight_producer_shifts = True
+
+#     @law.util.classproperty
+#     def mandatory_columns(cls) -> set[str]:
+#         return set(cls.category_id_columns) | {"process_id"}
+
+#     def workflow_requires(self):
+#         reqs = super().workflow_requires()
+
+#         # require the full merge forest
+#         reqs["events"] = self.reqs.ProvideReducedEvents.req(self)
+
+#         if not self.pilot:
+#             if self.producer_insts:
+#                 reqs["producers"] = [
+#                     self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
+#                     for producer_inst in self.producer_insts
+#                     if producer_inst.produced_columns
+#                 ]
+#             if self.ml_model_insts:
+#                 reqs["ml"] = [
+#                     self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name)
+#                     for ml_model_inst in self.ml_model_insts
+#                 ]
+
+#             # add weight_producer dependent requirements
+#             reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
+
+#         return reqs
+
+#     def requires(self):
+#         reqs = {"events": self.reqs.ProvideReducedEvents.req(self)}
+
+#         if self.producer_insts:
+#             reqs["producers"] = [
+#                 self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
+#                 for producer_inst in self.producer_insts
+#                 if producer_inst.produced_columns
+#             ]
+#         if self.ml_model_insts:
+#             reqs["ml"] = [
+#                 self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name)
+#                 for ml_model_inst in self.ml_model_insts
+#             ]
+
+#         # add weight_producer dependent requirements
+#         reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
+
+#         return reqs
+
+#     workflow_condition = ReducedEventsUser.workflow_condition.copy()
+
+#     @workflow_condition.output
+#     def output(self):
+#         return {"hists": self.target(f"hist__vars_{self.variables_repr}__{self.branch}.pickle")}
+
+#     @law.decorator.notify
+#     @law.decorator.log
+#     @law.decorator.localize(input=True, output=False)
+#     @law.decorator.safe_output
+#     def run(self):
+#         import numpy as np
+#         import awkward as ak
+#         from columnflow.columnar_util import (
+#             Route, update_ak_array, add_ak_aliases, has_ak_column, attach_coffea_behavior,
+#         )
+#         from columnflow.hist_util import fill_hist
+
+#         # prepare inputs
+#         inputs = self.input()
+
+#         # declare output: dict of histograms
+#         histograms = {}
+
+#         # run the weight_producer setup
+#         producer_reqs = self.weight_producer_inst.run_requires()
+#         reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs))
+
+#         # create a temp dir for saving intermediate files
+#         tmp_dir = law.LocalDirectoryTarget(is_tmp=True)
+#         tmp_dir.touch()
+
+#         # get shift dependent aliases
+#         aliases = self.local_shift_inst.x("column_aliases", {})
+
+#         # define columns that need to be read
+#         read_columns = {Route("process_id")}
+#         read_columns |= set(map(Route, self.category_id_columns))
+#         read_columns |= set(self.weight_producer_inst.used_columns)
+#         read_columns |= set(map(Route, aliases.values()))
+#         read_columns |= {
+#             Route(inp)
+#             for variable_inst in (
+#                 self.config_inst.get_variable(var_name)
+#                 for var_name in law.util.flatten(self.variable_tuples.values())
+#             )
+#             for inp in ((
+#                 {variable_inst.expression}
+#                 if isinstance(variable_inst.expression, str)
+#                 # for variable_inst with custom expressions, read columns declared via aux key
+#                 else set(variable_inst.x("inputs", []))
+#             ) | (
+#                 # for variable_inst with selection, read columns declared via aux key
+#                 set(variable_inst.x("inputs", []))
+#                 if variable_inst.selection != "1"
+#                 else set()
+#             ))
+#         }
+
+#         # empty float array to use when input files have no entries
+#         empty_f32 = ak.Array(np.array([], dtype=np.float32))
+
+#         # iterate over chunks of events and diffs
+#         file_targets = [inputs["events"]["events"]]
+#         if self.producer_insts:
+#             file_targets.extend([inp["columns"] for inp in inputs["producers"]])
+#         if self.ml_model_insts:
+#             file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]])
+
+#         # prepare inputs for localization
+#         with law.localize_file_targets(
+#             [*file_targets, *reader_targets.values()],
+#             mode="r",
+#         ) as inps:
+#             for (events, *columns), pos in self.iter_chunked_io(
+#                 [inp.abspath for inp in inps],
+#                 source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets),
+#                 read_columns=(len(file_targets) + len(reader_targets)) * [read_columns],
+#                 chunk_size=self.weight_producer_inst.get_min_chunk_size(),
+#             ):
+#                 # optional check for overlapping inputs
+#                 if self.check_overlapping_inputs:
+#                     self.raise_if_overlapping([events] + list(columns))
+
+#                 # add additional columns
+#                 events = update_ak_array(events, *columns)
+
+#                 # add aliases
+#                 events = add_ak_aliases(
+#                     events,
+#                     aliases,
+#                     remove_src=True,
+#                     missing_strategy=self.missing_column_alias_strategy,
+#                 )
+
+#                 # attach coffea behavior aiding functional variable expressions
+#                 events = attach_coffea_behavior(events)
+
+#                 # build the full event weight
+#                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
+#                     events, weight = self.weight_producer_inst(events)
+#                 else:
+#                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
+
+#                 # define and fill histograms, taking into account multiple axes
+#                 for var_key, var_names in self.variable_tuples.items():
+#                     # get variable instances
+#                     variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
+
+#                     if var_key not in histograms:
+#                         # create the histogram in the first chunk
+#                         histograms[var_key] = create_hist_from_variables(
+#                             *variable_insts,
+#                             int_cat_axes=("category", "process", "shift"),
+#                         )
+
+#                     # mask events and weights when selection expressions are found
+#                     masked_events = events
+#                     masked_weights = weight
+#                     for variable_inst in variable_insts:
+#                         sel = variable_inst.selection
+#                         if sel == "1":
+#                             continue
+#                         if not callable(sel):
+#                             raise ValueError(
+#                                 f"invalid selection '{sel}', for now only callables are supported",
+#                             )
+#                         mask = sel(masked_events)
+#                         masked_events = masked_events[mask]
+#                         masked_weights = masked_weights[mask]
+
+#                     # merge category ids
+#                     category_ids = ak.concatenate(
+#                         [Route(c).apply(masked_events) for c in self.category_id_columns],
+#                         axis=-1,
+#                     )
+
+#                     # broadcast arrays so that each event can be filled for all its categories
+#                     fill_data = {
+#                         "category": category_ids,
+#                         "process": masked_events.process_id,
+#                         "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id,
+#                         "weight": masked_weights,
+#                     }
+#                     for variable_inst in variable_insts:
+#                         # prepare the expression
+#                         expr = variable_inst.expression
+#                         if isinstance(expr, str):
+#                             route = Route(expr)
+#                             def expr(events, *args, **kwargs):
+#                                 if len(events) == 0 and not has_ak_column(events, route):
+#                                     return empty_f32
+#                                 return route.apply(events, null_value=variable_inst.null_value)
+#                         # apply it
+#                         fill_data[variable_inst.name] = expr(masked_events)
+
+#                     # fill it
+#                     fill_hist(
+#                         histograms[var_key],
+#                         fill_data,
+#                         last_edge_inclusive=self.last_edge_inclusive,
+#                     )
+
+#         # merge output files
+#         self.output()["hists"].dump(histograms, formatter="pickle")
+
+
+# # overwrite class defaults
+# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
+# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
+#     default=CreateHistograms.task_family in check_overlap_tasks,
+#     add_default_to_description=True,
+# )
+
 
 CreateHistogramsWrapper = wrapper_factory(
     base_cls=AnalysisTask,
@@ -384,18 +652,45 @@ def run(self):
             for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
         ]
 
+        cats = list(hists[0].keys())
+        variable_names = list(hists[0][cats[0]].keys())
+        get_hists = lambda hists, cat, var : [h[cat][var] for h in hists]
         # create a separate file per output variable
-        variable_names = list(hists[0].keys())
         for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)):
-            self.publish_message(f"merging histograms for '{variable_name}'")
-
-            variable_hists = [h[variable_name] for h in hists]
-            merged = sum(variable_hists[1:], variable_hists[0].copy())
-            outputs["hists"][variable_name].dump(merged, formatter="pickle")
+            merged_hists = {}
+            for the_cat in cats:
+                self.publish_message(f"merging histograms for {variable_name}, category: {the_cat}")
+                variable_hists  = get_hists(hists, the_cat, variable_name)
+                merged_hists[the_cat] = sum(variable_hists[1:], variable_hists[0].copy())
+            outputs["hists"][variable_name].dump(merged_hists, formatter="pickle")
 
         # optionally remove inputs
         if self.remove_previous:
             inputs.remove()
+    
+    # def run(self):
+    #     # preare inputs and outputs
+    #     inputs = self.input()["collection"]
+    #     outputs = self.output()
+
+    #     # load input histograms
+    #     hists = [
+    #         inp["hists"].load(formatter="pickle")
+    #         for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
+    #     ]
+
+    #     # create a separate file per output variable
+    #     variable_names = list(hists[0].keys())
+    #     for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)):
+    #         self.publish_message(f"merging histograms for '{variable_name}'")
+
+    #         variable_hists = [h[variable_name] for h in hists]
+    #         merged = sum(variable_hists[1:], variable_hists[0].copy())
+    #         outputs["hists"][variable_name].dump(merged, formatter="pickle")
+
+    #     # optionally remove inputs
+    #     if self.remove_previous:
+    #         inputs.remove()
 
 
 MergeHistogramsWrapper = wrapper_factory(
diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py
index 6709d3fbb..d15a18cec 100644
--- a/columnflow/tasks/plotting.py
+++ b/columnflow/tasks/plotting.py
@@ -111,43 +111,52 @@ def run(self):
             for dataset, inp in self.input().items():
                 dataset_inst = self.config_inst.get_dataset(dataset)
                 h_in = inp["collection"][0]["hists"].targets[self.branch_data.variable].load(formatter="pickle")
-
+               
                 # loop and extract one histogram per process
-                for process_inst in process_insts:
-                    # skip when the dataset is already known to not contain any sub process
-                    if not any(
-                        dataset_inst.has_process(sub_process_inst.name)
-                        for sub_process_inst in sub_process_insts[process_inst]
-                    ):
-                        continue
-
-                    # select processes and reduce axis
-                    h = h_in.copy()
-                    h = h[{
-                        "process": [
-                            hist.loc(p.id)
-                            for p in sub_process_insts[process_inst]
-                            if p.id in h.axes["process"]
-                        ],
-                    }]
-                    h = h[{"process": sum}]
-
-                    # add the histogram
-                    if process_inst in hists:
-                        hists[process_inst] += h
-                    else:
-                        hists[process_inst] = h
-
+                for region in h_in.keys():
+                    if region not in hists: hists[region] = {}
+                    for process_inst in process_insts:
+                        # skip when the dataset is already known to not contain any sub process
+                        if not any(
+                            dataset_inst.has_process(sub_process_inst.name)
+                            for sub_process_inst in sub_process_insts[process_inst]
+                        ):
+                            continue
+
+                        # select processes and reduce axis
+                        h = h_in[region].copy()
+                        h = h[{
+                            "process": [
+                                hist.loc(p.id)
+                                for p in sub_process_insts[process_inst]
+                                if p.id in h.axes["process"]
+                            ],
+                        }]
+                        h = h[{"process": sum}]
+
+                        # add the histogram
+                        if process_inst in hists[region]:
+                            hists[region][process_inst] += h
+                        else:
+                            hists[region][process_inst] = h
+            
+           
             # there should be hists to plot
+            
             if not hists:
                 raise Exception(
                     "no histograms found to plot; possible reasons:\n"
                     "  - requested variable requires columns that were missing during histogramming\n"
                     "  - selected --processes did not match any value on the process axis of the input histogram",
                 )
-
-            # update histograms using custom hooks
-            hists = self.invoke_hist_hooks(hists)
+            
+            if 'sr' in category_inst.name:
+                hists = self.invoke_hist_hooks(hists)
+            else:
+                if category_inst.name in hists.keys():
+                    hists = hists[category_inst.name]
+                else:
+                    hists[list(hists.keys())[0]]
 
             # add new processes to the end of the list
             for process_inst in hists:

From e2e2ef4838647bc5988cacc44115d0d057378f87 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Wed, 12 Feb 2025 12:45:36 +0100
Subject: [PATCH 12/26] Minor code updates

---
 columnflow/columnar_util.py            | 20 +++++++++++---------
 columnflow/hist_util.py                | 11 +++++++++--
 columnflow/production/normalization.py |  8 +++++---
 columnflow/tasks/yields.py             |  2 +-
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py
index a7b3c5ebe..171ab3661 100644
--- a/columnflow/columnar_util.py
+++ b/columnflow/columnar_util.py
@@ -14,6 +14,7 @@
 import math
 import time
 import enum
+
 import inspect
 import threading
 import multiprocessing
@@ -40,6 +41,7 @@
 maybe_import("coffea.nanoevents.methods.base")
 maybe_import("coffea.nanoevents.methods.nanoaod")
 pq = maybe_import("pyarrow.parquet")
+hist = maybe_import("hist")
 
 
 # loggers
@@ -1405,15 +1407,15 @@ def allows_shift(ax) -> bool:
             flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5
 
     # fill
-    # if 'event' in data.keys():
-    #     arrays = {}
-    #     for ax_name in axis_names:
-    #         if ax_name in data.keys():
-    #             arrays[ax_name] = data[ax_name]
-    #     h.fill(**fill_kwargs, **arrays)
-    # else:
-    arrays = ak.flatten(ak.cartesian(data))
-    h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
+    if 'event' in data.keys():
+        arrays = {}
+        for ax_name in axis_names:
+            if ax_name in data.keys():
+                arrays[ax_name] = data[ax_name]
+        h.fill(**fill_kwargs, **arrays)
+    else:
+        arrays = ak.flatten(ak.cartesian(data))
+        h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
 
 
 class RouteFilter(object):
diff --git a/columnflow/hist_util.py b/columnflow/hist_util.py
index 3c2b60ca6..92a9ed42a 100644
--- a/columnflow/hist_util.py
+++ b/columnflow/hist_util.py
@@ -72,8 +72,15 @@ def allows_shift(ax) -> bool:
             flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5
 
     # fill
-    arrays = ak.flatten(ak.cartesian(data))
-    h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
+    if 'event' in data.keys():
+        arrays = {}
+        for ax_name in axis_names:
+            if ax_name in data.keys():
+                arrays[ax_name] = data[ax_name]
+        h.fill(**fill_kwargs, **arrays)
+    else:
+        arrays = ak.flatten(ak.cartesian(data))
+        h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
 
 
 def add_hist_axis(histogram: hist.Hist, variable_inst: od.Variable) -> hist.Hist:
diff --git a/columnflow/production/normalization.py b/columnflow/production/normalization.py
index 66616ac7e..2144a52be 100644
--- a/columnflow/production/normalization.py
+++ b/columnflow/production/normalization.py
@@ -339,7 +339,8 @@ def normalization_weights_setup(
 
         # fill the process weight table
         for proc_id, br in branching_ratios.items():
-            sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(proc_id)]
+            #sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(proc_id)]
+            sum_weights = self.dataset_inst.n_events
             process_weight_table[0, proc_id] = lumi * inclusive_xsec * br / sum_weights
     else:
         # fill the process weight table with per-process cross sections
@@ -349,10 +350,9 @@ def normalization_weights_setup(
                     f"no cross section registered for process {process_inst} for center-of-mass "
                     f"energy of {self.config_inst.campaign.ecm}",
                 )
-            sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(process_inst.id)]
+            #sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(process_inst.id)]
             #quick fix that need to be fixed
             ################################
-            #n_evt_per_file = /self.dataset_inst.n_files
             sum_weights = self.dataset_inst.n_events
             ################################
             xsec = process_inst.get_xsec(self.config_inst.campaign.ecm).nominal
@@ -401,3 +401,5 @@ def normalization_weights_init(self: Producer) -> None:
         "get_xsecs_from_inclusive_dataset": False,
     },
 )
+
+
diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py
index e7d26ca57..9de6a31cc 100644
--- a/columnflow/tasks/yields.py
+++ b/columnflow/tasks/yields.py
@@ -143,7 +143,7 @@ def run(self):
                 dataset_inst = self.config_inst.get_dataset(dataset)
 
                 # load the histogram of the variable named "event"
-                h_in = inp["hists"]["event"].load(formatter="pickle")
+                input_hists = inp["hists"]["event"].load(formatter="pickle")
 
                 # loop and extract one histogram per process
                 for process_inst in process_insts:

From b3cc09f3ecf8f418ee82464c4465654bb516c506 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Fri, 14 Feb 2025 11:14:36 +0100
Subject: [PATCH 13/26] Fake_factor method update

---
 columnflow/tasks/data_driven_methods.py | 187 +++++++++++++++++-------
 1 file changed, 138 insertions(+), 49 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 4a7367521..b8228b361 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -225,11 +225,23 @@ def run(self):
     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 )
 
+class dict_creator():
+    def init_dict(self, ax_list):
+        if not ax_list:
+            return -1.
+        else:
+            ax = ax_list[0]
+            updated_ax = ax_list[1:]
+            get_ax_dict = lambda ax, ax_list, func : {ax.bin(i): func(ax_list) for i in range(ax.size)}
+            return get_ax_dict(ax,updated_ax, self.init_dict)
+                
+
 class ComputeFakeFactors(
     DatasetsProcessesMixin,
     CategoriesMixin,
     WeightProducerMixin,
     ProducersMixin,
+    dict_creator,
 ):
     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
 
@@ -279,7 +291,7 @@ def requires(self):
             d: self.reqs.PrepareFakeFactorHistograms.req(
                 self,
                 dataset=d,
-                branch=-1,
+                branch=-1
             )
             for d in self.datasets
         }
@@ -297,9 +309,15 @@ def run(self):
         import hist
         import numpy as np
         from scipy.optimize import curve_fit
+        from scipy.special import erf
         import matplotlib.pyplot as plt
-        import correctionlib
-        import correctionlib.convert as cl_convert 
+        import correctionlib.schemav2 as cs
+        plt.figure(dpi=200)
+        plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "monospace",
+            "font.monospace": 'Computer Modern Typewriter'
+        })
         # preare inputs and outputs
         inputs = self.input()
         outputs = self.output()
@@ -346,6 +364,8 @@ def get_dr_hist(self, h, det_reg):
                 cat_name = self.categories[0]
                 cat = self.config_inst.get_category(cat_name.replace('sr',det_reg))
                 return h[{"category": hist.loc(cat.id)}]
+            
+            get_id = lambda ax, key: [i in enumerate(ax.keys)]
          
             data_num = get_dr_hist(self, h_data, num_reg)
             data_den = get_dr_hist(self, h_data, den_reg)
@@ -360,50 +380,72 @@ def get_dr_hist(self, h, det_reg):
             def rel_err(x):
                 return x.variances()/np.maximum(x.values()**2, 1)
             
-            ff_err2 = np.abs(1./den) * (data_num.variances()**0.5 + mc_num.variances()**0.5) + np.abs(num)/(den**2) * (data_den.variances()**0.5 + mc_den.variances()**0.5)
-             
-            def fitf(x, a, b):
-                return a + b * x 
-            #make interpolation of the ff values
-            ipt_range = ff_val.shape[0]
-            x = data_num.axes[0].centers
+            ff_err = ff_val * ((data_num.variances() + mc_num.variances())**0.5 / np.abs(num) + (data_den.variances() + mc_den.variances())**0.5 / np.abs(den))
+            
             
-            ff_fit = np.zeros((*np.shape(ff_val),3))
-            for idm in range(ff_val.shape[1]):
-                mask = ff_val[:,idm] > 0
-                y = ff_val[mask,idm]
-                y_err = ff_err2[mask,idm]
-                x_masked = x[mask]
-                popt, pcov = curve_fit(fitf,
-                                       x_masked,
-                                       y,
-                                       sigma=y_err,
-                                       absolute_sigma=True)
-                ff_fit[:,idm,0] = fitf(x, *popt)
-                ff_fit[:,idm,1] = fitf(x, *popt + np.sqrt(np.diag(pcov)))
-                ff_fit[:,idm,2] = fitf(x, *popt - np.sqrt(np.diag(pcov)))
             h = hist.Hist.new
             for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                 h = eval(f'h.{var_axis.ax_str}') 
             h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor')
-            ff_fitted = h.Weight()
-            
-            ff_fitted.view().value = ff_fit
-            ff_fitted.name = name
-            ff_fitted.label = label
-            
-            ff_raw = ff_fitted.copy().reset()
+            ff_raw = h.Weight()
             ff_raw.view().value[...,0] = ff_val
-            ff_raw.view().variance[...,0] = ff_err2
+            ff_raw.view().variance[...,0] = ff_err**2
             ff_raw.name = name + '_raw'
             ff_raw.label = label + '_raw'
             
+            #Make an approximation of tau pt dependance
+            formula_str = 'p0 + p1*x+p2*x*x'
+            def fitf(x, p0, p1, p2):
+                return eval(formula_str)
+            def jac(x):
+                from numpy import array
+                out = array([[ 1, x,  x**2],[x,  x**2, x**3],[x**2,  x**3, x**4]])
+                return out
+            
+            def eval_formula(formula_str, popt):
+                for i,p in enumerate(popt):
+                    formula_str = formula_str.replace(f'p{i}',str(popt[i]))
+                return formula_str
+            
+            ff_fitted = ff_raw.copy().reset()
+            ff_fitted.name = name
+            ff_fitted.label = label
+            fitres = {}
             
-           
+            axes = list(ff_raw.axes[1:2])
+            fitres = {}
+            dc = dict_creator()
+            for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str']: 
+                fitres[the_field]= dc.init_dict(axes)
             
-            return ff_raw, ff_fitted
+            dm_axis = ff_raw.axes['tau_dm_pnet']
+            for dm in dm_axis:
+                h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm),
+                                'syst': hist.loc('nominal')}]
+                mask = h1d.values() > 0
+                y = h1d.values()[mask]
+                y_err = (h1d.variances()[mask])**0.5
+                x = h1d.axes[0].centers[mask]
+                popt, pcov = curve_fit(fitf,x,y,
+                                       sigma=y_err,
+                                       absolute_sigma=True,
+                                       )
+                fitres['chi2'][dm] = sum(((y - fitf(x, *popt))/y_err)**2)
+                fitres['ndf'][dm] = len(y) - len(popt)
+                fitres['popt'][dm] = popt 
+                fitres['pcov'][dm] = pcov
+               
+                fitres['fitf_str'][dm] = eval_formula(formula_str,popt)
+                for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0
+                    ff_fitted.view().value[:,
+                                           ff_fitted.axes[1].index(dm),
+                                           ff_fitted.axes[2].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov)))
+            fitres['name']  = name
+            fitres['jac']   = jac
+            fitres['fitf']  = fitf
+            return ff_raw, ff_fitted, fitres
         
-        wj_raw, wj_fitted = get_ff_corr(self,
+        wj_raw, wj_fitted, wj_fitres = get_ff_corr(self,
                               data_hists,
                               mc_hists,
                               num_reg = 'dr_num_wj',
@@ -411,7 +453,7 @@ def fitf(x, a, b):
                               name='ff_wjets',
                               label='Fake factor W+jets')
         
-        qcd_raw, qcd_fitted = get_ff_corr(self,
+        qcd_raw, qcd_fitted, qcd_fitres = get_ff_corr(self,
                               data_hists,
                               mc_hists,
                               num_reg = 'dr_num_qcd',
@@ -420,17 +462,46 @@ def fitf(x, a, b):
                               label='Fake factor QCD')
         
         corr_list = []
-        for h in [wj_raw, wj_fitted, qcd_raw, qcd_fitted]:
-            corr = cl_convert.from_histogram(h)
-            corr.data.flow = "clamp"
-            corr.version = 2
-            corr_list.append(corr)
-        cset = correctionlib.schemav2.CorrectionSet(
+        for fitres in [wj_fitres, qcd_fitres]:
+            formula_str = fitres['fitf_str']
+            dm_bins = []
+            for (dm, the_formula) in formula_str.items():
+                x_max = 100
+                last_val = fitres['fitf'](x_max,* fitres['popt'][dm])
+                
+                dm_bins.append(cs.CategoryItem(
+                    key=dm,
+                    value=cs.Formula(
+                        nodetype="formula",
+                        variables=["tau_pt"],
+                        parser="TFormula",
+                        expression=f'({the_formula})/(1. + exp(10.*(x-{x_max}))) + ({last_val})/(1. + exp(-10.*(x-{x_max})))',
+                    )))
+            corr_list.append(cs.Correction(
+                name=fitres['name'],
+                description=f"fake factor correcton for {fitres['name'].split('_')[1]}",
+                version=2,
+                inputs=[
+                    cs.Variable(name="tau_pt", type="real",description="pt of tau"),
+                    cs.Variable(name="tau_dm_pnet", type="int", description="PNet decay mode of tau"),
+                ],
+                output=cs.Variable(name="weight", type="real", description="Multiplicative event weight"),
+                data=cs.Category(
+                    nodetype="category",
+                    input="tau_dm_pnet",
+                    content=dm_bins,)
+            ))
+            
+        cset = cs.CorrectionSet(
         schema_version=2,
         description="Fake factors",
         corrections=corr_list
         )
         self.output()['ff_json'].dump(cset.json(exclude_unset=True), formatter="json")
+        
+        
+        
+        #Plot fake factors:
         for h_name in ['wj', 'qcd']:
             h_raw = eval(f'{h_name}_raw')
             h_fitted = eval(f'{h_name}_fitted')
@@ -438,14 +509,12 @@ def fitf(x, a, b):
             fig, ax = plt.subplots(figsize=(12, 8))
             h_raw[...,'nominal'].plot2d(ax=ax)
             self.output()['plots']['_'.join((h_name,'nominal'))].dump(fig, formatter="mpl")
-           
+            fitres = wj_fitres if h_name == 'wj' else qcd_fitres
             dm_axis = h_raw.axes['tau_dm_pnet']
             for dm in dm_axis:
                 h1d = h_raw[{'tau_dm_pnet': hist.loc(dm),
                                 'syst': hist.loc('nominal')}]
-                
                 hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm)}]
-                
                 fig, ax = plt.subplots(figsize=(8, 6))
                 mask = h1d.counts() > 0
                 x = h1d.axes[0].centers[mask]
@@ -457,13 +526,33 @@ def fitf(x, a, b):
                                 marker='o',
                                 fmt='o',
                                 line=None, color='#2478B7', capsize=4)
-                ax.plot(hfit.axes[0].centers,
-                        hfit[:,0].counts(),
+                x_fine = np.linspace(x[0],x[-1],num=100)
+                popt = fitres['popt'][dm]
+                pcov = fitres['pcov'][dm]
+                jac = fitres['jac']
+                def err(x,jac,pcov):
+                    from numpy import sqrt,einsum
+                    return sqrt(einsum('ij,ij',jac(x),pcov))
+
+                import functools
+                err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine))
+                
+                y_fitf = fitres['fitf'](x_fine,*popt)
+                y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y
+                y_fitf_down = fitres['fitf'](x_fine,*(popt)) - err_y
+               
+                ax.plot(x_fine,
+                        y_fitf,
                         color='#FF867B')
-                ax.fill_between(hfit.axes[0].centers, hfit[:,2].counts(), hfit[:,1].counts(), color='#83d55f', alpha=0.5)
+                ax.fill_between(x_fine, y_fitf_up,  y_fitf_down, color='#83d55f', alpha=0.5)
                 ax.set_ylabel('Fake Factor')
                 ax.set_xlabel('Tau pT [GeV]')
-                ax.set_title(f'Jet Fake Factors (Tau PNet Decay Mode {(dm)}')
+                ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}')
+                ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm],2)}}}{{{fitres['ndf'][dm]}}}$",
+                            (0.8, 0.9),
+                            xycoords='axes fraction',
+                            fontsize=20)
+                
                 self.output()['plots1d']['_'.join((h_name,str(dm)))].dump(fig, formatter="mpl")
 
 

From 766350c32d2308c3cb8e9820519129e271b98533 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Thu, 27 Feb 2025 09:45:00 +0100
Subject: [PATCH 14/26] Update on the fake factor method and plotting, jets.py
 form columnflow didn't work, so I commented them out

---
 columnflow/calibration/cms/jets.py       | 2188 +++++++++++-----------
 columnflow/plotting/plot_functions_1d.py |    2 +-
 columnflow/tasks/data_driven_methods.py  |    8 +-
 columnflow/tasks/framework/mixins.py     |    4 +-
 columnflow/tasks/histograms.py           |  345 +---
 columnflow/tasks/plotting.py             |   19 +-
 6 files changed, 1155 insertions(+), 1411 deletions(-)

diff --git a/columnflow/calibration/cms/jets.py b/columnflow/calibration/cms/jets.py
index 32c7c816b..20e600fa3 100644
--- a/columnflow/calibration/cms/jets.py
+++ b/columnflow/calibration/cms/jets.py
@@ -1,1091 +1,1109 @@
-# coding: utf-8
-
-"""
-Jet energy corrections and jet resolution smearing.
-"""
-from pprint import pprint
-
-import functools
-
-import law
-
-from columnflow.types import Any
-from columnflow.calibration import Calibrator, calibrator
-from columnflow.calibration.util import ak_random, propagate_met
-from columnflow.production.util import attach_coffea_behavior
-from columnflow.util import maybe_import, InsertableDict, DotDict
-from columnflow.columnar_util import set_ak_column, layout_ak_array, optional_column as optional
-
-np = maybe_import("numpy")
-ak = maybe_import("awkward")
-correctionlib = maybe_import("correctionlib")
-
-logger = law.logger.get_logger(__name__)
-
-
-#
-# helper functions
-#
-
-set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32)
-
-
-import difflib
-
-def get_evaluators(
-    correction_set: correctionlib.highlevel.CorrectionSet,
-    names: list[str],
-) -> list[Any]:
-    """
-    Helper function to get a list of correction evaluators from a
-    :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` object given
-    a list of *names*. The *names* can refer to either simple or compound
-    corrections.
-
-    :param correction_set: evaluator provided by :external+correctionlib:doc:`index`
-    :param names: List of names of corrections to be applied
-    :raises RuntimeError: If a requested correction in *names* is not available
-    :return: List of compounded corrections, see
-        :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
-    """
-    available_keys = set(correction_set.keys()).union(correction_set.compound.keys())
-    corrected_names = []
-
-    for name in names:
-        if name not in available_keys:
-            # Find the closest match using difflib
-            closest_matches = difflib.get_close_matches(name, available_keys, n=1)
-            if closest_matches:
-                closest_match = closest_matches[0]
-                print(
-                    f"Correction '{name}' not found. Using closest match: '{closest_match}'",
-                )
-                corrected_names.append(closest_match)
-            else:
-                raise RuntimeError(f"Correction '{name}' not found and no close match available.")
-        else:
-            corrected_names.append(name)
-
-    # Retrieve the evaluators
-    return [
-        correction_set.compound[name]
-        if name in correction_set.compound
-        else correction_set[name]
-        for name in corrected_names
-    ]
-
-def ak_evaluate(evaluator: correctionlib.highlevel.Correction, *args) -> float:
-    """
-    Evaluate a :external+correctionlib:py:class:`correctionlib.highlevel.Correction`
-    using one or more :external+ak:py:class:`awkward arrays <ak.Array>` as inputs.
-
-    :param evaluator: Evaluator instance
-    :raises ValueError: If no :external+ak:py:class:`awkward arrays <ak.Array>` are provided
-    :return: The correction factor derived from the input arrays
-    """
-    # fail if no arguments
-    if not args:
-        raise ValueError("Expected at least one argument.")
-
-    # collect arguments that are awkward arrays
-    ak_args = [
-        arg for arg in args if isinstance(arg, ak.Array)
-    ]
-
-    # broadcast akward arrays together and flatten
-    if ak_args:
-        bc_args = ak.broadcast_arrays(*ak_args)
-        flat_args = (
-            np.asarray(ak.flatten(bc_arg, axis=None))
-            for bc_arg in bc_args
-        )
-        output_layout_array = bc_args[0]
-    else:
-        flat_args = iter(())
-        output_layout_array = None
-
-    # multiplex flattened and non-awkward inputs
-    all_flat_args = [
-        next(flat_args) if isinstance(arg, ak.Array) else arg
-        for arg in args
-    ]
-
-    # apply evaluator to flattened/multiplexed inputs
-    result = evaluator.evaluate(*all_flat_args)
-
-    # apply broadcasted layout to result
-    if output_layout_array is not None:
-        result = layout_ak_array(result, output_layout_array)
-
-    return result
-
-
-#
-# jet energy corrections
-#
-
-# define default functions for jec calibrator
-def get_jerc_file_default(self: Calibrator, external_files: DotDict) -> str:
-    """
-    Function to obtain external correction files for JEC and/or JER.
-
-    By default, this function extracts the location of the jec correction
-    files from the current config instance *config_inst*. The key of the
-    external file depends on the jet collection. For ``Jet`` (AK4 jets), this
-    resolves to ``jet_jerc``, and for ``FatJet`` it is resolved to
-    ``fat_jet_jerc``.
-
-    .. code-block:: python
-
-        cfg.x.external_files = DotDict.wrap({
-            "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
-            "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz",
-        })
-
-    :param external_files: Dictionary containing the information about the file location
-    :return: path or url to correction file(s)
-    """ # noqa
-
-    # get config
-    try_attrs = ("get_jec_config", "get_jer_config")
-    jerc_config = None
-    for try_attr in try_attrs:
-        try:
-            jerc_config = getattr(self, try_attr)()
-        except AttributeError:
-            continue
-        else:
-            break
-
-    # fail if not found
-    if jerc_config is None:
-        raise ValueError(
-            "could not retrieve jer/jec config, none of the following methods "
-            f"were found: {try_attrs}",
-        )
-
-    # first check config for user-supplied `external_file_key`
-    ext_file_key = jerc_config.get("external_file_key", None)
-    if ext_file_key is not None:
-        return external_files[ext_file_key]
-
-    # if not found, try to resolve from jet collection name and fail if not standard NanoAOD
-    if self.jet_name not in get_jerc_file_default.map_jet_name_file_key:
-        available_keys = ", ".join(sorted(get_jerc_file_default.map_jet_name_file_key))
-        raise ValueError(
-            f"could not determine external file key for jet collection '{self.jet_name}', "
-            f"name is not one of standard NanoAOD jet collections: {available_keys}",
-        )
-
-    # return external file
-    ext_file_key = get_jerc_file_default.map_jet_name_file_key[self.jet_name]
-    return external_files[ext_file_key]
-
-
-# default external file keys for known jet collections
-get_jerc_file_default.map_jet_name_file_key = {
-    "Jet": "jet_jerc",
-    "FatJet": "fat_jet_jerc",
-}
-
-
-def get_jec_config_default(self: Calibrator) -> DotDict:
-    """
-    Load config relevant to the jet energy corrections (JEC).
-
-    By default, this is extracted from the current *config_inst*,
-    assuming the JEC configurations are stored under the 'jec'
-    aux key. Separate configurations should be specified for each
-    jet collection, using the collection name as a key. For example,
-    the configuration for the default jet collection ``Jet`` will
-    be retrieved from the following config entry:
-
-    .. code-block:: python
-
-        self.config_inst.x.jec.Jet
-
-    Used in :py:meth:`~.jec.setup_func`.
-
-    :return: Dictionary containing configuration for jet energy calibration
-    """
-    jec_cfg = self.config_inst.x.jec
-
-    # check for old-style config
-    if self.jet_name not in jec_cfg:
-        # if jet collection is `Jet`, issue deprecation warning
-        if self.jet_name == "Jet":
-            logger.warning_once(
-                f"{id(self)}_depr_jec_config",
-                "config aux 'jec' does not contain key for input jet "
-                f"collection '{self.jet_name}'. This may be due to "
-                "an outdated config. Continuing under the assumption that "
-                "the entire 'jec' entry refers to this jet collection. "
-                "This assumption will be removed in future versions of "
-                "columnflow, so please adapt the config according to the "
-                "documentation to remove this warning and ensure future "
-                "compatibility of the code.",
-            )
-            return jec_cfg
-
-        # otherwise raise exception
-        raise ValueError(
-            "config aux 'jec' does not contain key for input jet "
-            f"collection '{self.jet_name}'.",
-        )
-
-    return jec_cfg[self.jet_name]
-
-
-@calibrator(
-    uses={
-        optional("fixedGridRhoFastjetAll"),
-        optional("Rho.fixedGridRhoFastjetAll"),
-        attach_coffea_behavior,
-    },
-    # name of the jet collection to calibrate
-    jet_name="Jet",
-    # name of the associated MET collection
-    met_name="MET",
-    # name of the associated Raw MET collection
-    raw_met_name="RawMET",
-    # custom uncertainty sources, defaults to config when empty
-    uncertainty_sources=None,
-    # toggle for propagation to PuppiMET
-    propagate_met=True,
-    # # function to determine the correction file
-    get_jec_file=get_jec_file_default,
-    # # function to determine the jec configuration dict
-    get_jec_config=get_jec_config_default,
-)
-
-def jec(
-    self: Calibrator,
-    events: ak.Array,
-    min_pt_met_prop: float = 15.0,
-    max_eta_met_prop: float = 5.2,
-    **kwargs,
-) -> ak.Array:
-    """Performs the jet energy corrections (JECs) and uncertainty shifts using the
-    :external+correctionlib:doc:`index`, optionally
-    propagating the changes to the PuppiMET.
-
-    The *jet_name* should be set to the name of the NanoAOD jet collection to calibrate
-    (default: ``Jet``, i.e. AK4 jets).
-
-    Requires an external file in the config pointing to the JSON files containing the JECs.
-    The file key can be specified via an optional ``external_file_key`` in the ``jec`` config entry.
-    If not given, the file key will be determined automatically based on the jet collection name:
-    ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files
-    can be specified as:
-
-    .. code-block:: python
-
-        cfg.x.external_files = DotDict.wrap({
-            "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
-            "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz",
-        })
-
-    For more file-grained control, the *get_jec_file* can be adapted in a subclass in case it is stored
-    differently in the external files
-
-    The JEC configuration should be an auxiliary entry in the config, specifying the correction
-    details under "jec". Separate configs should be given for each jet collection to calibrate,
-    using the jet collection name as a subkey. An example of a valid configuration for correction
-    AK4 jets with JEC is:
-
-    .. code-block:: python
-
-        cfg.x.jec = {
-            "Jet": {
-                "campaign": "Summer19UL17",
-                "version": "V5",
-                "jet_type": "AK4PFchs",
-                "levels": ["L1L2L3Res"],  # or individual correction levels
-                "levels_for_type1_met": ["L1FastJet"],
-                "uncertainty_sources": [
-                    "Total",
-                    "CorrelationGroupMPFInSitu",
-                    "CorrelationGroupIntercalibration",
-                    "CorrelationGroupbJES",
-                    "CorrelationGroupFlavor",
-                    "CorrelationGroupUncorrelated",
-                ]
-            },
-        }
-
-    *get_jec_config* can be adapted in a subclass in case it is stored differently in the config.
-
-    If running on data, the datasets must have an auxiliary field *jec_era* defined, e.g. "RunF",
-    or an auxiliary field *era*, e.g. "F".
-
-    This instance of :py:class:`~columnflow.calibration.Calibrator` is
-    initialized with the following parameters by default:
-
-    :param events: awkward array containing events to process
-
-    :param min_pt_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet values
-        to the missing transverse energy (PuppiMET) using
-        :py:func:`~columnflow.calibration.util.propagate_met` for events where
-        ``met.pt > *min_pt_met_prop*``.
-    :param max_eta_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet
-        values to the missing transverse energy (PuppiMET) using
-        :py:func:`~columnflow.calibration.util.propagate_met` for events where
-        ``met.eta > *min_eta_met_prop*``.
-    """ # noqa
+# # coding: utf-8
+
+# """
+# Jet energy corrections and jet resolution smearing.
+# """
+# from pprint import pprint
+
+# import functools
+
+# import law
+
+# from columnflow.types import Any
+# from columnflow.calibration import Calibrator, calibrator
+# from columnflow.calibration.util import ak_random, propagate_met
+# from columnflow.production.util import attach_coffea_behavior
+# from columnflow.util import maybe_import, InsertableDict, DotDict
+# from columnflow.columnar_util import set_ak_column, layout_ak_array, optional_column as optional
+
+# np = maybe_import("numpy")
+# ak = maybe_import("awkward")
+# correctionlib = maybe_import("correctionlib")
+
+# logger = law.logger.get_logger(__name__)
+
+
+# #
+# # helper functions
+# #
+
+# set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32)
+
+
+# import difflib
+
+# def get_evaluators(
+#     correction_set: correctionlib.highlevel.CorrectionSet,
+#     names: list[str],
+# ) -> list[Any]:
+#     """
+#     Helper function to get a list of correction evaluators from a
+#     :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` object given
+#     a list of *names*. The *names* can refer to either simple or compound
+#     corrections.
+
+#     :param correction_set: evaluator provided by :external+correctionlib:doc:`index`
+#     :param names: List of names of corrections to be applied
+#     :raises RuntimeError: If a requested correction in *names* is not available
+#     :return: List of compounded corrections, see
+#         :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
+#     """
+#     available_keys = set(correction_set.keys()).union(correction_set.compound.keys())
+#     corrected_names = []
+
+#     for name in names:
+#         if name not in available_keys:
+#             # Find the closest match using difflib
+#             closest_matches = difflib.get_close_matches(name, available_keys, n=1)
+#             if closest_matches:
+#                 closest_match = closest_matches[0]
+#                 print(
+#                     f"Correction '{name}' not found. Using closest match: '{closest_match}'",
+#                 )
+#                 corrected_names.append(closest_match)
+#             else:
+#                 raise RuntimeError(f"Correction '{name}' not found and no close match available.")
+#         else:
+#             corrected_names.append(name)
+
+#     # Retrieve the evaluators
+#     return [
+#         correction_set.compound[name]
+#         if name in correction_set.compound
+#         else correction_set[name]
+#         for name in corrected_names
+#     ]
+
+# def ak_evaluate(evaluator: correctionlib.highlevel.Correction, *args) -> float:
+#     """
+#     Evaluate a :external+correctionlib:py:class:`correctionlib.highlevel.Correction`
+#     using one or more :external+ak:py:class:`awkward arrays <ak.Array>` as inputs.
+
+#     :param evaluator: Evaluator instance
+#     :raises ValueError: If no :external+ak:py:class:`awkward arrays <ak.Array>` are provided
+#     :return: The correction factor derived from the input arrays
+#     """
+#     # fail if no arguments
+#     if not args:
+#         raise ValueError("Expected at least one argument.")
+
+#     # collect arguments that are awkward arrays
+#     ak_args = [
+#         arg for arg in args if isinstance(arg, ak.Array)
+#     ]
+
+#     # broadcast akward arrays together and flatten
+#     if ak_args:
+#         bc_args = ak.broadcast_arrays(*ak_args)
+#         flat_args = (
+#             np.asarray(ak.flatten(bc_arg, axis=None))
+#             for bc_arg in bc_args
+#         )
+#         output_layout_array = bc_args[0]
+#     else:
+#         flat_args = iter(())
+#         output_layout_array = None
+
+#     # multiplex flattened and non-awkward inputs
+#     all_flat_args = [
+#         next(flat_args) if isinstance(arg, ak.Array) else arg
+#         for arg in args
+#     ]
+
+#     # apply evaluator to flattened/multiplexed inputs
+#     result = evaluator.evaluate(*all_flat_args)
+
+#     # apply broadcasted layout to result
+#     if output_layout_array is not None:
+#         result = layout_ak_array(result, output_layout_array)
+
+#     return result
+
+
+# #
+# # jet energy corrections
+# #
+# def get_jec_file_default(self, external_files: DotDict) -> str:
+#     """
+#     Function to obtain external jec files.
+
+#     By default, this function extracts the location of the jec correction
+#     files from the current config instance *config_inst*:
+
+#     .. code-block:: python
+
+#         cfg.x.external_files = DotDict.wrap({
+#             "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
+#         })
+
+#     :param external_files: Dictionary containing the information about the file location
+#     :return: path or url to correction file(s)
+#     """ # noqa
+#     return external_files.jet_jerc
+
+
+# # define default functions for jec calibrator
+# def get_jerc_file_default(self: Calibrator, external_files: DotDict) -> str:
+#     """
+#     Function to obtain external correction files for JEC and/or JER.
+
+#     By default, this function extracts the location of the jec correction
+#     files from the current config instance *config_inst*. The key of the
+#     external file depends on the jet collection. For ``Jet`` (AK4 jets), this
+#     resolves to ``jet_jerc``, and for ``FatJet`` it is resolved to
+#     ``fat_jet_jerc``.
+
+#     .. code-block:: python
+
+#         cfg.x.external_files = DotDict.wrap({
+#             "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
+#             "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz",
+#         })
+
+#     :param external_files: Dictionary containing the information about the file location
+#     :return: path or url to correction file(s)
+#     """ # noqa
+
+#     # get config
+#     try_attrs = ("get_jec_config", "get_jer_config")
+#     jerc_config = None
+#     for try_attr in try_attrs:
+#         try:
+#             jerc_config = getattr(self, try_attr)()
+#         except AttributeError:
+#             continue
+#         else:
+#             break
+
+#     # fail if not found
+#     if jerc_config is None:
+#         raise ValueError(
+#             "could not retrieve jer/jec config, none of the following methods "
+#             f"were found: {try_attrs}",
+#         )
+
+#     # first check config for user-supplied `external_file_key`
+#     ext_file_key = jerc_config.get("external_file_key", None)
+#     if ext_file_key is not None:
+#         return external_files[ext_file_key]
+
+#     # if not found, try to resolve from jet collection name and fail if not standard NanoAOD
+#     if self.jet_name not in get_jerc_file_default.map_jet_name_file_key:
+#         available_keys = ", ".join(sorted(get_jerc_file_default.map_jet_name_file_key))
+#         raise ValueError(
+#             f"could not determine external file key for jet collection '{self.jet_name}', "
+#             f"name is not one of standard NanoAOD jet collections: {available_keys}",
+#         )
+
+#     # return external file
+#     ext_file_key = get_jerc_file_default.map_jet_name_file_key[self.jet_name]
+#     return external_files[ext_file_key]
+
+
+# # default external file keys for known jet collections
+# get_jerc_file_default.map_jet_name_file_key = {
+#     "Jet": "jet_jerc",
+#     "FatJet": "fat_jet_jerc",
+# }
+
+
+# def get_jec_config_default(self: Calibrator) -> DotDict:
+#     """
+#     Load config relevant to the jet energy corrections (JEC).
+
+#     By default, this is extracted from the current *config_inst*,
+#     assuming the JEC configurations are stored under the 'jec'
+#     aux key. Separate configurations should be specified for each
+#     jet collection, using the collection name as a key. For example,
+#     the configuration for the default jet collection ``Jet`` will
+#     be retrieved from the following config entry:
+
+#     .. code-block:: python
+
+#         self.config_inst.x.jec.Jet
+
+#     Used in :py:meth:`~.jec.setup_func`.
+
+#     :return: Dictionary containing configuration for jet energy calibration
+#     """
+#     jec_cfg = self.config_inst.x.jec
+
+#     # check for old-style config
+#     if self.jet_name not in jec_cfg:
+#         # if jet collection is `Jet`, issue deprecation warning
+#         if self.jet_name == "Jet":
+#             logger.warning_once(
+#                 f"{id(self)}_depr_jec_config",
+#                 "config aux 'jec' does not contain key for input jet "
+#                 f"collection '{self.jet_name}'. This may be due to "
+#                 "an outdated config. Continuing under the assumption that "
+#                 "the entire 'jec' entry refers to this jet collection. "
+#                 "This assumption will be removed in future versions of "
+#                 "columnflow, so please adapt the config according to the "
+#                 "documentation to remove this warning and ensure future "
+#                 "compatibility of the code.",
+#             )
+#             return jec_cfg
+
+#         # otherwise raise exception
+#         raise ValueError(
+#             "config aux 'jec' does not contain key for input jet "
+#             f"collection '{self.jet_name}'.",
+#         )
+
+#     return jec_cfg[self.jet_name]
+
+
+# @calibrator(
+#     uses={
+#         optional("fixedGridRhoFastjetAll"),
+#         optional("Rho.fixedGridRhoFastjetAll"),
+#         attach_coffea_behavior,
+#     },
+#     # name of the jet collection to calibrate
+#     jet_name="Jet",
+#     # name of the associated MET collection
+#     met_name="MET",
+#     # name of the associated Raw MET collection
+#     raw_met_name="RawMET",
+#     # custom uncertainty sources, defaults to config when empty
+#     uncertainty_sources=None,
+#     # toggle for propagation to PuppiMET
+#     propagate_met=True,
+#     # # function to determine the correction file
+#     get_jec_file=get_jec_file_default,
+#     # # function to determine the jec configuration dict
+#     get_jec_config=get_jec_config_default,
+# )
+
+# def jec(
+#     self: Calibrator,
+#     events: ak.Array,
+#     min_pt_met_prop: float = 15.0,
+#     max_eta_met_prop: float = 5.2,
+#     **kwargs,
+# ) -> ak.Array:
+#     """Performs the jet energy corrections (JECs) and uncertainty shifts using the
+#     :external+correctionlib:doc:`index`, optionally
+#     propagating the changes to the PuppiMET.
+
+#     The *jet_name* should be set to the name of the NanoAOD jet collection to calibrate
+#     (default: ``Jet``, i.e. AK4 jets).
+
+#     Requires an external file in the config pointing to the JSON files containing the JECs.
+#     The file key can be specified via an optional ``external_file_key`` in the ``jec`` config entry.
+#     If not given, the file key will be determined automatically based on the jet collection name:
+#     ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files
+#     can be specified as:
+
+#     .. code-block:: python
+
+#         cfg.x.external_files = DotDict.wrap({
+#             "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
+#             "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz",
+#         })
+
+#     For more file-grained control, the *get_jec_file* can be adapted in a subclass in case it is stored
+#     differently in the external files
+
+#     The JEC configuration should be an auxiliary entry in the config, specifying the correction
+#     details under "jec". Separate configs should be given for each jet collection to calibrate,
+#     using the jet collection name as a subkey. An example of a valid configuration for correction
+#     AK4 jets with JEC is:
+
+#     .. code-block:: python
+
+#         cfg.x.jec = {
+#             "Jet": {
+#                 "campaign": "Summer19UL17",
+#                 "version": "V5",
+#                 "jet_type": "AK4PFchs",
+#                 "levels": ["L1L2L3Res"],  # or individual correction levels
+#                 "levels_for_type1_met": ["L1FastJet"],
+#                 "uncertainty_sources": [
+#                     "Total",
+#                     "CorrelationGroupMPFInSitu",
+#                     "CorrelationGroupIntercalibration",
+#                     "CorrelationGroupbJES",
+#                     "CorrelationGroupFlavor",
+#                     "CorrelationGroupUncorrelated",
+#                 ]
+#             },
+#         }
+
+#     *get_jec_config* can be adapted in a subclass in case it is stored differently in the config.
+
+#     If running on data, the datasets must have an auxiliary field *jec_era* defined, e.g. "RunF",
+#     or an auxiliary field *era*, e.g. "F".
+
+#     This instance of :py:class:`~columnflow.calibration.Calibrator` is
+#     initialized with the following parameters by default:
+
+#     :param events: awkward array containing events to process
+
+#     :param min_pt_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet values
+#         to the missing transverse energy (PuppiMET) using
+#         :py:func:`~columnflow.calibration.util.propagate_met` for events where
+#         ``met.pt > *min_pt_met_prop*``.
+#     :param max_eta_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet
+#         values to the missing transverse energy (PuppiMET) using
+#         :py:func:`~columnflow.calibration.util.propagate_met` for events where
+#         ``met.eta > *min_eta_met_prop*``.
+#     """ # noqa
     
-    # calculate uncorrected pt, mass
-    events = set_ak_column_f32(events, "Jet.pt_raw", events.Jet.pt * (1 - events.Jet.rawFactor))
-    events = set_ak_column_f32(events, "Jet.mass_raw", events.Jet.mass * (1 - events.Jet.rawFactor))
-
-    # calculate uncorrected pt, mass
-    events = set_ak_column_f32(events, f"{jet_name}.pt_raw", events[jet_name].pt * (1 - events[jet_name].rawFactor))
-    events = set_ak_column_f32(events, f"{jet_name}.mass_raw", events[jet_name].mass * (1 - events[jet_name].rawFactor))
-
-    def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
-        # variable naming convention
-        variable_map = {
-            "JetA": area,
-            "JetEta": eta,
-            "JetPt": pt,
-            "JetPhi": phi,
-            "Rho": ak.values_astype(rho, np.float32),
-        }
-
-        # apply all correctors sequentially, updating the pt each time
-        full_correction = ak.ones_like(pt, dtype=np.float32)
+#     # calculate uncorrected pt, mass
+#     events = set_ak_column_f32(events, "Jet.pt_raw", events.Jet.pt * (1 - events.Jet.rawFactor))
+#     events = set_ak_column_f32(events, "Jet.mass_raw", events.Jet.mass * (1 - events.Jet.rawFactor))
+
+#     # calculate uncorrected pt, mass
+#     events = set_ak_column_f32(events, f"{jet_name}.pt_raw", events[jet_name].pt * (1 - events[jet_name].rawFactor))
+#     events = set_ak_column_f32(events, f"{jet_name}.mass_raw", events[jet_name].mass * (1 - events[jet_name].rawFactor))
+
+#     def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"):
+#         # variable naming convention
+#         variable_map = {
+#             "JetA": area,
+#             "JetEta": eta,
+#             "JetPt": pt,
+#             "JetPhi": phi,
+#             "Rho": ak.values_astype(rho, np.float32),
+#         }
+
+#         # apply all correctors sequentially, updating the pt each time
+#         full_correction = ak.ones_like(pt, dtype=np.float32)
         
 
-        for corrector in self.evaluators[evaluator_key]:
-            # determine correct inputs (change depending on corrector)
-            inputs = [
-                variable_map[inp.name]
-                for inp in corrector.inputs
-            ]
-            correction = ak_evaluate(corrector, *inputs)
-            # update pt for subsequent correctors
-            #pprint(corrector.__dict__)  # If `corrector` is a custom object with attributes
-            variable_map["JetPt"] = variable_map["JetPt"] * correction
-            full_correction = full_correction * correction
-
-        return full_correction
-
-    # obtain rho, which might be located at different routes, depending on the nano version
-    rho = (
-        events.fixedGridRhoFastjetAll
-        if "fixedGridRhoFastjetAll" in events.fields
-        else events.Rho.fixedGridRhoFastjetAll
-    )
-
-    # correct jets with only a subset of correction levels
-    # (for calculating TypeI PuppiMET correction)
-    if self.propagate_met:
-        # get correction factors
-        jec_factors_subset_type1_met = correct_jets(
-            pt=events[jet_name].pt_raw,
-            eta=events[jet_name].eta,
-            phi=events[jet_name].phi,
-            area=events[jet_name].area,
-            rho=rho,
-            evaluator_key="jec_subset_type1_met",
-        )
-
-        # temporarily apply the new factors with only subset of corrections
-        events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors_subset_type1_met)
-        events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors_subset_type1_met)
-        events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
-
-        # store pt and phi of the full jet system for PuppiMET propagation, including a selection in raw info
-        # see https://twiki.cern.ch/twiki/bin/view/CMS/JECAnalysesRecommendations?rev=19#Minimum_jet_selection_cuts
-        met_prop_mask = (events[jet_name].pt_raw > min_pt_met_prop) & (abs(events[jet_name].eta) < max_eta_met_prop)
-        jetsum = events[jet_name][met_prop_mask].sum(axis=1)
-        jetsum_pt_subset_type1_met = jetsum.pt
-        jetsum_phi_subset_type1_met = jetsum.phi
-
-    # factors for full jet correction with all levels
-    jec_factors = correct_jets(
-        pt=events[jet_name].pt_raw,
-        eta=events[jet_name].eta,
-        phi=events[jet_name].phi,
-        area=events[jet_name].area,
-        rho=rho,
-        evaluator_key="jec",
-    )
-
-    # apply full jet correction
-    events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors)
-    events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors)
-    rawFactor = ak.nan_to_num(1 - events[jet_name].pt_raw / events[jet_name].pt, nan=0.0)
-    events = set_ak_column_f32(events, f"{jet_name}.rawFactor", rawFactor)
-    events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
-
-    # nominal met propagation
-    if self.propagate_met:
-        # get pt and phi of all jets after correcting
-        jetsum = events[jet_name][met_prop_mask].sum(axis=1)
-        jetsum_pt_all_levels = jetsum.pt
-        jetsum_phi_all_levels = jetsum.phi
-
-        # propagate changes to PuppiMET, starting from jets corrected with subset of JEC levels
-        # (recommendation is to propagate only L2 corrections and onwards)
-        met_pt, met_phi = propagate_met(
-            jetsum_pt_subset_type1_met,
-            jetsum_phi_subset_type1_met,
-            jetsum_pt_all_levels,
-            jetsum_phi_all_levels,
-            events.RawPuppiMET.pt,
-            events.RawPuppiMET.phi,
-        )
-
-        events = set_ak_column_f32(events, "PuppiMET.pt", met_pt)
-        events = set_ak_column_f32(events, "PuppiMET.phi", met_phi)
-
-    # variable naming conventions
-    variable_map = {
-        "JetEta": events[jet_name].eta,
-        "JetPt": events[jet_name].pt_raw,
-    }
-
-    # jet energy uncertainty components
-    for name, evaluator in self.evaluators["junc"].items():
-        # get uncertainty
-        inputs = [variable_map[inp.name] for inp in evaluator.inputs]
-        jec_uncertainty = ak_evaluate(evaluator, *inputs)
-
-        # apply jet uncertainty shifts
-        events = set_ak_column_f32(
-            events, f"{jet_name}.pt_jec_{name}_up", events[jet_name].pt * (1.0 + jec_uncertainty),
-        )
-        events = set_ak_column_f32(
-            events, f"{jet_name}.pt_jec_{name}_down", events[jet_name].pt * (1.0 - jec_uncertainty),
-        )
-        events = set_ak_column_f32(
-            events, f"{jet_name}.mass_jec_{name}_up", events[jet_name].mass * (1.0 + jec_uncertainty),
-        )
-        events = set_ak_column_f32(
-            events, f"{jet_name}.mass_jec_{name}_down", events[jet_name].mass * (1.0 - jec_uncertainty),
-        )
-
-        # propagate shifts to PuppiMET
-        if self.propagate_met:
-            jet_pt_up = events[jet_name][met_prop_mask][f"pt_jec_{name}_up"]
-            jet_pt_down = events[jet_name][met_prop_mask][f"pt_jec_{name}_down"]
-            met_pt_up, met_phi_up = propagate_met(
-                jetsum_pt_all_levels,
-                jetsum_phi_all_levels,
-                jet_pt_up,
-                events[jet_name][met_prop_mask].phi,
-                met_pt,
-                met_phi,
-            )
-            met_pt_down, met_phi_down = propagate_met(
-                jetsum_pt_all_levels,
-                jetsum_phi_all_levels,
-                jet_pt_down,
-                events[jet_name][met_prop_mask].phi,
-                met_pt,
-                met_phi,
-            )
-            events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_up", met_pt_up)
-            events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_down", met_pt_down)
-            events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_up", met_phi_up)
-            events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_down", met_phi_down)
-
-    return events
-
-
-@jec.init
-def jec_init(self: Calibrator) -> None:
-    jec_cfg = self.get_jec_config()
-
-    sources = self.uncertainty_sources
-    if sources is None:
-        sources = jec_cfg.uncertainty_sources
-
-    # register used jet columns
-    self.uses.add(f"{self.jet_name}.{{pt,eta,phi,mass,area,rawFactor}}")
-
-    # register produced jet columns
-    self.produces.add(f"{self.jet_name}.{{pt,mass,rawFactor}}")
-
-    # add shifted jet variables
-    self.produces |= {
-        f"{self.jet_name}.{shifted_var}_jec_{junc_name}_{junc_dir}"
-        for shifted_var in ("pt", "mass")
-        for junc_name in sources
-        for junc_dir in ("up", "down")
-    }
-
-    # add PuppiMET variables
-    if self.propagate_met:
-        self.uses |= {"RawPuppiMET.pt", "RawPuppiMET.phi","PuppiMET.pt", "PuppiMET.phi"}
-        self.produces |= {"PuppiMET.pt", "PuppiMET.phi"}
-
-        # add shifted PuppiMET variables
-        self.produces |= {
-            f"PuppiMET.{shifted_var}_jec_{junc_name}_{junc_dir}"
-            for shifted_var in ("pt", "phi")
-            for junc_name in sources
-            for junc_dir in ("up", "down")
-        }
-
-
-@jec.requires
-def jec_requires(self: Calibrator, reqs: dict) -> None:
-    if "external_files" in reqs:
-        return
-
-    from columnflow.tasks.external import BundleExternalFiles
-    reqs["external_files"] = BundleExternalFiles.req(self.task)
-
-
-@jec.setup
-def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None:
-    """
-    Load the correct jec files using the :py:func:`from_string` method of the
-    :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
-    function and apply the corrections as needed.
-
-    The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
-    instance are extracted with the :py:meth:`~.jec.get_jec_file`.
-
-    Uses the member function :py:meth:`~.jec.get_jec_config` to construct the
-    required keys, which are based on the following information about the JEC:
-
-        - levels
-        - campaign
-        - version
-        - jet_type
-
-    A corresponding example snippet wihtin the *config_inst* could like something
-    like this:
-
-    .. code-block:: python
-
-        cfg.x.jec = DotDict.wrap({
-            # campaign name for this JEC correctiono
-            "campaign": f"Summer19UL{year2}{jerc_postfix}",
-            # version of the corrections
-            "version": "V7",
-            # Type of jets that the corrections should be applied on
-            "jet_type": "AK4PFchs",
-            # relevant levels in the derivation process of the JEC
-            "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"],
-            # relevant levels in the derivation process of the Type 1 PuppiMET JEC
-            "levels_for_type1_met": ["L1FastJet"],
-            # names of the uncertainties to be applied
-            "uncertainty_sources": [
-                "Total",
-                "CorrelationGroupMPFInSitu",
-                "CorrelationGroupIntercalibration",
-                "CorrelationGroupbJES",
-                "CorrelationGroupFlavor",
-                "CorrelationGroupUncorrelated",
-            ],
-        })
-
-    :param reqs: Requirement dictionary for this
-        :py:class:`~columnflow.calibration.Calibrator` instance
-    :param inputs: Additional inputs, currently not used
-    :param reader_targets: TODO: add documentation
-    """
-
-    bundle = reqs["external_files"]
+#         for corrector in self.evaluators[evaluator_key]:
+#             # determine correct inputs (change depending on corrector)
+#             inputs = [
+#                 variable_map[inp.name]
+#                 for inp in corrector.inputs
+#             ]
+#             correction = ak_evaluate(corrector, *inputs)
+#             # update pt for subsequent correctors
+#             #pprint(corrector.__dict__)  # If `corrector` is a custom object with attributes
+#             variable_map["JetPt"] = variable_map["JetPt"] * correction
+#             full_correction = full_correction * correction
+
+#         return full_correction
+
+#     # obtain rho, which might be located at different routes, depending on the nano version
+#     rho = (
+#         events.fixedGridRhoFastjetAll
+#         if "fixedGridRhoFastjetAll" in events.fields
+#         else events.Rho.fixedGridRhoFastjetAll
+#     )
+
+#     # correct jets with only a subset of correction levels
+#     # (for calculating TypeI PuppiMET correction)
+#     if self.propagate_met:
+#         # get correction factors
+#         jec_factors_subset_type1_met = correct_jets(
+#             pt=events[jet_name].pt_raw,
+#             eta=events[jet_name].eta,
+#             phi=events[jet_name].phi,
+#             area=events[jet_name].area,
+#             rho=rho,
+#             evaluator_key="jec_subset_type1_met",
+#         )
+
+#         # temporarily apply the new factors with only subset of corrections
+#         events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors_subset_type1_met)
+#         events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors_subset_type1_met)
+#         events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
+
+#         # store pt and phi of the full jet system for PuppiMET propagation, including a selection in raw info
+#         # see https://twiki.cern.ch/twiki/bin/view/CMS/JECAnalysesRecommendations?rev=19#Minimum_jet_selection_cuts
+#         met_prop_mask = (events[jet_name].pt_raw > min_pt_met_prop) & (abs(events[jet_name].eta) < max_eta_met_prop)
+#         jetsum = events[jet_name][met_prop_mask].sum(axis=1)
+#         jetsum_pt_subset_type1_met = jetsum.pt
+#         jetsum_phi_subset_type1_met = jetsum.phi
+
+#     # factors for full jet correction with all levels
+#     jec_factors = correct_jets(
+#         pt=events[jet_name].pt_raw,
+#         eta=events[jet_name].eta,
+#         phi=events[jet_name].phi,
+#         area=events[jet_name].area,
+#         rho=rho,
+#         evaluator_key="jec",
+#     )
+
+#     # apply full jet correction
+#     events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors)
+#     events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors)
+#     rawFactor = ak.nan_to_num(1 - events[jet_name].pt_raw / events[jet_name].pt, nan=0.0)
+#     events = set_ak_column_f32(events, f"{jet_name}.rawFactor", rawFactor)
+#     events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
+
+#     # nominal met propagation
+#     if self.propagate_met:
+#         # get pt and phi of all jets after correcting
+#         jetsum = events[jet_name][met_prop_mask].sum(axis=1)
+#         jetsum_pt_all_levels = jetsum.pt
+#         jetsum_phi_all_levels = jetsum.phi
+
+#         # propagate changes to PuppiMET, starting from jets corrected with subset of JEC levels
+#         # (recommendation is to propagate only L2 corrections and onwards)
+#         met_pt, met_phi = propagate_met(
+#             jetsum_pt_subset_type1_met,
+#             jetsum_phi_subset_type1_met,
+#             jetsum_pt_all_levels,
+#             jetsum_phi_all_levels,
+#             events.RawPuppiMET.pt,
+#             events.RawPuppiMET.phi,
+#         )
+
+#         events = set_ak_column_f32(events, "PuppiMET.pt", met_pt)
+#         events = set_ak_column_f32(events, "PuppiMET.phi", met_phi)
+
+#     # variable naming conventions
+#     variable_map = {
+#         "JetEta": events[jet_name].eta,
+#         "JetPt": events[jet_name].pt_raw,
+#     }
+
+#     # jet energy uncertainty components
+#     for name, evaluator in self.evaluators["junc"].items():
+#         # get uncertainty
+#         inputs = [variable_map[inp.name] for inp in evaluator.inputs]
+#         jec_uncertainty = ak_evaluate(evaluator, *inputs)
+
+#         # apply jet uncertainty shifts
+#         events = set_ak_column_f32(
+#             events, f"{jet_name}.pt_jec_{name}_up", events[jet_name].pt * (1.0 + jec_uncertainty),
+#         )
+#         events = set_ak_column_f32(
+#             events, f"{jet_name}.pt_jec_{name}_down", events[jet_name].pt * (1.0 - jec_uncertainty),
+#         )
+#         events = set_ak_column_f32(
+#             events, f"{jet_name}.mass_jec_{name}_up", events[jet_name].mass * (1.0 + jec_uncertainty),
+#         )
+#         events = set_ak_column_f32(
+#             events, f"{jet_name}.mass_jec_{name}_down", events[jet_name].mass * (1.0 - jec_uncertainty),
+#         )
+
+#         # propagate shifts to PuppiMET
+#         if self.propagate_met:
+#             jet_pt_up = events[jet_name][met_prop_mask][f"pt_jec_{name}_up"]
+#             jet_pt_down = events[jet_name][met_prop_mask][f"pt_jec_{name}_down"]
+#             met_pt_up, met_phi_up = propagate_met(
+#                 jetsum_pt_all_levels,
+#                 jetsum_phi_all_levels,
+#                 jet_pt_up,
+#                 events[jet_name][met_prop_mask].phi,
+#                 met_pt,
+#                 met_phi,
+#             )
+#             met_pt_down, met_phi_down = propagate_met(
+#                 jetsum_pt_all_levels,
+#                 jetsum_phi_all_levels,
+#                 jet_pt_down,
+#                 events[jet_name][met_prop_mask].phi,
+#                 met_pt,
+#                 met_phi,
+#             )
+#             events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_up", met_pt_up)
+#             events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_down", met_pt_down)
+#             events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_up", met_phi_up)
+#             events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_down", met_phi_down)
+
+#     return events
+
+
+# @jec.init
+# def jec_init(self: Calibrator) -> None:
+#     jec_cfg = self.get_jec_config()
+
+#     sources = self.uncertainty_sources
+#     if sources is None:
+#         sources = jec_cfg.uncertainty_sources
+
+#     # register used jet columns
+#     self.uses.add(f"{self.jet_name}.{{pt,eta,phi,mass,area,rawFactor}}")
+
+#     # register produced jet columns
+#     self.produces.add(f"{self.jet_name}.{{pt,mass,rawFactor}}")
+
+#     # add shifted jet variables
+#     self.produces |= {
+#         f"{self.jet_name}.{shifted_var}_jec_{junc_name}_{junc_dir}"
+#         for shifted_var in ("pt", "mass")
+#         for junc_name in sources
+#         for junc_dir in ("up", "down")
+#     }
+
+#     # add PuppiMET variables
+#     if self.propagate_met:
+#         self.uses |= {"RawPuppiMET.pt", "RawPuppiMET.phi","PuppiMET.pt", "PuppiMET.phi"}
+#         self.produces |= {"PuppiMET.pt", "PuppiMET.phi"}
+
+#         # add shifted PuppiMET variables
+#         self.produces |= {
+#             f"PuppiMET.{shifted_var}_jec_{junc_name}_{junc_dir}"
+#             for shifted_var in ("pt", "phi")
+#             for junc_name in sources
+#             for junc_dir in ("up", "down")
+#         }
+
+
+# @jec.requires
+# def jec_requires(self: Calibrator, reqs: dict) -> None:
+#     if "external_files" in reqs:
+#         return
+
+#     from columnflow.tasks.external import BundleExternalFiles
+#     reqs["external_files"] = BundleExternalFiles.req(self.task)
+
+
+# @jec.setup
+# def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None:
+#     """
+#     Load the correct jec files using the :py:func:`from_string` method of the
+#     :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
+#     function and apply the corrections as needed.
+
+#     The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
+#     instance are extracted with the :py:meth:`~.jec.get_jec_file`.
+
+#     Uses the member function :py:meth:`~.jec.get_jec_config` to construct the
+#     required keys, which are based on the following information about the JEC:
+
+#         - levels
+#         - campaign
+#         - version
+#         - jet_type
+
+#     A corresponding example snippet wihtin the *config_inst* could like something
+#     like this:
+
+#     .. code-block:: python
+
+#         cfg.x.jec = DotDict.wrap({
+#             # campaign name for this JEC correctiono
+#             "campaign": f"Summer19UL{year2}{jerc_postfix}",
+#             # version of the corrections
+#             "version": "V7",
+#             # Type of jets that the corrections should be applied on
+#             "jet_type": "AK4PFchs",
+#             # relevant levels in the derivation process of the JEC
+#             "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"],
+#             # relevant levels in the derivation process of the Type 1 PuppiMET JEC
+#             "levels_for_type1_met": ["L1FastJet"],
+#             # names of the uncertainties to be applied
+#             "uncertainty_sources": [
+#                 "Total",
+#                 "CorrelationGroupMPFInSitu",
+#                 "CorrelationGroupIntercalibration",
+#                 "CorrelationGroupbJES",
+#                 "CorrelationGroupFlavor",
+#                 "CorrelationGroupUncorrelated",
+#             ],
+#         })
+
+#     :param reqs: Requirement dictionary for this
+#         :py:class:`~columnflow.calibration.Calibrator` instance
+#     :param inputs: Additional inputs, currently not used
+#     :param reader_targets: TODO: add documentation
+#     """
+
+#     bundle = reqs["external_files"]
     
-    # import the correction sets from the external file
-    import correctionlib
-
-    correction_set = correctionlib.CorrectionSet.from_string(
-        self.get_jec_file(bundle.files).load(formatter="gzip").decode("utf-8"),
-    )
-
-    # compute JEC keys from config information
-    jec_cfg = self.get_jec_config()
-
-    def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data):
-        if is_data:
-
-            jec_era = self.dataset_inst.get_aux("jec_era", None)
-            # if no special JEC era is specified, infer based on 'era'
-            if jec_era is None:
-                jec_era = "Run" + self.dataset_inst.get_aux("era")
-
-        return [
-            f"{jec.campaign}_{jec_era}_{jec.version}_DATA_{name}_{jec.jet_type}"
-            if is_data else
-            f"{jec.campaign}_{jec.version}_MC_{name}_{jec.jet_type}"
-            for name in names
-        ]
-
-    # take sources from constructor or config
-    sources = self.uncertainty_sources
-    if sources is None:
-        sources = jec_cfg.uncertainty_sources
+#     # import the correction sets from the external file
+#     import correctionlib
+
+#     correction_set = correctionlib.CorrectionSet.from_string(
+#         self.get_jec_file(bundle.files).load(formatter="gzip").decode("utf-8"),
+#     )
+
+#     # compute JEC keys from config information
+#     jec_cfg = self.get_jec_config()
+
+#     def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data):
+#         if is_data:
+
+#             jec_era = self.dataset_inst.get_aux("jec_era", None)
+#             # if no special JEC era is specified, infer based on 'era'
+#             if jec_era is None:
+#                 jec_era = "Run" + self.dataset_inst.get_aux("era")
+
+#         return [
+#             f"{jec.campaign}_{jec_era}_{jec.version}_DATA_{name}_{jec.jet_type}"
+#             if is_data else
+#             f"{jec.campaign}_{jec.version}_MC_{name}_{jec.jet_type}"
+#             for name in names
+#         ]
+
+#     # take sources from constructor or config
+#     sources = self.uncertainty_sources
+#     if sources is None:
+#         sources = jec_cfg.uncertainty_sources
     
-    if self.dataset_inst.is_data :
-        jec_keys = make_jme_keys(jec_cfg.levels_DATA)
-    else :
-        jec_keys = make_jme_keys(jec_cfg.levels_MC)
-    jec_keys_subset_type1_met = make_jme_keys(jec_cfg.levels_for_type1_met)
-    junc_keys = make_jme_keys(sources, is_data=False)  # uncertainties only stored as MC keys
-
-    # store the evaluators
-    self.evaluators = {
-        "jec": get_evaluators(correction_set, jec_keys),
-        "jec_subset_type1_met": get_evaluators(correction_set, jec_keys_subset_type1_met),
-        "junc": dict(zip(sources, get_evaluators(correction_set, junc_keys))),
-    }
-
-
-# custom jec calibrator that only runs nominal correction
-jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []})
-
-# define default functions for jec calibrator
-def get_jer_file(self, external_files: DotDict) -> str:
-    """
-    Load config relevant to the jet energy resolution (JER) smearing.
-
-    By default, this is extracted from the current *config_inst*,
-    assuming the JER configurations are stored under the 'jer'
-    aux key. Separate configurations should be specified for each
-    jet collection, using the collection name as a key. For example,
-    the configuration for the default jet collection ``Jet`` will
-    be retrieved from the following config entry:
-
-    .. code-block:: python
-
-        self.config_inst.x.jer.Jet
-
-    Used in :py:meth:`~.jer.setup_func`.
-
-    :return: Dictionary containing configuration for JER smearing
-    """
-    jer_cfg = self.config_inst.x.jer
-
-    # check for old-style config
-    if self.jet_name not in jer_cfg:
-        # if jet collection is `Jet`, issue deprecation warning
-        if self.jet_name == "Jet":
-            logger.warning_once(
-                f"{id(self)}_depr_jer_config",
-                "config aux 'jer' does not contain key for input jet "
-                f"collection '{self.jet_name}'. This may be due to "
-                "an outdated config. Continuing under the assumption that "
-                "the entire 'jer' entry refers to this jet collection. "
-                "This assumption will be removed in future versions of "
-                "columnflow, so please adapt the config according to the "
-                "documentation to remove this warning and ensure future "
-                "compatibility of the code.",
-            )
-            return jer_cfg
-
-        # otherwise raise exception
-        raise ValueError(
-            "config aux 'jer' does not contain key for input jet "
-            f"collection '{self.jet_name}'.",
-        )
-
-    return jer_cfg[self.jet_name]
-
-
-#
-# jet energy resolution smearing
-#
-
-@calibrator(
-    uses={
-        optional("Rho.fixedGridRhoFastjetAll"),
-        optional("fixedGridRhoFastjetAll"),
-        "GenJet.pt", "GenJet.eta", "GenJet.phi",
-        "PuppiMET.pt", "PuppiMET.phi",
-        attach_coffea_behavior,
-    },
-    produces={
-        "Jet.pt", "Jet.mass",
-        "Jet.pt_unsmeared", "Jet.mass_unsmeared",
-        "Jet.pt_jer_up", "Jet.pt_jer_down", "Jet.mass_jer_up", "Jet.mass_jer_down",
-        "PuppiMET.pt", "PuppiMET.phi",
-        "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", "PuppiMET.phi_jer_down",
-    },
-    # toggle for propagation to PuppiMET
-    propagate_met=True,
-    # only run on mc
-    mc_only=True,
-    # use deterministic seeds for random smearing and
-    # take the "index"-th random number per seed when not -1
-    deterministic_seed_index=-1,
-    # function to determine the correction file
-    get_jer_file=get_jerc_file_default,
-    # function to determine the jer configuration dict
-    get_jer_config=get_jer_config_default,
-)
-def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
-    """
-    Applies the jet energy resolution smearing in MC and calculates the associated uncertainty
-    shifts using the :external+correctionlib:doc:`index`, following the recommendations given in
-    https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetResolution.
-
-    The *jet_name* and *gen_jet_name* should be set to the name of the NanoAOD jet and gen jet
-    collections to use as an input for JER smearing (default: ``Jet`` and ``GenJet``, respectively,
-    i.e. AK4 jets).
-
-    Requires an external file in the config pointing to the JSON files containing the JER information.
-    The file key can be specified via an optional ``external_file_key`` in the ``jer`` config entry.
-    If not given, the file key will be determined automatically based on the jet collection name:
-    ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files
-    can be specified as:
-
-    .. code-block:: python
-
-        cfg.x.external_files = DotDict.wrap({
-            "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
-            "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz",
-        })
-
-    For more fine-grained control, the *get_jer_file* can be adapted in a subclass in case it is stored
-    differently in the external files.
-
-    The JER smearing configuration should be an auxiliary entry in the config, specifying the input
-    JER to use under "jer". Separate configs should be given for each jet collection to smear, using
-    the jet collection name as a subkey. An example of a valid configuration for smearing
-    AK4 jets with JER is:
-
-    .. code-block:: python
-
-        cfg.x.jer = {
-            "Jet": {
-                "campaign": "Summer19UL17",
-                "version": "JRV2",
-                "jet_type": "AK4PFchs",
-            },
-        }
-
-    *get_jer_config* can be adapted in a subclass in case it is stored differently in the config.
-
-    Throws an error if running on data.
-
-    :param events: awkward array containing events to process
-    """ # noqa
-    # use local variables for convenience
-    jet_name = self.jet_name
-    gen_jet_name = self.gen_jet_name
-
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to apply jet energy resolution smearing in data")
-
-    # save the unsmeared properties in case they are needed later
-    events = set_ak_column_f32(events, f"{jet_name}.pt_unsmeared", events[jet_name].pt)
-    events = set_ak_column_f32(events, f"{jet_name}.mass_unsmeared", events[jet_name].mass)
-
-    # obtain rho, which might be located at different routes, depending on the nano version
-    rho = (
-        events.fixedGridRhoFastjetAll
-        if "fixedGridRhoFastjetAll" in events.fields else
-        events.Rho.fixedGridRhoFastjetAll
-    )
-
-    # variable naming convention
-    variable_map = {
-        "JetEta": events[jet_name].eta,
-        "JetPt": events[jet_name].pt,
-        "Rho": rho,
-    }
-
-    # pt resolution
-    inputs = [variable_map[inp.name] for inp in self.evaluators["jer"].inputs]
-    jer = ak_evaluate(self.evaluators["jer"], *inputs)
-
-    # JER scale factors and systematic variations
-    jersf = {}
-    for syst in ("nom", "up", "down"):
-        variable_map_syst = dict(variable_map, systematic=syst)
-        inputs = [variable_map_syst[inp.name] for inp in self.evaluators["sf"].inputs]
-        jersf[syst] = ak_evaluate(self.evaluators["sf"], *inputs)
-
-    # array with all JER scale factor variations as an additional axis
-    # (note: axis needs to be regular for broadcasting to work correctly)
-    jersf = ak.concatenate(
-        [jersf[syst][..., None] for syst in ("nom", "up", "down")],
-        axis=-1,
-    )
-
-    # -- stochastic smearing
-    # normally distributed random numbers according to JER
-    jer_random_normal = (
-        ak_random(0, jer, events[jet_name].deterministic_seed, rand_func=self.deterministic_normal)
-        if self.deterministic_seed_index >= 0
-        else ak_random(0, jer, rand_func=np.random.Generator(
-            np.random.SFC64(events.event.to_list())).normal,
-        )
-    )
-
-    # scale random numbers according to JER SF
-    jersf2_m1 = jersf ** 2 - 1
-    add_smear = np.sqrt(ak.where(jersf2_m1 < 0, 0, jersf2_m1))
-
-    # broadcast over JER SF variations
-    jer_random_normal, jersf_z = ak.broadcast_arrays(jer_random_normal, add_smear)
-
-    # compute smearing factors (stochastic method)
-    smear_factors_stochastic = 1.0 + jer_random_normal * add_smear
-
-    # -- scaling method (using gen match)
-
-    # mask negative gen jet indices (= no gen match)
-    gen_jet_idx = events[jet_name][self.gen_jet_idx_column]
-    valid_gen_jet_idxs = ak.mask(gen_jet_idx, gen_jet_idx >= 0)
-
-    # pad list of gen jets to prevent index error on match lookup
-    max_gen_jet_idx = ak.max(valid_gen_jet_idxs)
-    padded_gen_jets = ak.pad_none(
-        events[gen_jet_name],
-        0 if max_gen_jet_idx is None else (max_gen_jet_idx + 1),
-    )
-
-    # gen jets that match the reconstructed jets
-    matched_gen_jets = padded_gen_jets[valid_gen_jet_idxs]
-
-    # compute the relative (reco - gen) pt difference
-    pt_relative_diff = (events[jet_name].pt - matched_gen_jets.pt) / events[jet_name].pt
-
-    # test if matched gen jets are within 3 * resolution
-    is_matched_pt = np.abs(pt_relative_diff) < 3 * jer
-    is_matched_pt = ak.fill_none(is_matched_pt, False)  # masked values = no gen match
-
-    # (no check for Delta-R matching criterion; we assume this was done during
-    # nanoAOD production to get the `genJetIdx`)
-
-    # broadcast over JER SF variations
-    pt_relative_diff, jersf = ak.broadcast_arrays(pt_relative_diff, jersf)
-
-    # compute smearing factors (scaling method)
-    smear_factors_scaling = 1.0 + (jersf - 1.0) * pt_relative_diff
-
-    # -- hybrid smearing: take smear factors from scaling if there was a match,
-    # otherwise take the stochastic ones
-    smear_factors = ak.where(
-        is_matched_pt[:, :, None],
-        smear_factors_scaling,
-        smear_factors_stochastic,
-    )
-
-    # ensure array is not nullable (avoid ambiguity on Arrow/Parquet conversion)
-    smear_factors = ak.fill_none(smear_factors, 0.0)
-
-    # store pt and phi of the full jet system
-    if self.propagate_met:
-        jetsum = events[jet_name].sum(axis=1)
-        jetsum_pt_before = jetsum.pt
-        jetsum_phi_before = jetsum.phi
-
-    # apply the smearing factors to the pt and mass
-    # (note: apply variations first since they refer to the original pt)
-    events = set_ak_column_f32(events, f"{jet_name}.pt_jer_up", events[jet_name].pt * smear_factors[:, :, 1])
-    events = set_ak_column_f32(events, f"{jet_name}.mass_jer_up", events[jet_name].mass * smear_factors[:, :, 1])
-    events = set_ak_column_f32(events, f"{jet_name}.pt_jer_down", events[jet_name].pt * smear_factors[:, :, 2])
-    events = set_ak_column_f32(events, f"{jet_name}.mass_jer_down", events[jet_name].mass * smear_factors[:, :, 2])
-    events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt * smear_factors[:, :, 0])
-    events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass * smear_factors[:, :, 0])
-
-    # recover coffea behavior
-    events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
-
-    # met propagation
-    if self.propagate_met:
-
-        # save unsmeared quantities
-        events = set_ak_column_f32(events, "PuppiMET.pt_unsmeared", events.PuppiMET.pt)
-        events = set_ak_column_f32(events, "PuppiMET.phi_unsmeared", events.PuppiMET.phi)
-
-        # get pt and phi of all jets after correcting
-        jetsum = events[jet_name].sum(axis=1)
-        jetsum_pt_after = jetsum.pt
-        jetsum_phi_after = jetsum.phi
-
-        # propagate changes to PuppiMET
-        met_pt, met_phi = propagate_met(
-            jetsum_pt_before,
-            jetsum_phi_before,
-            jetsum_pt_after,
-            jetsum_phi_after,
-            events.PuppiMET.pt,
-            events.PuppiMET.phi,
-        )
-        events = set_ak_column_f32(events, "PuppiMET.pt", met_pt)
-        events = set_ak_column_f32(events, "PuppiMET.phi", met_phi)
-
-        # syst variations on top of corrected PuppiMET
-        met_pt_up, met_phi_up = propagate_met(
-            jetsum_pt_after,
-            jetsum_phi_after,
-            events[jet_name].pt_jer_up,
-            events[jet_name].phi,
-            met_pt,
-            met_phi,
-        )
-        met_pt_down, met_phi_down = propagate_met(
-            jetsum_pt_after,
-            jetsum_phi_after,
-            events[jet_name].pt_jer_down,
-            events[jet_name].phi,
-            met_pt,
-            met_phi,
-        )
-        events = set_ak_column_f32(events, "PuppiMET.pt_jer_up", met_pt_up)
-        events = set_ak_column_f32(events, "PuppiMET.pt_jer_down", met_pt_down)
-        events = set_ak_column_f32(events, "PuppiMET.phi_jer_up", met_phi_up)
-        events = set_ak_column_f32(events, "PuppiMET.phi_jer_down", met_phi_down)
-
-    return events
-
-
-@jer.init
-def jer_init(self: Calibrator) -> None:
-    # determine gen-level jet index column
-    lower_first = lambda s: s[0].lower() + s[1:] if s else s
-    self.gen_jet_idx_column = lower_first(self.gen_jet_name) + "Idx"
-
-    self.uses |= {
-        "PuppiMET.pt", "PuppiMET.phi",
-    }
-    self.produces |= {
-        "PuppiMET.pt", "PuppiMET.phi", "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up",
-        "PuppiMET.phi_jer_down", "PuppiMET.pt_unsmeared", "PuppiMET.phi_unsmeared",
-    }
-
-
-@jer.requires
-def jer_requires(self: Calibrator, reqs: dict) -> None:
-    if "external_files" in reqs:
-        return
-
-    from columnflow.tasks.external import BundleExternalFiles
-    reqs["external_files"] = BundleExternalFiles.req(self.task)
-
-
-@jer.setup
-def jer_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None:
-    """
-    Load the correct jer files using the :py:func:`from_string` method of the
-    :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` function and apply the
-    corrections as needed.
-
-    The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
-    instance are extracted with the :py:meth:`~.jer.get_jer_file`.
-
-    Uses the member function :py:meth:`~.jer.get_jer_config` to construct the required keys, which
-    are based on the following information about the JER:
-
-    - campaign
-    - version
-    - jet_type
-
-    A corresponding example snippet within the *config_inst* could like something like this:
-
-    .. code-block:: python
-
-        cfg.x.jer = DotDict.wrap({
-            "Jet": {
-                "campaign": f"Summer19UL{year2}{jerc_postfix}",
-                "version": "JRV3",
-                "jet_type": "AK4PFchs",
-            },
-        })
-
-    :param reqs: Requirement dictionary for this :py:class:`~columnflow.calibration.Calibrator`
-        instance.
-    :param inputs: Additional inputs, currently not used.
-    :param reader_targets: TODO: add documentation.
-    """
-    bundle = reqs["external_files"]
-
-    # import the correction sets from the external file
-    import correctionlib
-    correction_set = correctionlib.CorrectionSet.from_string(
-        self.get_jer_file(bundle.files).load(formatter="gzip").decode("utf-8"),
-    )
+#     if self.dataset_inst.is_data :
+#         jec_keys = make_jme_keys(jec_cfg.levels_DATA)
+#     else :
+#         jec_keys = make_jme_keys(jec_cfg.levels_MC)
+#     jec_keys_subset_type1_met = make_jme_keys(jec_cfg.levels_for_type1_met)
+#     junc_keys = make_jme_keys(sources, is_data=False)  # uncertainties only stored as MC keys
+
+#     # store the evaluators
+#     self.evaluators = {
+#         "jec": get_evaluators(correction_set, jec_keys),
+#         "jec_subset_type1_met": get_evaluators(correction_set, jec_keys_subset_type1_met),
+#         "junc": dict(zip(sources, get_evaluators(correction_set, junc_keys))),
+#     }
+
+
+# # custom jec calibrator that only runs nominal correction
+# jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []})
+
+# # define default functions for jec calibrator
+# def get_jer_file(self, external_files: DotDict) -> str:
+#     """
+#     Load config relevant to the jet energy resolution (JER) smearing.
+
+#     By default, this is extracted from the current *config_inst*,
+#     assuming the JER configurations are stored under the 'jer'
+#     aux key. Separate configurations should be specified for each
+#     jet collection, using the collection name as a key. For example,
+#     the configuration for the default jet collection ``Jet`` will
+#     be retrieved from the following config entry:
+
+#     .. code-block:: python
+
+#         self.config_inst.x.jer.Jet
+
+#     Used in :py:meth:`~.jer.setup_func`.
+
+#     :return: Dictionary containing configuration for JER smearing
+#     """
+#     jer_cfg = self.config_inst.x.jer
+
+#     # check for old-style config
+#     if self.jet_name not in jer_cfg:
+#         # if jet collection is `Jet`, issue deprecation warning
+#         if self.jet_name == "Jet":
+#             logger.warning_once(
+#                 f"{id(self)}_depr_jer_config",
+#                 "config aux 'jer' does not contain key for input jet "
+#                 f"collection '{self.jet_name}'. This may be due to "
+#                 "an outdated config. Continuing under the assumption that "
+#                 "the entire 'jer' entry refers to this jet collection. "
+#                 "This assumption will be removed in future versions of "
+#                 "columnflow, so please adapt the config according to the "
+#                 "documentation to remove this warning and ensure future "
+#                 "compatibility of the code.",
+#             )
+#             return jer_cfg
+
+#         # otherwise raise exception
+#         raise ValueError(
+#             "config aux 'jer' does not contain key for input jet "
+#             f"collection '{self.jet_name}'.",
+#         )
+
+#     return jer_cfg[self.jet_name]
+
+
+# #
+# # jet energy resolution smearing
+# #
+
+# @calibrator(
+#     uses={
+#         optional("Rho.fixedGridRhoFastjetAll"),
+#         optional("fixedGridRhoFastjetAll"),
+#         "GenJet.pt", "GenJet.eta", "GenJet.phi",
+#         "PuppiMET.pt", "PuppiMET.phi",
+#         attach_coffea_behavior,
+#     },
+#     produces={
+#         "Jet.pt", "Jet.mass",
+#         "Jet.pt_unsmeared", "Jet.mass_unsmeared",
+#         "Jet.pt_jer_up", "Jet.pt_jer_down", "Jet.mass_jer_up", "Jet.mass_jer_down",
+#         "PuppiMET.pt", "PuppiMET.phi",
+#         "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", "PuppiMET.phi_jer_down",
+#     },
+#     # toggle for propagation to PuppiMET
+#     propagate_met=True,
+#     # only run on mc
+#     mc_only=True,
+#     # use deterministic seeds for random smearing and
+#     # take the "index"-th random number per seed when not -1
+#     deterministic_seed_index=-1,
+#     # function to determine the correction file
+#     get_jer_file=get_jerc_file_default,
+#     # function to determine the jer configuration dict
+#     get_jer_config=get_jer_config_default,
+# )
+# def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
+#     """
+#     Applies the jet energy resolution smearing in MC and calculates the associated uncertainty
+#     shifts using the :external+correctionlib:doc:`index`, following the recommendations given in
+#     https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetResolution.
+
+#     The *jet_name* and *gen_jet_name* should be set to the name of the NanoAOD jet and gen jet
+#     collections to use as an input for JER smearing (default: ``Jet`` and ``GenJet``, respectively,
+#     i.e. AK4 jets).
+
+#     Requires an external file in the config pointing to the JSON files containing the JER information.
+#     The file key can be specified via an optional ``external_file_key`` in the ``jer`` config entry.
+#     If not given, the file key will be determined automatically based on the jet collection name:
+#     ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files
+#     can be specified as:
+
+#     .. code-block:: python
+
+#         cfg.x.external_files = DotDict.wrap({
+#             "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz",
+#             "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz",
+#         })
+
+#     For more fine-grained control, the *get_jer_file* can be adapted in a subclass in case it is stored
+#     differently in the external files.
+
+#     The JER smearing configuration should be an auxiliary entry in the config, specifying the input
+#     JER to use under "jer". Separate configs should be given for each jet collection to smear, using
+#     the jet collection name as a subkey. An example of a valid configuration for smearing
+#     AK4 jets with JER is:
+
+#     .. code-block:: python
+
+#         cfg.x.jer = {
+#             "Jet": {
+#                 "campaign": "Summer19UL17",
+#                 "version": "JRV2",
+#                 "jet_type": "AK4PFchs",
+#             },
+#         }
+
+#     *get_jer_config* can be adapted in a subclass in case it is stored differently in the config.
+
+#     Throws an error if running on data.
+
+#     :param events: awkward array containing events to process
+#     """ # noqa
+#     # use local variables for convenience
+#     jet_name = self.jet_name
+#     gen_jet_name = self.gen_jet_name
+
+#     # fail when running on data
+#     if self.dataset_inst.is_data:
+#         raise ValueError("attempt to apply jet energy resolution smearing in data")
+
+#     # save the unsmeared properties in case they are needed later
+#     events = set_ak_column_f32(events, f"{jet_name}.pt_unsmeared", events[jet_name].pt)
+#     events = set_ak_column_f32(events, f"{jet_name}.mass_unsmeared", events[jet_name].mass)
+
+#     # obtain rho, which might be located at different routes, depending on the nano version
+#     rho = (
+#         events.fixedGridRhoFastjetAll
+#         if "fixedGridRhoFastjetAll" in events.fields else
+#         events.Rho.fixedGridRhoFastjetAll
+#     )
+
+#     # variable naming convention
+#     variable_map = {
+#         "JetEta": events[jet_name].eta,
+#         "JetPt": events[jet_name].pt,
+#         "Rho": rho,
+#     }
+
+#     # pt resolution
+#     inputs = [variable_map[inp.name] for inp in self.evaluators["jer"].inputs]
+#     jer = ak_evaluate(self.evaluators["jer"], *inputs)
+
+#     # JER scale factors and systematic variations
+#     jersf = {}
+#     for syst in ("nom", "up", "down"):
+#         variable_map_syst = dict(variable_map, systematic=syst)
+#         inputs = [variable_map_syst[inp.name] for inp in self.evaluators["sf"].inputs]
+#         jersf[syst] = ak_evaluate(self.evaluators["sf"], *inputs)
+
+#     # array with all JER scale factor variations as an additional axis
+#     # (note: axis needs to be regular for broadcasting to work correctly)
+#     jersf = ak.concatenate(
+#         [jersf[syst][..., None] for syst in ("nom", "up", "down")],
+#         axis=-1,
+#     )
+
+#     # -- stochastic smearing
+#     # normally distributed random numbers according to JER
+#     jer_random_normal = (
+#         ak_random(0, jer, events[jet_name].deterministic_seed, rand_func=self.deterministic_normal)
+#         if self.deterministic_seed_index >= 0
+#         else ak_random(0, jer, rand_func=np.random.Generator(
+#             np.random.SFC64(events.event.to_list())).normal,
+#         )
+#     )
+
+#     # scale random numbers according to JER SF
+#     jersf2_m1 = jersf ** 2 - 1
+#     add_smear = np.sqrt(ak.where(jersf2_m1 < 0, 0, jersf2_m1))
+
+#     # broadcast over JER SF variations
+#     jer_random_normal, jersf_z = ak.broadcast_arrays(jer_random_normal, add_smear)
+
+#     # compute smearing factors (stochastic method)
+#     smear_factors_stochastic = 1.0 + jer_random_normal * add_smear
+
+#     # -- scaling method (using gen match)
+
+#     # mask negative gen jet indices (= no gen match)
+#     gen_jet_idx = events[jet_name][self.gen_jet_idx_column]
+#     valid_gen_jet_idxs = ak.mask(gen_jet_idx, gen_jet_idx >= 0)
+
+#     # pad list of gen jets to prevent index error on match lookup
+#     max_gen_jet_idx = ak.max(valid_gen_jet_idxs)
+#     padded_gen_jets = ak.pad_none(
+#         events[gen_jet_name],
+#         0 if max_gen_jet_idx is None else (max_gen_jet_idx + 1),
+#     )
+
+#     # gen jets that match the reconstructed jets
+#     matched_gen_jets = padded_gen_jets[valid_gen_jet_idxs]
+
+#     # compute the relative (reco - gen) pt difference
+#     pt_relative_diff = (events[jet_name].pt - matched_gen_jets.pt) / events[jet_name].pt
+
+#     # test if matched gen jets are within 3 * resolution
+#     is_matched_pt = np.abs(pt_relative_diff) < 3 * jer
+#     is_matched_pt = ak.fill_none(is_matched_pt, False)  # masked values = no gen match
+
+#     # (no check for Delta-R matching criterion; we assume this was done during
+#     # nanoAOD production to get the `genJetIdx`)
+
+#     # broadcast over JER SF variations
+#     pt_relative_diff, jersf = ak.broadcast_arrays(pt_relative_diff, jersf)
+
+#     # compute smearing factors (scaling method)
+#     smear_factors_scaling = 1.0 + (jersf - 1.0) * pt_relative_diff
+
+#     # -- hybrid smearing: take smear factors from scaling if there was a match,
+#     # otherwise take the stochastic ones
+#     smear_factors = ak.where(
+#         is_matched_pt[:, :, None],
+#         smear_factors_scaling,
+#         smear_factors_stochastic,
+#     )
+
+#     # ensure array is not nullable (avoid ambiguity on Arrow/Parquet conversion)
+#     smear_factors = ak.fill_none(smear_factors, 0.0)
+
+#     # store pt and phi of the full jet system
+#     if self.propagate_met:
+#         jetsum = events[jet_name].sum(axis=1)
+#         jetsum_pt_before = jetsum.pt
+#         jetsum_phi_before = jetsum.phi
+
+#     # apply the smearing factors to the pt and mass
+#     # (note: apply variations first since they refer to the original pt)
+#     events = set_ak_column_f32(events, f"{jet_name}.pt_jer_up", events[jet_name].pt * smear_factors[:, :, 1])
+#     events = set_ak_column_f32(events, f"{jet_name}.mass_jer_up", events[jet_name].mass * smear_factors[:, :, 1])
+#     events = set_ak_column_f32(events, f"{jet_name}.pt_jer_down", events[jet_name].pt * smear_factors[:, :, 2])
+#     events = set_ak_column_f32(events, f"{jet_name}.mass_jer_down", events[jet_name].mass * smear_factors[:, :, 2])
+#     events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt * smear_factors[:, :, 0])
+#     events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass * smear_factors[:, :, 0])
+
+#     # recover coffea behavior
+#     events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs)
+
+#     # met propagation
+#     if self.propagate_met:
+
+#         # save unsmeared quantities
+#         events = set_ak_column_f32(events, "PuppiMET.pt_unsmeared", events.PuppiMET.pt)
+#         events = set_ak_column_f32(events, "PuppiMET.phi_unsmeared", events.PuppiMET.phi)
+
+#         # get pt and phi of all jets after correcting
+#         jetsum = events[jet_name].sum(axis=1)
+#         jetsum_pt_after = jetsum.pt
+#         jetsum_phi_after = jetsum.phi
+
+#         # propagate changes to PuppiMET
+#         met_pt, met_phi = propagate_met(
+#             jetsum_pt_before,
+#             jetsum_phi_before,
+#             jetsum_pt_after,
+#             jetsum_phi_after,
+#             events.PuppiMET.pt,
+#             events.PuppiMET.phi,
+#         )
+#         events = set_ak_column_f32(events, "PuppiMET.pt", met_pt)
+#         events = set_ak_column_f32(events, "PuppiMET.phi", met_phi)
+
+#         # syst variations on top of corrected PuppiMET
+#         met_pt_up, met_phi_up = propagate_met(
+#             jetsum_pt_after,
+#             jetsum_phi_after,
+#             events[jet_name].pt_jer_up,
+#             events[jet_name].phi,
+#             met_pt,
+#             met_phi,
+#         )
+#         met_pt_down, met_phi_down = propagate_met(
+#             jetsum_pt_after,
+#             jetsum_phi_after,
+#             events[jet_name].pt_jer_down,
+#             events[jet_name].phi,
+#             met_pt,
+#             met_phi,
+#         )
+#         events = set_ak_column_f32(events, "PuppiMET.pt_jer_up", met_pt_up)
+#         events = set_ak_column_f32(events, "PuppiMET.pt_jer_down", met_pt_down)
+#         events = set_ak_column_f32(events, "PuppiMET.phi_jer_up", met_phi_up)
+#         events = set_ak_column_f32(events, "PuppiMET.phi_jer_down", met_phi_down)
+
+#     return events
+
+
+# @jer.init
+# def jer_init(self: Calibrator) -> None:
+#     # determine gen-level jet index column
+#     lower_first = lambda s: s[0].lower() + s[1:] if s else s
+#     self.gen_jet_idx_column = lower_first(self.gen_jet_name) + "Idx"
+
+#     self.uses |= {
+#         "PuppiMET.pt", "PuppiMET.phi",
+#     }
+#     self.produces |= {
+#         "PuppiMET.pt", "PuppiMET.phi", "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up",
+#         "PuppiMET.phi_jer_down", "PuppiMET.pt_unsmeared", "PuppiMET.phi_unsmeared",
+#     }
+
+
+# @jer.requires
+# def jer_requires(self: Calibrator, reqs: dict) -> None:
+#     if "external_files" in reqs:
+#         return
+
+#     from columnflow.tasks.external import BundleExternalFiles
+#     reqs["external_files"] = BundleExternalFiles.req(self.task)
+
+
+# @jer.setup
+# def jer_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None:
+#     """
+#     Load the correct jer files using the :py:func:`from_string` method of the
+#     :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` function and apply the
+#     corrections as needed.
+
+#     The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet`
+#     instance are extracted with the :py:meth:`~.jer.get_jer_file`.
+
+#     Uses the member function :py:meth:`~.jer.get_jer_config` to construct the required keys, which
+#     are based on the following information about the JER:
+
+#     - campaign
+#     - version
+#     - jet_type
+
+#     A corresponding example snippet within the *config_inst* could like something like this:
+
+#     .. code-block:: python
+
+#         cfg.x.jer = DotDict.wrap({
+#             "Jet": {
+#                 "campaign": f"Summer19UL{year2}{jerc_postfix}",
+#                 "version": "JRV3",
+#                 "jet_type": "AK4PFchs",
+#             },
+#         })
+
+#     :param reqs: Requirement dictionary for this :py:class:`~columnflow.calibration.Calibrator`
+#         instance.
+#     :param inputs: Additional inputs, currently not used.
+#     :param reader_targets: TODO: add documentation.
+#     """
+#     bundle = reqs["external_files"]
+
+#     # import the correction sets from the external file
+#     import correctionlib
+#     correction_set = correctionlib.CorrectionSet.from_string(
+#         self.get_jer_file(bundle.files).load(formatter="gzip").decode("utf-8"),
+#     )
     
-    # compute JER keys from config information
-    jer_cfg = self.get_jer_config()
-    jer_keys = {
-        "jer": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_PtResolution_{jer_cfg.jet_type}",
-        "sf": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_ScaleFactor_{jer_cfg.jet_type}",
-    }
-
-    # store the evaluators
-    self.evaluators = {
-        name: get_evaluators(correction_set, [key])[0]
-        for name, key in jer_keys.items()
-    }
-
-    # use deterministic seeds for random smearing if requested
-    if self.deterministic_seed_index >= 0:
-        idx = self.deterministic_seed_index
-        bit_generator = np.random.SFC64
-        def deterministic_normal(loc, scale, seed):
-            return np.asarray([
-                np.random.Generator(bit_generator(_seed)).normal(_loc, _scale, size=idx + 1)[-1]
-                for _loc, _scale, _seed in zip(loc, scale, seed)
-            ])
-        self.deterministic_normal = deterministic_normal
-
-
-# explicit calibrators for standard jet collections
-jer_ak4 = jer.derive("jer_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"})
-jer_ak8 = jer.derive("jer_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8", "propagate_met": False})
-
-
-#
-# single calibrator for doing both JEC and JER smearing
-#
-
-@calibrator(
-    uses={jec, jer},
-    produces={jec, jer},
-    # toggle for propagation to PuppiMET
-    propagate_met=None,
-    # functions to determine configs and files
-    get_jec_file=None,
-    get_jec_config=None,
-    get_jer_file=None,
-    get_jer_config=None,
-)
-def jets(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
-    """
-    Instance of :py:class:`~columnflow.calibration.Calibrator` that does all relevant calibrations
-    for jets, i.e. JEC and JER. For more information, see :py:func:`~.jec` and :py:func:`~.jer`.
-
-    :param events: awkward array containing events to process
-    """
-    # apply jet energy corrections
-    events = self[jec](events, **kwargs)
-
-    # apply jer smearing on MC only
-    if self.dataset_inst.is_mc:
-        events = self[jer](events, **kwargs)
-
-    return events
-
-
-@jets.init
-def jets_init(self: Calibrator) -> None:
-    # forward argument to the producers
-    self.deps_kwargs[jec]["jet_name"] = self.jet_name
-    self.deps_kwargs[jer]["jet_name"] = self.jet_name
-    self.deps_kwargs[jer]["gen_jet_name"] = self.gen_jet_name
-    if self.propagate_met is not None:
-        self.deps_kwargs[jec]["propagate_met"] = self.propagate_met
-        self.deps_kwargs[jer]["propagate_met"] = self.propagate_met
-    if self.get_jec_file is not None:
-        self.deps_kwargs[jec]["get_jec_file"] = self.get_jec_file
-    if self.get_jec_config is not None:
-        self.deps_kwargs[jec]["get_jec_config"] = self.get_jec_config
-    if self.get_jer_file is not None:
-        self.deps_kwargs[jer]["get_jer_file"] = self.get_jer_file
-    if self.get_jer_config is not None:
-        self.deps_kwargs[jer]["get_jer_config"] = self.get_jer_config
-
-
-# explicit calibrators for standard jet collections
-jets_ak4 = jets.derive("jets_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"})
-jets_ak8 = jets.derive("jets_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8"})
+#     # compute JER keys from config information
+#     jer_cfg = self.get_jer_config()
+#     jer_keys = {
+#         "jer": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_PtResolution_{jer_cfg.jet_type}",
+#         "sf": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_ScaleFactor_{jer_cfg.jet_type}",
+#     }
+
+#     # store the evaluators
+#     self.evaluators = {
+#         name: get_evaluators(correction_set, [key])[0]
+#         for name, key in jer_keys.items()
+#     }
+
+#     # use deterministic seeds for random smearing if requested
+#     if self.deterministic_seed_index >= 0:
+#         idx = self.deterministic_seed_index
+#         bit_generator = np.random.SFC64
+#         def deterministic_normal(loc, scale, seed):
+#             return np.asarray([
+#                 np.random.Generator(bit_generator(_seed)).normal(_loc, _scale, size=idx + 1)[-1]
+#                 for _loc, _scale, _seed in zip(loc, scale, seed)
+#             ])
+#         self.deterministic_normal = deterministic_normal
+
+
+# # explicit calibrators for standard jet collections
+# jer_ak4 = jer.derive("jer_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"})
+# jer_ak8 = jer.derive("jer_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8", "propagate_met": False})
+
+
+# #
+# # single calibrator for doing both JEC and JER smearing
+# #
+
+# @calibrator(
+#     uses={jec, jer},
+#     produces={jec, jer},
+#     # toggle for propagation to PuppiMET
+#     propagate_met=None,
+#     # functions to determine configs and files
+#     get_jec_file=None,
+#     get_jec_config=None,
+#     get_jer_file=None,
+#     get_jer_config=None,
+# )
+# def jets(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
+#     """
+#     Instance of :py:class:`~columnflow.calibration.Calibrator` that does all relevant calibrations
+#     for jets, i.e. JEC and JER. For more information, see :py:func:`~.jec` and :py:func:`~.jer`.
+
+#     :param events: awkward array containing events to process
+#     """
+#     # apply jet energy corrections
+#     events = self[jec](events, **kwargs)
+
+#     # apply jer smearing on MC only
+#     if self.dataset_inst.is_mc:
+#         events = self[jer](events, **kwargs)
+
+#     return events
+
+
+# @jets.init
+# def jets_init(self: Calibrator) -> None:
+#     # forward argument to the producers
+#     self.deps_kwargs[jec]["jet_name"] = self.jet_name
+#     self.deps_kwargs[jer]["jet_name"] = self.jet_name
+#     self.deps_kwargs[jer]["gen_jet_name"] = self.gen_jet_name
+#     if self.propagate_met is not None:
+#         self.deps_kwargs[jec]["propagate_met"] = self.propagate_met
+#         self.deps_kwargs[jer]["propagate_met"] = self.propagate_met
+#     if self.get_jec_file is not None:
+#         self.deps_kwargs[jec]["get_jec_file"] = self.get_jec_file
+#     if self.get_jec_config is not None:
+#         self.deps_kwargs[jec]["get_jec_config"] = self.get_jec_config
+#     if self.get_jer_file is not None:
+#         self.deps_kwargs[jer]["get_jer_file"] = self.get_jer_file
+#     if self.get_jer_config is not None:
+#         self.deps_kwargs[jer]["get_jer_config"] = self.get_jer_config
+
+
+# # explicit calibrators for standard jet collections
+# jets_ak4 = jets.derive("jets_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"})
+# jets_ak8 = jets.derive("jets_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8"})
diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py
index 0c60ff0fb..ceeccb986 100644
--- a/columnflow/plotting/plot_functions_1d.py
+++ b/columnflow/plotting/plot_functions_1d.py
@@ -316,7 +316,7 @@ def plot_shifted_variable(
     default_style_config = prepare_style_config(
         config_inst, category_inst, variable_inst, density, shape_norm, yscale,
     )
-    default_style_config["rax_cfg"]["ylim"] = (0.25, 1.75)
+    default_style_config["rax_cfg"]["ylim"] = (0.75, 1.25)
     default_style_config["rax_cfg"]["ylabel"] = "Ratio"
     if legend_title:
         default_style_config["legend_cfg"]["title"] = legend_title
diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index b8228b361..e7bc90763 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -425,12 +425,14 @@ def eval_formula(formula_str, popt):
                 mask = h1d.values() > 0
                 y = h1d.values()[mask]
                 y_err = (h1d.variances()[mask])**0.5
-                x = h1d.axes[0].centers[mask]
-                popt, pcov = curve_fit(fitf,x,y,
+                x = h1d.axes[0].centers
+                x_masked = x[mask]
+                
+                popt, pcov = curve_fit(fitf,x_masked,y,
                                        sigma=y_err,
                                        absolute_sigma=True,
                                        )
-                fitres['chi2'][dm] = sum(((y - fitf(x, *popt))/y_err)**2)
+                fitres['chi2'][dm] = sum(((y - fitf(x_masked, *popt))/y_err)**2)
                 fitres['ndf'][dm] = len(y) - len(popt)
                 fitres['popt'][dm] = popt 
                 fitres['pcov'][dm] = pcov
diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py
index 0de908b80..2bc75c005 100644
--- a/columnflow/tasks/framework/mixins.py
+++ b/columnflow/tasks/framework/mixins.py
@@ -2447,7 +2447,7 @@ class HistHookMixin(ConfigTask):
         "default: empty",
     )
 
-    def invoke_hist_hooks(self, hists: dict) -> dict:
+    def invoke_hist_hooks(self, hists: dict, category_inst: od.Category) -> dict:
         """
         Invoke hooks to update histograms before plotting.
         """
@@ -2470,7 +2470,7 @@ def invoke_hist_hooks(self, hists: dict) -> dict:
 
             # invoke it
             self.publish_message(f"invoking hist hook '{hook}'")
-            hists = func(self, hists)
+            hists = func(self, hists, category_inst)
 
         return hists
 
diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py
index bfc316e9e..f1d9c7e61 100644
--- a/columnflow/tasks/histograms.py
+++ b/columnflow/tasks/histograms.py
@@ -209,13 +209,19 @@ def run(self):
                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
 
                 categories = self.config_inst.categories.names()
-                sig_regs = [the_cat for the_cat in categories if 'sr' in the_cat]
+                sr_names = [the_cat for the_cat in categories if 'sr' in the_cat]
                 # define and fill histograms, taking into account multiple axes
-                for sig_reg in sig_regs:
+                for sr_name in sr_names:
                     #iterate over the regions needed for calculation of the ff_method
-                    for region in ["sr", "ar_wj", "ar_qcd", "ar_yields"]: 
+                    the_sr = self.config_inst.get_category(sr_name)
+                    regions = [sr_name]
+                    if the_sr.aux:
+                        for the_key in the_sr.aux.keys():
+                            if (the_key == 'abcd_regs') or (the_key == 'ff_regs'):
+                                regions += list(the_sr.aux[the_key].values())
+                    for region in regions: 
                         #by accessing the list of categories we check if the category with this name exists
-                        cat = self.config_inst.get_category(sig_reg.replace('sr',region))
+                        cat = self.config_inst.get_category(region)
                         if cat.name not in histograms.keys(): histograms[cat.name] = {}
                         for var_key, var_names in self.variable_tuples.items():
                             # get variable instances
@@ -225,39 +231,38 @@ def run(self):
                                 # create the histogram in the first chunk
                                 histograms[cat.name][var_key] = create_hist_from_variables(
                                     *variable_insts,
-                                    int_cat_axes=("category", "process", "shift"),
+                                    int_cat_axes=("process", "shift"),
                                 )
                             # mask events and weights when selection expressions are found
                             masked_events = events
                             
-                            if region == 'ar_wj':
+                            if 'ar_wj' in region:
                                 masked_weights = weight * events.ff_weight_wj_nominal
-                            elif region == 'ar_qcd':
+                            elif 'ar_qcd' in region:
                                 masked_weights = weight * events.ff_weight_qcd_nominal
                             else:
                                 masked_weights = weight
-                            for variable_inst in variable_insts:
-                                sel = variable_inst.selection
-                                if sel == "1":
-                                    continue
-                                if not callable(sel):
-                                    raise ValueError(
-                                        f"invalid selection '{sel}', for now only callables are supported",
-                                    )
-                                mask = sel(masked_events)
-                                #select only one category per histogram
-                                masked_events = masked_events[mask]
-                                masked_weights = masked_weights[mask]
-
-                            # merge category ids
+                                
+                            # for variable_inst in variable_insts:
+                            #     sel = variable_inst.selection
+                            #     if sel == "1":
+                            #         continue
+                            #     if not callable(sel):
+                            #         raise ValueError(
+                            #             f"invalid selection '{sel}', for now only callables are supported",
+                            #         )
+                            #     mask = sel(masked_events)
+                            #     #select only one category per histogram
+                              # merge category ids
                             category_ids = ak.concatenate(
                                 [Route(c).apply(masked_events) for c in self.category_id_columns],
                                 axis=-1,
                             )
-
+                            mask = ak.any(category_ids == cat.id, axis = 1)
+                            masked_events = masked_events[mask]
+                            masked_weights = masked_weights[mask]
                             # broadcast arrays so that each event can be filled for all its categories
                             fill_data = {
-                                "category": category_ids,
                                 "process": masked_events.process_id,
                                 "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id,
                                 "weight": masked_weights,
@@ -274,7 +279,6 @@ def expr(events, *args, **kwargs):
                                 # apply it
                                 fill_data[variable_inst.name] = expr(masked_events)
                             # fill it
-
                             fill_hist(
                                 histograms[cat.name][var_key],
                                 fill_data,
@@ -291,261 +295,6 @@ def expr(events, *args, **kwargs):
     add_default_to_description=True,
 )
 
-# class CreateHistograms(
-#     VariablesMixin,
-#     WeightProducerMixin,
-#     MLModelsMixin,
-#     ProducersMixin,
-#     ReducedEventsUser,
-#     ChunkedIOMixin,
-#     law.LocalWorkflow,
-#     RemoteWorkflow,
-# ):
-#     last_edge_inclusive = last_edge_inclusive_inst
-
-#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
-
-#     # upstream requirements
-#     reqs = Requirements(
-#         ReducedEventsUser.reqs,
-#         RemoteWorkflow.reqs,
-#         ProduceColumns=ProduceColumns,
-#         MLEvaluation=MLEvaluation,
-#     )
-
-#     # strategy for handling missing source columns when adding aliases on event chunks
-#     missing_column_alias_strategy = "original"
-
-#     # names of columns that contain category ids
-#     # (might become a parameter at some point)
-#     category_id_columns = {"category_ids"}
-
-#     # register sandbox and shifts found in the chosen weight producer to this task
-#     register_weight_producer_sandbox = True
-#     register_weight_producer_shifts = True
-
-#     @law.util.classproperty
-#     def mandatory_columns(cls) -> set[str]:
-#         return set(cls.category_id_columns) | {"process_id"}
-
-#     def workflow_requires(self):
-#         reqs = super().workflow_requires()
-
-#         # require the full merge forest
-#         reqs["events"] = self.reqs.ProvideReducedEvents.req(self)
-
-#         if not self.pilot:
-#             if self.producer_insts:
-#                 reqs["producers"] = [
-#                     self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
-#                     for producer_inst in self.producer_insts
-#                     if producer_inst.produced_columns
-#                 ]
-#             if self.ml_model_insts:
-#                 reqs["ml"] = [
-#                     self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name)
-#                     for ml_model_inst in self.ml_model_insts
-#                 ]
-
-#             # add weight_producer dependent requirements
-#             reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
-
-#         return reqs
-
-#     def requires(self):
-#         reqs = {"events": self.reqs.ProvideReducedEvents.req(self)}
-
-#         if self.producer_insts:
-#             reqs["producers"] = [
-#                 self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
-#                 for producer_inst in self.producer_insts
-#                 if producer_inst.produced_columns
-#             ]
-#         if self.ml_model_insts:
-#             reqs["ml"] = [
-#                 self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name)
-#                 for ml_model_inst in self.ml_model_insts
-#             ]
-
-#         # add weight_producer dependent requirements
-#         reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
-
-#         return reqs
-
-#     workflow_condition = ReducedEventsUser.workflow_condition.copy()
-
-#     @workflow_condition.output
-#     def output(self):
-#         return {"hists": self.target(f"hist__vars_{self.variables_repr}__{self.branch}.pickle")}
-
-#     @law.decorator.notify
-#     @law.decorator.log
-#     @law.decorator.localize(input=True, output=False)
-#     @law.decorator.safe_output
-#     def run(self):
-#         import numpy as np
-#         import awkward as ak
-#         from columnflow.columnar_util import (
-#             Route, update_ak_array, add_ak_aliases, has_ak_column, attach_coffea_behavior,
-#         )
-#         from columnflow.hist_util import fill_hist
-
-#         # prepare inputs
-#         inputs = self.input()
-
-#         # declare output: dict of histograms
-#         histograms = {}
-
-#         # run the weight_producer setup
-#         producer_reqs = self.weight_producer_inst.run_requires()
-#         reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs))
-
-#         # create a temp dir for saving intermediate files
-#         tmp_dir = law.LocalDirectoryTarget(is_tmp=True)
-#         tmp_dir.touch()
-
-#         # get shift dependent aliases
-#         aliases = self.local_shift_inst.x("column_aliases", {})
-
-#         # define columns that need to be read
-#         read_columns = {Route("process_id")}
-#         read_columns |= set(map(Route, self.category_id_columns))
-#         read_columns |= set(self.weight_producer_inst.used_columns)
-#         read_columns |= set(map(Route, aliases.values()))
-#         read_columns |= {
-#             Route(inp)
-#             for variable_inst in (
-#                 self.config_inst.get_variable(var_name)
-#                 for var_name in law.util.flatten(self.variable_tuples.values())
-#             )
-#             for inp in ((
-#                 {variable_inst.expression}
-#                 if isinstance(variable_inst.expression, str)
-#                 # for variable_inst with custom expressions, read columns declared via aux key
-#                 else set(variable_inst.x("inputs", []))
-#             ) | (
-#                 # for variable_inst with selection, read columns declared via aux key
-#                 set(variable_inst.x("inputs", []))
-#                 if variable_inst.selection != "1"
-#                 else set()
-#             ))
-#         }
-
-#         # empty float array to use when input files have no entries
-#         empty_f32 = ak.Array(np.array([], dtype=np.float32))
-
-#         # iterate over chunks of events and diffs
-#         file_targets = [inputs["events"]["events"]]
-#         if self.producer_insts:
-#             file_targets.extend([inp["columns"] for inp in inputs["producers"]])
-#         if self.ml_model_insts:
-#             file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]])
-
-#         # prepare inputs for localization
-#         with law.localize_file_targets(
-#             [*file_targets, *reader_targets.values()],
-#             mode="r",
-#         ) as inps:
-#             for (events, *columns), pos in self.iter_chunked_io(
-#                 [inp.abspath for inp in inps],
-#                 source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets),
-#                 read_columns=(len(file_targets) + len(reader_targets)) * [read_columns],
-#                 chunk_size=self.weight_producer_inst.get_min_chunk_size(),
-#             ):
-#                 # optional check for overlapping inputs
-#                 if self.check_overlapping_inputs:
-#                     self.raise_if_overlapping([events] + list(columns))
-
-#                 # add additional columns
-#                 events = update_ak_array(events, *columns)
-
-#                 # add aliases
-#                 events = add_ak_aliases(
-#                     events,
-#                     aliases,
-#                     remove_src=True,
-#                     missing_strategy=self.missing_column_alias_strategy,
-#                 )
-
-#                 # attach coffea behavior aiding functional variable expressions
-#                 events = attach_coffea_behavior(events)
-
-#                 # build the full event weight
-#                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
-#                     events, weight = self.weight_producer_inst(events)
-#                 else:
-#                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
-
-#                 # define and fill histograms, taking into account multiple axes
-#                 for var_key, var_names in self.variable_tuples.items():
-#                     # get variable instances
-#                     variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names]
-
-#                     if var_key not in histograms:
-#                         # create the histogram in the first chunk
-#                         histograms[var_key] = create_hist_from_variables(
-#                             *variable_insts,
-#                             int_cat_axes=("category", "process", "shift"),
-#                         )
-
-#                     # mask events and weights when selection expressions are found
-#                     masked_events = events
-#                     masked_weights = weight
-#                     for variable_inst in variable_insts:
-#                         sel = variable_inst.selection
-#                         if sel == "1":
-#                             continue
-#                         if not callable(sel):
-#                             raise ValueError(
-#                                 f"invalid selection '{sel}', for now only callables are supported",
-#                             )
-#                         mask = sel(masked_events)
-#                         masked_events = masked_events[mask]
-#                         masked_weights = masked_weights[mask]
-
-#                     # merge category ids
-#                     category_ids = ak.concatenate(
-#                         [Route(c).apply(masked_events) for c in self.category_id_columns],
-#                         axis=-1,
-#                     )
-
-#                     # broadcast arrays so that each event can be filled for all its categories
-#                     fill_data = {
-#                         "category": category_ids,
-#                         "process": masked_events.process_id,
-#                         "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id,
-#                         "weight": masked_weights,
-#                     }
-#                     for variable_inst in variable_insts:
-#                         # prepare the expression
-#                         expr = variable_inst.expression
-#                         if isinstance(expr, str):
-#                             route = Route(expr)
-#                             def expr(events, *args, **kwargs):
-#                                 if len(events) == 0 and not has_ak_column(events, route):
-#                                     return empty_f32
-#                                 return route.apply(events, null_value=variable_inst.null_value)
-#                         # apply it
-#                         fill_data[variable_inst.name] = expr(masked_events)
-
-#                     # fill it
-#                     fill_hist(
-#                         histograms[var_key],
-#                         fill_data,
-#                         last_edge_inclusive=self.last_edge_inclusive,
-#                     )
-
-#         # merge output files
-#         self.output()["hists"].dump(histograms, formatter="pickle")
-
-
-# # overwrite class defaults
-# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
-# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
-#     default=CreateHistograms.task_family in check_overlap_tasks,
-#     add_default_to_description=True,
-# )
-
 
 CreateHistogramsWrapper = wrapper_factory(
     base_cls=AnalysisTask,
@@ -651,7 +400,6 @@ def run(self):
             inp["hists"].load(formatter="pickle")
             for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
         ]
-
         cats = list(hists[0].keys())
         variable_names = list(hists[0][cats[0]].keys())
         get_hists = lambda hists, cat, var : [h[cat][var] for h in hists]
@@ -663,35 +411,9 @@ def run(self):
                 variable_hists  = get_hists(hists, the_cat, variable_name)
                 merged_hists[the_cat] = sum(variable_hists[1:], variable_hists[0].copy())
             outputs["hists"][variable_name].dump(merged_hists, formatter="pickle")
-
         # optionally remove inputs
         if self.remove_previous:
             inputs.remove()
-    
-    # def run(self):
-    #     # preare inputs and outputs
-    #     inputs = self.input()["collection"]
-    #     outputs = self.output()
-
-    #     # load input histograms
-    #     hists = [
-    #         inp["hists"].load(formatter="pickle")
-    #         for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
-    #     ]
-
-    #     # create a separate file per output variable
-    #     variable_names = list(hists[0].keys())
-    #     for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)):
-    #         self.publish_message(f"merging histograms for '{variable_name}'")
-
-    #         variable_hists = [h[variable_name] for h in hists]
-    #         merged = sum(variable_hists[1:], variable_hists[0].copy())
-    #         outputs["hists"][variable_name].dump(merged, formatter="pickle")
-
-    #     # optionally remove inputs
-    #     if self.remove_previous:
-    #         inputs.remove()
-
 
 MergeHistogramsWrapper = wrapper_factory(
     base_cls=AnalysisTask,
@@ -769,13 +491,18 @@ def run(self):
             self.publish_message(f"merging histograms for '{variable_name}'")
 
             # load hists
+           
+            
             variable_hists = [
                 coll["hists"].targets[variable_name].load(formatter="pickle")
                 for coll in inputs.values()
             ]
-
-            # merge and write the output
-            merged = sum(variable_hists[1:], variable_hists[0].copy())
+            merged = {}
+            get_hists = lambda hists, cat : [h[cat] for h in hists]
+            for the_cat in variable_hists[0].keys():
+                single_cat_hists = get_hists(variable_hists, the_cat)
+                merged[the_cat] = sum(single_cat_hists[1:], single_cat_hists[0].copy())
+            
             outp.dump(merged, formatter="pickle")
 
 
diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py
index d15a18cec..8ac757ed0 100644
--- a/columnflow/tasks/plotting.py
+++ b/columnflow/tasks/plotting.py
@@ -149,14 +149,18 @@ def run(self):
                     "  - requested variable requires columns that were missing during histogramming\n"
                     "  - selected --processes did not match any value on the process axis of the input histogram",
                 )
-            
-            if 'sr' in category_inst.name:
-                hists = self.invoke_hist_hooks(hists)
+            if category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions
+                if self.hist_hooks:
+                    hists = self.invoke_hist_hooks(hists,category_inst)
+                else:
+                    hists = hists[category_inst.name]
             else:
                 if category_inst.name in hists.keys():
                     hists = hists[category_inst.name]
                 else:
-                    hists[list(hists.keys())[0]]
+                    raise Exception(
+                    f"no histograms found to plot for {category_inst.name}"
+                )
 
             # add new processes to the end of the list
             for process_inst in hists:
@@ -169,11 +173,6 @@ def run(self):
                 h = hists[process_inst]
                 # selections
                 h = h[{
-                    "category": [
-                        hist.loc(c.id)
-                        for c in leaf_category_insts
-                        if c.id in h.axes["category"]
-                    ],
                     "shift": [
                         hist.loc(s.id)
                         for s in plot_shifts
@@ -181,11 +180,9 @@ def run(self):
                     ],
                 }]
                 # reductions
-                h = h[{"category": sum}]
                 # store
                 _hists[process_inst] = h
             hists = _hists
-
             # call the plot function
             fig, _ = self.call_plot_func(
                 self.plot_function,

From 46c51ee600f42af344121a1dca061686de3bdc97 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 24 Mar 2025 16:28:08 +0100
Subject: [PATCH 15/26] Updated fake factor method: fixed bugs with chunked io

---
 columnflow/tasks/data_driven_methods.py | 666 +++++++++++++++---------
 columnflow/tasks/plotting.py            |   5 +-
 2 files changed, 415 insertions(+), 256 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index e7bc90763..759152614 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -24,6 +24,7 @@
 
 
 class PrepareFakeFactorHistograms(
+    CategoriesMixin,
     WeightProducerMixin,
     MLModelsMixin,
     ProducersMixin,
@@ -58,6 +59,10 @@ class PrepareFakeFactorHistograms(
     def mandatory_columns(cls) -> set[str]:
         return set(cls.category_id_columns) | {"process_id"}
 
+    # def create_branch_map(self):
+    #     # create a dummy branch map so that this task could be submitted as a job
+    #     return {0: None}
+    
     def workflow_requires(self):
         reqs = super().workflow_requires()
 
@@ -96,8 +101,8 @@ def requires(self):
 
     @workflow_condition.output
     def output(self):
-        return {"hists": self.target(f"fake_factor__{self.branch}.pickle")}
-
+        return  {"hists": self.target(f"ff_hist_{self.branch}.pickle")}
+    @law.decorator.notify
     @law.decorator.log
     @law.decorator.localize(input=True, output=False)
     @law.decorator.safe_output
@@ -106,9 +111,9 @@ def run(self):
         import numpy as np
         import awkward as ak
         from columnflow.columnar_util import (
-            Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, EMPTY_FLOAT
+            Route, update_ak_array, add_ak_aliases, has_ak_column, attach_coffea_behavior, EMPTY_FLOAT
         )
-
+        from columnflow.hist_util import fill_hist
         # prepare inputs
         inputs = self.input()
 
@@ -125,15 +130,14 @@ def run(self):
 
         # get shift dependent aliases
         aliases = self.local_shift_inst.x("column_aliases", {})
-
+        ff_variables = [var.var_route for var in self.config_inst.x.fake_factor_method.axes.values()]
         # define columns that need to be read
+        
         read_columns = {Route("process_id")}
         read_columns |= set(map(Route, self.category_id_columns))
         read_columns |= set(self.weight_producer_inst.used_columns)
         read_columns |= set(map(Route, aliases.values()))
-        read_columns |= {
-            Route(the_ax.var_route) for the_ax in self.config_inst.x.fake_factor_method.axes.values()
-        }
+        read_columns |= set(map(Route, ff_variables))
         # empty float array to use when input files have no entries
         empty_f32 = ak.Array(np.array([], dtype=np.float32))
 
@@ -141,14 +145,15 @@ def run(self):
         file_targets = [inputs["events"]["events"]]
         if self.producer_insts:
             file_targets.extend([inp["columns"] for inp in inputs["producers"]])
-
+            
         # prepare inputs for localization
         with law.localize_file_targets(
             [*file_targets, *reader_targets.values()],
             mode="r",
         ) as inps:
+            
             for (events, *columns), pos in self.iter_chunked_io(
-                [inp.path for inp in inps],
+                [inp.abspath for inp in inps],
                 source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets),
                 read_columns=(len(file_targets) + len(reader_targets)) * [read_columns],
                 chunk_size=self.weight_producer_inst.get_min_chunk_size(),
@@ -156,10 +161,8 @@ def run(self):
                 # optional check for overlapping inputs
                 if self.check_overlapping_inputs:
                     self.raise_if_overlapping([events] + list(columns))
-
                 # add additional columns
                 events = update_ak_array(events, *columns)
-
                 # add aliases
                 events = add_ak_aliases(
                     events,
@@ -168,47 +171,77 @@ def run(self):
                     missing_strategy=self.missing_column_alias_strategy,
                 )
 
+                # attach coffea behavior aiding functional variable expressions
+                events = attach_coffea_behavior(events)
                 # build the full event weight
                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
                     events, weight = self.weight_producer_inst(events)
                 else:
                     weight = ak.Array(np.ones(len(events), dtype=np.float32))
                 # define and fill histograms, taking into account multiple axes
-                categories = self.config_inst.categories.ids()
-                h = (hist.Hist.new
-                    .IntCat(categories , name="category", growth=True)
-                    .IntCat([], name="process", growth=True))
-                for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
-                    h = eval(f'h.{var_axis.ax_str}') 
-                
-                histograms['fake_factors'] = h.Weight()
-                
                 category_ids = ak.concatenate(
                         [Route(c).apply(events) for c in self.category_id_columns],
-                        axis=-1,
-                    )
-                # broadcast arrays so that each event can be filled for all its categories
-                
-                fill_data = {
-                    "category"          : category_ids,
-                    "process"           : events.process_id,
-                    "weight"            : weight,
-                }
-                for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
-                    route = Route(var_axis.var_route)
-                    if len(events) == 0 and not has_ak_column(events, route):
-                        values = empty_f32
+                        axis=-1,)
+                sr_names = self.categories
+                for sr_name in sr_names:
+                    the_sr = self.config_inst.get_category(sr_name)
+                    regions = [sr_name]
+                    if the_sr.aux:
+                        for the_key in the_sr.aux.keys():
+                            if (the_key == 'abcd_regs') or (the_key == 'ff_regs'):
+                                regions += list(the_sr.aux[the_key].values())
                     else:
-                        values = ak.fill_none(ak.firsts(route.apply(events),axis=1), EMPTY_FLOAT)
-                        if 'IntCategory' in var_axis.ax_str: values = ak.values_astype(values, np.int64)
-                    fill_data[var_name] = values
-                # fill it
-                fill_hist(
-                    histograms['fake_factors'],
-                    fill_data,
-                )
+                        raise KeyError(f"Application and determination regions are not found for {the_sr}. \n Check aux field of the category map!") 
+    
+                    for region in regions: 
+                        #by accessing the list of categories we check if the category with this name exists
+                        cat = self.config_inst.get_category(region)
+                        
+                        # get variable instances
+                        mask = ak.any(category_ids == cat.id, axis = 1)
+                        masked_events = events[mask]
+                        masked_weight = weight[mask]
+                        
+                        h = (hist.Hist.new.IntCat([], name="process", growth=True))
+                        for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
+                            h = eval(f'h.{var_axis.ax_str}') 
+                        
+                        h = h.Weight()
+                        # broadcast arrays so that each event can be filled for all its categories
+                        
+                        fill_data = {
+                            "process": masked_events.process_id,
+                            "weight"  : masked_weight,
+                        }
+                        for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
+                            route = Route(var_axis.var_route)
+                            if len(masked_events) == 0 and not has_ak_column(masked_events, route):
+                                values = empty_f32
+                            else:
+                                values = route.apply(masked_events)
+                                if values.ndim != 1: values = ak.firsts(values,axis=1)
+                                values = ak.fill_none(values, EMPTY_FLOAT)
+                                
+                                if var_name == 'n_jets': values = ak.where (values > 2, 
+                                                                            2 * ak.ones_like(values),
+                                                                            values) 
+                                
+                                if 'Int' in var_axis.ax_str: values = ak.values_astype(values, np.int64)
+                            fill_data[var_name] = values
+                        # fill it
+                        fill_hist(
+                            h,
+                            fill_data,
+                        )
+                        if cat.name not in histograms.keys():
+                            histograms[cat.name] = h
+                        else:
+                            histograms[cat.name] +=h
+                        
         # merge output files
         self.output()["hists"].dump(histograms, formatter="pickle")
+    
+   
 
 
 # overwrite class defaults
@@ -225,6 +258,121 @@ def run(self):
     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 )
 
+
+class MergeFakeFactorHistograms(
+    #VariablesMixin,
+    #WeightProducerMixin,
+    #MLModelsMixin,
+    #ProducersMixin,
+    #SelectorStepsMixin,
+    #CalibratorsMixin,
+    DatasetTask,
+    law.LocalWorkflow,
+    RemoteWorkflow,
+):
+    only_missing = luigi.BoolParameter(
+        default=False,
+        description="when True, identify missing variables first and only require histograms of "
+        "missing ones; default: False",
+    )
+    remove_previous = luigi.BoolParameter(
+        default=False,
+        significant=False,
+        description="when True, remove particlar input histograms after merging; default: False",
+    )
+
+    sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+    # upstream requirements
+    reqs = Requirements(
+        RemoteWorkflow.reqs,
+        PrepareFakeFactorHistograms=PrepareFakeFactorHistograms,
+    )
+
+    @classmethod
+    def req_params(cls, inst: AnalysisTask, **kwargs) -> dict:
+        _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"}
+        kwargs["_prefer_cli"] = _prefer_cli
+        return super().req_params(inst, **kwargs)
+
+    def create_branch_map(self):
+        # create a dummy branch map so that this task could be submitted as a job
+        return {0: None}
+
+    # def _get_variables(self):
+    #     if self.is_workflow():
+    #         return self.as_branch()._get_variables()
+
+    #     variables = self.variables
+
+    #     # optional dynamic behavior: determine not yet created variables and require only those
+    #     if self.only_missing:
+    #         missing = self.output().count(existing=False, keys=True)[1]
+    #         variables = sorted(missing, key=variables.index)
+
+    #     return variables
+
+    def workflow_requires(self):
+        reqs = super().workflow_requires()
+
+        if not self.pilot:
+            #variables = self._get_variables()
+            #if variables:
+            reqs["hists"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching(
+                    self,
+                    branch=-1,
+                    #variables=tuple(variables),
+            )
+
+        return reqs
+
+    def requires(self):
+        #variables = self._get_variables()
+        #if not variables:
+        #    return []
+
+        return self.reqs.PrepareFakeFactorHistograms.req_different_branching(
+            self,
+            branch=-1,
+            #variables=tuple(variables),
+            workflow="local",
+        )
+
+    def output(self):
+        return {"hists": self.target(f"merged_ff_hist.pickle")}
+
+    @law.decorator.notify
+    @law.decorator.log
+    def run(self):
+        # preare inputs and outputs
+        inputs = self.input()["collection"]
+        outputs = self.output()
+
+        # load input histograms
+        hists = [
+            inp["hists"].load(formatter="pickle")
+            for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50))
+        ]
+        cats = list(hists[0].keys())
+        get_hists = lambda hists, cat : [h[cat] for h in hists]
+        # create a separate file per output variable
+        merged_hists = {}
+        self.publish_message(f"merging {len(hists)} histograms for {self.dataset}")
+        for the_cat in cats:
+            h = get_hists(hists, the_cat)
+            merged_hists[the_cat] = sum(h[1:], h[0].copy())
+        outputs["hists"].dump(merged_hists, formatter="pickle")
+        # optionally remove inputs
+        if self.remove_previous:
+            inputs.remove()
+
+MergeFakeFactorHistogramsWrapper = wrapper_factory(
+    base_cls=AnalysisTask,
+    require_cls=MergeFakeFactorHistograms,
+    enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
+)
+
+
 class dict_creator():
     def init_dict(self, ax_list):
         if not ax_list:
@@ -259,7 +407,7 @@ class ComputeFakeFactors(
     # upstream requirements
     reqs = Requirements(
         RemoteWorkflow.reqs,
-        PrepareFakeFactorHistograms=PrepareFakeFactorHistograms,
+        MergeFakeFactorHistograms=MergeFakeFactorHistograms,
     )
     
     def store_parts(self):
@@ -272,37 +420,43 @@ def req_params(cls, inst: AnalysisTask, **kwargs) -> dict:
         _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"}
         kwargs["_prefer_cli"] = _prefer_cli
         return super().req_params(inst, **kwargs)
-
-    def workflow_requires(self):
-        reqs = super().workflow_requires()
-        if not self.pilot:
-            variables = self._get_variables()
-            if variables:
-                reqs["ff_method"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching(
-                    self,
-                    branch=-1,
-                    variables=tuple(variables),
-                )
+    
+    def create_branch_map(self):
+        # create a dummy branch map so that this task could be submitted as a job
+        return {0: None}
 
         return reqs
-
     def requires(self):
         return {
-            d: self.reqs.PrepareFakeFactorHistograms.req(
+            d: self.reqs.MergeFakeFactorHistograms.req_different_branching(
                 self,
+                branch=-1,
                 dataset=d,
-                branch=-1
+                workflow="local",
             )
             for d in self.datasets
         }
+    
     def output(self):
-        return {"ff_json": self.target(f"fake_factors.json"),
-                "plots": {'_'.join((ff_type, syst)): self.target(f"fake_factor_{ff_type}_{syst}.png")
+        year = self.config_inst.campaign.aux['year']
+        tag = self.config_inst.campaign.aux['tag']
+        channel = self.config_inst.channels.get_first().name
+        return {"ff_json": self.target('_'.join(('fake_factors',
+                                                 channel,
+                                                 str(year),
+                                                 tag)) + '.json'),
+                "plots": {'_'.join((ff_type,
+                                    syst,
+                                    f'n_jets_{str(nj)}')): self.target(f"fake_factor_{ff_type}_{syst}_njets_{str(nj)}.png")
                           for syst in ['nominal', 'up', 'down']
-                          for ff_type in ['qcd','wj']},
-                "plots1d": {'_'.join((ff_type,str(dm))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}.png")
                           for ff_type in ['qcd','wj']
-                          for dm in [0,1,2,10,11]}}
+                          for nj in [0,1,2]},
+                "plots1d": {'_'.join((ff_type,
+                                      str(dm),
+                                      str(nj))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}_njets_{str(nj)}.png")
+                          for ff_type in ['qcd','wj']
+                          for dm in [0,1,2,10,11]
+                          for nj in [0,1,2]}}
 
     @law.decorator.log
     def run(self):
@@ -321,57 +475,59 @@ def run(self):
         # preare inputs and outputs
         inputs = self.input()
         outputs = self.output()
-        merged_per_dataset = {}
-        projected_hists = []
+        
         hists_by_dataset = []
+        merged_hists = {}
         for (dataset_name, dataset) in inputs.items():
-            files = dataset['collection']
+            files = dataset['collection'][0]
+            
             # load input histograms per dataset
-            hists_per_ds = [
-                inp['hists'].load(formatter="pickle")['fake_factors']
-                for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50))
-            ]
-            ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy())
-            hists_by_dataset.append(ds_single_hist)
-        #Create a dict of histograms indexed by the process
-        hists_by_proc = {}
-        for proc_name in self.config_inst.processes.names():
-            proc = self.config_inst.processes.get(proc_name)
-            for the_hist in hists_by_dataset:
-                
-                if proc.id in the_hist.axes["process"]: 
-                    h = the_hist.copy()
-                    h = h[{"process": hist.loc(proc.id)}]
-                    # add the histogram
-                    if proc in hists_by_proc:
-                        hists_by_proc[proc] += h
+            input_chunked_hists = []
+            input_chunked_hists = [f.load(formatter='pickle') for f in files.values()]
+            
+            for hists in input_chunked_hists:
+                for the_cat, the_hist in hists.items():
+                    if the_cat not in merged_hists.keys():
+                        merged_hists[the_cat] = []
+                        merged_hists[the_cat].append(the_hist)
                     else:
-                        hists_by_proc[proc] = h
-        
-        #Divide histograms to data and bkg
-        mc_hists    = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")]
-        data_hists  = [h for p, h in hists_by_proc.items() if p.is_data]
-        
-        #Merge histograms to get a joint data and mc histogram
-        if len(mc_hists) > 1:   mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
-        else: mc_hists = mc_hists[0].copy()
-        if len(data_hists) > 1: data_hists  = sum(data_hists[1:], data_hists[0].copy())
-        else: data_hists = data_hists[0].copy()
+                        merged_hists[the_cat].append(the_hist)
         
-        #Function that performs the calculation of th
-        def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'):
-            def get_dr_hist(self, h, det_reg): 
-                cat_name = self.categories[0]
-                cat = self.config_inst.get_category(cat_name.replace('sr',det_reg))
-                return h[{"category": hist.loc(cat.id)}]
-            
-            get_id = lambda ax, key: [i in enumerate(ax.keys)]
-         
-            data_num = get_dr_hist(self, h_data, num_reg)
-            data_den = get_dr_hist(self, h_data, den_reg)
-            mc_num = get_dr_hist(self, h_mc, num_reg)
-            mc_den = get_dr_hist(self, h_mc, den_reg)
+        #merge histograms
+        mc_hists = {}
+        data_hists = {}
+        #devide between data and mc
+        for the_cat, h_list in merged_hists.items():
+            for the_hist in h_list:
+                for proc_name in self.config_inst.processes.names():
+                    proc = self.config_inst.processes.get(proc_name)
+                    if proc.id in the_hist.axes["process"]: 
+                        h = the_hist.copy()
+                        h = h[{"process": hist.loc(proc.id)}]
+                        if proc.is_mc and not proc.has_tag("signal"):
+                            if the_cat in mc_hists: mc_hists[the_cat] += h
+                            else: mc_hists[the_cat] = h
+                        if proc.is_data:
+                            if the_cat in data_hists: data_hists[the_cat] += h
+                            else: data_hists[the_cat] = h
+        #Function that performs the calculation of t
+        def get_ff_corr(self, h_data, h_mc, dr_num, dr_den, name='ff_hist', label='ff_hist'):
             
+            def get_single_cat(self, h, reg_name): 
+                cat_name = self.config_inst.get_category(self.categories[0]).aux['ff_regs'][reg_name]
+                return h[cat_name]
+            data_num = get_single_cat(self, h_data, dr_num)
+            data_den = get_single_cat(self, h_data, dr_den)
+            mc_num = get_single_cat(self, h_mc, dr_num)
+            mc_den = get_single_cat(self, h_mc, dr_den)
+            print(name)
+            for nj in [0,1,2]:
+                for dm in [0,1,2,10,11]:
+                    print(f'DM {dm} Nj {nj}')
+                    print(f"data_num: {data_num[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}")
+                    print(f"data_den: {data_den[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}")
+                    print(f"mc_num: {mc_num[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}")
+                    print(f"mc_den: {mc_den[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}")
             num = data_num.values() - mc_num.values()
             den = data_den.values() - mc_den.values()
             ff_val = np.where((num > 0) & (den > 0),
@@ -386,6 +542,7 @@ def rel_err(x):
             h = hist.Hist.new
             for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                 h = eval(f'h.{var_axis.ax_str}') 
+            axes = list(h.axes[1:])
             h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor')
             ff_raw = h.Weight()
             ff_raw.view().value[...,0] = ff_val
@@ -395,90 +552,116 @@ def rel_err(x):
             
             #Make an approximation of tau pt dependance
             formula_str = 'p0 + p1*x+p2*x*x'
+            #formula_str = 'p0 + p1*x'
             def fitf(x, p0, p1, p2):
                 return eval(formula_str)
+            
             def jac(x):
                 from numpy import array
                 out = array([[ 1, x,  x**2],[x,  x**2, x**3],[x**2,  x**3, x**4]])
+                #out = array([[ 1., x],[x,  x**2]])
                 return out
             
             def eval_formula(formula_str, popt):
                 for i,p in enumerate(popt):
-                    formula_str = formula_str.replace(f'p{i}',str(popt[i]))
+                    par = round(popt[i],6)
+                    formula_str = formula_str.replace(f'p{i}',str(par))
                 return formula_str
             
             ff_fitted = ff_raw.copy().reset()
             ff_fitted.name = name
             ff_fitted.label = label
-            fitres = {}
             
-            axes = list(ff_raw.axes[1:2])
+            
             fitres = {}
             dc = dict_creator()
-            for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str']: 
+            for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str','x_max']: 
                 fitres[the_field]= dc.init_dict(axes)
             
             dm_axis = ff_raw.axes['tau_dm_pnet']
-            for dm in dm_axis:
-                h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm),
-                                'syst': hist.loc('nominal')}]
-                mask = h1d.values() > 0
-                y = h1d.values()[mask]
-                y_err = (h1d.variances()[mask])**0.5
-                x = h1d.axes[0].centers
-                x_masked = x[mask]
+            n_jets_axis = ff_raw.axes['n_jets']
+            for nj in n_jets_axis:
+                for dm in dm_axis:
+                    h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm),
+                                   'n_jets': hist.loc(nj),
+                                    'syst': hist.loc('nominal')}]
+                    mask = h1d.values() > 0
+                    x = h1d.axes[0].centers
+                    if np.sum(mask) < 3:
+                    #if np.sum(mask) < 2:
+                        y = np.zeros_like(x)
+                        y_err = np.ones_like(x)
+                        x_masked = x
+                    else:
+                        y = h1d.values()[mask]
+                        y_err = (h1d.variances()[mask])**0.5
+                        x_masked = x[mask]
+                    popt, pcov = curve_fit(fitf,
+                                           x_masked,
+                                           y,
+                                           sigma=y_err,
+                                           absolute_sigma=True,
+                                        )
+                    fitres['chi2'][dm][nj] = sum(((y - fitf(x_masked, *popt))/y_err)**2)
+                    fitres['ndf'][dm][nj] = len(y) - len(popt)
+                    fitres['popt'][dm][nj] = popt 
+                    fitres['pcov'][dm][nj] = pcov
+                    fitres['x_max'][dm][nj] = np.max(x_masked)
                 
-                popt, pcov = curve_fit(fitf,x_masked,y,
-                                       sigma=y_err,
-                                       absolute_sigma=True,
-                                       )
-                fitres['chi2'][dm] = sum(((y - fitf(x_masked, *popt))/y_err)**2)
-                fitres['ndf'][dm] = len(y) - len(popt)
-                fitres['popt'][dm] = popt 
-                fitres['pcov'][dm] = pcov
-               
-                fitres['fitf_str'][dm] = eval_formula(formula_str,popt)
-                for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0
-                    ff_fitted.view().value[:,
-                                           ff_fitted.axes[1].index(dm),
-                                           ff_fitted.axes[2].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov)))
-            fitres['name']  = name
-            fitres['jac']   = jac
-            fitres['fitf']  = fitf
+                    fitres['fitf_str'][dm][nj] = eval_formula(formula_str,popt)
+                    for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0
+                        ff_fitted.view().value[:,
+                                            ff_fitted.axes[1].index(dm),
+                                            ff_fitted.axes[2].index(nj),
+                                            ff_fitted.axes[3].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov)))
+                fitres['name']  = name
+                fitres['jac']   = jac
+                fitres['fitf']  = fitf
             return ff_raw, ff_fitted, fitres
         
         wj_raw, wj_fitted, wj_fitres = get_ff_corr(self,
                               data_hists,
                               mc_hists,
-                              num_reg = 'dr_num_wj',
-                              den_reg = 'dr_den_wj',
+                              dr_num = 'dr_num_wj',
+                              dr_den = 'dr_den_wj',
                               name='ff_wjets',
                               label='Fake factor W+jets')
         
         qcd_raw, qcd_fitted, qcd_fitres = get_ff_corr(self,
                               data_hists,
                               mc_hists,
-                              num_reg = 'dr_num_qcd',
-                              den_reg = 'dr_den_qcd',
+                              dr_num = 'dr_num_qcd',
+                              dr_den = 'dr_den_qcd',
                               name='ff_qcd',
                               label='Fake factor QCD')
         
         corr_list = []
+
+        corr_list = []            
         for fitres in [wj_fitres, qcd_fitres]:
             formula_str = fitres['fitf_str']
-            dm_bins = []
-            for (dm, the_formula) in formula_str.items():
-                x_max = 100
-                last_val = fitres['fitf'](x_max,* fitres['popt'][dm])
-                
-                dm_bins.append(cs.CategoryItem(
-                    key=dm,
-                    value=cs.Formula(
-                        nodetype="formula",
-                        variables=["tau_pt"],
-                        parser="TFormula",
-                        expression=f'({the_formula})/(1. + exp(10.*(x-{x_max}))) + ({last_val})/(1. + exp(-10.*(x-{x_max})))',
-                    )))
+            dm_cats = []
+            for dm in formula_str.keys():
+                formula_str_njet_binned = formula_str[dm]
+                single_dm = []
+                for nj, the_formula in formula_str_njet_binned.items():
+                    x_max = fitres['x_max'][dm][nj]
+                    fx_max = np.maximum(fitres['fitf'](x_max,* fitres['popt'][dm][nj]),0)
+                    single_dm.append(cs.CategoryItem(
+                        key=nj,
+                        value=cs.Formula(
+                            nodetype="formula",
+                            variables=["tau_pt"],
+                            parser="TFormula",
+                            expression=f'({the_formula})*((x-{x_max})<0) + ({fx_max})*((x-{x_max})>=0)',
+                        )))
+                dm_cats.append(cs.CategoryItem(
+                        key=dm,
+                        value=cs.Category(
+                            nodetype="category",
+                            input="n_jets",
+                            content=single_dm,
+                            )))
             corr_list.append(cs.Correction(
                 name=fitres['name'],
                 description=f"fake factor correcton for {fitres['name'].split('_')[1]}",
@@ -486,14 +669,16 @@ def eval_formula(formula_str, popt):
                 inputs=[
                     cs.Variable(name="tau_pt", type="real",description="pt of tau"),
                     cs.Variable(name="tau_dm_pnet", type="int", description="PNet decay mode of tau"),
+                    cs.Variable(name="n_jets", type="int", description="Number of jets with pt > 20 GeV and eta < 4.7"),
                 ],
                 output=cs.Variable(name="weight", type="real", description="Multiplicative event weight"),
                 data=cs.Category(
                     nodetype="category",
                     input="tau_dm_pnet",
-                    content=dm_bins,)
+                    content=dm_cats,
+                )
             ))
-            
+        
         cset = cs.CorrectionSet(
         schema_version=2,
         description="Fake factors",
@@ -508,106 +693,79 @@ def eval_formula(formula_str, popt):
             h_raw = eval(f'{h_name}_raw')
             h_fitted = eval(f'{h_name}_fitted')
             
-            fig, ax = plt.subplots(figsize=(12, 8))
-            h_raw[...,'nominal'].plot2d(ax=ax)
-            self.output()['plots']['_'.join((h_name,'nominal'))].dump(fig, formatter="mpl")
+           
             fitres = wj_fitres if h_name == 'wj' else qcd_fitres
             dm_axis = h_raw.axes['tau_dm_pnet']
-            for dm in dm_axis:
-                h1d = h_raw[{'tau_dm_pnet': hist.loc(dm),
-                                'syst': hist.loc('nominal')}]
-                hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm)}]
-                fig, ax = plt.subplots(figsize=(8, 6))
-                mask = h1d.counts() > 0
-                x = h1d.axes[0].centers[mask]
-                y = h1d.counts()[mask]
-                xerr = (np.diff(h1d.axes[0]).flatten()/2.)[mask],
-                yerr = np.sqrt(h1d.variances()).flatten()[mask],
-                ax.errorbar(x, y, xerr = xerr, yerr = yerr,
-                                label=f"PNet decay mode = {dm}",
-                                marker='o',
-                                fmt='o',
-                                line=None, color='#2478B7', capsize=4)
-                x_fine = np.linspace(x[0],x[-1],num=100)
-                popt = fitres['popt'][dm]
-                pcov = fitres['pcov'][dm]
-                jac = fitres['jac']
-                def err(x,jac,pcov):
-                    from numpy import sqrt,einsum
-                    return sqrt(einsum('ij,ij',jac(x),pcov))
-
-                import functools
-                err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine))
+            nj_axis = h_raw.axes['n_jets']
+            for nj in nj_axis:
+                print(f"Plotting 2d map for n jets = {nj}")
+                fig, ax = plt.subplots(figsize=(12, 8))
                 
-                y_fitf = fitres['fitf'](x_fine,*popt)
-                y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y
-                y_fitf_down = fitres['fitf'](x_fine,*(popt)) - err_y
-               
-                ax.plot(x_fine,
-                        y_fitf,
-                        color='#FF867B')
-                ax.fill_between(x_fine, y_fitf_up,  y_fitf_down, color='#83d55f', alpha=0.5)
-                ax.set_ylabel('Fake Factor')
-                ax.set_xlabel('Tau pT [GeV]')
-                ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}')
-                ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm],2)}}}{{{fitres['ndf'][dm]}}}$",
-                            (0.8, 0.9),
-                            xycoords='axes fraction',
-                            fontsize=20)
+                single2d_h = h_raw[{'n_jets': hist.loc(nj),
+                       'syst': hist.loc('nominal')}]
+                pcm = ax.pcolormesh(*np.meshgrid(*single2d_h.axes.edges), single2d_h.view().value.T, cmap="viridis", vmin=0, vmax=0.5)
+                ax.set_yticks(dm_axis.centers, labels=list(map(dm_axis.bin, range(dm_axis.size))))
+                plt.colorbar(pcm, ax=ax)
+                plt.xlabel(single2d_h.axes.label[0])
+                plt.ylabel(single2d_h.axes.label[1])
+                plt.title(single2d_h.label)
+
+                self.output()['plots']['_'.join((h_name,'nominal',f'n_jets_{str(nj)}'))].dump(fig, formatter="mpl")
+                for dm in dm_axis:
+                    print(f"Plotting 1d plot for n jets = {nj}, dm = {dm}")
+                    h1d = h_raw[{'tau_dm_pnet': hist.loc(dm),
+                                 'n_jets': hist.loc(nj),
+                                    'syst': hist.loc('nominal')}]
+                    hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm),
+                                     'n_jets': hist.loc(nj),}]
+                    fig, ax = plt.subplots(figsize=(8, 6))
+                    mask = h1d.counts() > 0
+                    if np.sum(mask) > 0: 
+                        x = h1d.axes[0].centers[mask]
+                        y = h1d.counts()[mask]
+                        xerr = (np.diff(h1d.axes[0]).flatten()/2.)[mask],
+                        yerr = np.sqrt(h1d.variances()).flatten()[mask],
+                    else:
+                        x = h1d.axes[0].centers
+                        y = np.zeros_like(x)
+                        xerr = (np.diff(h1d.axes[0]).flatten()/2.)
+                        yerr = np.ones_like(y),
+                   
+                    ax.errorbar(x, y, xerr = xerr, yerr = yerr,
+                                    label=f"PNet decay mode = {dm}",
+                                    marker='o',
+                                    fmt='o',
+                                    line=None, color='#2478B7', capsize=4)
+                    x_fine = np.linspace(x[0],x[-1],num=100)
+                    popt = fitres['popt'][dm][nj]
+                    pcov = fitres['pcov'][dm][nj]
+                    jac = fitres['jac']
+                    def err(x,jac,pcov):
+                        from numpy import sqrt,einsum
+                        return sqrt(einsum('ij,ij',jac(x),pcov))
+
+                    import functools
+                    err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine))
+                    
+                    y_fitf = fitres['fitf'](x_fine,*popt)
+                    y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y
+                    y_fitf_down = fitres['fitf'](x_fine,*(popt)) - err_y
                 
-                self.output()['plots1d']['_'.join((h_name,str(dm)))].dump(fig, formatter="mpl")
-
-
-
-class CreateDataDrivenHistograms(
-    VariablesMixin,
-    WeightProducerMixin,
-    ProducersMixin,
-    ReducedEventsUser,
-    ChunkedIOMixin,
-    law.LocalWorkflow,
-    RemoteWorkflow,
-):
-
-    sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
-
-    # upstream requirements
-    reqs = Requirements(
-        ReducedEventsUser.reqs,
-        RemoteWorkflow.reqs,
-        ComputeFakeFactors=ComputeFakeFactors,
-        ProduceColumns=ProduceColumns,
-    )
-    
-    def requires(self):
-        reqs = {"events": self.reqs.ProvideReducedEvents.req(self)}
-        from IPython import embed; embed()
-        if self.producer_insts:
-            reqs["producers"] = [
-                self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name)
-                for producer_inst in self.producer_insts
-                if producer_inst.produced_columns
-            ]
-        reqs['ff_json'] = self.reqs.ComputeFakeFactors.req(self)
-        reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires()))
-        return reqs
-
-    def output(self):
-        return {"hists": self.target(f"histograms__vars_{self.variables_repr}__{self.branch}.pickle")}
-
-    @law.decorator.log
-    @law.decorator.localize(input=True, output=False)
-    @law.decorator.safe_output
-    def run(self):
-        import hist
-        import numpy as np
-        import awkward as ak
-        from columnflow.columnar_util import (
-            Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist,
-        )
-
-        # prepare inputs
-        inputs = self.input()
-        from IPython import embed; embed()
-        # declare output: dict of histograms
-        histograms = {}
\ No newline at end of file
+                    ax.plot(x_fine,
+                            y_fitf,
+                            color='#FF867B')
+                    ax.fill_between(x_fine, y_fitf_up,  y_fitf_down, color='#83d55f', alpha=0.5)
+                    ax.set_ylabel('Fake Factor')
+                    ax.set_xlabel('Tau pT [GeV]')
+                    ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}')
+                    ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm][nj],2)}}}{{{fitres['ndf'][dm][nj]}}}$",
+                                (0.8, 0.9),
+                                xycoords='axes fraction',
+                                fontsize=20)
+                    print(str(fitres['fitf_str'][dm][nj]))
+                    ax.annotate('y=' + str(fitres['fitf_str'][dm][nj]),
+                                (0.1, 0.9),
+                                xycoords='axes fraction',
+                                fontsize=12)
+                    
+                    self.output()['plots1d']['_'.join((h_name,str(dm),str(nj)))].dump(fig, formatter="mpl")
\ No newline at end of file
diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py
index 8ac757ed0..71cbc5f27 100644
--- a/columnflow/tasks/plotting.py
+++ b/columnflow/tasks/plotting.py
@@ -155,13 +155,14 @@ def run(self):
                 else:
                     hists = hists[category_inst.name]
             else:
-                if category_inst.name in hists.keys():
+                if 'dr' in category_inst.name:
+                    hists = self.invoke_hist_hooks(hists,category_inst)
+                elif category_inst.name in hists.keys():
                     hists = hists[category_inst.name]
                 else:
                     raise Exception(
                     f"no histograms found to plot for {category_inst.name}"
                 )
-
             # add new processes to the end of the list
             for process_inst in hists:
                 if process_inst not in process_insts:

From 02429bdf98f2442bf19e320aa8146ce416e7701d Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 24 Mar 2025 16:37:58 +0100
Subject: [PATCH 16/26] Fixed the long-standing issue with flattening of the
 arrays at the stage of Creating the histograms

---
 columnflow/hist_util.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/columnflow/hist_util.py b/columnflow/hist_util.py
index 92a9ed42a..7929bbbd7 100644
--- a/columnflow/hist_util.py
+++ b/columnflow/hist_util.py
@@ -72,15 +72,12 @@ def allows_shift(ax) -> bool:
             flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5
 
     # fill
-    if 'event' in data.keys():
-        arrays = {}
-        for ax_name in axis_names:
-            if ax_name in data.keys():
-                arrays[ax_name] = data[ax_name]
-        h.fill(**fill_kwargs, **arrays)
-    else:
-        arrays = ak.flatten(ak.cartesian(data))
-        h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields})
+    flat_data = {}
+    for key, arr in data.items():
+        if arr.ndim != 1: flat_data[key] = ak.flatten(arr)
+        else: flat_data[key] = arr
+    h.fill(**fill_kwargs, **flat_data)
+     
 
 
 def add_hist_axis(histogram: hist.Hist, variable_inst: od.Variable) -> hist.Hist:

From fdb30eb69082f4e10c2f180fee50d5de57bef999 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 24 Mar 2025 16:42:21 +0100
Subject: [PATCH 17/26] Updated framework tasks according to a new approach of
 storing different categories

---
 columnflow/tasks/framework/mixins.py |   2 +-
 columnflow/tasks/histograms.py       |  44 ++-
 columnflow/tasks/yields.py           | 399 ++++++++++++++++++++-------
 law.cfg                              |   2 +-
 4 files changed, 323 insertions(+), 124 deletions(-)

diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py
index 2bc75c005..35549393b 100644
--- a/columnflow/tasks/framework/mixins.py
+++ b/columnflow/tasks/framework/mixins.py
@@ -2452,7 +2452,7 @@ def invoke_hist_hooks(self, hists: dict, category_inst: od.Category) -> dict:
         Invoke hooks to update histograms before plotting.
         """
         if not self.hist_hooks:
-            return hists
+            return hists[category_inst.name]
 
         for hook in self.hist_hooks:
             if hook in (None, "", law.NO_STR):
diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py
index f1d9c7e61..d7603112c 100644
--- a/columnflow/tasks/histograms.py
+++ b/columnflow/tasks/histograms.py
@@ -57,7 +57,7 @@ class CreateHistograms(
 
     @law.util.classproperty
     def mandatory_columns(cls) -> set[str]:
-        return set(cls.category_id_columns) | {"process_id"}
+        return set(cls.category_id_columns) | {"process_id", "ff_weight*"}
 
     def workflow_requires(self):
         reqs = super().workflow_requires()
@@ -142,7 +142,9 @@ def run(self):
         read_columns = {Route("process_id")}
         read_columns |= set(map(Route, self.category_id_columns))
         read_columns |= set(self.weight_producer_inst.used_columns)
-        read_columns |= set(map(Route, [n +'*' for n in self.config_inst.x.fake_factor_method.columns]))
+        read_columns |= set(map(Route, ['_'.join((the_name,the_shift)) 
+                                        for the_name in self.config_inst.x.fake_factor_method.columns
+                                        for the_shift in self.config_inst.x.fake_factor_method.shifts]))
         read_columns |= set(map(Route, aliases.values()))
         read_columns |= {
             Route(inp)
@@ -201,7 +203,6 @@ def run(self):
 
                 # attach coffea behavior aiding functional variable expressions
                 events = attach_coffea_behavior(events)
-                
                 # build the full event weight
                 if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func():
                     events, weight = self.weight_producer_inst(events)
@@ -236,24 +237,18 @@ def run(self):
                             # mask events and weights when selection expressions are found
                             masked_events = events
                             
-                            if 'ar_wj' in region:
-                                masked_weights = weight * events.ff_weight_wj_nominal
-                            elif 'ar_qcd' in region:
-                                masked_weights = weight * events.ff_weight_qcd_nominal
+                            if 'apply_ff' in cat.aux.keys():
+                                if cat.aux['apply_ff'] == 'wj':
+                                    self.publish_message(f"applying FF weights: ff_weight_wj_nominal, category: {cat.name}")
+                                    masked_weights = weight * events.ff_weight_wj_nominal
+                                elif cat.aux['apply_ff'] == 'qcd':
+                                    self.publish_message(f"applying FF weights: ff_weight_qcd_nominal, category: {cat.name}")
+                                    masked_weights = weight * events.ff_weight_qcd_nominal
+                                else:
+                                    masked_weights = weight
                             else:
                                 masked_weights = weight
-                                
-                            # for variable_inst in variable_insts:
-                            #     sel = variable_inst.selection
-                            #     if sel == "1":
-                            #         continue
-                            #     if not callable(sel):
-                            #         raise ValueError(
-                            #             f"invalid selection '{sel}', for now only callables are supported",
-                            #         )
-                            #     mask = sel(masked_events)
-                            #     #select only one category per histogram
-                              # merge category ids
+                            
                             category_ids = ak.concatenate(
                                 [Route(c).apply(masked_events) for c in self.category_id_columns],
                                 axis=-1,
@@ -272,12 +267,15 @@ def run(self):
                                 expr = variable_inst.expression
                                 if isinstance(expr, str):
                                     route = Route(expr)
-                                    def expr(events, *args, **kwargs):
-                                        if len(events) == 0 and not has_ak_column(events, route):
+                                    def expr(masked_events, *args, **kwargs):
+                                        if len(masked_events) == 0 and not has_ak_column(masked_events, route):
                                             return empty_f32
-                                        return route.apply(events, null_value=variable_inst.null_value)
+                                        return route.apply(masked_events, null_value=variable_inst.null_value)
                                 # apply it
-                                fill_data[variable_inst.name] = expr(masked_events)
+                                if variable_inst.name == "event":
+                                    fill_data[variable_inst.name] = np.sign(masked_events.event)
+                                else:
+                                    fill_data[variable_inst.name] = expr(masked_events)
                             # fill it
                             fill_hist(
                                 histograms[cat.name][var_key],
diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py
index 9de6a31cc..3abab6e15 100644
--- a/columnflow/tasks/yields.py
+++ b/columnflow/tasks/yields.py
@@ -21,6 +21,245 @@
 from columnflow.util import dev_sandbox, try_int
 
 
+# class CreateYieldTable(
+#     DatasetsProcessesMixin,
+#     CategoriesMixin,
+#     WeightProducerMixin,
+#     ProducersMixin,
+#     SelectorStepsMixin,
+#     CalibratorsMixin,
+#     law.LocalWorkflow,
+#     RemoteWorkflow,
+# ):
+#     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+#     table_format = luigi.Parameter(
+#         default="fancy_grid",
+#         significant=False,
+#         description="format of the yield table; accepts all formats of the tabulate package; "
+#         "default: fancy_grid",
+#     )
+#     number_format = luigi.Parameter(
+#         default="pdg",
+#         significant=False,
+#         description="rounding format of each number in the yield table; accepts all formats "
+#         "understood by scinum.Number.str(), e.g. 'pdg', 'publication', '%.1f' or an integer "
+#         "(number of signficant digits); default: pdg",
+#     )
+#     skip_uncertainties = luigi.BoolParameter(
+#         default=False,
+#         significant=False,
+#         description="when True, uncertainties are not displayed in the table; default: False",
+#     )
+#     normalize_yields = luigi.ChoiceParameter(
+#         choices=(law.NO_STR, "per_process", "per_category", "all"),
+#         default=law.NO_STR,
+#         significant=False,
+#         description="string parameter to define the normalization of the yields; "
+#         "choices: '', per_process, per_category, all; empty default",
+#     )
+#     output_suffix = luigi.Parameter(
+#         default=law.NO_STR,
+#         description="Adds a suffix to the output name of the yields table; empty default",
+#     )
+
+#     # upstream requirements
+#     reqs = Requirements(
+#         RemoteWorkflow.reqs,
+#         MergeHistograms=MergeHistograms,
+#     )
+
+#     # dummy branch map
+#     def create_branch_map(self):
+#         return [0]
+
+#     def requires(self):
+#         return {
+#             d: self.reqs.MergeHistograms.req(
+#                 self,
+#                 dataset=d,
+#                 variables=("event",),
+#                 _prefer_cli={"variables"},
+#             )
+#             for d in self.datasets
+#         }
+
+#     def workflow_requires(self):
+#         reqs = super().workflow_requires()
+
+#         reqs["merged_hists"] = [
+#             self.reqs.MergeHistograms.req(
+#                 self,
+#                 dataset=d,
+#                 variables=("event",),
+#                 _exclude={"branches"},
+#             )
+#             for d in self.datasets
+#         ]
+
+#         return reqs
+
+#     @classmethod
+#     def resolve_param_values(cls, params):
+#         params = super().resolve_param_values(params)
+
+#         if "number_format" in params and try_int(params["number_format"]):
+#             # convert 'number_format' in integer if possible
+#             params["number_format"] = int(params["number_format"])
+
+#         return params
+
+#     def output(self):
+#         suffix = ""
+#         if self.output_suffix and self.output_suffix != law.NO_STR:
+#             suffix = f"__{self.output_suffix}"
+
+#         return {
+#             "table": self.target(f"table__proc_{self.processes_repr}__cat_{self.categories_repr}{suffix}.txt"),
+#             "yields": self.target(f"yields__proc_{self.processes_repr}__cat_{self.categories_repr}{suffix}.json"),
+#         }
+
+#     @law.decorator.notify
+#     @law.decorator.log
+#     def run(self):
+#         import hist
+#         from tabulate import tabulate
+
+#         inputs = self.input()
+#         outputs = self.output()
+
+#         category_insts = list(map(self.config_inst.get_category, self.categories))
+#         process_insts = list(map(self.config_inst.get_process, self.processes))
+#         sub_process_insts = {
+#             proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)]
+#             for proc in process_insts
+#         }
+
+#         # histogram data per process
+#         hists = {}
+
+#         with self.publish_step(f"Creating yields for processes {self.processes}, categories {self.categories}"):
+#             for dataset, inp in inputs.items():
+#                 dataset_inst = self.config_inst.get_dataset(dataset)
+
+#                 # load the histogram of the variable named "event"
+#                 input_hists = inp["hists"]["event"].load(formatter="pickle")
+
+#                 # loop and extract one histogram per process
+#                 for process_inst in process_insts:
+#                     # skip when the dataset is already known to not contain any sub process
+#                     if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])):
+#                         continue
+
+#                     # work on a copy
+#                     h = h_in.copy()
+
+#                     # axis selections
+#                     h = h[{
+#                         "process": [
+#                             hist.loc(p.id)
+#                             for p in sub_process_insts[process_inst]
+#                             if p.id in h.axes["process"]
+#                         ],
+#                     }]
+
+#                     # axis reductions
+#                     h = h[{"process": sum, "shift": sum, "event": sum}]
+
+#                     # add the histogram
+#                     if process_inst in hists:
+#                         hists[process_inst] += h
+#                     else:
+#                         hists[process_inst] = h
+
+#             # there should be hists to plot
+#             if not hists:
+#                 raise Exception("no histograms found to plot")
+
+#             # sort hists by process order
+#             hists = OrderedDict(
+#                 (process_inst, hists[process_inst])
+#                 for process_inst in sorted(hists, key=process_insts.index)
+#             )
+
+#             yields, processes = defaultdict(list), []
+
+#             # read out yields per category and per process
+#             for process_inst, h in hists.items():
+#                 processes.append(process_inst)
+
+#                 for category_inst in category_insts:
+#                     leaf_category_insts = category_inst.get_leaf_categories() or [category_inst]
+
+#                     h_cat = h[{"category": [
+#                         hist.loc(c.id)
+#                         for c in leaf_category_insts
+#                         if c.id in h.axes["category"]
+#                     ]}]
+#                     h_cat = h_cat[{"category": sum}]
+
+#                     value = Number(h_cat.value)
+#                     if not self.skip_uncertainties:
+#                         # set a unique uncertainty name for correct propagation below
+#                         value.set_uncertainty(
+#                             f"mcstat_{process_inst.name}_{category_inst.name}",
+#                             math.sqrt(h_cat.variance),
+#                         )
+#                     yields[category_inst].append(value)
+
+#             # obtain normalizaton factors
+#             norm_factors = 1
+#             if self.normalize_yields == "all":
+#                 norm_factors = sum(
+#                     sum(category_yields)
+#                     for category_yields in yields.values()
+#                 )
+#             elif self.normalize_yields == "per_process":
+#                 norm_factors = [
+#                     sum(yields[category][i] for category in yields.keys())
+#                     for i in range(len(yields[category_insts[0]]))
+#                 ]
+#             elif self.normalize_yields == "per_category":
+#                 norm_factors = {
+#                     category: sum(category_yields)
+#                     for category, category_yields in yields.items()
+#                 }
+
+#             # initialize dicts
+#             yields_str = defaultdict(list, {"Process": [proc.label for proc in processes]})
+#             raw_yields = defaultdict(dict, {})
+
+#             # apply normalization and format
+#             for category, category_yields in yields.items():
+#                 for i, value in enumerate(category_yields):
+#                     # get correct norm factor per category and process
+#                     if self.normalize_yields == "per_process":
+#                         norm_factor = norm_factors[i]
+#                     elif self.normalize_yields == "per_category":
+#                         norm_factor = norm_factors[category]
+#                     else:
+#                         norm_factor = norm_factors
+
+#                     raw_yield = (value / norm_factor).nominal
+#                     raw_yields[category.name][processes[i].name] = raw_yield
+
+#                     # format yields into strings
+#                     yield_str = (value / norm_factor).str(
+#                         combine_uncs="all",
+#                         format=self.number_format,
+#                         style="latex" if "latex" in self.table_format else "plain",
+#                     )
+#                     if "latex" in self.table_format:
+#                         yield_str = f"${yield_str}$"
+#                     yields_str[category.label].append(yield_str)
+
+#             # create, print and save the yield table
+#             yield_table = tabulate(yields_str, headers="keys", tablefmt=self.table_format)
+#             self.publish_message(yield_table)
+
+#             outputs["table"].dump(yield_table, formatter="text")
+#             outputs["yields"].dump(raw_yields, formatter="json")
+            
 class CreateYieldTable(
     DatasetsProcessesMixin,
     CategoriesMixin,
@@ -136,123 +375,85 @@ def run(self):
         }
 
         # histogram data per process
-        hists = {}
-
+        merged_hists = {}
         with self.publish_step(f"Creating yields for processes {self.processes}, categories {self.categories}"):
             for dataset, inp in inputs.items():
                 dataset_inst = self.config_inst.get_dataset(dataset)
 
                 # load the histogram of the variable named "event"
                 input_hists = inp["hists"]["event"].load(formatter="pickle")
-
-                # loop and extract one histogram per process
-                for process_inst in process_insts:
-                    # skip when the dataset is already known to not contain any sub process
-                    if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])):
-                        continue
-
-                    # work on a copy
-                    h = h_in.copy()
-
-                    # axis selections
-                    h = h[{
-                        "process": [
-                            hist.loc(p.id)
-                            for p in sub_process_insts[process_inst]
-                            if p.id in h.axes["process"]
-                        ],
-                    }]
-
-                    # axis reductions
-                    h = h[{"process": sum, "shift": sum, "event": sum}]
-
-                    # add the histogram
-                    if process_inst in hists:
-                        hists[process_inst] += h
+                
+                
+                for the_cat, the_hist in input_hists.items():
+                    if the_cat not in merged_hists.keys():
+                        merged_hists[the_cat] = []
                     else:
-                        hists[process_inst] = h
-
+                        merged_hists[the_cat].append(the_hist)
+                #merge histograms
+            merged_hists_ = {the_cat: sum(h[1:],h[0].copy()) for the_cat, h in merged_hists.items()}
+            hists_per_proc = {} 
+            for the_cat, the_hist in merged_hists_.items():
+                hists_per_proc[the_cat] = {}
+                for proc in process_insts:
+                    leaf_procs = proc.get_leaf_processes()
+                    if len(leaf_procs) == 0 : leaf_procs = [proc]
+                    for leaf_proc in leaf_procs:
+                        if leaf_proc.id in the_hist.axes["process"]: 
+                            h = the_hist.copy()
+                            h = h[{"process": hist.loc(leaf_proc.id)}]
+                            
+                            if proc in hists_per_proc[the_cat]:
+                                hists_per_proc[the_cat][proc] +=h
+                            else:
+                                hists_per_proc[the_cat][proc] = h
+                                
             # there should be hists to plot
-            if not hists:
+            if not hists_per_proc:
                 raise Exception("no histograms found to plot")
-
             # sort hists by process order
-            hists = OrderedDict(
-                (process_inst, hists[process_inst])
-                for process_inst in sorted(hists, key=process_insts.index)
+            hists = {}
+            for the_cat in hists_per_proc.keys():
+                single_cat_hists = hists_per_proc[the_cat]
+                hists[the_cat] = OrderedDict(
+                (process_inst, single_cat_hists[process_inst])
+                for process_inst in sorted(single_cat_hists, key=process_insts.index)
             )
-
-            yields, processes = defaultdict(list), []
-
-            # read out yields per category and per process
-            for process_inst, h in hists.items():
-                processes.append(process_inst)
-
-                for category_inst in category_insts:
-                    leaf_category_insts = category_inst.get_leaf_categories() or [category_inst]
-
-                    h_cat = h[{"category": [
-                        hist.loc(c.id)
-                        for c in leaf_category_insts
-                        if c.id in h.axes["category"]
-                    ]}]
-                    h_cat = h_cat[{"category": sum}]
-
-                    value = Number(h_cat.value)
-                    if not self.skip_uncertainties:
+            #Calculate yields
+            yields = {}
+            for the_cat in hists.keys():
+                tmp = {}
+                for the_proc in hists[the_cat].keys():
+                    val = Number(hists[the_cat][the_proc].sum().value)
+                    
+                    if not self.skip_uncertainties and not the_proc.is_data:
                         # set a unique uncertainty name for correct propagation below
-                        value.set_uncertainty(
-                            f"mcstat_{process_inst.name}_{category_inst.name}",
-                            math.sqrt(h_cat.variance),
+                        val.set_uncertainty(
+                            f"mcstat_{the_proc.name}_{the_cat}",
+                            math.sqrt(hists[the_cat][the_proc].sum().variance),
                         )
-                    yields[category_inst].append(value)
-
-            # obtain normalizaton factors
-            norm_factors = 1
-            if self.normalize_yields == "all":
-                norm_factors = sum(
-                    sum(category_yields)
-                    for category_yields in yields.values()
-                )
-            elif self.normalize_yields == "per_process":
-                norm_factors = [
-                    sum(yields[category][i] for category in yields.keys())
-                    for i in range(len(yields[category_insts[0]]))
-                ]
-            elif self.normalize_yields == "per_category":
-                norm_factors = {
-                    category: sum(category_yields)
-                    for category, category_yields in yields.items()
-                }
-
+                    tmp[the_proc]=val
+                yields[the_cat] = OrderedDict(tmp)
             # initialize dicts
-            yields_str = defaultdict(list, {"Process": [proc.label for proc in processes]})
+            yields_str = defaultdict(list, {"Process" : [proc.label for proc in process_insts]})
             raw_yields = defaultdict(dict, {})
-
             # apply normalization and format
-            for category, category_yields in yields.items():
-                for i, value in enumerate(category_yields):
-                    # get correct norm factor per category and process
-                    if self.normalize_yields == "per_process":
-                        norm_factor = norm_factors[i]
-                    elif self.normalize_yields == "per_category":
-                        norm_factor = norm_factors[category]
-                    else:
-                        norm_factor = norm_factors
-
-                    raw_yield = (value / norm_factor).nominal
-                    raw_yields[category.name][processes[i].name] = raw_yield
-
-                    # format yields into strings
-                    yield_str = (value / norm_factor).str(
-                        combine_uncs="all",
-                        format=self.number_format,
-                        style="latex" if "latex" in self.table_format else "plain",
-                    )
+            for cat in yields.keys():
+                yields_per_cat = yields[cat]
+                for proc in process_insts:
+                    if proc in yields_per_cat:
+                        raw_yield = yields_per_cat[proc].nominal
+                        yield_str = (yields_per_cat[proc]).str(
+                            combine_uncs="all",
+                            format=self.number_format,
+                            style="latex" if "latex" in self.table_format else "plain",
+                        )
+                    else: 
+                        raw_yield = Number(-1).nominal
+                        yield_str = str(-1)
+                    raw_yields[cat][proc.name] = raw_yield
                     if "latex" in self.table_format:
                         yield_str = f"${yield_str}$"
-                    yields_str[category.label].append(yield_str)
-
+                    yields_str[cat].append(yield_str)
             # create, print and save the yield table
             yield_table = tabulate(yields_str, headers="keys", tablefmt=self.table_format)
             self.publish_message(yield_table)
diff --git a/law.cfg b/law.cfg
index 0d6ae338f..5d01d5d05 100644
--- a/law.cfg
+++ b/law.cfg
@@ -60,7 +60,7 @@ slurm_flavor: $CF_SLURM_FLAVOR
 slurm_partition: $CF_SLURM_PARTITION
 
 # ChunkedIOHandler defaults
-chunked_io_chunk_size: 100000
+chunked_io_chunk_size: 50000
 chunked_io_pool_size: 2
 chunked_io_debug: False
 

From 000ee8b1e14bd945413a4b63ecea985d35f66a47 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 7 Apr 2025 14:00:31 +0200
Subject: [PATCH 18/26] Updated code for the Fake Factor calculation

---
 columnflow/tasks/data_driven_methods.py | 220 ++++++++++++++----------
 1 file changed, 126 insertions(+), 94 deletions(-)

diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 759152614..7c4b3f375 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -372,24 +372,11 @@ def run(self):
     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 )
 
-
-class dict_creator():
-    def init_dict(self, ax_list):
-        if not ax_list:
-            return -1.
-        else:
-            ax = ax_list[0]
-            updated_ax = ax_list[1:]
-            get_ax_dict = lambda ax, ax_list, func : {ax.bin(i): func(ax_list) for i in range(ax.size)}
-            return get_ax_dict(ax,updated_ax, self.init_dict)
-                
-
 class ComputeFakeFactors(
     DatasetsProcessesMixin,
     CategoriesMixin,
     WeightProducerMixin,
     ProducersMixin,
-    dict_creator,
 ):
     sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
 
@@ -456,7 +443,12 @@ def output(self):
                                       str(nj))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}_njets_{str(nj)}.png")
                           for ff_type in ['qcd','wj']
                           for dm in [0,1,2,10,11]
-                          for nj in [0,1,2]}}
+                          for nj in [0,1,2]},
+                "fitres": self.target('_'.join(('fitres',
+                                                 channel,
+                                                 str(year),
+                                                 tag)) + '.json'),
+                }
 
     @law.decorator.log
     def run(self):
@@ -466,6 +458,7 @@ def run(self):
         from scipy.special import erf
         import matplotlib.pyplot as plt
         import correctionlib.schemav2 as cs
+        from numpy import exp
         plt.figure(dpi=200)
         plt.rcParams.update({
             "text.usetex": True,
@@ -510,6 +503,15 @@ def run(self):
                         if proc.is_data:
                             if the_cat in data_hists: data_hists[the_cat] += h
                             else: data_hists[the_cat] = h
+        
+        def eval_formula(formula_str, popt,make_rounding=False):
+                for i,p in enumerate(popt):
+                    if make_rounding:
+                        formula_str = formula_str.replace(f'p{i}', '{:.3e}'.format(p))
+                    else:
+                        formula_str = formula_str.replace(f'p{i}',str(p))
+                return formula_str
+        
         #Function that performs the calculation of t
         def get_ff_corr(self, h_data, h_mc, dr_num, dr_den, name='ff_hist', label='ff_hist'):
             
@@ -529,6 +531,7 @@ def get_single_cat(self, h, reg_name):
                     print(f"mc_num: {mc_num[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}")
                     print(f"mc_den: {mc_den[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}")
             num = data_num.values() - mc_num.values()
+
             den = data_den.values() - mc_den.values()
             ff_val = np.where((num > 0) & (den > 0),
                                num / np.maximum(den, 1),
@@ -538,7 +541,7 @@ def rel_err(x):
             
             ff_err = ff_val * ((data_num.variances() + mc_num.variances())**0.5 / np.abs(num) + (data_den.variances() + mc_den.variances())**0.5 / np.abs(den))
             
-            
+            ff_err[ff_val < 0] = 1
             h = hist.Hist.new
             for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                 h = eval(f'h.{var_axis.ax_str}') 
@@ -550,45 +553,54 @@ def rel_err(x):
             ff_raw.name = name + '_raw'
             ff_raw.label = label + '_raw'
             
-            #Make an approximation of tau pt dependance
-            formula_str = 'p0 + p1*x+p2*x*x'
-            #formula_str = 'p0 + p1*x'
-            def fitf(x, p0, p1, p2):
-                return eval(formula_str)
-            
-            def jac(x):
-                from numpy import array
-                out = array([[ 1, x,  x**2],[x,  x**2, x**3],[x**2,  x**3, x**4]])
-                #out = array([[ 1., x],[x,  x**2]])
-                return out
-            
-            def eval_formula(formula_str, popt):
-                for i,p in enumerate(popt):
-                    par = round(popt[i],6)
-                    formula_str = formula_str.replace(f'p{i}',str(par))
-                return formula_str
+            def get_fitf(dm):
+                if dm==0:
+                    formula_str = 'p0+p1*x+p2*x*x'
+                    def fitf(x,p0,p1,p2): 
+                        return eval(formula_str)
+                else:
+                    formula_str = 'p0+p1*exp(-p2*x)'
+                    def fitf(x,p0,p1,p2): 
+                        from numpy import exp
+                        return eval(formula_str)
+                return fitf, formula_str
+         
+            def get_jac(dm):
+                if dm==0:
+                    def jac(x,p): 
+                        from numpy import array
+                        return array([ 1., x, x**2])
+                else:
+                    def jac(x,p):
+                        from numpy import array,exp,outer
+                        ders=array([ 1.,
+                                    exp(-p[2]*x),
+                                    -1*p[1]*x*exp(-p[2]*x)])
+                        return ders
+                return jac 
             
             ff_fitted = ff_raw.copy().reset()
             ff_fitted.name = name
             ff_fitted.label = label
             
-            
             fitres = {}
-            dc = dict_creator()
-            for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str','x_max']: 
-                fitres[the_field]= dc.init_dict(axes)
-            
             dm_axis = ff_raw.axes['tau_dm_pnet']
             n_jets_axis = ff_raw.axes['n_jets']
+            
             for nj in n_jets_axis:
+                if nj not in fitres.keys(): fitres[nj] = {}
                 for dm in dm_axis:
+                    if dm not in fitres[nj].keys(): fitres[nj][dm] = {}
+                    
+                 
+                        
+                    
                     h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm),
                                    'n_jets': hist.loc(nj),
                                     'syst': hist.loc('nominal')}]
                     mask = h1d.values() > 0
                     x = h1d.axes[0].centers
-                    if np.sum(mask) < 3:
-                    #if np.sum(mask) < 2:
+                    if np.sum(mask) < 2:
                         y = np.zeros_like(x)
                         y_err = np.ones_like(x)
                         x_masked = x
@@ -596,27 +608,37 @@ def eval_formula(formula_str, popt):
                         y = h1d.values()[mask]
                         y_err = (h1d.variances()[mask])**0.5
                         x_masked = x[mask]
-                    popt, pcov = curve_fit(fitf,
+                    
+                    fitf, formula_str = get_fitf(dm)
+                    if dm==0:
+                        the_bounds = ([-10,-5,-1],[10,5,1])
+                    else:
+                        the_bounds = ([-0.5, -1, 0],[0.5,1,0.1])
+                    popt, pcov, infodict, mesg, ier = curve_fit(fitf,
                                            x_masked,
                                            y,
                                            sigma=y_err,
+                                           bounds=the_bounds,
                                            absolute_sigma=True,
+                                           full_output=True
                                         )
-                    fitres['chi2'][dm][nj] = sum(((y - fitf(x_masked, *popt))/y_err)**2)
-                    fitres['ndf'][dm][nj] = len(y) - len(popt)
-                    fitres['popt'][dm][nj] = popt 
-                    fitres['pcov'][dm][nj] = pcov
-                    fitres['x_max'][dm][nj] = np.max(x_masked)
-                
-                    fitres['fitf_str'][dm][nj] = eval_formula(formula_str,popt)
+                    fitres[nj][dm]['chi2']      = sum((infodict['fvec'])**2)
+                    fitres[nj][dm]['ndf']       = len(y) - len(popt)
+                    fitres[nj][dm]['popt']      = popt 
+                    fitres[nj][dm]['pcov']      = pcov
+                    fitres[nj][dm]['x_max']     = np.max(x_masked)
+                   
+                    fitres[nj][dm]['jac']       = get_jac(dm)
+                    fitres[nj][dm]['name']      = name
+                    fitres[nj][dm]['fitf']      = fitf
+                    fitres[nj][dm]['fitf_str']  = formula_str
+                    
                     for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0
                         ff_fitted.view().value[:,
                                             ff_fitted.axes[1].index(dm),
                                             ff_fitted.axes[2].index(nj),
                                             ff_fitted.axes[3].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov)))
-                fitres['name']  = name
-                fitres['jac']   = jac
-                fitres['fitf']  = fitf
+                        
             return ff_raw, ff_fitted, fitres
         
         wj_raw, wj_fitted, wj_fitres = get_ff_corr(self,
@@ -635,36 +657,36 @@ def eval_formula(formula_str, popt):
                               name='ff_qcd',
                               label='Fake factor QCD')
         
-        corr_list = []
-
+        
         corr_list = []            
-        for fitres in [wj_fitres, qcd_fitres]:
-            formula_str = fitres['fitf_str']
-            dm_cats = []
-            for dm in formula_str.keys():
-                formula_str_njet_binned = formula_str[dm]
-                single_dm = []
-                for nj, the_formula in formula_str_njet_binned.items():
-                    x_max = fitres['x_max'][dm][nj]
-                    fx_max = np.maximum(fitres['fitf'](x_max,* fitres['popt'][dm][nj]),0)
-                    single_dm.append(cs.CategoryItem(
-                        key=nj,
+        for fitres_per_proc in [wj_fitres, qcd_fitres]:
+            nj_categories = []
+            for nj, fitres_per_nj in fitres_per_proc.items():
+                single_nj = []
+                for dm, fitres in fitres_per_nj.items():
+                    x_max = fitres['x_max']
+                    fitf = fitres['fitf']
+                    popt = fitres['popt']
+                    fitf_str = eval_formula(fitres['fitf_str'], popt)
+                    fx_max = np.maximum(fitf(x_max,*popt),0)
+                    single_nj.append(cs.CategoryItem(
+                        key=dm,
                         value=cs.Formula(
                             nodetype="formula",
                             variables=["tau_pt"],
                             parser="TFormula",
-                            expression=f'({the_formula})*((x-{x_max})<0) + ({fx_max})*((x-{x_max})>=0)',
+                            expression=f'({fitf_str})*((x-{x_max})<0) + ({fx_max})*((x-{x_max})>=0)',
                         )))
-                dm_cats.append(cs.CategoryItem(
-                        key=dm,
+                nj_categories.append(cs.CategoryItem(
+                        key=nj,
                         value=cs.Category(
                             nodetype="category",
-                            input="n_jets",
-                            content=single_dm,
+                            input="tau_dm_pnet",
+                            content=single_nj,
                             )))
             corr_list.append(cs.Correction(
-                name=fitres['name'],
-                description=f"fake factor correcton for {fitres['name'].split('_')[1]}",
+                name=fitres_per_proc[0][0]['name'],
+                description=f"fake factor correcton for {fitres_per_proc[0][0]['name'].split('_')[1]}",
                 version=2,
                 inputs=[
                     cs.Variable(name="tau_pt", type="real",description="pt of tau"),
@@ -674,11 +696,10 @@ def eval_formula(formula_str, popt):
                 output=cs.Variable(name="weight", type="real", description="Multiplicative event weight"),
                 data=cs.Category(
                     nodetype="category",
-                    input="tau_dm_pnet",
-                    content=dm_cats,
+                    input="n_jets",
+                    content=nj_categories,
                 )
             ))
-        
         cset = cs.CorrectionSet(
         schema_version=2,
         description="Fake factors",
@@ -686,17 +707,25 @@ def eval_formula(formula_str, popt):
         )
         self.output()['ff_json'].dump(cset.json(exclude_unset=True), formatter="json")
         
-        
+        chi2_string = 'type nj dm chi2 ndf,'
+        for fitres_per_proc in [wj_fitres, qcd_fitres]:
+            for dm, fitres_per_dm in fitres_per_proc.items():
+                for nj, fitres in fitres_per_dm.items():
+                    chi2_string += ' '.join((fitres['name'],
+                                             str(nj),
+                                             str(dm),
+                                             str(fitres['chi2']),
+                                             str(fitres['ndf'])))
+                    chi2_string += ','
+        self.output()['fitres'].dump(chi2_string, formatter="json")
         
         #Plot fake factors:
         for h_name in ['wj', 'qcd']:
-            h_raw = eval(f'{h_name}_raw')
-            h_fitted = eval(f'{h_name}_fitted')
-            
-           
-            fitres = wj_fitres if h_name == 'wj' else qcd_fitres
-            dm_axis = h_raw.axes['tau_dm_pnet']
-            nj_axis = h_raw.axes['n_jets']
+            h_raw       = eval(f'{h_name}_raw')
+            h_fitted    = eval(f'{h_name}_fitted')
+            fitres_dict = eval(f'{h_name}_fitres')
+            dm_axis     = h_raw.axes['tau_dm_pnet']
+            nj_axis     = h_raw.axes['n_jets']
             for nj in nj_axis:
                 print(f"Plotting 2d map for n jets = {nj}")
                 fig, ax = plt.subplots(figsize=(12, 8))
@@ -736,16 +765,17 @@ def eval_formula(formula_str, popt):
                                     marker='o',
                                     fmt='o',
                                     line=None, color='#2478B7', capsize=4)
-                    x_fine = np.linspace(x[0],x[-1],num=100)
-                    popt = fitres['popt'][dm][nj]
-                    pcov = fitres['pcov'][dm][nj]
+                    x_fine = np.linspace(x[0],x[-1],num=30)
+                    fitres = fitres_dict[nj][dm]
+                    popt = fitres['popt']
+                    pcov = fitres['pcov']
                     jac = fitres['jac']
-                    def err(x,jac,pcov):
-                        from numpy import sqrt,einsum
-                        return sqrt(einsum('ij,ij',jac(x),pcov))
+                    def err(x,jac,pcov,popt):
+                        from numpy import sqrt,einsum,abs
+                        return sqrt(abs(einsum('i,ij,j',jac(x,popt).T,pcov,jac(x,popt))))
 
                     import functools
-                    err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine))
+                    err_y = list(map(functools.partial(err, jac=jac,pcov=pcov,popt=popt), x_fine))
                     
                     y_fitf = fitres['fitf'](x_fine,*popt)
                     y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y
@@ -757,14 +787,16 @@ def err(x,jac,pcov):
                     ax.fill_between(x_fine, y_fitf_up,  y_fitf_down, color='#83d55f', alpha=0.5)
                     ax.set_ylabel('Fake Factor')
                     ax.set_xlabel('Tau pT [GeV]')
-                    ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}')
-                    ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm][nj],2)}}}{{{fitres['ndf'][dm][nj]}}}$",
-                                (0.8, 0.9),
+                    ax.set_title(f'Jet Fake Factors : Tau PNet Decay Mode {dm}, Njets {nj}')
+                    ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'],2)}}}{{{fitres['ndf']}}}$",
+                                (0.8, 0.75),
                                 xycoords='axes fraction',
                                 fontsize=20)
-                    print(str(fitres['fitf_str'][dm][nj]))
-                    ax.annotate('y=' + str(fitres['fitf_str'][dm][nj]),
-                                (0.1, 0.9),
+                    
+                    formula_str = eval_formula(fitres['fitf_str'],popt, make_rounding=True)
+                    
+                    ax.annotate('y=' + formula_str,
+                                (0.01, 0.95),
                                 xycoords='axes fraction',
                                 fontsize=12)
                     

From 13d104d714ab704a70433f90811955d0aa740dbe Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Fri, 25 Apr 2025 13:30:50 +0200
Subject: [PATCH 19/26] Small update in the histogram filling process and
 boundaries of the fake factor fit

---
 columnflow/hist_util.py                 | 17 ++++++++++++++++-
 columnflow/tasks/data_driven_methods.py |  2 +-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/columnflow/hist_util.py b/columnflow/hist_util.py
index 7929bbbd7..ff44709d8 100644
--- a/columnflow/hist_util.py
+++ b/columnflow/hist_util.py
@@ -72,9 +72,24 @@ def allows_shift(ax) -> bool:
             flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5
 
     # fill
+    
     flat_data = {}
+    arr_shape = None
     for key, arr in data.items():
-        if arr.ndim != 1: flat_data[key] = ak.flatten(arr)
+        if arr.ndim > 1:
+            logger.warning(
+                f"Found axis {key} that is not 1-dimensional: trying to broadcast all other axes:"
+            )
+            arr_shape = ak.local_index(arr)
+            
+    for key, arr in data.items():
+        if arr_shape is not None:
+            if arr.ndim == 1:
+                _, br_arr = ak.broadcast_arrays(arr_shape, arr)
+                flat_data[key] = ak.flatten(br_arr)
+            else:
+                flat_data[key] = ak.flatten(arr)
+
         else: flat_data[key] = arr
     h.fill(**fill_kwargs, **flat_data)
      
diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
index 7c4b3f375..80162dacb 100644
--- a/columnflow/tasks/data_driven_methods.py
+++ b/columnflow/tasks/data_driven_methods.py
@@ -613,7 +613,7 @@ def jac(x,p):
                     if dm==0:
                         the_bounds = ([-10,-5,-1],[10,5,1])
                     else:
-                        the_bounds = ([-0.5, -1, 0],[0.5,1,0.1])
+                        the_bounds = ([-0.5, -3, 0],[0.5,3,0.1])
                     popt, pcov, infodict, mesg, ier = curve_fit(fitf,
                                            x_masked,
                                            y,

From 44033d33a106b33639533613faed29d44d34b100 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Mon, 28 Apr 2025 14:45:59 +0200
Subject: [PATCH 20/26] Bug fix: while creating cutflow histogram from a set of
 files, histograms from the first file were missing

---
 columnflow/tasks/yields.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py
index 3abab6e15..ecf73c85f 100644
--- a/columnflow/tasks/yields.py
+++ b/columnflow/tasks/yields.py
@@ -383,14 +383,20 @@ def run(self):
                 # load the histogram of the variable named "event"
                 input_hists = inp["hists"]["event"].load(formatter="pickle")
                 
-                
                 for the_cat, the_hist in input_hists.items():
                     if the_cat not in merged_hists.keys():
                         merged_hists[the_cat] = []
+                        merged_hists[the_cat].append(the_hist)
                     else:
                         merged_hists[the_cat].append(the_hist)
                 #merge histograms
-            merged_hists_ = {the_cat: sum(h[1:],h[0].copy()) for the_cat, h in merged_hists.items()}
+                
+            merged_hists_ = {}
+            for the_cat, h in merged_hists.items():
+                if len(h) > 1: merged_hists_[the_cat] =  sum(h[1:],h[0].copy()) 
+                else: 
+                    merged_hists_[the_cat] = h[0].copy()
+            
             hists_per_proc = {} 
             for the_cat, the_hist in merged_hists_.items():
                 hists_per_proc[the_cat] = {}
@@ -403,7 +409,7 @@ def run(self):
                             h = h[{"process": hist.loc(leaf_proc.id)}]
                             
                             if proc in hists_per_proc[the_cat]:
-                                hists_per_proc[the_cat][proc] +=h
+                                hists_per_proc[the_cat][proc] += h
                             else:
                                 hists_per_proc[the_cat][proc] = h
                                 

From 2a7e90403777995d7ed123b510974b0c07a91535 Mon Sep 17 00:00:00 2001
From: Aliya Nigamova <aliya.nigamova@cern.ch>
Date: Mon, 19 May 2025 17:19:05 +0200
Subject: [PATCH 21/26] first version of datacards

---
 bin/cf_sandbox_file_hash          |  2 +-
 columnflow/tasks/cms/inference.py | 13 ++++---------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/bin/cf_sandbox_file_hash b/bin/cf_sandbox_file_hash
index 18f846c35..bf3ae5387 100755
--- a/bin/cf_sandbox_file_hash
+++ b/bin/cf_sandbox_file_hash
@@ -11,6 +11,6 @@ action() {
         setopt globdots
     fi
 
-    python "${this_dir}/$( basename "${this_file}" ).py" "$@"
+    python3 "${this_dir}/$( basename "${this_file}" ).py" "$@"
 }
 action "$@"
diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py
index 9386a47f6..06d7a051d 100644
--- a/columnflow/tasks/cms/inference.py
+++ b/columnflow/tasks/cms/inference.py
@@ -211,24 +211,19 @@ def run(self):
                         continue
 
                     # open the histogram and work on a copy
-                    h = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy()
-
+                    h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy()
+                    h = h_dict[cat_obj.name].copy()
                     # axis selections
                     h = h[{
                         "process": [
                             hist.loc(p.id)
                             for p in sub_process_insts
                             if p.id in h.axes["process"]
-                        ],
-                        "category": [
-                            hist.loc(c.id)
-                            for c in leaf_category_insts
-                            if c.id in h.axes["category"]
-                        ],
+                        ]
                     }]
 
                     # axis reductions
-                    h = h[{"process": sum, "category": sum}]
+                    h = h[{"process": sum}]
 
                     # add the histogram for this dataset
                     if h_proc is None:

From 3eed85b9a8e82a149f23bf474ab56a29095fd15b Mon Sep 17 00:00:00 2001
From: Aliya Nigamova <aliya.nigamova@cern.ch>
Date: Tue, 20 May 2025 15:37:38 +0200
Subject: [PATCH 22/26] working version with abcd method

---
 columnflow/tasks/cms/inference.py | 118 +++++++++++++++---------------
 1 file changed, 60 insertions(+), 58 deletions(-)

diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py
index 06d7a051d..b5a391fbb 100644
--- a/columnflow/tasks/cms/inference.py
+++ b/columnflow/tasks/cms/inference.py
@@ -10,7 +10,7 @@
 
 from columnflow.tasks.framework.base import Requirements, AnalysisTask, wrapper_factory
 from columnflow.tasks.framework.mixins import (
-    CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin,
+    CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin, HistHookMixin
 )
 from columnflow.tasks.framework.remote import RemoteWorkflow
 from columnflow.tasks.histograms import MergeHistograms, MergeShiftedHistograms
@@ -19,6 +19,7 @@
 
 
 class CreateDatacards(
+    HistHookMixin,
     InferenceModelMixin,
     MLModelsMixin,
     ProducersMixin,
@@ -183,82 +184,83 @@ def run(self):
         category_inst = self.config_inst.get_category(cat_obj.config_category)
         variable_inst = self.config_inst.get_variable(cat_obj.config_variable)
         leaf_category_insts = category_inst.get_leaf_categories() or [category_inst]
-
+  
         # histogram data per process
         hists = OrderedDict()
-
+        process_insts = []
+        #prepare histogram objects 
         with self.publish_step(f"extracting {variable_inst.name} in {category_inst.name} ..."):
             for proc_obj_name, inp in inputs.items():
                 if proc_obj_name == "data":
                     proc_obj = None
                     process_inst = self.config_inst.get_process("data")
-                else:
+                elif proc_obj_name != "qcd" and proc_obj_name != "wj":
                     proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name)
                     process_inst = self.config_inst.get_process(proc_obj.config_process)
+                else: 
+                    continue
                 sub_process_insts = [sub for sub, _, _ in process_inst.walk_processes(include_self=True)]
-
+                process_insts.append(process_inst)
                 h_proc = None
                 for dataset, _inp in inp.items():
                     dataset_inst = self.config_inst.get_dataset(dataset)
-
-                    # skip when the dataset is already known to not contain any sub process
-                    if not any(map(dataset_inst.has_process, sub_process_insts)):
-                        self.logger.warning(
-                            f"dataset '{dataset}' does not contain process '{process_inst.name}' "
-                            "or any of its subprocesses which indicates a misconfiguration in the "
-                            f"inference model '{self.inference_model}'",
-                        )
-                        continue
-
-                    # open the histogram and work on a copy
                     h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy()
-                    h = h_dict[cat_obj.name].copy()
-                    # axis selections
-                    h = h[{
-                        "process": [
-                            hist.loc(p.id)
-                            for p in sub_process_insts
-                            if p.id in h.axes["process"]
-                        ]
-                    }]
-
-                    # axis reductions
-                    h = h[{"process": sum}]
-
-                    # add the histogram for this dataset
-                    if h_proc is None:
-                        h_proc = h
-                    else:
-                        h_proc += h
-
-                # there must be a histogram
-                if h_proc is None:
-                    raise Exception(f"no histograms found for process '{process_inst.name}'")
-
-                # create the nominal hist
-                hists[proc_obj_name] = OrderedDict()
-                nominal_shift_inst = self.config_inst.get_shift("nominal")
-                hists[proc_obj_name]["nominal"] = h_proc[
-                    {"shift": hist.loc(nominal_shift_inst.id)}
-                ]
-
-                # per shift
-                if proc_obj:
-                    for param_obj in proc_obj.parameters:
-                        # skip the parameter when varied hists are not needed
-                        if not self.inference_model_inst.require_shapes_for_parameter(param_obj):
+                    
+                    for region in h_dict.keys():
+                        if region not in hists: hists[region] = {}    
+                        # skip when the dataset is already known to not contain any sub process
+                        if not any(map(dataset_inst.has_process, sub_process_insts)):
+                            self.logger.warning(
+                                f"dataset '{dataset}' does not contain process '{process_inst.name}' "
+                                "or any of its subprocesses which indicates a misconfiguration in the "
+                                f"inference model '{self.inference_model}'",
+                            )
                             continue
-                        # store the varied hists
-                        hists[proc_obj_name][param_obj.name] = {}
-                        for d in ["up", "down"]:
-                            shift_inst = self.config_inst.get_shift(f"{param_obj.config_shift_source}_{d}")
-                            hists[proc_obj_name][param_obj.name][d] = h_proc[
-                                {"shift": hist.loc(shift_inst.id)}
+                        # open the histogram and work on a copy
+                        h = h_dict[region]
+                        # axis selections
+                        h = h[{
+                            "process": [
+                                hist.loc(p.id)
+                                for p in sub_process_insts
+                                if p.id in h.axes["process"]
                             ]
+                        }]
+
+                        # axis reductions
+                        h = h[{"process": sum}]
+                        if process_inst in hists[region]:
+                            hists[region][process_inst] += h
+                        else:
+                            hists[region][process_inst] = h
+
+                    # there must be a histogra
+                    if hists[region][process_inst] is None:
+                        raise Exception(f"no histograms found for process '{process_inst.name}'")
+
+
+
+            if category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions
+                if self.hist_hooks:
+                    hists = self.invoke_hist_hooks(hists,category_inst)
+                else:
+                    hists = hists[category_inst.name]
+                for process_inst in hists:
+                    if process_inst not in process_insts:
+                        process_insts.append(process_inst) 
+            else:    # get the histogram for the pro   
+                hists = hists[category_inst.name]   
+            datacard_hists = OrderedDict()
+            for process_inst in process_insts:
+                # get the histogram for the process
+                datacard_hists[process_inst.name] = OrderedDict()
+                nominal_shift_inst = self.config_inst.get_shift("nominal")
+                # add the histogram to the datacard
+                datacard_hists[process_inst.name]["nominal"] = hists[process_inst][{"shift": hist.loc(nominal_shift_inst.id)}]
 
             # forward objects to the datacard writer
             outputs = self.output()
-            writer = DatacardWriter(self.inference_model_inst, {cat_obj.name: hists})
+            writer = DatacardWriter(self.inference_model_inst, {cat_obj.name: datacard_hists})
             with outputs["card"].localize("w") as tmp_card, outputs["shapes"].localize("w") as tmp_shapes:
                 writer.write(tmp_card.abspath, tmp_shapes.abspath, shapes_path_ref=outputs["shapes"].basename)
 

From fb06da518e874d5b706e1ff8d790f04437c6bcf9 Mon Sep 17 00:00:00 2001
From: Aliya Nigamova <aliya.nigamova@cern.ch>
Date: Wed, 21 May 2025 10:16:30 +0200
Subject: [PATCH 23/26] working version for the ff method

---
 columnflow/tasks/cms/inference.py | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py
index b5a391fbb..d091f11bb 100644
--- a/columnflow/tasks/cms/inference.py
+++ b/columnflow/tasks/cms/inference.py
@@ -194,14 +194,12 @@ def run(self):
                 if proc_obj_name == "data":
                     proc_obj = None
                     process_inst = self.config_inst.get_process("data")
-                elif proc_obj_name != "qcd" and proc_obj_name != "wj":
+                elif proc_obj_name != "qcd" and proc_obj_name != "jet_fakes":
                     proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name)
                     process_inst = self.config_inst.get_process(proc_obj.config_process)
                 else: 
                     continue
                 sub_process_insts = [sub for sub, _, _ in process_inst.walk_processes(include_self=True)]
-                process_insts.append(process_inst)
-                h_proc = None
                 for dataset, _inp in inp.items():
                     dataset_inst = self.config_inst.get_dataset(dataset)
                     h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy()
@@ -238,24 +236,16 @@ def run(self):
                     if hists[region][process_inst] is None:
                         raise Exception(f"no histograms found for process '{process_inst.name}'")
 
-
-
-            if category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions
-                if self.hist_hooks:
-                    hists = self.invoke_hist_hooks(hists,category_inst)
-                else:
-                    hists = hists[category_inst.name]
-                for process_inst in hists:
-                    if process_inst not in process_insts:
-                        process_insts.append(process_inst) 
-            else:    # get the histogram for the pro   
-                hists = hists[category_inst.name]   
+            if self.hist_hooks and category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions
+                hists = self.invoke_hist_hooks(hists,category_inst)
+            else:
+                hists = hists[category_inst.name]
+            # prepare the hists to be used in the datacard writer
             datacard_hists = OrderedDict()
-            for process_inst in process_insts:
+            for process_inst in hists.keys():
                 # get the histogram for the process
                 datacard_hists[process_inst.name] = OrderedDict()
                 nominal_shift_inst = self.config_inst.get_shift("nominal")
-                # add the histogram to the datacard
                 datacard_hists[process_inst.name]["nominal"] = hists[process_inst][{"shift": hist.loc(nominal_shift_inst.id)}]
 
             # forward objects to the datacard writer

From 4e94516c0990e34ae65ab4dc8f8a80c544a3389c Mon Sep 17 00:00:00 2001
From: Aliya Nigamova <aliya.nigamova@cern.ch>
Date: Wed, 21 May 2025 11:17:19 +0200
Subject: [PATCH 24/26] adding data_driven flag

---
 columnflow/inference/__init__.py  | 3 +++
 columnflow/tasks/cms/inference.py | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/columnflow/inference/__init__.py b/columnflow/inference/__init__.py
index 7926a9f78..d60fd87c4 100644
--- a/columnflow/inference/__init__.py
+++ b/columnflow/inference/__init__.py
@@ -325,6 +325,7 @@ def process_spec(
         name: str,
         config_process: str | None = None,
         is_signal: bool = False,
+        data_driven: bool = False,
         config_mc_datasets: Sequence[str] | None = None,
         scale: float | int = 1.0,
     ) -> DotDict:
@@ -333,6 +334,7 @@ def process_spec(
 
             - *name*: The name of the process in the model.
             - *is_signal*: A boolean flag deciding whether this process describes signal.
+            - *data_driven*: A boolean flag deciding whether this process is data driven.
             - *config_process*: The name of the source process in the config to use.
             - *config_mc_datasets*: List of names or patterns of MC datasets in the config to use.
             - *scale*: A float value to scale the process, defaulting to 1.0.
@@ -340,6 +342,7 @@ def process_spec(
         return DotDict([
             ("name", str(name)),
             ("is_signal", bool(is_signal)),
+            ("data_driven", bool(data_driven)),
             ("config_process", str(config_process) if config_process else None),
             ("config_mc_datasets", list(map(str, config_mc_datasets or []))),
             ("scale", float(scale)),
diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py
index d091f11bb..949b88cff 100644
--- a/columnflow/tasks/cms/inference.py
+++ b/columnflow/tasks/cms/inference.py
@@ -191,11 +191,11 @@ def run(self):
         #prepare histogram objects 
         with self.publish_step(f"extracting {variable_inst.name} in {category_inst.name} ..."):
             for proc_obj_name, inp in inputs.items():
+                proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name)
                 if proc_obj_name == "data":
                     proc_obj = None
                     process_inst = self.config_inst.get_process("data")
-                elif proc_obj_name != "qcd" and proc_obj_name != "jet_fakes":
-                    proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name)
+                elif not proc_obj.data_driven : # data driven processes will be added later with invoke_hist_hooks
                     process_inst = self.config_inst.get_process(proc_obj.config_process)
                 else: 
                     continue

From 0099c46a810a069dc58405c6a70ffc50e48492d5 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Wed, 16 Jul 2025 16:12:06 +0200
Subject: [PATCH 25/26] Changes to make datacard production working and some
 cosmetics

---
 columnflow/columnar_util.py              | 13 +++++++
 columnflow/plotting/plot_functions_1d.py | 43 ++++++++++++++++++++----
 columnflow/tasks/cms/inference.py        | 41 ++++++++++++----------
 columnflow/tasks/yields.py               |  5 +--
 sandboxes/dev.txt                        |  4 ++-
 5 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py
index 171ab3661..24be344f2 100644
--- a/columnflow/columnar_util.py
+++ b/columnflow/columnar_util.py
@@ -2465,6 +2465,19 @@ def setup(cls, func: Callable[[dict], None]) -> None:
         """
         cls.setup_func = func
 
+    @classmethod
+    def teardown(cls, func: Callable[[dict], None]) -> None:
+        """
+        Decorator to wrap a function *func* that should be registered as :py:meth:`teardown_func`
+        which is used to perform a custom teardown of objects at the end of processing. The function
+        should accept one argument:
+
+            - *task*, the invoking task instance.
+
+        The decorator does not return the wrapped function.
+        """
+        cls.teardown_func = func
+
     def __init__(
         self,
         *args,
diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py
index ceeccb986..839418c67 100644
--- a/columnflow/plotting/plot_functions_1d.py
+++ b/columnflow/plotting/plot_functions_1d.py
@@ -271,20 +271,36 @@ def plot_shifted_variable(
     plot_config = {}
     colors = {
         "nominal": "black",
-        "up": "red",
-        "down": "blue",
+        "up": "blue",
+        "down": "red",
     }
+    shift_names = {
+        "nominal": "max mixing",
+        "ts_up": "CP-odd",
+        "ts_down": "CP-even",
+    }
+    
+    hist_up = None 
+    hist_down = None
+    hist_up_err = None
+    hist_down_err = None
     for i, shift_id in enumerate(h_sum.axes["shift"]):
         shift_inst = config_inst.get_shift(shift_id)
-
+       
         h = h_sum[{"shift": hist.loc(shift_id)}]
+        if "up" in shift_inst.label:
+            hist_up = h.values()
+            hist_up_err = h.variances()
+        elif "down" in shift_inst.label:
+            hist_down = h.values()
+            hist_down_err = h.variances()
         # assuming `nominal` always has shift id 0
         ratio_norm = h_sum[{"shift": hist.loc(0)}].values()
 
         diff = sum(h.values()) / sum(ratio_norm) - 1
-        label = shift_inst.label
+        label = shift_names[shift_inst.label]
         if not shift_inst.is_nominal:
-            label += " ({0:+.2f}%)".format(diff * 100)
+            pass #label +=  " ({0:+.2f}%)".format(diff * 100)
 
         plot_config[shift_inst.name] = plot_cfg = {
             "method": "draw_hist",
@@ -302,8 +318,18 @@ def plot_shifted_variable(
         if hide_errors:
             for key in ("kwargs", "ratio_kwargs"):
                 if key in plot_cfg:
-                    plot_cfg[key]["yerr"] = None
-
+                    plot_cfg[key]["yerr"] = False
+    h_sum = (hist_up + hist_down)
+    mask = (h_sum > 0)
+    asym_hist = np.where(mask, 
+                         np.abs(hist_up - hist_down)/h_sum,
+                         0)
+    herr_num = np.sqrt(hist_up_err + hist_down_err)
+    herr_den = np.sqrt(hist_up_err + hist_down_err)
+    dA = np.average(np.sqrt( (herr_num/h_sum)**2 + (herr_den*np.abs(hist_up - hist_down)/h_sum/h_sum)**2))
+
+    A = np.average(asym_hist)
+    
     # legend title setting
     if not legend_title and len(hists) == 1:
         # use process label as default if 1 process
@@ -318,6 +344,9 @@ def plot_shifted_variable(
     )
     default_style_config["rax_cfg"]["ylim"] = (0.75, 1.25)
     default_style_config["rax_cfg"]["ylabel"] = "Ratio"
+    
+    default_style_config["annotate_cfg"]["text"] = f'A={A:1.3f}$\pm${dA:1.3f}'
+    default_style_config["annotate_cfg"]["fontsize"] = 22
     if legend_title:
         default_style_config["legend_cfg"]["title"] = legend_title
 
diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py
index 949b88cff..24abc829d 100644
--- a/columnflow/tasks/cms/inference.py
+++ b/columnflow/tasks/cms/inference.py
@@ -92,6 +92,7 @@ def workflow_requires(self):
 
         for cat_obj in self.branch_map.values():
             for proc_obj in cat_obj.processes:
+                if proc_obj.data_driven: continue
                 for dataset in self.get_mc_datasets(proc_obj):
                     # add all required variables and shifts per dataset
                     mc_dataset_params[dataset]["variables"].add(cat_obj.config_variable)
@@ -100,10 +101,8 @@ def workflow_requires(self):
                         for param_obj in proc_obj.parameters
                         if self.inference_model_inst.require_shapes_for_parameter(param_obj)
                     )
-
             for dataset in self.get_data_datasets(cat_obj):
                 data_dataset_params[dataset]["variables"].add(cat_obj.config_variable)
-
         # set workflow requirements per mc dataset
         reqs["merged_hists"] = set(
             self.reqs.MergeShiftedHistograms.req_different_branching(
@@ -129,6 +128,7 @@ def workflow_requires(self):
 
     def requires(self):
         cat_obj = self.branch_data
+        processes = [proc_obj for proc_obj in cat_obj.processes if not proc_obj.data_driven]
         reqs = {
             proc_obj.name: {
                 dataset: self.reqs.MergeShiftedHistograms.req_different_branching(
@@ -143,9 +143,9 @@ def requires(self):
                     branch=-1,
                     workflow="local",
                 )
-                for dataset in self.get_mc_datasets(proc_obj)
+                for dataset in self.get_mc_datasets(proc_obj) 
             }
-            for proc_obj in cat_obj.processes
+            for proc_obj in processes
         }
         if cat_obj.config_data_datasets:
             reqs["data"] = {
@@ -191,19 +191,20 @@ def run(self):
         #prepare histogram objects 
         with self.publish_step(f"extracting {variable_inst.name} in {category_inst.name} ..."):
             for proc_obj_name, inp in inputs.items():
-                proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name)
                 if proc_obj_name == "data":
                     proc_obj = None
                     process_inst = self.config_inst.get_process("data")
-                elif not proc_obj.data_driven : # data driven processes will be added later with invoke_hist_hooks
-                    process_inst = self.config_inst.get_process(proc_obj.config_process)
-                else: 
-                    continue
+                else:
+                    proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name)
+                    if not proc_obj.data_driven: # data driven processes will be added later with invoke_hist_hooks
+                        process_inst = self.config_inst.get_process(proc_obj.config_process)
+                    else: 
+                        pass
+                
                 sub_process_insts = [sub for sub, _, _ in process_inst.walk_processes(include_self=True)]
                 for dataset, _inp in inp.items():
                     dataset_inst = self.config_inst.get_dataset(dataset)
                     h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy()
-                    
                     for region in h_dict.keys():
                         if region not in hists: hists[region] = {}    
                         # skip when the dataset is already known to not contain any sub process
@@ -235,19 +236,25 @@ def run(self):
                     # there must be a histogra
                     if hists[region][process_inst] is None:
                         raise Exception(f"no histograms found for process '{process_inst.name}'")
-
             if self.hist_hooks and category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions
                 hists = self.invoke_hist_hooks(hists,category_inst)
             else:
                 hists = hists[category_inst.name]
             # prepare the hists to be used in the datacard writer
             datacard_hists = OrderedDict()
-            for process_inst in hists.keys():
-                # get the histogram for the process
-                datacard_hists[process_inst.name] = OrderedDict()
-                nominal_shift_inst = self.config_inst.get_shift("nominal")
-                datacard_hists[process_inst.name]["nominal"] = hists[process_inst][{"shift": hist.loc(nominal_shift_inst.id)}]
-
+            for combine_proc, proc_name in self.inference_model_inst.proc_map.items():
+                process_inst = [the_proc for the_proc in hists.keys() if the_proc.name == proc_name]
+                if len(process_inst) and not (hists[process_inst[0]].empty()):
+                    # get the histogram for the process
+                    datacard_hists[combine_proc] = OrderedDict()
+                    nominal_shift_inst = self.config_inst.get_shift("nominal")
+                    datacard_hists[combine_proc]["nominal"] = hists[process_inst[0]][{"shift": hist.loc(nominal_shift_inst.id)}]
+            # add data:
+            data_proc = [the_proc for the_proc in hists.keys() if the_proc.name == 'data'] 
+            datacard_hists['data'] = OrderedDict()
+            nominal_shift_inst = self.config_inst.get_shift("nominal")
+            datacard_hists['data']["nominal"] = hists[data_proc[0]][{"shift": hist.loc(nominal_shift_inst.id)}]
+    
             # forward objects to the datacard writer
             outputs = self.output()
             writer = DatacardWriter(self.inference_model_inst, {cat_obj.name: datacard_hists})
diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py
index ecf73c85f..01ba92079 100644
--- a/columnflow/tasks/yields.py
+++ b/columnflow/tasks/yields.py
@@ -367,7 +367,7 @@ def run(self):
         inputs = self.input()
         outputs = self.output()
 
-        category_insts = list(map(self.config_inst.get_category, self.categories))
+        category_insts = list(self.categories)
         process_insts = list(map(self.config_inst.get_process, self.processes))
         sub_process_insts = {
             proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)]
@@ -383,7 +383,8 @@ def run(self):
                 # load the histogram of the variable named "event"
                 input_hists = inp["hists"]["event"].load(formatter="pickle")
                 
-                for the_cat, the_hist in input_hists.items():
+                for the_cat in category_insts:
+                    the_hist = input_hists[the_cat]
                     if the_cat not in merged_hists.keys():
                         merged_hists[the_cat] = []
                         merged_hists[the_cat].append(the_hist)
diff --git a/sandboxes/dev.txt b/sandboxes/dev.txt
index cc5455448..3be3f1914 100644
--- a/sandboxes/dev.txt
+++ b/sandboxes/dev.txt
@@ -1,4 +1,4 @@
-# version 10
+# version 11
 
 # last version to support python 3.9
 ipython~=8.18.1
@@ -10,3 +10,5 @@ flake8-quotes~=3.4.0
 pipdeptree~=2.23.4
 pymarkdownlnt~=0.9.25
 uniplot~=0.15.1
+xgboost~=2.1.4
+scikit-learn

From 518a10262e2031004394e2ead1048b2df200e7f1 Mon Sep 17 00:00:00 2001
From: zakharov-binp <stepan.zakharov96@gmail.com>
Date: Thu, 31 Jul 2025 13:39:36 +0200
Subject: [PATCH 26/26] returned dev.txt to the original state

---
 sandboxes/dev.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sandboxes/dev.txt b/sandboxes/dev.txt
index 3be3f1914..cc5455448 100644
--- a/sandboxes/dev.txt
+++ b/sandboxes/dev.txt
@@ -1,4 +1,4 @@
-# version 11
+# version 10
 
 # last version to support python 3.9
 ipython~=8.18.1
@@ -10,5 +10,3 @@ flake8-quotes~=3.4.0
 pipdeptree~=2.23.4
 pymarkdownlnt~=0.9.25
 uniplot~=0.15.1
-xgboost~=2.1.4
-scikit-learn