From 5044e7436a2f1a54de5bc41be0bc0a5736a9d7b9 Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Mon, 22 Dec 2025 10:24:58 +0100
Subject: [PATCH 01/23] Add config parameter to set external path to the sample
 database

---
 helper/functions.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/helper/functions.py b/helper/functions.py
index 3709073..c09fed7 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -22,6 +22,10 @@
 from XRootD import client
 
 
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+TAU_FAKE_FACTORS_DIR = os.path.dirname(THIS_DIR)
+
+
 class CachingKeyHelper:
     @staticmethod
     def make_hashable(obj: Union[Dict, List, Tuple, Any]) -> Union[Dict, Tuple, bytes, Any]:
@@ -411,7 +415,21 @@ def load_config(config_file: str) -> Dict:
     else:
         print("No common config file found!")
 
-    config = {}
+    # Container of the loaded configuration
+    # 
+    # Some default values are pre-defined in the config dict that is going to contain the loaded
+    # configuration. These values are overwritten if they are explicitly set in the common config file.
+    #
+    # The variables, for which defaults are set, are:
+    #
+    # - 'sample_database`: Path to the sample database directory. Usuallly, this path is set to the
+    #   `datasets` submodule of the `TauFakeFactors` module. Users can set a custom path, e.g.,
+    #   to an external path to a working version of their sample database.
+    config = {
+        "sample_database": os.path.join(TAU_FAKE_FACTORS_DIR, "datasets"),
+    }
+
+    # Update the config with common settings, applying to all steps
     with open(common_config_file, "r") as file:
         config.update(configured_yaml.load(file))
 

From 4853d30590c2c4e80638eed8f80b769e738649da Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Mon, 22 Dec 2025 10:26:22 +0100
Subject: [PATCH 02/23] Add function to define new columns from expressions,
 provided in the preselection configuration

---
 helper/functions.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/helper/functions.py b/helper/functions.py
index c09fed7..4330f16 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -712,6 +712,49 @@ def rename_boosted_variables(rdf: Any, channel: str) -> Any:
 
     return rdf
 
+def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
+    """
+    Customizer function to define additional columns in the ntuples.
+     
+    The `column_definitions` dictionary is usually provided with the preselection configuration
+    file. The keys of the dictionary correspond to the columns to be created. The values are
+    dictionaries which contain the information for the column information. The keys of these inner
+    dictionaries have the following meaning:
+
+    - `expression`: The expression string which is used to define the new column.
+
+    - `exclude_processes` (_optional_): A list of process names for which the definition should be
+      skipped. If `process` is in this list, the definition is not applied.
+
+    Note that the new column names must not exist in the ntuples, otherwise an error is raised.
+
+    Args:
+        rdf: root DataFrame
+        column_definitions: Dictionary mapping new column names (keys) to expressions (values)
+        process: Name of the current process
+
+    Return:
+        root DataFrame with redefined variables
+    """
+
+    # Ensure that the new column names are not already present in the ntuple
+    rdf_columns = set(rdf.GetColumnNames())
+    new_columns = set(column_definitions.keys())
+    intersection = rdf_columns.intersection(new_columns)
+    if intersection:
+        raise ValueError(
+            f"The following new column names already exist in the ntuple: {intersection}"
+        )
+
+    # Perform the define declarations on the RDataFrame object
+    for new_column, define_dict in column_definitions.items():
+        expression = define_dict["expression"]
+        exclude_processes = define_dict.get("exclude_processes", [])
+        if process in exclude_processes:
+            continue
+        rdf = rdf.Define(new_column, expression)
+
+    return rdf
 
 def get_samples(config: Dict[str, Union[str, Dict, List]]) -> List[str]:
     """

From 8d62c4db88621e654234c227cd1f80d6653ce66c Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Mon, 22 Dec 2025 10:27:52 +0100
Subject: [PATCH 03/23] Add column definitions to sample preselection

---
 preselection.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/preselection.py b/preselection.py
index ce8be61..f7e1e4f 100644
--- a/preselection.py
+++ b/preselection.py
@@ -51,7 +51,7 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
     Return:
         A tuple with the tau gen. level mode and the name of the output file
     """
-    process, config, output_path, ncores, sample, tau_gen_mode = args
+    process, config, output_path, ncores, sample, tau_gen_mode, column_definitions = args
     log = logging.getLogger(f"preselection.{process}")
     ROOT.EnableImplicitMT(ncores)
 
@@ -80,6 +80,10 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
         log.info(f"WARNING: Sample {sample} is empty. Skipping...")
         return ()
 
+    # Declare column definitions on the RDataFrame
+    if column_definitions:
+        rdf = func.define_columns(rdf, column_definitions, process)
+
     # apply analysis specific event filters
     selection_conf = config["event_selection"]
     for cut in selection_conf:
@@ -209,9 +213,12 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
         f"Considered samples for process {process}: {config['processes'][process]['samples']}"
     )
 
+    # get renaming of columns
+    column_definitions = config.get("column_definitions", {})
+
     # going through all contributing samples for the process
     args_list = [
-        (process, config, output_path, ncores, sample, tau_gen_mode) for tau_gen_mode in config["processes"][process]["tau_gen_modes"] for sample in config["processes"][process]["samples"]
+        (process, config, output_path, ncores, sample, tau_gen_mode, column_definitions) for tau_gen_mode in config["processes"][process]["tau_gen_modes"] for sample in config["processes"][process]["samples"]
     ]
 
     results = func.optional_process_pool(

From d470ec63dd5c79716e148a5acce35f46c73416d2 Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Mon, 22 Dec 2025 10:28:47 +0100
Subject: [PATCH 04/23] Add 2022 and 2023 luminosity weights

---
 helper/weights.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/helper/weights.py b/helper/weights.py
index 7472cd3..5c61f0a 100644
--- a/helper/weights.py
+++ b/helper/weights.py
@@ -94,6 +94,14 @@ def lumi_weight(rdf: Any, era: str) -> Any:
         rdf = rdf.Redefine("weight", "weight * 41.48 * 1000.")
     elif era == "2018":
         rdf = rdf.Redefine("weight", "weight * 59.83 * 1000.")
+    elif era == "2022preEE":
+        rdf = rdf.Redefine("weight", "weight * 7.9804 * 1000.")
+    elif era == "2022postEE":
+        rdf = rdf.Redefine("weight", "weight * 26.6717 * 1000.")
+    elif era == "2023preBPix":
+        rdf = rdf.Redefine("weight", "weight * 18.063 * 1000.")
+    elif era == "2023postBPix":
+        rdf = rdf.Redefine("weight", "weight * 9.693 * 1000.")
     else:
         raise ValueError(f"Weight calc: lumi: Era is not defined: {era}")
 

From eb8ff407e4cc4e739d1a55a33d1d8e61acfc868d Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Mon, 22 Dec 2025 10:31:15 +0100
Subject: [PATCH 05/23] Add selection for lepton flavor in Run 3 DY samples

---
 preselection.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/preselection.py b/preselection.py
index f7e1e4f..5337464 100644
--- a/preselection.py
+++ b/preselection.py
@@ -89,6 +89,13 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
     for cut in selection_conf:
         rdf = rdf.Filter(f"({selection_conf[cut]})", f"cut on {cut}")
 
+    # For Run 3 DY samples, we need to collect the events from two samples, that need to be selected
+    # for different flavors
+    if sample.startswith("DYto2L"):
+        rdf = rdf.Filter("lhe_drell_yan_decay_flavor == 11 || lhe_drell_yan_decay_flavor == 13", "DY e/mu selection")
+    if sample.startswith("DYto2Tau"):
+        rdf = rdf.Filter("lhe_drell_yan_decay_flavor == 15", "DY tau selection")
+
     if process == "embedding":
         rdf = filters.emb_tau_gen_match(rdf=rdf, channel=config["channel"])
 

From 467b0daa7816ffce053f0f017de12d6eb754f379 Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Mon, 22 Dec 2025 10:32:42 +0100
Subject: [PATCH 06/23] Set path to datasets file according to sample database
 path in config.

---
 preselection.py         | 6 +++++-
 preselection_boosted.py | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/preselection.py b/preselection.py
index 5337464..299088e 100644
--- a/preselection.py
+++ b/preselection.py
@@ -275,7 +275,11 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
     config = func.load_config(args.config_file)
 
     # loading general dataset info file for xsec and event number
-    with open(f"datasets/{config['nanoAOD_version']}/datasets.json", "r") as file:
+    datasets_file = os.path.join(
+        config["sample_database"], config["nanoAOD_version"], "datasets.json"
+    )
+    print(f"Loading sample database from {datasets_file}")
+    with open(datasets_file, "r") as file:
         datasets = json.load(file)
 
     # define output path for the preselected samples
diff --git a/preselection_boosted.py b/preselection_boosted.py
index 6b38216..663a046 100644
--- a/preselection_boosted.py
+++ b/preselection_boosted.py
@@ -266,7 +266,11 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
     config = func.load_config(args.config_file)
 
     # loading general dataset info file for xsec and event number
-    with open(f"datasets/{config['nanoAOD_version']}/datasets.json", "r") as file:
+    datasets_file = os.path.join(
+        config["sample_database"], config["nanoAOD_version"], "datasets.json"
+    )
+    print(f"Loading sample database from {datasets_file}")
+    with open(datasets_file, "r") as file:
         datasets = json.load(file)
 
     # define output path for the preselected samples

From 392149ab070fcc65aa0dfd94c155673c69ccad27 Mon Sep 17 00:00:00 2001
From: Moritz Molch <moritz.molch@cern.ch>
Date: Tue, 23 Dec 2025 15:10:49 +0100
Subject: [PATCH 07/23] Add weight to rescale tt contributions estimated from
 simulation

---
 preselection.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/preselection.py b/preselection.py
index 299088e..288daca 100644
--- a/preselection.py
+++ b/preselection.py
@@ -133,6 +133,15 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
                     rdf = rdf.Redefine(
                         "weight", f"weight * ({mc_weight_conf[weight]})"
                     )
+            elif weight == "ttbar_norm_weight":
+                if process == "ttbar" and tau_gen_mode in ["L", "T"]:
+                    # This function applies an additional normalization weight to tt backgrounds
+                    # obtained from simulation. The factor corrects for a mismodelling of the
+                    # normalization of tt compared to data and is extracted in an e mu control
+                    # region.
+                    rdf = rdf.Redefine(
+                        "weight", f"weight * ({mc_weight_conf[weight]})"
+                    )
             else:
                 rdf = rdf.Redefine("weight", f"weight * ({mc_weight_conf[weight]})")
 

From d0ee4dca957c385c3b4d295c50cab6a55c560656 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 10:29:13 +0100
Subject: [PATCH 08/23] Introduce normalization weight for ttbar backgrounds

---
 preselection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preselection.py b/preselection.py
index 288daca..aada3e2 100644
--- a/preselection.py
+++ b/preselection.py
@@ -134,7 +134,7 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
                         "weight", f"weight * ({mc_weight_conf[weight]})"
                     )
             elif weight == "ttbar_norm_weight":
-                if process == "ttbar" and tau_gen_mode in ["L", "T"]:
+                if process == "ttbar" and tau_gen_mode in ["L", "J", "T"]:
                     # This function applies an additional normalization weight to tt backgrounds
                     # obtained from simulation. The factor corrects for a mismodelling of the
                     # normalization of tt compared to data and is extracted in an e mu control

From 81134152f93e93e3ec6979d383c07efe84b22096 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 10:37:16 +0100
Subject: [PATCH 09/23] Use logger instead of print for echoing the sample
 database path

---
 preselection.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/preselection.py b/preselection.py
index aada3e2..5f1463f 100644
--- a/preselection.py
+++ b/preselection.py
@@ -283,26 +283,28 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
     # loading of the chosen config file
     config = func.load_config(args.config_file)
 
-    # loading general dataset info file for xsec and event number
-    datasets_file = os.path.join(
-        config["sample_database"], config["nanoAOD_version"], "datasets.json"
-    )
-    print(f"Loading sample database from {datasets_file}")
-    with open(datasets_file, "r") as file:
-        datasets = json.load(file)
-
     # define output path for the preselected samples
     output_path = os.path.join(
         config["output_path"], "preselection", config["era"], config["channel"]
     )
     func.check_path(path=output_path)
 
+    # Set up logger and retrieve logger instance for main routine
     func.setup_logger(
         log_file=output_path + "/preselection.log",
         log_name="preselection",
         log_level=logging.INFO,
         subcategories=config["processes"],
     )
+    log = logging.getLogger("preselection.main")
+
+    # Load general dataset info file for xsec and event number
+    datasets_file = os.path.join(
+        config["sample_database"], config["nanoAOD_version"], "datasets.json"
+    )
+    with open(datasets_file, "r") as file:
+        datasets = json.load(file)
+    log.log(f"Loading sample database from {datasets_file}")
 
     # get needed features for fake factor calculation
     output_features = config["output_features"]

From 0d3d2a47747163eca5afe59cfe40a7c7fa3abf08 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 10:39:25 +0100
Subject: [PATCH 10/23] Remove obsolete variable

---
 helper/functions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/helper/functions.py b/helper/functions.py
index 4330f16..37bb77c 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -22,8 +22,7 @@
 from XRootD import client
 
 
-THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-TAU_FAKE_FACTORS_DIR = os.path.dirname(THIS_DIR)
+TAU_FAKE_FACTORS_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 
 class CachingKeyHelper:
@@ -715,7 +714,7 @@ def rename_boosted_variables(rdf: Any, channel: str) -> Any:
 def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
     """
     Customizer function to define additional columns in the ntuples.
-     
+
     The `column_definitions` dictionary is usually provided with the preselection configuration
     file. The keys of the dictionary correspond to the columns to be created. The values are
     dictionaries which contain the information for the column information. The keys of these inner
@@ -756,6 +755,7 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
 
     return rdf
 
+
 def get_samples(config: Dict[str, Union[str, Dict, List]]) -> List[str]:
     """
     Function to get a list of all sample paths which will be used for the fake factor calculation.

From 459a98edf79932a729af6b433e40344ff031c382 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 11:21:26 +0100
Subject: [PATCH 11/23] Add documentation for column definitions entry in
 preselection step

---
 docs/preselection.md | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/docs/preselection.md b/docs/preselection.md
index bbe1478..0d53a51 100644
--- a/docs/preselection.md
+++ b/docs/preselection.md
@@ -8,6 +8,12 @@ The preselection config has the following parameters:
   ---|---|---
   `channel` | `string` | tau pair decay channels ("et", "mt", "tt")
   `processes` | `dict` | process parameters are explained below
+  `column_definitions` | `dict` | in this section, new columns can be defined
+  based on a given `ROOT` expression. <br>The keys of the dictionary correspond
+  to the name of the defined column. The values are dictionaries itself, with
+  the `expression` key defining the `ROOT` expression for defining the column and
+  the optional entry `exclude_processes` containing a list of processes for
+      which the column should not be added. An example is given below.
   `event_selection` | `dict` | with this parameter all selections that should be applied are defined. <br>This is basically a dictionary of cuts where the key is the name of a cut and the value is the cut itself as a string e.g. `had_tau_pt: "pt_2 > 30"`. The name of a cut is not really important, it is only used as an output information in the terminal. A cut can only use variables which are in the ntuples.
   `mc_weights` | `dict` | weight parameter are defined below
   `emb_weights` | `dict` | all weights that should be applied for embedded samples are defined. <br>Like for `event_selection` a weight can directly be specified and is then applied to all samples the same way e.g. `single_trigger: "trg_wgt_single_mu24ormu27"`
@@ -31,6 +37,25 @@ The `tau_gen_modes` have following modes:
   `L` | `string` | lepton misidentified as a tau
   `all` | `string` | if no split should be performed
 
+In `column_definitions`, new columns. An example entry could look like this:
+
+```yaml
+column_definitions:
+    nbtag:
+        expression: n_bjets
+    btag_weight:
+        expression: id_wgt_bjet_pnet_shape
+        exclude_processes:
+        - data
+    jj_deltaR:
+        expression: ROOT::VecOps::DeltaR(jeta_1, jeta_2, jphi_1, jphi_2)
+```
+
+The key `expression` is required and can contain any valid `ROOT` expression.
+The entry `exclude_processes` is optional. This list can contain process names
+from the `processes` section of this configuration. By default, the new columns
+are defined for all processes.
+
 In `mc_weights` all weights that should be applied for simulated samples are defined. <br>
 There are two types of weights.
 
@@ -53,4 +78,4 @@ python preselection.py --config-file configs/PATH/CONFIG.yaml
 Further there are additional optional parameters: 
 
 1. `--nthreads=SOME_INTEGER` to define the number of threads for the multiprocessing pool to run the sample processing in parallel. Default value is 8 (this should normally cover running all of the samples in parallel).
-2.  `--ncores=SOME_INTEGER` to define the number of cores that should be used for each pool thread to speed up the ROOT dataframe calculation. Default value is 2.
\ No newline at end of file
+2.  `--ncores=SOME_INTEGER` to define the number of cores that should be used for each pool thread to speed up the ROOT dataframe calculation. Default value is 2.

From 80a40bc2c0fbad5b2acdb39e7d320af389b54d86 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 12:42:36 +0100
Subject: [PATCH 12/23] Clean formatting of column_definitions table entry and
 extend description of example.

---
 docs/preselection.md | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/docs/preselection.md b/docs/preselection.md
index 0d53a51..0cd8c87 100644
--- a/docs/preselection.md
+++ b/docs/preselection.md
@@ -8,12 +8,7 @@ The preselection config has the following parameters:
   ---|---|---
   `channel` | `string` | tau pair decay channels ("et", "mt", "tt")
   `processes` | `dict` | process parameters are explained below
-  `column_definitions` | `dict` | in this section, new columns can be defined
-  based on a given `ROOT` expression. <br>The keys of the dictionary correspond
-  to the name of the defined column. The values are dictionaries itself, with
-  the `expression` key defining the `ROOT` expression for defining the column and
-  the optional entry `exclude_processes` containing a list of processes for
-      which the column should not be added. An example is given below.
+  `column_definitions` | `dict` | in this section, new columns can be defined based on a given `ROOT` expression. <br>The keys of the dictionary correspond to the name of the defined column. The values are dictionaries itself, with the `expression` key defining the `ROOT` expression for defining the column and the optional entry `exclude_processes` containing a list of processes for which the column should not be added. An example is given below.
   `event_selection` | `dict` | with this parameter all selections that should be applied are defined. <br>This is basically a dictionary of cuts where the key is the name of a cut and the value is the cut itself as a string e.g. `had_tau_pt: "pt_2 > 30"`. The name of a cut is not really important, it is only used as an output information in the terminal. A cut can only use variables which are in the ntuples.
   `mc_weights` | `dict` | weight parameter are defined below
   `emb_weights` | `dict` | all weights that should be applied for embedded samples are defined. <br>Like for `event_selection` a weight can directly be specified and is then applied to all samples the same way e.g. `single_trigger: "trg_wgt_single_mu24ormu27"`
@@ -54,7 +49,8 @@ column_definitions:
 The key `expression` is required and can contain any valid `ROOT` expression.
 The entry `exclude_processes` is optional. This list can contain process names
 from the `processes` section of this configuration. By default, the new columns
-are defined for all processes.
+are defined for all processes. To write the new columns to the output file, you
+have to explicitly add the columns to the `output_features` list.
 
 In `mc_weights` all weights that should be applied for simulated samples are defined. <br>
 There are two types of weights.

From 10b2192be4120beae61dcd98cabf0e2270079989 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 13:22:14 +0100
Subject: [PATCH 13/23] Fix typo

---
 helper/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helper/functions.py b/helper/functions.py
index 37bb77c..9680e16 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -421,7 +421,7 @@ def load_config(config_file: str) -> Dict:
     #
     # The variables, for which defaults are set, are:
     #
-    # - 'sample_database`: Path to the sample database directory. Usuallly, this path is set to the
+    # - 'sample_database`: Path to the sample database directory. Usually, this path is set to the
     #   `datasets` submodule of the `TauFakeFactors` module. Users can set a custom path, e.g.,
     #   to an external path to a working version of their sample database.
     config = {

From 898c3a438a259261b2605f963f3873f7a334fdf7 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 14:02:22 +0100
Subject: [PATCH 14/23] Use column_definitions to redefine columns in boosted
 NMSSM analysis

---
 .../nmssm_boosted/2018/preselection_et.yaml   | 64 ++++++++++++++++++-
 .../nmssm_boosted/2018/preselection_mt.yaml   | 60 ++++++++++++++++-
 .../nmssm_boosted/2018/preselection_tt.yaml   | 64 ++++++++++++++++++-
 3 files changed, 185 insertions(+), 3 deletions(-)

diff --git a/configs/nmssm_boosted/2018/preselection_et.yaml b/configs/nmssm_boosted/2018/preselection_et.yaml
index 32140ab..9ede5a6 100644
--- a/configs/nmssm_boosted/2018/preselection_et.yaml
+++ b/configs/nmssm_boosted/2018/preselection_et.yaml
@@ -69,6 +69,68 @@ processes:
             - "EGamma_Run2018C-UL2018"
             - "EGamma_Run2018D-UL2018"
 
+column_definitions:
+    njets:
+        expression: njets_boosted
+    nbtag:
+        expression: nbtag_boosted
+    metphi:
+        expression: metphi_boosted
+    met:
+        expression: met_boosted
+    pt_1:
+        expression: boosted_pt_1
+    q_1:
+        expression: boosted_q_1
+    pt_2:
+        expression: boosted_pt_2
+    q_2:
+        expression: boosted_q_2
+    mt_1:
+        expression: boosted_mt_1
+    iso_1:
+        expression: boosted_iso_1
+    mass_2:
+        expression: boosted_mass_2
+    tau_decaymode_2:
+        expression: boosted_tau_decaymode_2
+    deltaR_ditaupair:
+        expression: boosted_deltaR_ditaupair
+    m_vis:
+        expression: boosted_m_vis
+    fj_Xbb_pt:
+        expression: fj_Xbb_pt_boosted
+    fj_Xbb_eta:
+        expression: fj_Xbb_eta_boosted
+    fj_Xbb_particleNet_XbbvsQCD:
+        expression: fj_Xbb_particleNet_XbbvsQCD_boosted
+    bpair_pt_1:
+        expression: bpair_pt_1_boosted
+    bpair_pt_2:
+        expression: bpair_pt_2_boosted
+    bpair_btag_value_2:
+        expression: bpair_btag_value_2_boosted
+    bpair_eta_2:
+        expression: bpair_eta_2_boosted
+    extraelec_veto:
+        expression: extraelec_veto_boosted
+    gen_match_1:
+        expression: boosted_gen_match_1
+        exclude_processes:
+        - data
+    gen_match_2:
+        expression: boosted_gen_match_2
+        exclude_processes:
+        - data
+    btag_weight:
+        expression: btag_weight_boosted
+        exclude_processes:
+        - data
+    pNet_Xbb_weight:
+        expression: pNet_Xbb_weight_boosted
+        exclude_processes:
+        - data
+
 event_selection:
     # lep_pt: "boosted_pt_1 > 120"
     had_tau_pt: "boosted_pt_2 > 40"
@@ -130,4 +192,4 @@ output_features:
     - "bpair_eta_2"
     - "met"
     - "mass_2"
-    - "tau_decaymode_2"
\ No newline at end of file
+    - "tau_decaymode_2"
diff --git a/configs/nmssm_boosted/2018/preselection_mt.yaml b/configs/nmssm_boosted/2018/preselection_mt.yaml
index 23163f9..9573c10 100644
--- a/configs/nmssm_boosted/2018/preselection_mt.yaml
+++ b/configs/nmssm_boosted/2018/preselection_mt.yaml
@@ -69,6 +69,64 @@ processes:
             - "SingleMuon_Run2018C-UL2018_GT36"
             - "SingleMuon_Run2018D-UL2018_GT36"
 
+column_definitions:
+    njets:
+        expression: njets_boosted
+    nbtag:
+        expression: nbtag_boosted
+    metphi:
+        expression: metphi_boosted
+    met:
+        expression: met_boosted
+    pt_1:
+        expression: boosted_pt_1
+    q_1:
+        expression: boosted_q_1
+    pt_2:
+        expression: boosted_pt_2
+    q_2:
+        expression: boosted_q_2
+    mt_1:
+        expression: boosted_mt_1
+    iso_1:
+        expression: boosted_iso_1
+    mass_2:
+        expression: boosted_mass_2
+    tau_decaymode_2:
+        expression: boosted_tau_decaymode_2
+    deltaR_ditaupair:
+        expression: boosted_deltaR_ditaupair
+    m_vis:
+        expression: boosted_m_vis
+    fj_Xbb_pt:
+        expression: fj_Xbb_pt_boosted
+    fj_Xbb_eta:
+        expression: fj_Xbb_eta_boosted
+    fj_Xbb_particleNet_XbbvsQCD:
+        expression: fj_Xbb_particleNet_XbbvsQCD_boosted
+    bpair_pt_1:
+        expression: bpair_pt_1_boosted
+    bpair_pt_2:
+        expression: bpair_pt_2_boosted
+    bpair_btag_value_2:
+        expression: bpair_btag_value_2_boosted
+    bpair_eta_2:
+        expression: bpair_eta_2_boosted
+    extramuon_veto:
+        expression: extramuon_veto_boosted
+    gen_match_2:
+        expression: boosted_gen_match_2
+        exclude_processes:
+        - data
+    btag_weight:
+        expression: btag_weight_boosted
+        exclude_processes:
+        - data
+    pNet_Xbb_weight:
+        expression: pNet_Xbb_weight_boosted
+        exclude_processes:
+        - data
+
 event_selection:
     # lep_pt: "boosted_pt_1 > 55"
     had_tau_pt: "boosted_pt_2 > 40"
@@ -130,4 +188,4 @@ output_features:
     - "bpair_eta_2"
     - "met"
     - "mass_2"
-    - "tau_decaymode_2"
\ No newline at end of file
+    - "tau_decaymode_2"
diff --git a/configs/nmssm_boosted/2018/preselection_tt.yaml b/configs/nmssm_boosted/2018/preselection_tt.yaml
index 2322fe7..f1dcb76 100644
--- a/configs/nmssm_boosted/2018/preselection_tt.yaml
+++ b/configs/nmssm_boosted/2018/preselection_tt.yaml
@@ -69,6 +69,68 @@ processes:
             - "JetHT_Run2018C-UL2018"
             - "JetHT_Run2018D-UL2018"
 
+column_definitions:
+    njets:
+        expression: njets_boosted
+    nbtag:
+        expression: nbtag_boosted
+    metphi:
+        expression: metphi_boosted
+    met:
+        expression: met_boosted
+    pt_1:
+        expression: boosted_pt_1
+    q_1:
+        expression: boosted_q_1
+    pt_2:
+        expression: boosted_pt_2
+    q_2:
+        expression: boosted_q_2
+    mt_1:
+        expression: boosted_mt_1
+    iso_1:
+        expression: boosted_iso_1
+    mass_1:
+        expression: boosted_mass_1
+    mass_2:
+        expression: boosted_mass_2
+    tau_decaymode_1:
+        expression: boosted_tau_decaymode_1
+    tau_decaymode_2:
+        expression: boosted_tau_decaymode_2
+    deltaR_ditaupair:
+        expression: boosted_deltaR_ditaupair
+    m_vis:
+        expression: boosted_m_vis
+    fj_Xbb_pt:
+        expression: fj_Xbb_pt_boosted
+    fj_Xbb_eta:
+        expression: fj_Xbb_eta_boosted
+    fj_Xbb_particleNet_XbbvsQCD:
+        expression: fj_Xbb_particleNet_XbbvsQCD_boosted
+    bpair_pt_1:
+        expression: bpair_pt_1_boosted
+    bpair_pt_2:
+        expression: bpair_pt_2_boosted
+    bpair_btag_value_2:
+        expression: bpair_btag_value_2_boosted
+    bpair_eta_2:
+        expression: bpair_eta_2_boosted
+    extramuon_veto:
+        expression: extramuon_veto_boosted
+    gen_match_2:
+        expression: boosted_gen_match_2
+        exclude_processes:
+        - data
+    btag_weight:
+        expression: btag_weight_boosted
+        exclude_processes:
+        - data
+    pNet_Xbb_weight:
+        expression: pNet_Xbb_weight_boosted
+        exclude_processes:
+        - data
+
 event_selection:
     # met: "(met_boosted > 120)"
     had_tau_pt: "(boosted_pt_1 > 40) && (boosted_pt_2 > 40)"
@@ -132,4 +194,4 @@ output_features:
     - "mass_1"
     - "tau_decaymode_1"
     - "mass_2"
-    - "tau_decaymode_2"
\ No newline at end of file
+    - "tau_decaymode_2"

From 351b1e088aca7e1532963f425f6a68c0ccaa24a6 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 14:09:37 +0100
Subject: [PATCH 15/23] Fix wrong logger call and synchronize setup of
 preselection_boosted with preselection script

---
 preselection.py         |  2 +-
 preselection_boosted.py | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/preselection.py b/preselection.py
index 5f1463f..2b60046 100644
--- a/preselection.py
+++ b/preselection.py
@@ -304,7 +304,7 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
     )
     with open(datasets_file, "r") as file:
         datasets = json.load(file)
-    log.log(f"Loading sample database from {datasets_file}")
+    log.info(f"Loading sample database from {datasets_file}")
 
     # get needed features for fake factor calculation
     output_features = config["output_features"]
diff --git a/preselection_boosted.py b/preselection_boosted.py
index 663a046..2052ee4 100644
--- a/preselection_boosted.py
+++ b/preselection_boosted.py
@@ -265,26 +265,28 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
     # loading of the chosen config file
     config = func.load_config(args.config_file)
 
-    # loading general dataset info file for xsec and event number
-    datasets_file = os.path.join(
-        config["sample_database"], config["nanoAOD_version"], "datasets.json"
-    )
-    print(f"Loading sample database from {datasets_file}")
-    with open(datasets_file, "r") as file:
-        datasets = json.load(file)
-
     # define output path for the preselected samples
     output_path = os.path.join(
         config["output_path"], "preselection", config["era"], config["channel"]
     )
     func.check_path(path=output_path)
 
+    # Set up logger and retrieve logger instance for main routine
     func.setup_logger(
         log_file=output_path + "/preselection.log",
         log_name="preselection",
         log_level=logging.INFO,
         subcategories=config["processes"],
     )
+    log = logging.getLogger("preselection.main")
+
+    # Load general dataset info file for xsec and event number
+    datasets_file = os.path.join(
+        config["sample_database"], config["nanoAOD_version"], "datasets.json"
+    )
+    with open(datasets_file, "r") as file:
+        datasets = json.load(file)
+    log.info(f"Loading sample database from {datasets_file}")
 
     # get needed features for fake factor calculation
     output_features = config["output_features"]

From 1e7ce8f1a4f740184dfd540ddaea7d5fb7ba8a18 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Mon, 12 Jan 2026 14:22:43 +0100
Subject: [PATCH 16/23] Move handling of column_definitions entry to the
 run_sample_preselection function

---
 preselection.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/preselection.py b/preselection.py
index 2b60046..878dc35 100644
--- a/preselection.py
+++ b/preselection.py
@@ -51,7 +51,7 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
     Return:
         A tuple with the tau gen. level mode and the name of the output file
     """
-    process, config, output_path, ncores, sample, tau_gen_mode, column_definitions = args
+    process, config, output_path, ncores, sample, tau_gen_mode = args
     log = logging.getLogger(f"preselection.{process}")
     ROOT.EnableImplicitMT(ncores)
 
@@ -80,7 +80,8 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
         log.info(f"WARNING: Sample {sample} is empty. Skipping...")
         return ()
 
-    # Declare column definitions on the RDataFrame
+    # get column definitions from config and declare definitions on the RDataFrame
+    column_definitions = config.get("column_definitions", {})
     if column_definitions:
         rdf = func.define_columns(rdf, column_definitions, process)
 
@@ -229,12 +230,9 @@ def run_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]], str, in
         f"Considered samples for process {process}: {config['processes'][process]['samples']}"
     )
 
-    # get renaming of columns
-    column_definitions = config.get("column_definitions", {})
-
     # going through all contributing samples for the process
     args_list = [
-        (process, config, output_path, ncores, sample, tau_gen_mode, column_definitions) for tau_gen_mode in config["processes"][process]["tau_gen_modes"] for sample in config["processes"][process]["samples"]
+        (process, config, output_path, ncores, sample, tau_gen_mode) for tau_gen_mode in config["processes"][process]["tau_gen_modes"] for sample in config["processes"][process]["samples"]
     ]
 
     results = func.optional_process_pool(

From dad202d3ba656869652bee35c8c3247c920a1b50 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Thu, 15 Jan 2026 14:17:08 +0100
Subject: [PATCH 17/23] Allow the user to specify a list processes with an
 exclusive list of process names, for which the column definition is performed

---
 helper/functions.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/helper/functions.py b/helper/functions.py
index 9680e16..33197df 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -722,8 +722,14 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
 
     - `expression`: The expression string which is used to define the new column.
 
+    - `processes` (_optional_): A list of process names for which the definition should be
+      skipped. For all processes, that are not part of the list, the column definition is not
+      performed. If this entry is set, `processes` cannot be part of `column_definitions`.
+
     - `exclude_processes` (_optional_): A list of process names for which the definition should be
-      skipped. If `process` is in this list, the definition is not applied.
+      skipped. If `process` is in this list, the column definition is not processed. If this entry
+      is set, `exclude_processes` cannot be part of `column_definitions`.
+
 
     Note that the new column names must not exist in the ntuples, otherwise an error is raised.
 
@@ -747,10 +753,20 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
 
     # Perform the define declarations on the RDataFrame object
     for new_column, define_dict in column_definitions.items():
-        expression = define_dict["expression"]
-        exclude_processes = define_dict.get("exclude_processes", [])
-        if process in exclude_processes:
+        # Check that processes and exclude_processes are not set at the same time
+        if "processes" in define_dict and "exclude_processes" in define_dict:
+            raise ValueError(
+                f"Both processes and exclude_processes have been specified for column {new_column}. You can only set one of them for the same entry."
+            )
+
+        # Check if the process should be skipped
+        if "processes" in define_dict and process not in define_dict["processes"]:
+            continue
+        if "exclude_processes" in define_dict and process in define_dict["processes"]:
             continue
+
+        # Get the ROOT expression for defining the new column
+        expression = define_dict["expression"]
         rdf = rdf.Define(new_column, expression)
 
     return rdf

From 42f76b84b1f2e7eb36174661d34d90436454a523 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Thu, 15 Jan 2026 14:18:00 +0100
Subject: [PATCH 18/23] Add flag that allows to use the Redefine function in
 column definitions

---
 helper/functions.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/helper/functions.py b/helper/functions.py
index 33197df..f16e7d7 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -730,8 +730,11 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
       skipped. If `process` is in this list, the column definition is not processed. If this entry
       is set, `exclude_processes` cannot be part of `column_definitions`.
 
+    - `allow_redefine` (_optional_): If this flag is set to `True`, the `Redefine` method is used
+      to overwrite the value of an already existing column with the same name. Default: `False`.
 
-    Note that the new column names must not exist in the ntuples, otherwise an error is raised.
+    The new column names must not exist in the ntuples, except for the case that `allow_redefine`
+    is set to true. Otherwise an error is raised.
 
     Args:
         rdf: root DataFrame
@@ -744,11 +747,11 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
 
     # Ensure that the new column names are not already present in the ntuple
     rdf_columns = set(rdf.GetColumnNames())
-    new_columns = set(column_definitions.keys())
+    new_columns = set(k for k, v in column_definitions.items() if not v.get("allow_redefine", False))
     intersection = rdf_columns.intersection(new_columns)
     if intersection:
         raise ValueError(
-            f"The following new column names already exist in the ntuple: {intersection}"
+            f"The following new column names already exist in the ntuple and allow_redefine is not set: {intersection}"
         )
 
     # Perform the define declarations on the RDataFrame object
@@ -767,7 +770,16 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
 
         # Get the ROOT expression for defining the new column
         expression = define_dict["expression"]
-        rdf = rdf.Define(new_column, expression)
+
+        # Use
+        # - `Redefine` if allow_redefine is `True` and the column is already present in the RDataFrame
+        # - `Define` in all other cases
+        rdf_define_call = (
+            rdf.Redefine
+            if new_column in rdf_columns and allow_redefine
+            else rdf.Define
+        )
+        rdf = rdf_define_call(new_column, expression)
 
     return rdf
 

From 38f9cbab4a0cd66dd640c165cb2e151c8fb4ace4 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Thu, 15 Jan 2026 14:23:29 +0100
Subject: [PATCH 19/23] Add allow_redefine flag to column definitions for the
 boosted NMSSM configs

---
 .../nmssm_boosted/2018/preselection_et.yaml   | 26 ++++++++++++++++++
 .../nmssm_boosted/2018/preselection_mt.yaml   | 25 +++++++++++++++++
 .../nmssm_boosted/2018/preselection_tt.yaml   | 27 +++++++++++++++++++
 3 files changed, 78 insertions(+)

diff --git a/configs/nmssm_boosted/2018/preselection_et.yaml b/configs/nmssm_boosted/2018/preselection_et.yaml
index 9ede5a6..c5aac8b 100644
--- a/configs/nmssm_boosted/2018/preselection_et.yaml
+++ b/configs/nmssm_boosted/2018/preselection_et.yaml
@@ -72,62 +72,88 @@ processes:
 column_definitions:
     njets:
         expression: njets_boosted
+        allow_redefine: True
     nbtag:
         expression: nbtag_boosted
+        allow_redefine: True
     metphi:
         expression: metphi_boosted
+        allow_redefine: True
     met:
         expression: met_boosted
+        allow_redefine: True
     pt_1:
         expression: boosted_pt_1
+        allow_redefine: True
     q_1:
         expression: boosted_q_1
+        allow_redefine: True
     pt_2:
         expression: boosted_pt_2
+        allow_redefine: True
     q_2:
         expression: boosted_q_2
+        allow_redefine: True
     mt_1:
         expression: boosted_mt_1
+        allow_redefine: True
     iso_1:
         expression: boosted_iso_1
+        allow_redefine: True
     mass_2:
         expression: boosted_mass_2
+        allow_redefine: True
     tau_decaymode_2:
         expression: boosted_tau_decaymode_2
+        allow_redefine: True
     deltaR_ditaupair:
         expression: boosted_deltaR_ditaupair
+        allow_redefine: True
     m_vis:
         expression: boosted_m_vis
+        allow_redefine: True
     fj_Xbb_pt:
         expression: fj_Xbb_pt_boosted
+        allow_redefine: True
     fj_Xbb_eta:
         expression: fj_Xbb_eta_boosted
+        allow_redefine: True
     fj_Xbb_particleNet_XbbvsQCD:
         expression: fj_Xbb_particleNet_XbbvsQCD_boosted
+        allow_redefine: True
     bpair_pt_1:
         expression: bpair_pt_1_boosted
+        allow_redefine: True
     bpair_pt_2:
         expression: bpair_pt_2_boosted
+        allow_redefine: True
     bpair_btag_value_2:
         expression: bpair_btag_value_2_boosted
+        allow_redefine: True
     bpair_eta_2:
         expression: bpair_eta_2_boosted
+        allow_redefine: True
     extraelec_veto:
         expression: extraelec_veto_boosted
+        allow_redefine: True
     gen_match_1:
         expression: boosted_gen_match_1
+        allow_redefine: True
         exclude_processes:
         - data
     gen_match_2:
         expression: boosted_gen_match_2
+        allow_redefine: True
         exclude_processes:
         - data
     btag_weight:
         expression: btag_weight_boosted
+        allow_redefine: True
         exclude_processes:
         - data
     pNet_Xbb_weight:
         expression: pNet_Xbb_weight_boosted
+        allow_redefine: True
         exclude_processes:
         - data
 
diff --git a/configs/nmssm_boosted/2018/preselection_mt.yaml b/configs/nmssm_boosted/2018/preselection_mt.yaml
index 9573c10..ffd5cef 100644
--- a/configs/nmssm_boosted/2018/preselection_mt.yaml
+++ b/configs/nmssm_boosted/2018/preselection_mt.yaml
@@ -72,58 +72,83 @@ processes:
 column_definitions:
     njets:
         expression: njets_boosted
+        allow_redefine: True
     nbtag:
         expression: nbtag_boosted
+        allow_redefine: True
     metphi:
         expression: metphi_boosted
+        allow_redefine: True
     met:
         expression: met_boosted
+        allow_redefine: True
     pt_1:
         expression: boosted_pt_1
+        allow_redefine: True
     q_1:
         expression: boosted_q_1
+        allow_redefine: True
     pt_2:
         expression: boosted_pt_2
+        allow_redefine: True
     q_2:
         expression: boosted_q_2
+        allow_redefine: True
     mt_1:
         expression: boosted_mt_1
+        allow_redefine: True
     iso_1:
         expression: boosted_iso_1
+        allow_redefine: True
     mass_2:
         expression: boosted_mass_2
+        allow_redefine: True
     tau_decaymode_2:
         expression: boosted_tau_decaymode_2
+        allow_redefine: True
     deltaR_ditaupair:
         expression: boosted_deltaR_ditaupair
+        allow_redefine: True
     m_vis:
         expression: boosted_m_vis
+        allow_redefine: True
     fj_Xbb_pt:
         expression: fj_Xbb_pt_boosted
+        allow_redefine: True
     fj_Xbb_eta:
         expression: fj_Xbb_eta_boosted
+        allow_redefine: True
     fj_Xbb_particleNet_XbbvsQCD:
         expression: fj_Xbb_particleNet_XbbvsQCD_boosted
+        allow_redefine: True
     bpair_pt_1:
         expression: bpair_pt_1_boosted
+        allow_redefine: True
     bpair_pt_2:
         expression: bpair_pt_2_boosted
+        allow_redefine: True
     bpair_btag_value_2:
         expression: bpair_btag_value_2_boosted
+        allow_redefine: True
     bpair_eta_2:
         expression: bpair_eta_2_boosted
+        allow_redefine: True
     extramuon_veto:
         expression: extramuon_veto_boosted
+        allow_redefine: True
     gen_match_2:
         expression: boosted_gen_match_2
+        allow_redefine: True
         exclude_processes:
         - data
     btag_weight:
         expression: btag_weight_boosted
+        allow_redefine: True
         exclude_processes:
         - data
     pNet_Xbb_weight:
         expression: pNet_Xbb_weight_boosted
+        allow_redefine: True
         exclude_processes:
         - data
 
diff --git a/configs/nmssm_boosted/2018/preselection_tt.yaml b/configs/nmssm_boosted/2018/preselection_tt.yaml
index f1dcb76..ad16246 100644
--- a/configs/nmssm_boosted/2018/preselection_tt.yaml
+++ b/configs/nmssm_boosted/2018/preselection_tt.yaml
@@ -72,62 +72,89 @@ processes:
 column_definitions:
     njets:
         expression: njets_boosted
+        allow_redefine: True
     nbtag:
         expression: nbtag_boosted
+        allow_redefine: True
     metphi:
         expression: metphi_boosted
+        allow_redefine: True
     met:
         expression: met_boosted
+        allow_redefine: True
     pt_1:
         expression: boosted_pt_1
+        allow_redefine: True
     q_1:
         expression: boosted_q_1
+        allow_redefine: True
     pt_2:
         expression: boosted_pt_2
+        allow_redefine: True
     q_2:
         expression: boosted_q_2
+        allow_redefine: True
     mt_1:
         expression: boosted_mt_1
+        allow_redefine: True
     iso_1:
         expression: boosted_iso_1
+        allow_redefine: True
     mass_1:
         expression: boosted_mass_1
+        allow_redefine: True
     mass_2:
         expression: boosted_mass_2
+        allow_redefine: True
     tau_decaymode_1:
         expression: boosted_tau_decaymode_1
+        allow_redefine: True
     tau_decaymode_2:
         expression: boosted_tau_decaymode_2
+        allow_redefine: True
     deltaR_ditaupair:
         expression: boosted_deltaR_ditaupair
+        allow_redefine: True
     m_vis:
         expression: boosted_m_vis
+        allow_redefine: True
     fj_Xbb_pt:
         expression: fj_Xbb_pt_boosted
+        allow_redefine: True
     fj_Xbb_eta:
         expression: fj_Xbb_eta_boosted
+        allow_redefine: True
     fj_Xbb_particleNet_XbbvsQCD:
         expression: fj_Xbb_particleNet_XbbvsQCD_boosted
+        allow_redefine: True
     bpair_pt_1:
         expression: bpair_pt_1_boosted
+        allow_redefine: True
     bpair_pt_2:
         expression: bpair_pt_2_boosted
+        allow_redefine: True
     bpair_btag_value_2:
         expression: bpair_btag_value_2_boosted
+        allow_redefine: True
     bpair_eta_2:
         expression: bpair_eta_2_boosted
+        allow_redefine: True
     extramuon_veto:
         expression: extramuon_veto_boosted
+        allow_redefine: True
     gen_match_2:
         expression: boosted_gen_match_2
+        allow_redefine: True
         exclude_processes:
         - data
     btag_weight:
         expression: btag_weight_boosted
+        allow_redefine: True
         exclude_processes:
         - data
     pNet_Xbb_weight:
         expression: pNet_Xbb_weight_boosted
+        allow_redefine: True
         exclude_processes:
         - data
 

From a5c9d39eacae45af244ec37a89888c463c4c0a2a Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Thu, 15 Jan 2026 14:35:11 +0100
Subject: [PATCH 20/23] Update documentation of column_definitions section

---
 docs/preselection.md | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/docs/preselection.md b/docs/preselection.md
index 0cd8c87..e16ca7f 100644
--- a/docs/preselection.md
+++ b/docs/preselection.md
@@ -8,7 +8,7 @@ The preselection config has the following parameters:
   ---|---|---
   `channel` | `string` | tau pair decay channels ("et", "mt", "tt")
   `processes` | `dict` | process parameters are explained below
-  `column_definitions` | `dict` | in this section, new columns can be defined based on a given `ROOT` expression. <br>The keys of the dictionary correspond to the name of the defined column. The values are dictionaries itself, with the `expression` key defining the `ROOT` expression for defining the column and the optional entry `exclude_processes` containing a list of processes for which the column should not be added. An example is given below.
+  `column_definitions` | `dict` | in this section, new columns can be defined based on a given `ROOT` expression. <br>The keys of the dictionary correspond to the name of the defined column. The values are dictionaries itself, with the `expression` key defining the `ROOT` expression for defining the column. Optional entries `processes` and `exclude_processes` allow to target specific processes, the entry `allow_redefine` can be used to enable the use of the `ROOT.RDataFrame.Redefine` function for overwriting already existing columns. For a more detailed description, see below.
   `event_selection` | `dict` | with this parameter all selections that should be applied are defined. <br>This is basically a dictionary of cuts where the key is the name of a cut and the value is the cut itself as a string e.g. `had_tau_pt: "pt_2 > 30"`. The name of a cut is not really important, it is only used as an output information in the terminal. A cut can only use variables which are in the ntuples.
   `mc_weights` | `dict` | weight parameter are defined below
   `emb_weights` | `dict` | all weights that should be applied for embedded samples are defined. <br>Like for `event_selection` a weight can directly be specified and is then applied to all samples the same way e.g. `single_trigger: "trg_wgt_single_mu24ormu27"`
@@ -32,25 +32,41 @@ The `tau_gen_modes` have following modes:
   `L` | `string` | lepton misidentified as a tau
   `all` | `string` | if no split should be performed
 
-In `column_definitions`, new columns. An example entry could look like this:
+In `column_definitions`, new columns can be added to the output `ntuples` by
+using `ROOT` expression. An example entry could look like this:
 
 ```yaml
 column_definitions:
     nbtag:
         expression: n_bjets
+        processes:
+        - ttbar
+        - DY
     btag_weight:
         expression: id_wgt_bjet_pnet_shape
         exclude_processes:
         - data
+        allow_redefine: True
     jj_deltaR:
         expression: ROOT::VecOps::DeltaR(jeta_1, jeta_2, jphi_1, jphi_2)
 ```
 
 The key `expression` is required and can contain any valid `ROOT` expression.
-The entry `exclude_processes` is optional. This list can contain process names
-from the `processes` section of this configuration. By default, the new columns
-are defined for all processes. To write the new columns to the output file, you
-have to explicitly add the columns to the `output_features` list.
+
+The entry `exclude_processes` is optional. Column definitions are performed for
+all processes except the ones given in this list. The entry `processes` is also
+optional. The column definition is performed only for processes in this list.
+The lists `processes` and `exclude_processes` can contain the names from the
+`processes` section of this configuration. By default, the new columns are
+defined for all processes. To write the new columns to the output file, you have
+to explicitly add the columns to the `output_features` list. Note that you can
+only set `processes` or `exclude_processes` for a column, but not both at the
+same time.
+
+If the key `allow_redefine` is set to `True`, the `ROOT.RDataFrame.Redefine`
+function is used if a column with the same name has been found in the
+`RDataFrame`. The values in this column are then overwritten by the expression
+given for the new column.
 
 In `mc_weights` all weights that should be applied for simulated samples are defined. <br>
 There are two types of weights.

From db29f330ff4e3a7705e439f973eb72fe259a5bec Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Thu, 15 Jan 2026 14:37:09 +0100
Subject: [PATCH 21/23] Fix wrong description in define_columns docstring

---
 helper/functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/helper/functions.py b/helper/functions.py
index f16e7d7..43f9b2e 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -722,8 +722,8 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
 
     - `expression`: The expression string which is used to define the new column.
 
-    - `processes` (_optional_): A list of process names for which the definition should be
-      skipped. For all processes, that are not part of the list, the column definition is not
+    - `processes` (_optional_): An exclusive list of process names for which the definition should be
+      performed. For all processes, that are not part of the list, the column definition is not
       performed. If this entry is set, `processes` cannot be part of `column_definitions`.
 
     - `exclude_processes` (_optional_): A list of process names for which the definition should be

From 3da81572c7c79db8718e75bcc764b579d224f625 Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Fri, 16 Jan 2026 11:32:30 +0100
Subject: [PATCH 22/23] Remove obsolete column renaming function for boosted
 fake factors

---
 helper/functions.py     | 71 -----------------------------------------
 preselection_boosted.py |  5 ---
 2 files changed, 76 deletions(-)

diff --git a/helper/functions.py b/helper/functions.py
index 43f9b2e..95dc822 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -640,77 +640,6 @@ def get_output_name(
     return os.path.join(path, f"{process}{tau_gen_mode}.root")
 
 
-def rename_boosted_variables(rdf: Any, channel: str) -> Any:
-    """
-    Function to redefine variables to the boosted tau pair information. Redefining only variables
-    which are written out for the fake factor measurement. Due to the hardcoded naming and redifinitions
-    this function needs to be adjusted if something changes in the list of output variables.
-
-    Args:
-        rdf: root DataFrame
-        channel: Analysis channel of the tau analysis e.g. "et", "mt" or "tt"
-
-    Return:
-        root DataFrame with redefined variables
-    """
-    rdf = rdf.Redefine("njets", "njets_boosted")
-    rdf = rdf.Redefine("nbtag", "nbtag_boosted")
-    rdf = rdf.Redefine("metphi", "metphi_boosted")
-    rdf = rdf.Redefine("met", "met_boosted")
-    rdf = rdf.Redefine("pt_1", "boosted_pt_1")
-    rdf = rdf.Redefine("q_1", "boosted_q_1")
-    rdf = rdf.Redefine("pt_2", "boosted_pt_2")
-    rdf = rdf.Redefine("q_2", "boosted_q_2")
-    rdf = rdf.Redefine("mt_1", "boosted_mt_1")
-    rdf = rdf.Redefine("iso_1", "boosted_iso_1")
-    rdf = rdf.Redefine("mass_2", "boosted_mass_2")
-    rdf = rdf.Redefine("tau_decaymode_2", "boosted_tau_decaymode_2")
-    rdf = rdf.Redefine("deltaR_ditaupair", "boosted_deltaR_ditaupair")
-    rdf = rdf.Redefine("m_vis", "boosted_m_vis")
-    rdf = rdf.Redefine("fj_Xbb_pt", "fj_Xbb_pt_boosted")
-    rdf = rdf.Redefine("fj_Xbb_eta", "fj_Xbb_eta_boosted")
-    rdf = rdf.Redefine(
-        "fj_Xbb_particleNet_XbbvsQCD", "fj_Xbb_particleNet_XbbvsQCD_boosted"
-    )
-    rdf = rdf.Redefine("bpair_pt_1", "bpair_pt_1_boosted")
-    rdf = rdf.Redefine("bpair_pt_2", "bpair_pt_2_boosted")
-    rdf = rdf.Redefine("bpair_btag_value_2", "bpair_btag_value_2_boosted")
-    rdf = rdf.Redefine("bpair_eta_2", "bpair_eta_2_boosted")
-
-    if "boosted_gen_match_2" in rdf.GetColumnNames():
-        rdf = rdf.Redefine("gen_match_2", "boosted_gen_match_2")
-    else:
-        rdf = rdf.Define("boosted_gen_match_2", "-1.")
-        rdf = rdf.Redefine("gen_match_2", "boosted_gen_match_2")
-
-    if "btag_weight_boosted" in rdf.GetColumnNames():
-        rdf = rdf.Redefine("btag_weight", "btag_weight_boosted")
-    else:
-        rdf = rdf.Define("btag_weight_boosted", "1.")
-        rdf = rdf.Redefine("btag_weight", "btag_weight_boosted")
-
-    if "pNet_Xbb_weight_boosted" in rdf.GetColumnNames():
-        rdf = rdf.Redefine("pNet_Xbb_weight", "pNet_Xbb_weight_boosted")
-    else:
-        rdf = rdf.Define("pNet_Xbb_weight_boosted", "1.")
-        rdf = rdf.Redefine("pNet_Xbb_weight", "pNet_Xbb_weight_boosted")
-
-    if channel == "tt":
-        rdf = rdf.Redefine("mass_1", "boosted_mass_1")
-        rdf = rdf.Redefine("tau_decaymode_1", "boosted_tau_decaymode_1")
-        if "boosted_gen_match_1" in rdf.GetColumnNames():
-            rdf = rdf.Redefine("gen_match_1", "boosted_gen_match_1")
-        else:
-            rdf = rdf.Define("boosted_gen_match_1", "-1.")
-            rdf = rdf.Redefine("gen_match_1", "boosted_gen_match_1")
-
-    if channel == "et":
-        rdf = rdf.Redefine("extraelec_veto", "boosted_extraelec_veto")
-    if channel == "mt":
-        rdf = rdf.Redefine("extramuon_veto", "boosted_extramuon_veto")
-
-    return rdf
-
 def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
     """
     Customizer function to define additional columns in the ntuples.
diff --git a/preselection_boosted.py b/preselection_boosted.py
index 2052ee4..9a10045 100644
--- a/preselection_boosted.py
+++ b/preselection_boosted.py
@@ -164,11 +164,6 @@ def run_sample_preselection(args: Tuple[str, Dict[str, Union[Dict, List, str]],
     log.debug(out.getvalue())
     log.debug("-" * 50)
 
-    # WARNING: cross check this function is something changes in the list of output features
-    tmp_rdf = func.rename_boosted_variables(
-        rdf=tmp_rdf, channel=config["channel"]
-    )
-
     tmp_file_name = func.get_output_name(
         path=output_path, process=sample, tau_gen_mode=tau_gen_mode
     )

From b14ed9e1222e236474f397cda69f1701b8db2add Mon Sep 17 00:00:00 2001
From: moritzmolch <moritz.molch@kit.edu>
Date: Fri, 16 Jan 2026 13:50:05 +0100
Subject: [PATCH 23/23] Use correct process list for evaluating processes to
 exclude.

---
 helper/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helper/functions.py b/helper/functions.py
index 95dc822..baa8bd5 100644
--- a/helper/functions.py
+++ b/helper/functions.py
@@ -694,7 +694,7 @@ def define_columns(rdf: Any, column_definitions: dict, process: str) -> Any:
         # Check if the process should be skipped
         if "processes" in define_dict and process not in define_dict["processes"]:
             continue
-        if "exclude_processes" in define_dict and process in define_dict["processes"]:
+        if "exclude_processes" in define_dict and process in define_dict["exclude_processes"]:
             continue
 
         # Get the ROOT expression for defining the new column