From 8e78169a3f4a4dbd1230c8990fc1230d4b2ef1dd Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 3 Feb 2025 12:29:54 +0100 Subject: [PATCH 01/26] Resolved the conflicts: mostly left local version of the code --- columnflow/plotting/plot_functions_1d.py | 103 ++++++++++++++++++++--- columnflow/production/cms/mc_weight.py | 9 +- columnflow/production/cms/pileup.py | 4 + columnflow/production/normalization.py | 9 +- columnflow/selection/stats.py | 2 +- sandboxes/cmssw_columnar.sh | 6 +- sandboxes/cmssw_default.sh | 4 +- 7 files changed, 116 insertions(+), 21 deletions(-) diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py index f73ceac9c..2dc19901c 100644 --- a/columnflow/plotting/plot_functions_1d.py +++ b/columnflow/plotting/plot_functions_1d.py @@ -33,7 +33,7 @@ plt = maybe_import("matplotlib.pyplot") mplhep = maybe_import("mplhep") od = maybe_import("order") - +import warnings def plot_variable_per_process( hists: OrderedDict, @@ -50,24 +50,62 @@ def plot_variable_per_process( **kwargs, ) -> plt.Figure: """ - TODO. + Plots histograms for multiple processes, ordering them by the total number of events in ascending order + and assigning specific colors to each process based on a predefined color map. """ remove_residual_axis(hists, "shift") - variable_inst = variable_insts[0] - blinding_threshold = kwargs.get("blinding_threshold", None) + # Define the color maps + color_maps = { + "6": ["#5790fc", "#7a21dd", "#964a8b", "#9c9ca1", "#e42536", "#f89c20"], + "8": ["#1845fb", "#578dff", "#656364", "#86c8dd", "#adad7d", "#c849a9", "#c91f16", "#ff5e02"], + "10": ["#3f90da", "#717581", "#832db6", "#92dadd", "#94a4a2", "#a96b59", "#b9ac70", "#bd1f01", "#e76300", "#ffa90e"], + } - if blinding_threshold: - hists = blind_sensitive_bins(hists, config_inst, blinding_threshold) - hists = apply_variable_settings(hists, variable_insts, variable_settings) - hists = apply_process_settings(hists, process_settings) - hists = apply_density_to_hists(hists, density) + # Basic colors for more than 24 processes + basic_colors = ["#FF0000", "#0000FF", "#00FF00", "#FFFF00", "#FF00FF", "#00FFFF", "#800000", "#808000"] + + # Calculate the total number of events for each process + total_events = {key: sum(hist.values()) for key, hist in hists.items()} + + # Sort processes by total number of events in ascending order + # sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]])) + # Sort processes by total number of events in descending order + sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True)) + + variable_inst = variable_insts[0] + sorted_hists = apply_variable_settings(sorted_hists, variable_insts, variable_settings) + sorted_hists = apply_process_settings(sorted_hists, process_settings) + sorted_hists = apply_density_to_hists(sorted_hists, density) plot_config = prepare_plot_config( - hists, + sorted_hists, shape_norm=shape_norm, hide_errors=hide_errors, ) + + + if 'data' not in plot_config: + + # Determine the appropriate color map based on the number of processes + num_processes = len(sorted_hists) + if num_processes <= 6: + colors = color_maps["6"][:num_processes] + elif num_processes == 7: + colors = color_maps["8"][:num_processes] + elif num_processes <= 10: + colors = color_maps["8"][:num_processes] if num_processes == 8 else color_maps["10"][:num_processes] + elif num_processes <= 18: + colors = color_maps["10"] + color_maps["8"][:num_processes - 10] + elif num_processes <= 24: + colors = color_maps["10"] + color_maps["8"] + color_maps["6"][:num_processes - 18] + else: + warnings.warn("You are about to plot more than 24 processes together, please reconsider... (Colors not in the approved palette will be assigned)") + colors = color_maps["10"] + color_maps["8"] + color_maps["6"] + colors += basic_colors[:num_processes - 24] + plot_config["mc_stack"]["kwargs"]["color"] = colors[:num_processes] + + default_style_config = prepare_style_config( config_inst, category_inst, variable_inst, density, shape_norm, yscale, @@ -80,6 +118,51 @@ def plot_variable_per_process( return plot_all(plot_config, style_config, **kwargs) +# def plot_variable_per_process( +# hists: OrderedDict, +# config_inst: od.Config, +# category_inst: od.Category, +# variable_insts: list[od.Variable], +# style_config: dict | None = None, +# density: bool | None = False, +# shape_norm: bool | None = False, +# yscale: str | None = "", +# hide_errors: bool | None = None, +# process_settings: dict | None = None, +# variable_settings: dict | None = None, +# **kwargs, +# ) -> plt.Figure: +# """ +# TODO. +# """ +# remove_residual_axis(hists, "shift") + +# variable_inst = variable_insts[0] +# blinding_threshold = kwargs.get("blinding_threshold", None) + +# if blinding_threshold: +# hists = blind_sensitive_bins(hists, config_inst, blinding_threshold) +# hists = apply_variable_settings(hists, variable_insts, variable_settings) +# hists = apply_process_settings(hists, process_settings) +# hists = apply_density_to_hists(hists, density) + +# plot_config = prepare_plot_config( +# hists, +# shape_norm=shape_norm, +# hide_errors=hide_errors, +# ) + +# default_style_config = prepare_style_config( +# config_inst, category_inst, variable_inst, density, shape_norm, yscale, +# ) + +# style_config = law.util.merge_dicts(default_style_config, style_config, deep=True) +# if shape_norm: +# style_config["ax_cfg"]["ylabel"] = r"$\Delta N/N$" + +# return plot_all(plot_config, style_config, **kwargs) + + def plot_variable_variants( hists: OrderedDict, config_inst: od.Config, diff --git a/columnflow/production/cms/mc_weight.py b/columnflow/production/cms/mc_weight.py index 9994c5b5a..e56b60b6e 100644 --- a/columnflow/production/cms/mc_weight.py +++ b/columnflow/production/cms/mc_weight.py @@ -31,11 +31,14 @@ def mc_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: [1] https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookNanoAOD?rev=99#Weigths """ + # # determine the mc_weight + # mc_weight = events.genWeight + # if has_ak_column(events, "LHEWeight.originalXWGTUP") and ak.all(events.genWeight == 1.0): + # mc_weight = events.LHEWeight.originalXWGTUP # determine the mc_weight - mc_weight = events.genWeight + mc_weight = np.sign(events.genWeight) if has_ak_column(events, "LHEWeight.originalXWGTUP") and ak.all(events.genWeight == 1.0): - mc_weight = events.LHEWeight.originalXWGTUP - + mc_weight = np.sign(events.LHEWeight.originalXWGTUP) # store the column events = set_ak_column(events, "mc_weight", mc_weight, value_type=np.float32) diff --git a/columnflow/production/cms/pileup.py b/columnflow/production/cms/pileup.py index 5e025c120..438d43889 100644 --- a/columnflow/production/cms/pileup.py +++ b/columnflow/production/cms/pileup.py @@ -54,6 +54,10 @@ def pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: # evaluate and store the produced column pu_weight = self.pileup_corrector.evaluate(*inputs) + ##################################################### + ### Keeps the pu_weight lower then 300 + pu_weight[pu_weight > 300] = 0 + ##################################################### events = set_ak_column(events, column_name, pu_weight, value_type=np.float32) return events diff --git a/columnflow/production/normalization.py b/columnflow/production/normalization.py index 9c2dd296f..66616ac7e 100644 --- a/columnflow/production/normalization.py +++ b/columnflow/production/normalization.py @@ -207,10 +207,8 @@ def normalization_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Arra f"process_id field contains id(s) {invalid_ids} for which no cross sections were " f"found; process ids with cross sections: {self.xs_process_ids}", ) - # read the weight per process (defined as lumi * xsec / sum_weights) from the lookup table process_weight = np.squeeze(np.asarray(self.process_weight_table[0, process_id].todense())) - # compute the weight and store it norm_weight = events.mc_weight * process_weight events = set_ak_column(events, self.weight_name, norm_weight, value_type=np.float32) @@ -352,13 +350,18 @@ def normalization_weights_setup( f"energy of {self.config_inst.campaign.ecm}", ) sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(process_inst.id)] + #quick fix that need to be fixed + ################################ + #n_evt_per_file = /self.dataset_inst.n_files + sum_weights = self.dataset_inst.n_events + ################################ xsec = process_inst.get_xsec(self.config_inst.campaign.ecm).nominal process_weight_table[0, process_inst.id] = lumi * xsec / sum_weights + self.process_weight_table = process_weight_table self.xs_process_ids = set(self.process_weight_table.rows[0]) - @normalization_weights.init def normalization_weights_init(self: Producer) -> None: """ diff --git a/columnflow/selection/stats.py b/columnflow/selection/stats.py index 5038a6a03..8141fb957 100644 --- a/columnflow/selection/stats.py +++ b/columnflow/selection/stats.py @@ -145,7 +145,7 @@ def increment_stats( "'num' or 'sum'", ) - # interpret obj based on the aoperation to be applied + # interpret obj based on the operation to be applied weights = None weight_mask = Ellipsis if isinstance(obj, (tuple, list)): diff --git a/sandboxes/cmssw_columnar.sh b/sandboxes/cmssw_columnar.sh index 350a954be..f626eaccd 100644 --- a/sandboxes/cmssw_columnar.sh +++ b/sandboxes/cmssw_columnar.sh @@ -10,8 +10,10 @@ action() { # set variables and source the generic CMSSW setup export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" - export CF_SCRAM_ARCH="el9_amd64_gcc11" - export CF_CMSSW_VERSION="CMSSW_13_0_19" + # export CF_SCRAM_ARCH="$( [ "${os_version}" = "8" ] && echo "el8" || echo "slc7" )_amd64_gcc10" + # export CF_CMSSW_VERSION="CMSSW_12_6_2" + export CF_SCRAM_ARCH=el9_amd64_gcc12 + export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )" export CF_CMSSW_FLAG="1" # increment when content changed diff --git a/sandboxes/cmssw_default.sh b/sandboxes/cmssw_default.sh index d2e31eb15..cbfd928f8 100644 --- a/sandboxes/cmssw_default.sh +++ b/sandboxes/cmssw_default.sh @@ -10,8 +10,8 @@ action() { # set variables and source the generic CMSSW setup export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" - export CF_SCRAM_ARCH="el9_amd64_gcc11" - export CF_CMSSW_VERSION="CMSSW_13_0_19" + export CF_SCRAM_ARCH=el9_amd64_gcc12 + export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )" export CF_CMSSW_FLAG="1" # increment when content changed From d4f96f7be432264d9fd62c12bf5653f636e2a992 Mon Sep 17 00:00:00 2001 From: Jacopo Malvaso Date: Tue, 26 Nov 2024 11:44:33 +0100 Subject: [PATCH 02/26] Plotting script with CAT colors --- columnflow/plotting/plot_functions_1d.py | 36 +++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py index 2dc19901c..716ec5421 100644 --- a/columnflow/plotting/plot_functions_1d.py +++ b/columnflow/plotting/plot_functions_1d.py @@ -33,7 +33,8 @@ plt = maybe_import("matplotlib.pyplot") mplhep = maybe_import("mplhep") od = maybe_import("order") -import warnings + +logger = law.logger.get_logger(__name__) def plot_variable_per_process( hists: OrderedDict, @@ -50,8 +51,10 @@ def plot_variable_per_process( **kwargs, ) -> plt.Figure: """ - Plots histograms for multiple processes, ordering them by the total number of events in ascending order - and assigning specific colors to each process based on a predefined color map. + Plots histograms for multiple processes, ordering them by a custom order: + the process with the highest number of events first, followed by the others, + and the process with the second highest number of events last. + Handles cases with only one or two processes. """ remove_residual_axis(hists, "shift") @@ -68,10 +71,25 @@ def plot_variable_per_process( # Calculate the total number of events for each process total_events = {key: sum(hist.values()) for key, hist in hists.items()} - # Sort processes by total number of events in ascending order - # sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]])) # Sort processes by total number of events in descending order - sorted_hists = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True)) + sorted_hists_desc = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True)) + + # Get keys of sorted processes + sorted_keys = list(sorted_hists_desc.keys()) + + # Handle cases with 1 or 2 processes + if len(sorted_keys) == 1: + # Only one process, no special reordering needed + custom_order = sorted_keys + elif len(sorted_keys) == 2: + # Two processes, highest first, then second highest + custom_order = sorted_keys + else: + # More than two processes, custom order: highest, rest, then second highest + custom_order = [sorted_keys[0]] + sorted_keys[2:] + [sorted_keys[1]] + + # Reorder histograms based on custom order + sorted_hists = OrderedDict((key, sorted_hists_desc[key]) for key in custom_order) variable_inst = variable_insts[0] sorted_hists = apply_variable_settings(sorted_hists, variable_insts, variable_settings) @@ -83,7 +101,6 @@ def plot_variable_per_process( shape_norm=shape_norm, hide_errors=hide_errors, ) - if 'data' not in plot_config: @@ -100,13 +117,11 @@ def plot_variable_per_process( elif num_processes <= 24: colors = color_maps["10"] + color_maps["8"] + color_maps["6"][:num_processes - 18] else: - warnings.warn("You are about to plot more than 24 processes together, please reconsider... (Colors not in the approved palette will be assigned)") + logger.warning("You are about to plot more than 24 processes together, please reconsider... (Colors not in the approved palette will be assigned)") colors = color_maps["10"] + color_maps["8"] + color_maps["6"] colors += basic_colors[:num_processes - 24] plot_config["mc_stack"]["kwargs"]["color"] = colors[:num_processes] - - default_style_config = prepare_style_config( config_inst, category_inst, variable_inst, density, shape_norm, yscale, ) @@ -118,6 +133,7 @@ def plot_variable_per_process( return plot_all(plot_config, style_config, **kwargs) + # def plot_variable_per_process( # hists: OrderedDict, # config_inst: od.Config, From 0ece10d03f590c9496619e92073900d7db8825fd Mon Sep 17 00:00:00 2001 From: Jacopo Malvaso Date: Tue, 26 Nov 2024 11:55:09 +0100 Subject: [PATCH 03/26] Normalization, pileup, job submission and other small updates --- columnflow/calibration/cms/jets.py | 228 ++++++++++++++-------------- columnflow/calibration/cms/met.py | 29 ++-- columnflow/calibration/util.py | 27 +++- columnflow/columnar_util.py | 62 ++++++++ columnflow/production/cms/pileup.py | 1 + columnflow/selection/cms/jets.py | 4 +- columnflow/tasks/plotting.py | 5 + sandboxes/cmssw_columnar.sh | 4 +- sandboxes/cmssw_default.sh | 4 +- 9 files changed, 231 insertions(+), 133 deletions(-) diff --git a/columnflow/calibration/cms/jets.py b/columnflow/calibration/cms/jets.py index bd910264b..32c7c816b 100644 --- a/columnflow/calibration/cms/jets.py +++ b/columnflow/calibration/cms/jets.py @@ -3,6 +3,7 @@ """ Jet energy corrections and jet resolution smearing. """ +from pprint import pprint import functools @@ -29,6 +30,8 @@ set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32) +import difflib + def get_evaluators( correction_set: correctionlib.highlevel.CorrectionSet, names: list[str], @@ -45,25 +48,32 @@ def get_evaluators( :return: List of compounded corrections, see :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` """ - # raise nice error if keys not found available_keys = set(correction_set.keys()).union(correction_set.compound.keys()) - missing_keys = set(names) - available_keys - if missing_keys: - raise RuntimeError("corrections not found:" + "".join( - f"\n - {name}" for name in names if name in missing_keys - ) + "\navailable:" + "".join( - f"\n - {name}" for name in sorted(available_keys) - )) - - # retrieve the evaluators + corrected_names = [] + + for name in names: + if name not in available_keys: + # Find the closest match using difflib + closest_matches = difflib.get_close_matches(name, available_keys, n=1) + if closest_matches: + closest_match = closest_matches[0] + print( + f"Correction '{name}' not found. Using closest match: '{closest_match}'", + ) + corrected_names.append(closest_match) + else: + raise RuntimeError(f"Correction '{name}' not found and no close match available.") + else: + corrected_names.append(name) + + # Retrieve the evaluators return [ correction_set.compound[name] if name in correction_set.compound else correction_set[name] - for name in names + for name in corrected_names ] - def ak_evaluate(evaluator: correctionlib.highlevel.Correction, *args) -> float: """ Evaluate a :external+correctionlib:py:class:`correctionlib.highlevel.Correction` @@ -240,13 +250,14 @@ def get_jec_config_default(self: Calibrator) -> DotDict: raw_met_name="RawMET", # custom uncertainty sources, defaults to config when empty uncertainty_sources=None, - # toggle for propagation to MET + # toggle for propagation to PuppiMET propagate_met=True, - # function to determine the correction file - get_jec_file=get_jerc_file_default, - # function to determine the jec configuration dict + # # function to determine the correction file + get_jec_file=get_jec_file_default, + # # function to determine the jec configuration dict get_jec_config=get_jec_config_default, ) + def jec( self: Calibrator, events: ak.Array, @@ -256,7 +267,7 @@ def jec( ) -> ak.Array: """Performs the jet energy corrections (JECs) and uncertainty shifts using the :external+correctionlib:doc:`index`, optionally - propagating the changes to the MET. + propagating the changes to the PuppiMET. The *jet_name* should be set to the name of the NanoAOD jet collection to calibrate (default: ``Jet``, i.e. AK4 jets). @@ -313,16 +324,18 @@ def jec( :param events: awkward array containing events to process :param min_pt_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet values - to the missing transverse energy (MET) using + to the missing transverse energy (PuppiMET) using :py:func:`~columnflow.calibration.util.propagate_met` for events where ``met.pt > *min_pt_met_prop*``. :param max_eta_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet - values to the missing transverse energy (MET) using + values to the missing transverse energy (PuppiMET) using :py:func:`~columnflow.calibration.util.propagate_met` for events where ``met.eta > *min_eta_met_prop*``. """ # noqa - # use local variable for convenience - jet_name = self.jet_name + + # calculate uncorrected pt, mass + events = set_ak_column_f32(events, "Jet.pt_raw", events.Jet.pt * (1 - events.Jet.rawFactor)) + events = set_ak_column_f32(events, "Jet.mass_raw", events.Jet.mass * (1 - events.Jet.rawFactor)) # calculate uncorrected pt, mass events = set_ak_column_f32(events, f"{jet_name}.pt_raw", events[jet_name].pt * (1 - events[jet_name].rawFactor)) @@ -340,6 +353,8 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): # apply all correctors sequentially, updating the pt each time full_correction = ak.ones_like(pt, dtype=np.float32) + + for corrector in self.evaluators[evaluator_key]: # determine correct inputs (change depending on corrector) inputs = [ @@ -348,6 +363,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): ] correction = ak_evaluate(corrector, *inputs) # update pt for subsequent correctors + #pprint(corrector.__dict__) # If `corrector` is a custom object with attributes variable_map["JetPt"] = variable_map["JetPt"] * correction full_correction = full_correction * correction @@ -361,7 +377,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): ) # correct jets with only a subset of correction levels - # (for calculating TypeI MET correction) + # (for calculating TypeI PuppiMET correction) if self.propagate_met: # get correction factors jec_factors_subset_type1_met = correct_jets( @@ -378,7 +394,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors_subset_type1_met) events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) - # store pt and phi of the full jet system for MET propagation, including a selection in raw info + # store pt and phi of the full jet system for PuppiMET propagation, including a selection in raw info # see https://twiki.cern.ch/twiki/bin/view/CMS/JECAnalysesRecommendations?rev=19#Minimum_jet_selection_cuts met_prop_mask = (events[jet_name].pt_raw > min_pt_met_prop) & (abs(events[jet_name].eta) < max_eta_met_prop) jetsum = events[jet_name][met_prop_mask].sum(axis=1) @@ -408,18 +424,20 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): jetsum = events[jet_name][met_prop_mask].sum(axis=1) jetsum_pt_all_levels = jetsum.pt jetsum_phi_all_levels = jetsum.phi - # propagate changes to MET, starting from jets corrected with subset of JEC levels + + # propagate changes to PuppiMET, starting from jets corrected with subset of JEC levels # (recommendation is to propagate only L2 corrections and onwards) met_pt, met_phi = propagate_met( jetsum_pt_subset_type1_met, jetsum_phi_subset_type1_met, jetsum_pt_all_levels, jetsum_phi_all_levels, - events[self.raw_met_name].pt, - events[self.raw_met_name].phi, + events.RawPuppiMET.pt, + events.RawPuppiMET.phi, ) - events = set_ak_column_f32(events, f"{self.met_name}.pt", met_pt) - events = set_ak_column_f32(events, f"{self.met_name}.phi", met_phi) + + events = set_ak_column_f32(events, "PuppiMET.pt", met_pt) + events = set_ak_column_f32(events, "PuppiMET.phi", met_phi) # variable naming conventions variable_map = { @@ -447,7 +465,7 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): events, f"{jet_name}.mass_jec_{name}_down", events[jet_name].mass * (1.0 - jec_uncertainty), ) - # propagate shifts to MET + # propagate shifts to PuppiMET if self.propagate_met: jet_pt_up = events[jet_name][met_prop_mask][f"pt_jec_{name}_up"] jet_pt_down = events[jet_name][met_prop_mask][f"pt_jec_{name}_down"] @@ -467,10 +485,10 @@ def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): met_pt, met_phi, ) - events = set_ak_column_f32(events, f"{self.met_name}.pt_jec_{name}_up", met_pt_up) - events = set_ak_column_f32(events, f"{self.met_name}.pt_jec_{name}_down", met_pt_down) - events = set_ak_column_f32(events, f"{self.met_name}.phi_jec_{name}_up", met_phi_up) - events = set_ak_column_f32(events, f"{self.met_name}.phi_jec_{name}_down", met_phi_down) + events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_up", met_pt_up) + events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_down", met_pt_down) + events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_up", met_phi_up) + events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_down", met_phi_down) return events @@ -497,14 +515,14 @@ def jec_init(self: Calibrator) -> None: for junc_dir in ("up", "down") } - # add MET variables + # add PuppiMET variables if self.propagate_met: - self.uses.add(f"{self.raw_met_name}.{{pt,phi}}") - self.produces.add(f"{self.met_name}.{{pt,phi}}") + self.uses |= {"RawPuppiMET.pt", "RawPuppiMET.phi","PuppiMET.pt", "PuppiMET.phi"} + self.produces |= {"PuppiMET.pt", "PuppiMET.phi"} - # add shifted MET variables + # add shifted PuppiMET variables self.produces |= { - f"{self.met_name}.{shifted_var}_jec_{junc_name}_{junc_dir}" + f"PuppiMET.{shifted_var}_jec_{junc_name}_{junc_dir}" for shifted_var in ("pt", "phi") for junc_name in sources for junc_dir in ("up", "down") @@ -544,27 +562,25 @@ def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert .. code-block:: python cfg.x.jec = DotDict.wrap({ - "Jet": { - # campaign name for this JEC correctiono - "campaign": f"Summer19UL{year2}{jerc_postfix}", - # version of the corrections - "version": "V7", - # Type of jets that the corrections should be applied on - "jet_type": "AK4PFchs", - # relevant levels in the derivation process of the JEC - "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], - # relevant levels in the derivation process of the Type 1 MET JEC - "levels_for_type1_met": ["L1FastJet"], - # names of the uncertainties to be applied - "uncertainty_sources": [ - "Total", - "CorrelationGroupMPFInSitu", - "CorrelationGroupIntercalibration", - "CorrelationGroupbJES", - "CorrelationGroupFlavor", - "CorrelationGroupUncorrelated", - ], - }, + # campaign name for this JEC correctiono + "campaign": f"Summer19UL{year2}{jerc_postfix}", + # version of the corrections + "version": "V7", + # Type of jets that the corrections should be applied on + "jet_type": "AK4PFchs", + # relevant levels in the derivation process of the JEC + "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], + # relevant levels in the derivation process of the Type 1 PuppiMET JEC + "levels_for_type1_met": ["L1FastJet"], + # names of the uncertainties to be applied + "uncertainty_sources": [ + "Total", + "CorrelationGroupMPFInSitu", + "CorrelationGroupIntercalibration", + "CorrelationGroupbJES", + "CorrelationGroupFlavor", + "CorrelationGroupUncorrelated", + ], }) :param reqs: Requirement dictionary for this @@ -572,10 +588,12 @@ def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert :param inputs: Additional inputs, currently not used :param reader_targets: TODO: add documentation """ - bundle = reqs["external_files"] + bundle = reqs["external_files"] + # import the correction sets from the external file import correctionlib + correction_set = correctionlib.CorrectionSet.from_string( self.get_jec_file(bundle.files).load(formatter="gzip").decode("utf-8"), ) @@ -585,6 +603,7 @@ def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data): if is_data: + jec_era = self.dataset_inst.get_aux("jec_era", None) # if no special JEC era is specified, infer based on 'era' if jec_era is None: @@ -601,8 +620,11 @@ def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data): sources = self.uncertainty_sources if sources is None: sources = jec_cfg.uncertainty_sources - - jec_keys = make_jme_keys(jec_cfg.levels) + + if self.dataset_inst.is_data : + jec_keys = make_jme_keys(jec_cfg.levels_DATA) + else : + jec_keys = make_jme_keys(jec_cfg.levels_MC) jec_keys_subset_type1_met = make_jme_keys(jec_cfg.levels_for_type1_met) junc_keys = make_jme_keys(sources, is_data=False) # uncertainties only stored as MC keys @@ -617,14 +639,8 @@ def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data): # custom jec calibrator that only runs nominal correction jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []}) -# explicit calibrators for standard jet collections -jec_ak4 = jec.derive("jec_ak4", cls_dict={"jet_name": "Jet"}) -jec_ak8 = jec.derive("jec_ak8", cls_dict={"jet_name": "FatJet", "propagate_met": False}) -jec_ak4_nominal = jec_ak4.derive("jec_ak4", cls_dict={"uncertainty_sources": []}) -jec_ak8_nominal = jec_ak8.derive("jec_ak8", cls_dict={"uncertainty_sources": []}) - - -def get_jer_config_default(self: Calibrator) -> DotDict: +# define default functions for jec calibrator +def get_jer_file(self, external_files: DotDict) -> str: """ Load config relevant to the jet energy resolution (JER) smearing. @@ -679,15 +695,18 @@ def get_jer_config_default(self: Calibrator) -> DotDict: uses={ optional("Rho.fixedGridRhoFastjetAll"), optional("fixedGridRhoFastjetAll"), + "GenJet.pt", "GenJet.eta", "GenJet.phi", + "PuppiMET.pt", "PuppiMET.phi", attach_coffea_behavior, }, - # name of the jet collection to smear - jet_name="Jet", - # name of the associated gen jet collection - gen_jet_name="GenJet", - # name of the associated MET collection - met_name="MET", - # toggle for propagation to MET + produces={ + "Jet.pt", "Jet.mass", + "Jet.pt_unsmeared", "Jet.mass_unsmeared", + "Jet.pt_jer_up", "Jet.pt_jer_down", "Jet.mass_jer_up", "Jet.mass_jer_down", + "PuppiMET.pt", "PuppiMET.phi", + "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", "PuppiMET.phi_jer_down", + }, + # toggle for propagation to PuppiMET propagate_met=True, # only run on mc mc_only=True, @@ -875,27 +894,27 @@ def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: if self.propagate_met: # save unsmeared quantities - events = set_ak_column_f32(events, f"{self.met_name}.pt_unsmeared", events[self.met_name].pt) - events = set_ak_column_f32(events, f"{self.met_name}.phi_unsmeared", events[self.met_name].phi) + events = set_ak_column_f32(events, "PuppiMET.pt_unsmeared", events.PuppiMET.pt) + events = set_ak_column_f32(events, "PuppiMET.phi_unsmeared", events.PuppiMET.phi) # get pt and phi of all jets after correcting jetsum = events[jet_name].sum(axis=1) jetsum_pt_after = jetsum.pt jetsum_phi_after = jetsum.phi - # propagate changes to MET + # propagate changes to PuppiMET met_pt, met_phi = propagate_met( jetsum_pt_before, jetsum_phi_before, jetsum_pt_after, jetsum_phi_after, - events[self.met_name].pt, - events[self.met_name].phi, + events.PuppiMET.pt, + events.PuppiMET.phi, ) - events = set_ak_column_f32(events, f"{self.met_name}.pt", met_pt) - events = set_ak_column_f32(events, f"{self.met_name}.phi", met_phi) + events = set_ak_column_f32(events, "PuppiMET.pt", met_pt) + events = set_ak_column_f32(events, "PuppiMET.phi", met_phi) - # syst variations on top of corrected MET + # syst variations on top of corrected PuppiMET met_pt_up, met_phi_up = propagate_met( jetsum_pt_after, jetsum_phi_after, @@ -912,10 +931,10 @@ def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: met_pt, met_phi, ) - events = set_ak_column_f32(events, f"{self.met_name}.pt_jer_up", met_pt_up) - events = set_ak_column_f32(events, f"{self.met_name}.pt_jer_down", met_pt_down) - events = set_ak_column_f32(events, f"{self.met_name}.phi_jer_up", met_phi_up) - events = set_ak_column_f32(events, f"{self.met_name}.phi_jer_down", met_phi_down) + events = set_ak_column_f32(events, "PuppiMET.pt_jer_up", met_pt_up) + events = set_ak_column_f32(events, "PuppiMET.pt_jer_down", met_pt_down) + events = set_ak_column_f32(events, "PuppiMET.phi_jer_up", met_phi_up) + events = set_ak_column_f32(events, "PuppiMET.phi_jer_down", met_phi_down) return events @@ -926,22 +945,13 @@ def jer_init(self: Calibrator) -> None: lower_first = lambda s: s[0].lower() + s[1:] if s else s self.gen_jet_idx_column = lower_first(self.gen_jet_name) + "Idx" - # register used jet columns - self.uses.add(f"{self.jet_name}.{{pt,eta,phi,mass,{self.gen_jet_idx_column}}}") - - # register used gen jet columns - self.uses.add(f"{self.gen_jet_name}.{{pt,eta,phi}}") - - # register produced jet columns - self.produces.add(f"{self.jet_name}.{{pt,mass}}{{,_unsmeared,_jer_up,_jer_down}}") - - # register produced MET columns - if self.propagate_met: - # register used MET columns - self.uses.add(f"{self.met_name}.{{pt,phi}}") - - # register produced MET columns - self.produces.add(f"{self.met_name}.{{pt,phi}}{{,_jer_up,_jer_down,_unsmeared}}") + self.uses |= { + "PuppiMET.pt", "PuppiMET.phi", + } + self.produces |= { + "PuppiMET.pt", "PuppiMET.phi", "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", + "PuppiMET.phi_jer_down", "PuppiMET.pt_unsmeared", "PuppiMET.phi_unsmeared", + } @jer.requires @@ -994,7 +1004,7 @@ def jer_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: Insert correction_set = correctionlib.CorrectionSet.from_string( self.get_jer_file(bundle.files).load(formatter="gzip").decode("utf-8"), ) - + # compute JER keys from config information jer_cfg = self.get_jer_config() jer_keys = { @@ -1032,11 +1042,7 @@ def deterministic_normal(loc, scale, seed): @calibrator( uses={jec, jer}, produces={jec, jer}, - # name of the jet collection to smear - jet_name="Jet", - # name of the associated gen jet collection (for JER smearing) - gen_jet_name="GenJet", - # toggle for propagation to MET + # toggle for propagation to PuppiMET propagate_met=None, # functions to determine configs and files get_jec_file=None, diff --git a/columnflow/calibration/cms/met.py b/columnflow/calibration/cms/met.py index 01b6ea9ef..aec3ca73b 100644 --- a/columnflow/calibration/cms/met.py +++ b/columnflow/calibration/cms/met.py @@ -1,7 +1,7 @@ # coding: utf-8 """ -MET corrections. +PuppiMET corrections. """ from columnflow.calibration import Calibrator, calibrator @@ -13,9 +13,8 @@ @calibrator( - uses={"run", "PV.npvs"}, - # name of the MET collection to calibrate - met_name="MET", + uses={"run", "PV.npvs", "PuppiMET.pt", "PuppiMET.phi"}, + produces={"PuppiMET.pt", "PuppiMET.phi"}, # function to determine the correction file get_met_file=(lambda self, external_files: external_files.met_phi_corr), # function to determine met correction config @@ -23,9 +22,9 @@ ) def met_phi(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: """ - Performs the MET phi (type II) correction using the + Performs the PuppiMET phi (type II) correction using the :external+correctionlib:doc:`index` for events there the - uncorrected MET pt is below the beam energy (extracted from ``config_inst.campaign.ecm * 0.5``). + uncorrected PuppiMET pt is below the beam energy (extracted from ``config_inst.campaign.ecm * 0.5``). Requires an external file in the config under ``met_phi_corr``: .. code-block:: python @@ -54,16 +53,16 @@ def met_phi(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: met = events[self.met_name] # copy the intial pt and phi values - corr_pt = np.array(met.pt, dtype=np.float32) - corr_phi = np.array(met.phi, dtype=np.float32) + corr_pt = np.array(events.PuppiMET.pt, dtype=np.float32) + corr_phi = np.array(events.PuppiMET.phi, dtype=np.float32) - # select only events where MET pt is below the expected beam energy - mask = met.pt < (0.5 * self.config_inst.campaign.ecm) + # select only events where PuppiMET pt is below the expected beam energy + mask = events.PuppiMET.pt < (0.5 * self.config_inst.campaign.ecm) # arguments for evaluation args = ( - met.pt[mask], - met.phi[mask], + events.PuppiMET.pt[mask], + events.PuppiMET.phi[mask], ak.values_astype(events.PV.npvs[mask], np.float32), ak.values_astype(events.run[mask], np.float32), ) @@ -73,8 +72,8 @@ def met_phi(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: corr_phi[mask] = self.met_phi_corrector.evaluate(*args) # save the corrected values - events = set_ak_column(events, f"{self.met_name}.pt", corr_pt, value_type=np.float32) - events = set_ak_column(events, f"{self.met_name}.phi", corr_phi, value_type=np.float32) + events = set_ak_column(events, "PuppiMET.pt", corr_pt, value_type=np.float32) + events = set_ak_column(events, "PuppiMET.phi", corr_phi, value_type=np.float32) return events @@ -110,7 +109,7 @@ def met_phi_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: di :param reader_targets: Additional targets, currently not used. """ bundle = reqs["external_files"] - + # create the pt and phi correctors import correctionlib correction_set = correctionlib.CorrectionSet.from_string( diff --git a/columnflow/calibration/util.py b/columnflow/calibration/util.py index ac20de9bb..af9955836 100644 --- a/columnflow/calibration/util.py +++ b/columnflow/calibration/util.py @@ -13,6 +13,9 @@ np = maybe_import("numpy") ak = maybe_import("awkward") +import law + +logger = law.logger.get_logger(__name__) # https://github.com/scikit-hep/awkward/issues/489\#issuecomment-711090923 def ak_random(*args, rand_func: Callable) -> ak.Array: @@ -91,7 +94,29 @@ def propagate_met( if jet_pt2.ndim > 1: jet_px2 = ak.sum(jet_px2, axis=1) jet_py2 = ak.sum(jet_py2, axis=1) - + + # RawPuppiMET sanity check + + crazy_PuppiMET_values_mask = met_pt1 > 14*10**3 + + crazy_PuppiMET_values = met_pt1[crazy_PuppiMET_values_mask] + + # Get the indices of the infinite values + crazy_PuppiMET_indices = np.where(crazy_PuppiMET_values_mask)[0] + + # Count the number of infinite values + crazy_PuppiMET_count = ak.sum(crazy_PuppiMET_values_mask) + + if crazy_PuppiMET_count > 0: + # Replace infinite values with 0 + met_pt1 = ak.where(~crazy_PuppiMET_values_mask, met_pt1, 1000) + + # Raise a warning about the replacement + logger.warning( + f"Warning: Found and replaced {crazy_PuppiMET_count} crazy value(s) {crazy_PuppiMET_values.tolist()} in 'RawPuppiMET.pt' with 1000.\n" + f"Indices in the chuck: {crazy_PuppiMET_indices.tolist()}\n" + f"We will get rid of these events in the selection step") + # propagate to met met_px2 = met_pt1 * np.cos(met_phi1) - (jet_px2 - jet_px1) met_py2 = met_pt1 * np.sin(met_phi1) - (jet_py2 - jet_py1) diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py index 7651675f5..9057b3faf 100644 --- a/columnflow/columnar_util.py +++ b/columnflow/columnar_util.py @@ -1354,6 +1354,68 @@ def ak_copy(ak_array: ak.Array) -> ak.Array: return layout_ak_array(np.array(ak.flatten(ak_array)), ak_array) +def fill_hist( + h: hist.Hist, + data: ak.Array | np.array | dict[str, ak.Array | np.array], + *, + last_edge_inclusive: bool | None = None, + fill_kwargs: dict[str, Any] | None = None, +) -> None: + """ + Fills a histogram *h* with data from an awkward array, numpy array or nested dictionary *data*. + The data is assumed to be structured in the same way as the histogram axes. If + *last_edge_inclusive* is *True*, values that would land exactly on the upper-most bin edge of an + axis are shifted into the last bin. If it is *None*, the behavior is determined automatically + and depends on the variable axis type. In this case, shifting is applied to all continuous, + non-circular axes. + """ + if fill_kwargs is None: + fill_kwargs = {} + + # helper to decide whether the variable axis qualifies for shifting the last bin + def allows_shift(ax) -> bool: + return ax.traits.continuous and not ax.traits.circular + + # determine the axis names, figure out which which axes the last bin correction should be done + axis_names = [] + correct_last_bin_axes = [] + for ax in h.axes: + axis_names.append(ax.name) + # include values hitting last edge? + if not len(ax.widths) or not isinstance(ax, hist.axis.Variable): + continue + if (last_edge_inclusive is None and allows_shift(ax)) or last_edge_inclusive: + correct_last_bin_axes.append(ax) + + # check data + if not isinstance(data, dict): + if len(axis_names) != 1: + raise ValueError("got multi-dimensional hist but only one dimensional data") + data = {axis_names[0]: data} + else: + for name in axis_names: + if name not in data and name not in fill_kwargs: + raise ValueError(f"missing data for histogram axis '{name}'") + + # correct last bin values + for ax in correct_last_bin_axes: + right_egde_mask = ak.flatten(data[ax.name], axis=None) == ax.edges[-1] + if np.any(right_egde_mask): + data[ax.name] = ak.copy(data[ax.name]) + flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5 + + # fill + if 'event' in data.keys(): + arrays = {} + for ax_name in axis_names: + if ax_name in data.keys(): + arrays[ax_name] = data[ax_name] + h.fill(**fill_kwargs, **arrays) + else: + arrays = ak.flatten(ak.cartesian(data)) + h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) + + class RouteFilter(object): """ Shallow helper class that handles removal of routes in an awkward array that do not match those diff --git a/columnflow/production/cms/pileup.py b/columnflow/production/cms/pileup.py index 438d43889..346be3125 100644 --- a/columnflow/production/cms/pileup.py +++ b/columnflow/production/cms/pileup.py @@ -58,6 +58,7 @@ def pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: ### Keeps the pu_weight lower then 300 pu_weight[pu_weight > 300] = 0 ##################################################### + events = set_ak_column(events, column_name, pu_weight, value_type=np.float32) return events diff --git a/columnflow/selection/cms/jets.py b/columnflow/selection/cms/jets.py index 945ed1b1e..89be152c3 100644 --- a/columnflow/selection/cms/jets.py +++ b/columnflow/selection/cms/jets.py @@ -22,7 +22,7 @@ @selector( uses={ - "Jet.{pt,eta,phi,mass,jetId,chEmEF}", optional("Jet.puId"), + "Jet.{pt,eta,phi,mass,jetId,chEmEF}", "Muon.{pt,eta,phi,mass,isPFcand}", }, produces={"Jet.veto_map_mask"}, @@ -59,7 +59,7 @@ def jet_veto_map( # loose jet selection jet_mask = ( (jet.pt > 15) & - (jet.jetId >= 2) & # tight id + (jet.jetId >= 2) & # tight id (jet.chEmEF < 0.9) & ak.all(events.Jet.metric_table(muon) >= 0.2, axis=2) ) diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py index b922684a8..6709d3fbb 100644 --- a/columnflow/tasks/plotting.py +++ b/columnflow/tasks/plotting.py @@ -213,6 +213,11 @@ def create_branch_map(self): def workflow_requires(self): reqs = super().workflow_requires() + + # no need to require merged histograms since each branch already requires them as a workflow + # if self.workflow == "local": + # reqs.pop("merged_hists", None) + return reqs def requires(self): diff --git a/sandboxes/cmssw_columnar.sh b/sandboxes/cmssw_columnar.sh index f626eaccd..5dd283d28 100644 --- a/sandboxes/cmssw_columnar.sh +++ b/sandboxes/cmssw_columnar.sh @@ -10,10 +10,10 @@ action() { # set variables and source the generic CMSSW setup export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_SCRAM_ARCH="el9_amd64_gcc12" + export CF_CMSSW_VERSION="CMSSW_14_1_0_pre4" # export CF_SCRAM_ARCH="$( [ "${os_version}" = "8" ] && echo "el8" || echo "slc7" )_amd64_gcc10" # export CF_CMSSW_VERSION="CMSSW_12_6_2" - export CF_SCRAM_ARCH=el9_amd64_gcc12 - export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )" export CF_CMSSW_FLAG="1" # increment when content changed diff --git a/sandboxes/cmssw_default.sh b/sandboxes/cmssw_default.sh index cbfd928f8..95dcaf592 100644 --- a/sandboxes/cmssw_default.sh +++ b/sandboxes/cmssw_default.sh @@ -10,8 +10,8 @@ action() { # set variables and source the generic CMSSW setup export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" - export CF_SCRAM_ARCH=el9_amd64_gcc12 - export CF_CMSSW_VERSION=CMSSW_14_1_0_pre4 + export CF_SCRAM_ARCH="el9_amd64_gcc12" + export CF_CMSSW_VERSION="CMSSW_14_1_0_pre4" export CF_CMSSW_ENV_NAME="$( basename "${this_file%.sh}" )" export CF_CMSSW_FLAG="1" # increment when content changed From c83375288715e459a3ce8663c77fd14ad1dfb19e Mon Sep 17 00:00:00 2001 From: Jacopo Malvaso Date: Tue, 3 Dec 2024 15:07:17 +0100 Subject: [PATCH 04/26] Modification to the legend position and columns --- columnflow/plotting/plot_all.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/columnflow/plotting/plot_all.py b/columnflow/plotting/plot_all.py index 60207a301..a30db1e58 100644 --- a/columnflow/plotting/plot_all.py +++ b/columnflow/plotting/plot_all.py @@ -273,9 +273,15 @@ def plot_all( if not skip_legend: # resolve legend kwargs legend_kwargs = { - "ncols": 1, - "loc": "upper right", + "ncol": 2, + "loc": "center left", + "bbox_to_anchor": (0.35, 0.8), # Position the legend outside the plot + # Moves the legend to the right side of the plot. + # The first value (1) controls the horizontal position, + # and the second value (0.95) controls the vertical position. + "fontsize": 16, } + legend_kwargs.update(style_config.get("legend_cfg", {})) # retrieve the legend handles and their labels From 394805c0f3c5cb34d155d9eb54f42941f66786f6 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Tue, 10 Dec 2024 15:50:42 +0100 Subject: [PATCH 05/26] Fake factor estimation code: initial commit --- columnflow/tasks/data_driven_methods.py | 731 ++++++++++++++++++++++++ law.cfg | 1 + 2 files changed, 732 insertions(+) create mode 100644 columnflow/tasks/data_driven_methods.py diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py new file mode 100644 index 000000000..0f47638e6 --- /dev/null +++ b/columnflow/tasks/data_driven_methods.py @@ -0,0 +1,731 @@ +# # coding: utf-8 + +# """ +# Tasks to plot different types of histograms. +# """ + +# from collections import OrderedDict +# from abc import abstractmethod + +# import law +# import luigi + +# from columnflow.tasks.framework.base import Requirements, ShiftTask +# from columnflow.tasks.framework.mixins import ( +# CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, +# CategoriesMixin, ShiftSourcesMixin, +# ) +# from columnflow.tasks.framework.plotting import ( +# PlotBase, PlotBase2D, ProcessPlotSettingMixin, VariablePlotSettingMixin, +# ) +# from columnflow.tasks.framework.decorators import view_output_plots +# from columnflow.tasks.framework.remote import RemoteWorkflow +# from columnflow.tasks.histograms import MergeHistograms +# from columnflow.util import DotDict, dev_sandbox, dict_add_strict + + +# class DataDrivenEstimationBase( +# VariablePlotSettingMixin, +# ProcessPlotSettingMixin, +# CategoriesMixin, +# MLModelsMixin, +# ProducersMixin, +# SelectorStepsMixin, +# CalibratorsMixin, +# law.LocalWorkflow, +# RemoteWorkflow, +# ): +# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) +# """sandbox to use for this task. Defaults to *default_columnar_sandbox* from +# analysis config. +# """ + +# exclude_index = True + +# # upstream requirements +# reqs = Requirements( +# RemoteWorkflow.reqs, +# MergeHistograms=MergeHistograms, +# ) +# """Set upstream requirements, in this case :py:class:`~columnflow.tasks.histograms.MergeHistograms` +# """ + +# def store_parts(self): +# parts = super().store_parts() +# parts.insert_before("version", "plot", f"datasets_{self.datasets_repr}") +# return parts + +# def create_branch_map(self): +# return [ +# DotDict({"category": cat_name, "variable": var_name}) +# for cat_name in sorted(self.categories) +# for var_name in sorted(self.variables) +# ] + +# def workflow_requires(self): +# reqs = super().workflow_requires() + +# reqs["merged_hists"] = self.requires_from_branch() + +# return reqs + +# @abstractmethod +# def get_plot_shifts(self): +# return + +# @law.decorator.log +# @view_output_plots +# def run(self): +# import hist +# import numpy as np +# from cmsdb.processes.qcd import qcd + +# # get the shifts to extract and plot +# plot_shifts = law.util.make_list(self.get_plot_shifts()) + +# # prepare config objects +# variable_tuple = self.variable_tuples[self.branch_data.variable] +# variable_insts = [ +# self.config_inst.get_variable(var_name) +# for var_name in variable_tuple +# ] +# category_inst = self.config_inst.get_category(self.branch_data.category) +# leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] +# process_insts = list(map(self.config_inst.get_process, self.processes)) +# sub_process_insts = { +# proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] +# for proc in process_insts +# } + +# # histogram data per process +# hists = {} +# if 'ff_control_reg' in category_inst.name : +# with self.publish_step(f"estimating qcd for {self.branch_data.variable} in {category_inst.name}"): +# for dataset, inp in self.input().items(): +# dataset_inst = self.config_inst.get_dataset(dataset) +# h_in = inp["collection"][0]["hists"].targets[self.branch_data.variable].load(formatter="pickle") + +# # loop and extract one histogram per process +# for process_inst in process_insts: +# # skip when the dataset is already known to not contain any sub process +# if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): +# continue +# # work on a copy +# h = h_in.copy() +# # axis selections +# h = h[{ +# "process": [ +# hist.loc(p.id) +# for p in sub_process_insts[process_inst] +# if p.id in h.axes["process"] +# ], +# "category": [ +# hist.loc(c.id) +# for c in leaf_category_insts +# if c.id in h.axes["category"] +# ], +# "shift": [ +# hist.loc(s.id) +# for s in plot_shifts +# if s.id in h.axes["shift"] +# ], +# }] + +# # axis reductions +# h = h[{"process": sum, "category": sum}] + +# # add the histogram +# if process_inst in hists: +# hists[process_inst] += h +# else: +# hists[process_inst] = h + +# # there should be hists to plot +# if not hists: +# raise Exception( +# "no histograms found to plot; possible reasons:\n" + +# " - requested variable requires columns that were missing during histogramming\n" + +# " - selected --processes did not match any value on the process axis of the input histogram", +# ) + +# # sort hists by process order +# hists = OrderedDict( +# (process_inst.copy_shallow(), hists[process_inst]) +# for process_inst in sorted(hists, key=process_insts.index) +# ) + +# qcd_hist = None +# qcd_hist_values = None +# for process_inst, h in hists.items(): +# hist_np , _ , _ = h.to_numpy(flow=True) +# if qcd_hist is None: +# qcd_hist = h.copy() +# qcd_hist_values = np.zeros_like(hist_np) +# if process_inst.is_data: qcd_hist_values += hist_np +# else: qcd_hist_values -= hist_np + +# #if the array contains negative values, set them to zero +# qcd_hist_values = np.where(qcd_hist_values > 0, qcd_hist_values, 0) +# qcd_hist.view(flow=True).value[:] = qcd_hist_values +# qcd_hist.view(flow=True).variance[:] = np.zeros_like(qcd_hist_values) +# qcd_hist +# #register a new datased at the hlist +# hists[qcd] = qcd_hist +# #save qcd estimation histogram and plots only for control region + +# self.output()["qcd_hists"][self.branch_data.variable].dump(qcd_hist, formatter="pickle") +# # call the plot function +# fig, _ = self.call_plot_func( +# self.plot_function, +# hists=hists, +# config_inst=self.config_inst, +# category_inst=category_inst.copy_shallow(), +# variable_insts=[var_inst.copy_shallow() for var_inst in variable_insts], +# **self.get_plot_parameters(), +# ) +# # save the plot +# for outp in self.output()["plots"]: +# outp.dump(fig, formatter="mpl") +# else: +# self.publish_step(f"Category: {category_inst.name} isn't used to estimate QCD, skipping this task.") + + +# class DataDrivenEstimationSingleShift( +# DataDrivenEstimationBase, +# ShiftTask, +# ): +# exclude_index = True + +# # upstream requirements +# reqs = Requirements( +# DataDrivenEstimationBase.reqs, +# MergeHistograms=MergeHistograms, +# ) + +# def create_branch_map(self): +# return [ +# DotDict({"category": cat_name, "variable": var_name}) +# for var_name in sorted(self.variables) +# for cat_name in sorted(self.categories) +# ] + +# def requires(self): +# return { +# d: self.reqs.MergeHistograms.req( +# self, +# dataset=d, +# branch=-1, +# _exclude={"branches"}, +# _prefer_cli={"variables"}, +# ) +# for d in self.datasets +# } + +# def output(self): +# b = self.branch_data +# return {"plots": [ +# self.target(name) +# for name in self.get_plot_names(f"plot__proc_{self.processes_repr}__cat_{b.category}__var_{b.variable}") +# ], +# "qcd_hists": law.SiblingFileCollection({ +# variable_name: self.target(f"qcd_histogram__{b.category}_{variable_name}.pickle") +# for variable_name in self.variables +# })} + +# def get_plot_shifts(self): +# return [self.global_shift_inst] + + +# class DataDrivenEstimation( +# DataDrivenEstimationSingleShift, +# DataDrivenEstimationBase, +# ): +# plot_function = PlotBase.plot_function.copy( +# default="columnflow.plotting.plot_functions_1d.plot_variable_per_process", +# add_default_to_description=True, +# ) + + + + +# coding: utf-8 + +""" +Task to produce and merge histograms. +""" + +from __future__ import annotations + +import luigi +import law + +from columnflow.tasks.framework.base import Requirements, AnalysisTask, DatasetTask, wrapper_factory +from columnflow.tasks.framework.mixins import ( + CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, VariablesMixin, + ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin, +) +from columnflow.tasks.framework.remote import RemoteWorkflow +from columnflow.tasks.framework.parameters import last_edge_inclusive_inst +from columnflow.tasks.reduction import ReducedEventsUser +from columnflow.tasks.production import ProduceColumns +from columnflow.tasks.ml import MLEvaluation +from columnflow.util import dev_sandbox + + +class CreateFakeFactorHistograms( + VariablesMixin, + WeightProducerMixin, + ProducersMixin, + ReducedEventsUser, + ChunkedIOMixin, + law.LocalWorkflow, + RemoteWorkflow, +): + last_edge_inclusive = last_edge_inclusive_inst + + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + + # upstream requirements + reqs = Requirements( + ReducedEventsUser.reqs, + RemoteWorkflow.reqs, + ProduceColumns=ProduceColumns, + ) + + # strategy for handling missing source columns when adding aliases on event chunks + missing_column_alias_strategy = "original" + + # names of columns that contain category ids + # (might become a parameter at some point) + category_id_columns = {"category_ids"} + + # register sandbox and shifts found in the chosen weight producer to this task + register_weight_producer_sandbox = True + register_weight_producer_shifts = True + + @law.util.classproperty + def mandatory_columns(cls) -> set[str]: + return set(cls.category_id_columns) | {"process_id"} + + def workflow_requires(self): + reqs = super().workflow_requires() + + # require the full merge forest + reqs["events"] = self.reqs.ProvideReducedEvents.req(self) + + if not self.pilot: + if self.producer_insts: + reqs["producers"] = [ + self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) + for producer_inst in self.producer_insts + if producer_inst.produced_columns + ] + + # add weight_producer dependent requirements + reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) + + return reqs + + def requires(self): + reqs = {"events": self.reqs.ProvideReducedEvents.req(self)} + + if self.producer_insts: + reqs["producers"] = [ + self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) + for producer_inst in self.producer_insts + if producer_inst.produced_columns + ] + + # add weight_producer dependent requirements + reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) + + return reqs + + workflow_condition = ReducedEventsUser.workflow_condition.copy() + + @workflow_condition.output + def output(self): + return {"hists": self.target(f"fake_factor__{self.branch}.pickle")} + + @law.decorator.log + @law.decorator.localize(input=True, output=False) + @law.decorator.safe_output + def run(self): + import hist + import numpy as np + import awkward as ak + from columnflow.columnar_util import ( + Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, + ) + + # prepare inputs + inputs = self.input() + + # declare output: dict of histograms + histograms = {} + + # run the weight_producer setup + producer_reqs = self.weight_producer_inst.run_requires() + reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs)) + + # create a temp dir for saving intermediate files + tmp_dir = law.LocalDirectoryTarget(is_tmp=True) + tmp_dir.touch() + + # get shift dependent aliases + aliases = self.local_shift_inst.x("column_aliases", {}) + + # define columns that need to be read + read_columns = {Route("process_id")} + read_columns |= set(map(Route, self.category_id_columns)) + read_columns |= set(self.weight_producer_inst.used_columns) + read_columns |= set(map(Route, aliases.values())) + read_columns |= { + Route(the_var) for the_var in self.config_inst.x.fake_factor_method.vars.keys() + } + from IPython import embed; embed() + # empty float array to use when input files have no entries + empty_f32 = ak.Array(np.array([], dtype=np.float32)) + + # iterate over chunks of events and diffs + file_targets = [inputs["events"]["events"]] + if self.producer_insts: + file_targets.extend([inp["columns"] for inp in inputs["producers"]]) + + # prepare inputs for localization + with law.localize_file_targets( + [*file_targets, *reader_targets.values()], + mode="r", + ) as inps: + for (events, *columns), pos in self.iter_chunked_io( + [inp.path for inp in inps], + source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets), + read_columns=(len(file_targets) + len(reader_targets)) * [read_columns], + chunk_size=self.weight_producer_inst.get_min_chunk_size(), + ): + # optional check for overlapping inputs + if self.check_overlapping_inputs: + self.raise_if_overlapping([events] + list(columns)) + + # add additional columns + events = update_ak_array(events, *columns) + + # add aliases + events = add_ak_aliases( + events, + aliases, + remove_src=True, + missing_strategy=self.missing_column_alias_strategy, + ) + + # build the full event weight + if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): + events, weight = self.weight_producer_inst(events) + else: + weight = ak.Array(np.ones(len(events), dtype=np.float32)) + # define and fill histograms, taking into account multiple axes + + h = (hist.Hist.new + .IntCat([], name="category", growth=True) + .IntCat([], name="process", growth=True) + .IntCat([], name="shift", growth=True)) + for (var_name, var_axis) in self.config_inst.x.fake_factor_method.vars.items(): + h = eval(f'h.{var_axis}') + + histograms['fake_factor'] = h.Weight() + + category_ids = ak.concatenate( + [Route(c).apply(events) for c in self.category_id_columns], + axis=-1, + ) + # broadcast arrays so that each event can be filled for all its categories + fill_data = { + "category": category_ids, + "process": events.process_id, + "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, + "weight": weight, + } + # for variable_inst in self.config_inst.x.fake_factor_method.vars.: + # # prepare the expression + # expr = variable_inst.expression + # if isinstance(expr, str): + # route = Route(expr) + # def expr(events, *args, **kwargs): + # if len(events) == 0 and not has_ak_column(events, route): + # return empty_f32 + # return route.apply(events, null_value=variable_inst.null_value) + # fill_data[variable_inst.name] = expr(events) + from IPython import embed; embed() + # for var_key, var_names in self.variable_tuples.items(): + # variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] + + # # create the histogram if not present yet + # if var_key not in histograms: + # h = ( + # hist.Hist.new + # .IntCat([], name="category", growth=True) + # .IntCat([], name="process", growth=True) + # .IntCat([], name="shift", growth=True) + # ) + # # add variable axes + # for variable_inst in variable_insts: + # h = h.Var( + # variable_inst.bin_edges, + # name=variable_inst.name, + # label=variable_inst.get_full_x_title(), + # ) + # # enable weights and store it + # histograms[var_key] = h.Weight() + + # # merge category ids + # category_ids = ak.concatenate( + # [Route(c).apply(events) for c in self.category_id_columns], + # axis=-1, + # ) + + # broadcast arrays so that each event can be filled for all its categories + # fill_data = { + # "category": category_ids, + # "process": events.process_id, + # "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, + # "weight": weight, + # } + # for variable_inst in variable_insts: + # # prepare the expression + # expr = variable_inst.expression + # if isinstance(expr, str): + # route = Route(expr) + # def expr(events, *args, **kwargs): + # if len(events) == 0 and not has_ak_column(events, route): + # return empty_f32 + # return route.apply(events, null_value=variable_inst.null_value) + # # apply it + # fill_data[variable_inst.name] = expr(events) + + # # fill it + # fill_hist( + # histograms[var_key], + # fill_data, + # last_edge_inclusive=self.last_edge_inclusive, + # ) + + # merge output files + self.output()["hists"].dump(histograms, formatter="pickle") + + +# overwrite class defaults +check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True) +CreateFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( + default=CreateFakeFactorHistograms.task_family in check_overlap_tasks, + add_default_to_description=True, +) + + +CreateFakeFactorHistogramsWrapper = wrapper_factory( + base_cls=AnalysisTask, + require_cls=CreateFakeFactorHistograms, + enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], +) + + +# class MergeHistograms( +# VariablesMixin, +# WeightProducerMixin, +# MLModelsMixin, +# ProducersMixin, +# SelectorStepsMixin, +# CalibratorsMixin, +# DatasetTask, +# law.LocalWorkflow, +# RemoteWorkflow, +# ): +# only_missing = luigi.BoolParameter( +# default=False, +# description="when True, identify missing variables first and only require histograms of " +# "missing ones; default: False", +# ) +# remove_previous = luigi.BoolParameter( +# default=False, +# significant=False, +# description="when True, remove particlar input histograms after merging; default: False", +# ) + +# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + +# # upstream requirements +# reqs = Requirements( +# RemoteWorkflow.reqs, +# CreateHistograms=CreateHistograms, +# ) + +# @classmethod +# def req_params(cls, inst: AnalysisTask, **kwargs) -> dict: +# _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"} +# kwargs["_prefer_cli"] = _prefer_cli +# return super().req_params(inst, **kwargs) + +# def create_branch_map(self): +# # create a dummy branch map so that this task could be submitted as a job +# return {0: None} + +# def _get_variables(self): +# if self.is_workflow(): +# return self.as_branch()._get_variables() + +# variables = self.variables + +# # optional dynamic behavior: determine not yet created variables and require only those +# if self.only_missing: +# missing = self.output().count(existing=False, keys=True)[1] +# variables = sorted(missing, key=variables.index) + +# return variables + +# def workflow_requires(self): +# reqs = super().workflow_requires() + +# if not self.pilot: +# variables = self._get_variables() +# if variables: +# reqs["hists"] = self.reqs.CreateHistograms.req_different_branching( +# self, +# branch=-1, +# variables=tuple(variables), +# ) + +# return reqs + +# def requires(self): +# variables = self._get_variables() +# if not variables: +# return [] + +# return self.reqs.CreateHistograms.req_different_branching( +# self, +# branch=-1, +# variables=tuple(variables), +# workflow="local", +# ) + +# def output(self): +# return {"hists": law.SiblingFileCollection({ +# variable_name: self.target(f"hist__{variable_name}.pickle") +# for variable_name in self.variables +# })} + +# @law.decorator.log +# def run(self): +# # preare inputs and outputs +# inputs = self.input()["collection"] +# outputs = self.output() + +# # load input histograms +# hists = [ +# inp["hists"].load(formatter="pickle") +# for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) +# ] + +# # create a separate file per output variable +# variable_names = list(hists[0].keys()) +# for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): +# self.publish_message(f"merging histograms for '{variable_name}'") + +# variable_hists = [h[variable_name] for h in hists] +# merged = sum(variable_hists[1:], variable_hists[0].copy()) +# outputs["hists"][variable_name].dump(merged, formatter="pickle") + +# # optionally remove inputs +# if self.remove_previous: +# inputs.remove() + + +# MergeHistogramsWrapper = wrapper_factory( +# base_cls=AnalysisTask, +# require_cls=MergeHistograms, +# enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], +# ) + + +# class MergeShiftedHistograms( +# VariablesMixin, +# ShiftSourcesMixin, +# WeightProducerMixin, +# MLModelsMixin, +# ProducersMixin, +# SelectorStepsMixin, +# CalibratorsMixin, +# DatasetTask, +# law.LocalWorkflow, +# RemoteWorkflow, +# ): +# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + +# # disable the shift parameter +# shift = None +# effective_shift = None +# allow_empty_shift = True + +# # allow only running on nominal +# allow_empty_shift_sources = True + +# # upstream requirements +# reqs = Requirements( +# RemoteWorkflow.reqs, +# MergeHistograms=MergeHistograms, +# ) + +# def create_branch_map(self): +# # create a dummy branch map so that this task could as a job +# return {0: None} + +# def workflow_requires(self): +# reqs = super().workflow_requires() + +# # add nominal and both directions per shift source +# for shift in ["nominal"] + self.shifts: +# reqs[shift] = self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"}) + +# return reqs + +# def requires(self): +# return { +# shift: self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"}) +# for shift in ["nominal"] + self.shifts +# } + +# def store_parts(self): +# parts = super().store_parts() +# parts.insert_after("dataset", "shift_sources", f"shifts_{self.shift_sources_repr}") +# return parts + +# def output(self): +# return {"hists": law.SiblingFileCollection({ +# variable_name: self.target(f"shifted_hist__{variable_name}.pickle") +# for variable_name in self.variables +# })} + +# @law.decorator.log +# def run(self): +# # preare inputs and outputs +# inputs = self.input() +# outputs = self.output()["hists"].targets + +# for variable_name, outp in self.iter_progress(outputs.items(), len(outputs)): +# self.publish_message(f"merging histograms for '{variable_name}'") + +# # load hists +# variable_hists = [ +# coll["hists"].targets[variable_name].load(formatter="pickle") +# for coll in inputs.values() +# ] + +# # merge and write the output +# merged = sum(variable_hists[1:], variable_hists[0].copy()) +# outp.dump(merged, formatter="pickle") + + +# MergeShiftedHistogramsWrapper = wrapper_factory( +# base_cls=AnalysisTask, +# require_cls=MergeShiftedHistograms, +# enable=["configs", "skip_configs", "datasets", "skip_datasets"], +# ) diff --git a/law.cfg b/law.cfg index 86b667a76..0d6ae338f 100644 --- a/law.cfg +++ b/law.cfg @@ -8,6 +8,7 @@ columnflow.tasks.reduction columnflow.tasks.production columnflow.tasks.ml columnflow.tasks.union +columnflow.tasks.data_driven_methods columnflow.tasks.histograms columnflow.tasks.plotting columnflow.tasks.yields From 2a8894ac53ac3bacef3fd2d2a1181ad45a9ace8c Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 20 Jan 2025 20:18:00 +0100 Subject: [PATCH 06/26] Fake factor method, work in progress --- columnflow/tasks/data_driven_methods.py | 689 ++++++------------------ 1 file changed, 166 insertions(+), 523 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 0f47638e6..28082ad77 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -1,254 +1,3 @@ -# # coding: utf-8 - -# """ -# Tasks to plot different types of histograms. -# """ - -# from collections import OrderedDict -# from abc import abstractmethod - -# import law -# import luigi - -# from columnflow.tasks.framework.base import Requirements, ShiftTask -# from columnflow.tasks.framework.mixins import ( -# CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, -# CategoriesMixin, ShiftSourcesMixin, -# ) -# from columnflow.tasks.framework.plotting import ( -# PlotBase, PlotBase2D, ProcessPlotSettingMixin, VariablePlotSettingMixin, -# ) -# from columnflow.tasks.framework.decorators import view_output_plots -# from columnflow.tasks.framework.remote import RemoteWorkflow -# from columnflow.tasks.histograms import MergeHistograms -# from columnflow.util import DotDict, dev_sandbox, dict_add_strict - - -# class DataDrivenEstimationBase( -# VariablePlotSettingMixin, -# ProcessPlotSettingMixin, -# CategoriesMixin, -# MLModelsMixin, -# ProducersMixin, -# SelectorStepsMixin, -# CalibratorsMixin, -# law.LocalWorkflow, -# RemoteWorkflow, -# ): -# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) -# """sandbox to use for this task. Defaults to *default_columnar_sandbox* from -# analysis config. -# """ - -# exclude_index = True - -# # upstream requirements -# reqs = Requirements( -# RemoteWorkflow.reqs, -# MergeHistograms=MergeHistograms, -# ) -# """Set upstream requirements, in this case :py:class:`~columnflow.tasks.histograms.MergeHistograms` -# """ - -# def store_parts(self): -# parts = super().store_parts() -# parts.insert_before("version", "plot", f"datasets_{self.datasets_repr}") -# return parts - -# def create_branch_map(self): -# return [ -# DotDict({"category": cat_name, "variable": var_name}) -# for cat_name in sorted(self.categories) -# for var_name in sorted(self.variables) -# ] - -# def workflow_requires(self): -# reqs = super().workflow_requires() - -# reqs["merged_hists"] = self.requires_from_branch() - -# return reqs - -# @abstractmethod -# def get_plot_shifts(self): -# return - -# @law.decorator.log -# @view_output_plots -# def run(self): -# import hist -# import numpy as np -# from cmsdb.processes.qcd import qcd - -# # get the shifts to extract and plot -# plot_shifts = law.util.make_list(self.get_plot_shifts()) - -# # prepare config objects -# variable_tuple = self.variable_tuples[self.branch_data.variable] -# variable_insts = [ -# self.config_inst.get_variable(var_name) -# for var_name in variable_tuple -# ] -# category_inst = self.config_inst.get_category(self.branch_data.category) -# leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] -# process_insts = list(map(self.config_inst.get_process, self.processes)) -# sub_process_insts = { -# proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] -# for proc in process_insts -# } - -# # histogram data per process -# hists = {} -# if 'ff_control_reg' in category_inst.name : -# with self.publish_step(f"estimating qcd for {self.branch_data.variable} in {category_inst.name}"): -# for dataset, inp in self.input().items(): -# dataset_inst = self.config_inst.get_dataset(dataset) -# h_in = inp["collection"][0]["hists"].targets[self.branch_data.variable].load(formatter="pickle") - -# # loop and extract one histogram per process -# for process_inst in process_insts: -# # skip when the dataset is already known to not contain any sub process -# if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): -# continue -# # work on a copy -# h = h_in.copy() -# # axis selections -# h = h[{ -# "process": [ -# hist.loc(p.id) -# for p in sub_process_insts[process_inst] -# if p.id in h.axes["process"] -# ], -# "category": [ -# hist.loc(c.id) -# for c in leaf_category_insts -# if c.id in h.axes["category"] -# ], -# "shift": [ -# hist.loc(s.id) -# for s in plot_shifts -# if s.id in h.axes["shift"] -# ], -# }] - -# # axis reductions -# h = h[{"process": sum, "category": sum}] - -# # add the histogram -# if process_inst in hists: -# hists[process_inst] += h -# else: -# hists[process_inst] = h - -# # there should be hists to plot -# if not hists: -# raise Exception( -# "no histograms found to plot; possible reasons:\n" + -# " - requested variable requires columns that were missing during histogramming\n" + -# " - selected --processes did not match any value on the process axis of the input histogram", -# ) - -# # sort hists by process order -# hists = OrderedDict( -# (process_inst.copy_shallow(), hists[process_inst]) -# for process_inst in sorted(hists, key=process_insts.index) -# ) - -# qcd_hist = None -# qcd_hist_values = None -# for process_inst, h in hists.items(): -# hist_np , _ , _ = h.to_numpy(flow=True) -# if qcd_hist is None: -# qcd_hist = h.copy() -# qcd_hist_values = np.zeros_like(hist_np) -# if process_inst.is_data: qcd_hist_values += hist_np -# else: qcd_hist_values -= hist_np - -# #if the array contains negative values, set them to zero -# qcd_hist_values = np.where(qcd_hist_values > 0, qcd_hist_values, 0) -# qcd_hist.view(flow=True).value[:] = qcd_hist_values -# qcd_hist.view(flow=True).variance[:] = np.zeros_like(qcd_hist_values) -# qcd_hist -# #register a new datased at the hlist -# hists[qcd] = qcd_hist -# #save qcd estimation histogram and plots only for control region - -# self.output()["qcd_hists"][self.branch_data.variable].dump(qcd_hist, formatter="pickle") -# # call the plot function -# fig, _ = self.call_plot_func( -# self.plot_function, -# hists=hists, -# config_inst=self.config_inst, -# category_inst=category_inst.copy_shallow(), -# variable_insts=[var_inst.copy_shallow() for var_inst in variable_insts], -# **self.get_plot_parameters(), -# ) -# # save the plot -# for outp in self.output()["plots"]: -# outp.dump(fig, formatter="mpl") -# else: -# self.publish_step(f"Category: {category_inst.name} isn't used to estimate QCD, skipping this task.") - - -# class DataDrivenEstimationSingleShift( -# DataDrivenEstimationBase, -# ShiftTask, -# ): -# exclude_index = True - -# # upstream requirements -# reqs = Requirements( -# DataDrivenEstimationBase.reqs, -# MergeHistograms=MergeHistograms, -# ) - -# def create_branch_map(self): -# return [ -# DotDict({"category": cat_name, "variable": var_name}) -# for var_name in sorted(self.variables) -# for cat_name in sorted(self.categories) -# ] - -# def requires(self): -# return { -# d: self.reqs.MergeHistograms.req( -# self, -# dataset=d, -# branch=-1, -# _exclude={"branches"}, -# _prefer_cli={"variables"}, -# ) -# for d in self.datasets -# } - -# def output(self): -# b = self.branch_data -# return {"plots": [ -# self.target(name) -# for name in self.get_plot_names(f"plot__proc_{self.processes_repr}__cat_{b.category}__var_{b.variable}") -# ], -# "qcd_hists": law.SiblingFileCollection({ -# variable_name: self.target(f"qcd_histogram__{b.category}_{variable_name}.pickle") -# for variable_name in self.variables -# })} - -# def get_plot_shifts(self): -# return [self.global_shift_inst] - - -# class DataDrivenEstimation( -# DataDrivenEstimationSingleShift, -# DataDrivenEstimationBase, -# ): -# plot_function = PlotBase.plot_function.copy( -# default="columnflow.plotting.plot_functions_1d.plot_variable_per_process", -# add_default_to_description=True, -# ) - - - - -# coding: utf-8 """ Task to produce and merge histograms. @@ -262,14 +11,16 @@ from columnflow.tasks.framework.base import Requirements, AnalysisTask, DatasetTask, wrapper_factory from columnflow.tasks.framework.mixins import ( CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, VariablesMixin, - ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin, + ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin, DatasetsProcessesMixin, CategoriesMixin ) +from columnflow.tasks.framework.plotting import ProcessPlotSettingMixin + from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.framework.parameters import last_edge_inclusive_inst from columnflow.tasks.reduction import ReducedEventsUser from columnflow.tasks.production import ProduceColumns from columnflow.tasks.ml import MLEvaluation -from columnflow.util import dev_sandbox +from columnflow.util import dev_sandbox, DotDict class CreateFakeFactorHistograms( @@ -355,7 +106,7 @@ def run(self): import numpy as np import awkward as ak from columnflow.columnar_util import ( - Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, + Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, EMPTY_FLOAT ) # prepare inputs @@ -381,9 +132,8 @@ def run(self): read_columns |= set(self.weight_producer_inst.used_columns) read_columns |= set(map(Route, aliases.values())) read_columns |= { - Route(the_var) for the_var in self.config_inst.x.fake_factor_method.vars.keys() + Route(the_ax.var_route) for the_ax in self.config_inst.x.fake_factor_method.axes.values() } - from IPython import embed; embed() # empty float array to use when input files have no entries empty_f32 = ak.Array(np.array([], dtype=np.float32)) @@ -429,10 +179,10 @@ def run(self): .IntCat([], name="category", growth=True) .IntCat([], name="process", growth=True) .IntCat([], name="shift", growth=True)) - for (var_name, var_axis) in self.config_inst.x.fake_factor_method.vars.items(): - h = eval(f'h.{var_axis}') + for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): + h = eval(f'h.{var_axis.ax_str}') - histograms['fake_factor'] = h.Weight() + histograms['fake_factors'] = h.Weight() category_ids = ak.concatenate( [Route(c).apply(events) for c in self.category_id_columns], @@ -440,75 +190,24 @@ def run(self): ) # broadcast arrays so that each event can be filled for all its categories fill_data = { - "category": category_ids, - "process": events.process_id, - "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, + "category" : category_ids, + "process" : events.process_id, + "shift" : np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, "weight": weight, } - # for variable_inst in self.config_inst.x.fake_factor_method.vars.: - # # prepare the expression - # expr = variable_inst.expression - # if isinstance(expr, str): - # route = Route(expr) - # def expr(events, *args, **kwargs): - # if len(events) == 0 and not has_ak_column(events, route): - # return empty_f32 - # return route.apply(events, null_value=variable_inst.null_value) - # fill_data[variable_inst.name] = expr(events) - from IPython import embed; embed() - # for var_key, var_names in self.variable_tuples.items(): - # variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] - - # # create the histogram if not present yet - # if var_key not in histograms: - # h = ( - # hist.Hist.new - # .IntCat([], name="category", growth=True) - # .IntCat([], name="process", growth=True) - # .IntCat([], name="shift", growth=True) - # ) - # # add variable axes - # for variable_inst in variable_insts: - # h = h.Var( - # variable_inst.bin_edges, - # name=variable_inst.name, - # label=variable_inst.get_full_x_title(), - # ) - # # enable weights and store it - # histograms[var_key] = h.Weight() - - # # merge category ids - # category_ids = ak.concatenate( - # [Route(c).apply(events) for c in self.category_id_columns], - # axis=-1, - # ) - - # broadcast arrays so that each event can be filled for all its categories - # fill_data = { - # "category": category_ids, - # "process": events.process_id, - # "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, - # "weight": weight, - # } - # for variable_inst in variable_insts: - # # prepare the expression - # expr = variable_inst.expression - # if isinstance(expr, str): - # route = Route(expr) - # def expr(events, *args, **kwargs): - # if len(events) == 0 and not has_ak_column(events, route): - # return empty_f32 - # return route.apply(events, null_value=variable_inst.null_value) - # # apply it - # fill_data[variable_inst.name] = expr(events) - - # # fill it - # fill_hist( - # histograms[var_key], - # fill_data, - # last_edge_inclusive=self.last_edge_inclusive, - # ) - + for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): + route = Route(var_axis.var_route) + if len(events) == 0 and not has_ak_column(events, route): + values = empty_f32 + else: + values = ak.fill_none(ak.firsts(route.apply(events),axis=1), EMPTY_FLOAT) + if 'IntCategory' in var_axis.ax_str: values = ak.values_astype(values, np.int64) + fill_data[var_name] = values + # fill it + fill_hist( + histograms['fake_factors'], + fill_data, + ) # merge output files self.output()["hists"].dump(histograms, formatter="pickle") @@ -527,205 +226,149 @@ def run(self): enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], ) +class MergeFakeFactors( + VariablesMixin, + DatasetsProcessesMixin, + CategoriesMixin, + WeightProducerMixin, + ProducersMixin, + SelectorStepsMixin, + CalibratorsMixin, + law.LocalWorkflow, + RemoteWorkflow, +): + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) -# class MergeHistograms( -# VariablesMixin, -# WeightProducerMixin, -# MLModelsMixin, -# ProducersMixin, -# SelectorStepsMixin, -# CalibratorsMixin, -# DatasetTask, -# law.LocalWorkflow, -# RemoteWorkflow, -# ): -# only_missing = luigi.BoolParameter( -# default=False, -# description="when True, identify missing variables first and only require histograms of " -# "missing ones; default: False", -# ) -# remove_previous = luigi.BoolParameter( -# default=False, -# significant=False, -# description="when True, remove particlar input histograms after merging; default: False", -# ) - -# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) - -# # upstream requirements -# reqs = Requirements( -# RemoteWorkflow.reqs, -# CreateHistograms=CreateHistograms, -# ) - -# @classmethod -# def req_params(cls, inst: AnalysisTask, **kwargs) -> dict: -# _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"} -# kwargs["_prefer_cli"] = _prefer_cli -# return super().req_params(inst, **kwargs) - -# def create_branch_map(self): -# # create a dummy branch map so that this task could be submitted as a job -# return {0: None} - -# def _get_variables(self): -# if self.is_workflow(): -# return self.as_branch()._get_variables() - -# variables = self.variables - -# # optional dynamic behavior: determine not yet created variables and require only those -# if self.only_missing: -# missing = self.output().count(existing=False, keys=True)[1] -# variables = sorted(missing, key=variables.index) - -# return variables - -# def workflow_requires(self): -# reqs = super().workflow_requires() - -# if not self.pilot: -# variables = self._get_variables() -# if variables: -# reqs["hists"] = self.reqs.CreateHistograms.req_different_branching( -# self, -# branch=-1, -# variables=tuple(variables), -# ) - -# return reqs - -# def requires(self): -# variables = self._get_variables() -# if not variables: -# return [] - -# return self.reqs.CreateHistograms.req_different_branching( -# self, -# branch=-1, -# variables=tuple(variables), -# workflow="local", -# ) - -# def output(self): -# return {"hists": law.SiblingFileCollection({ -# variable_name: self.target(f"hist__{variable_name}.pickle") -# for variable_name in self.variables -# })} - -# @law.decorator.log -# def run(self): -# # preare inputs and outputs -# inputs = self.input()["collection"] -# outputs = self.output() - -# # load input histograms -# hists = [ -# inp["hists"].load(formatter="pickle") -# for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) -# ] - -# # create a separate file per output variable -# variable_names = list(hists[0].keys()) -# for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): -# self.publish_message(f"merging histograms for '{variable_name}'") - -# variable_hists = [h[variable_name] for h in hists] -# merged = sum(variable_hists[1:], variable_hists[0].copy()) -# outputs["hists"][variable_name].dump(merged, formatter="pickle") - -# # optionally remove inputs -# if self.remove_previous: -# inputs.remove() - - -# MergeHistogramsWrapper = wrapper_factory( -# base_cls=AnalysisTask, -# require_cls=MergeHistograms, -# enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], -# ) + only_missing = luigi.BoolParameter( + default=False, + description="when True, identify missing variables first and only require histograms of " + "missing ones; default: False", + ) + remove_previous = luigi.BoolParameter( + default=False, + significant=False, + description="when True, remove particlar input histograms after merging; default: False", + ) + + # upstream requirements + reqs = Requirements( + RemoteWorkflow.reqs, + CreateFakeFactorHistograms=CreateFakeFactorHistograms, + ) + + def store_parts(self): + parts = super().store_parts() + parts.insert_before("version", "datasets" )#, f"datasets_{self.datasets_repr}") + return parts + + @classmethod + def req_params(cls, inst: AnalysisTask, **kwargs) -> dict: + _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"} + kwargs["_prefer_cli"] = _prefer_cli + return super().req_params(inst, **kwargs) + + def create_branch_map(self): + return [ + DotDict({"category": cat_name}) + for cat_name in sorted(self.categories) + ] + + def _get_variables(self): + if self.is_workflow(): + return self.as_branch()._get_variables() + + variables = self.variables + + # optional dynamic behavior: determine not yet created variables and require only those + if self.only_missing: + missing = self.output().count(existing=False, keys=True)[1] + variables = sorted(missing, key=variables.index) + + return variables + + def workflow_requires(self): + reqs = super().workflow_requires() + if not self.pilot: + variables = self._get_variables() + if variables: + reqs["ff_method"] = self.reqs.CreateFakeFactorHistograms.req_different_branching( + self, + branch=-1, + variables=tuple(variables), + ) + + return reqs + def requires(self): + return { + d: self.reqs.CreateFakeFactorHistograms.req( + self, + dataset=d, + branch=-1, + ) + for d in self.datasets + } + def output(self): + return {"hists": self.target(f"fake_factors.pickle")} + + @law.decorator.log + def run(self): + import hist + import numpy as np + import matplotlib.pyplot as plt + # preare inputs and outputs + inputs = self.input() + outputs = self.output() + merged_per_dataset = {} + projected_hists = [] + for (dataset_name, dataset) in inputs.items(): + files = dataset['collection'] + # load input histograms per dataset + hists = [ + inp['hists'].load(formatter="pickle")['fake_factors'] + for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50)) + ] + self.publish_message(f"merging Fake factor histograms for {dataset_name}") + the_hist = sum(hists[1:], hists[0].copy()) + merged_per_dataset[dataset_name] = the_hist + #Get axes names excluding 'process'. This is needed to merge hists for different processes + ax_names = [ax_name for ax_name in the_hist.axes.name if ax_name != 'process'] + #Remove 'process' axis by projecting hist on the remaining axes + projected_hists.append(the_hist.project(*ax_names)) + merged_hist = sum(projected_hists[1:], projected_hists[0].copy()) + + cat_SR = self.config_inst.get_category(self.branch_data.category) + cat_DR_den = self.config_inst.get_category(cat_SR.x.DR_den) + cat_DR_num = self.config_inst.get_category(cat_SR.x.DR_num) + + def get_hist (h, category): + return h[{"category": hist.loc(category.id)}] + + h_DR_num = get_hist(merged_hist,cat_DR_num).values() + h_DR_den = get_hist(merged_hist,cat_DR_den).values() + + ff_values = np.where((h_DR_num > 0) & (h_DR_den > 0), + h_DR_num / np.maximum(h_DR_den, 1), + 0.0, + ) + + #For the control: make 2d hists and plot them: + hist2d = merged_hist.project('tau_pt','tau_dm_pnet') + ff_hist = hist.Hist(*hist2d.axes, data=ff_values[0]) + fig, ax = plt.subplots(figsize=(12, 8)) + ff_hist.plot2d(ax=ax) + plt.savefig('fake_factors.pdf') + from IPython import embed; embed() + #outputs["hists"][variable_name].dump(merged, formatter="pickle")F -# class MergeShiftedHistograms( -# VariablesMixin, -# ShiftSourcesMixin, -# WeightProducerMixin, -# MLModelsMixin, -# ProducersMixin, -# SelectorStepsMixin, -# CalibratorsMixin, -# DatasetTask, -# law.LocalWorkflow, -# RemoteWorkflow, -# ): -# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) - -# # disable the shift parameter -# shift = None -# effective_shift = None -# allow_empty_shift = True - -# # allow only running on nominal -# allow_empty_shift_sources = True - -# # upstream requirements -# reqs = Requirements( -# RemoteWorkflow.reqs, -# MergeHistograms=MergeHistograms, -# ) - -# def create_branch_map(self): -# # create a dummy branch map so that this task could as a job -# return {0: None} - -# def workflow_requires(self): -# reqs = super().workflow_requires() - -# # add nominal and both directions per shift source -# for shift in ["nominal"] + self.shifts: -# reqs[shift] = self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"}) - -# return reqs - -# def requires(self): -# return { -# shift: self.reqs.MergeHistograms.req(self, shift=shift, _prefer_cli={"variables"}) -# for shift in ["nominal"] + self.shifts -# } - -# def store_parts(self): -# parts = super().store_parts() -# parts.insert_after("dataset", "shift_sources", f"shifts_{self.shift_sources_repr}") -# return parts - -# def output(self): -# return {"hists": law.SiblingFileCollection({ -# variable_name: self.target(f"shifted_hist__{variable_name}.pickle") -# for variable_name in self.variables -# })} - -# @law.decorator.log -# def run(self): -# # preare inputs and outputs -# inputs = self.input() -# outputs = self.output()["hists"].targets - -# for variable_name, outp in self.iter_progress(outputs.items(), len(outputs)): -# self.publish_message(f"merging histograms for '{variable_name}'") - -# # load hists -# variable_hists = [ -# coll["hists"].targets[variable_name].load(formatter="pickle") -# for coll in inputs.values() -# ] - -# # merge and write the output -# merged = sum(variable_hists[1:], variable_hists[0].copy()) -# outp.dump(merged, formatter="pickle") - - -# MergeShiftedHistogramsWrapper = wrapper_factory( + # optionally remove inputs + if self.remove_previous: + inputs.remove() + + +# MergeFakeFactorsWrapper = wrapper_factory( # base_cls=AnalysisTask, -# require_cls=MergeShiftedHistograms, -# enable=["configs", "skip_configs", "datasets", "skip_datasets"], +# require_cls=MergeFakeFactors, +# enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], # ) + From c4b62497b644855168f591164babb14251cbaea2 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 20 Jan 2025 20:20:18 +0100 Subject: [PATCH 07/26] Some plotting aestetics --- columnflow/plotting/plot_functions_1d.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py index 716ec5421..0c60ff0fb 100644 --- a/columnflow/plotting/plot_functions_1d.py +++ b/columnflow/plotting/plot_functions_1d.py @@ -72,7 +72,8 @@ def plot_variable_per_process( total_events = {key: sum(hist.values()) for key, hist in hists.items()} # Sort processes by total number of events in descending order - sorted_hists_desc = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True)) + #sorted_hists_desc = OrderedDict(sorted(hists.items(), key=lambda item: total_events[item[0]], reverse=True)) + sorted_hists_desc = OrderedDict(hists.items()) # Get keys of sorted processes sorted_keys = list(sorted_hists_desc.keys()) @@ -86,7 +87,7 @@ def plot_variable_per_process( custom_order = sorted_keys else: # More than two processes, custom order: highest, rest, then second highest - custom_order = [sorted_keys[0]] + sorted_keys[2:] + [sorted_keys[1]] + custom_order = sorted_keys #[sorted_keys[0]] + sorted_keys[2:] + [sorted_keys[1]] # Reorder histograms based on custom order sorted_hists = OrderedDict((key, sorted_hists_desc[key]) for key in custom_order) From 21ab46bcba50d53045d9ebfd831a66c69d4306be Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Tue, 28 Jan 2025 10:45:28 +0100 Subject: [PATCH 08/26] Developed a task to calculate fake factors for WJ and QCD --- columnflow/tasks/data_driven_methods.py | 172 ++++++++++++++++-------- 1 file changed, 119 insertions(+), 53 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 28082ad77..d12da8645 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -23,7 +23,7 @@ from columnflow.util import dev_sandbox, DotDict -class CreateFakeFactorHistograms( +class PrepareFakeFactorHistograms( VariablesMixin, WeightProducerMixin, ProducersMixin, @@ -177,8 +177,7 @@ def run(self): h = (hist.Hist.new .IntCat([], name="category", growth=True) - .IntCat([], name="process", growth=True) - .IntCat([], name="shift", growth=True)) + .IntCat([], name="process", growth=True)) for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): h = eval(f'h.{var_axis.ax_str}') @@ -189,11 +188,11 @@ def run(self): axis=-1, ) # broadcast arrays so that each event can be filled for all its categories + fill_data = { "category" : category_ids, "process" : events.process_id, - "shift" : np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, - "weight": weight, + "weight" : weight, } for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): route = Route(var_axis.var_route) @@ -214,19 +213,19 @@ def run(self): # overwrite class defaults check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True) -CreateFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( - default=CreateFakeFactorHistograms.task_family in check_overlap_tasks, +PrepareFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( + default=PrepareFakeFactorHistograms.task_family in check_overlap_tasks, add_default_to_description=True, ) -CreateFakeFactorHistogramsWrapper = wrapper_factory( +PrepareFakeFactorHistogramsWrapper = wrapper_factory( base_cls=AnalysisTask, - require_cls=CreateFakeFactorHistograms, + require_cls=PrepareFakeFactorHistograms, enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], ) -class MergeFakeFactors( +class ComputeFakeFactors( VariablesMixin, DatasetsProcessesMixin, CategoriesMixin, @@ -253,12 +252,12 @@ class MergeFakeFactors( # upstream requirements reqs = Requirements( RemoteWorkflow.reqs, - CreateFakeFactorHistograms=CreateFakeFactorHistograms, + PrepareFakeFactorHistograms=PrepareFakeFactorHistograms, ) def store_parts(self): parts = super().store_parts() - parts.insert_before("version", "datasets" )#, f"datasets_{self.datasets_repr}") + parts.insert_before("version", "datasets", f"datasets_{self.datasets_repr}") return parts @classmethod @@ -291,7 +290,7 @@ def workflow_requires(self): if not self.pilot: variables = self._get_variables() if variables: - reqs["ff_method"] = self.reqs.CreateFakeFactorHistograms.req_different_branching( + reqs["ff_method"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching( self, branch=-1, variables=tuple(variables), @@ -301,7 +300,7 @@ def workflow_requires(self): def requires(self): return { - d: self.reqs.CreateFakeFactorHistograms.req( + d: self.reqs.PrepareFakeFactorHistograms.req( self, dataset=d, branch=-1, @@ -309,66 +308,133 @@ def requires(self): for d in self.datasets } def output(self): - return {"hists": self.target(f"fake_factors.pickle")} + return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']}, + "plots": {syst: self.target(f"fake_factor_syst_{syst}.png") for syst in ['nominal', 'up', 'down']},} @law.decorator.log def run(self): import hist import numpy as np import matplotlib.pyplot as plt + import correctionlib.convert as cl_convert # preare inputs and outputs inputs = self.input() outputs = self.output() merged_per_dataset = {} projected_hists = [] + hists_by_dataset = [] for (dataset_name, dataset) in inputs.items(): files = dataset['collection'] # load input histograms per dataset - hists = [ + hists_per_ds = [ inp['hists'].load(formatter="pickle")['fake_factors'] for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50)) ] self.publish_message(f"merging Fake factor histograms for {dataset_name}") - the_hist = sum(hists[1:], hists[0].copy()) - merged_per_dataset[dataset_name] = the_hist - #Get axes names excluding 'process'. This is needed to merge hists for different processes - ax_names = [ax_name for ax_name in the_hist.axes.name if ax_name != 'process'] - #Remove 'process' axis by projecting hist on the remaining axes - projected_hists.append(the_hist.project(*ax_names)) - merged_hist = sum(projected_hists[1:], projected_hists[0].copy()) + ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy()) + hists_by_dataset.append(ds_single_hist) + + hists_by_proc = {} + for proc_name in self.config_inst.processes.names(): + proc = self.config_inst.processes.get(proc_name) + self.publish_message(f"merging Fake factor histograms for process: {proc.name}") + for the_hist in hists_by_dataset: + + if proc.id in the_hist.axes["process"]: + h = the_hist.copy() + h = h[{"process": hist.loc(proc.id)}] + # add the histogram + if proc in hists_by_proc: + hists_by_proc[proc] += h + else: + hists_by_proc[proc] = h - cat_SR = self.config_inst.get_category(self.branch_data.category) - cat_DR_den = self.config_inst.get_category(cat_SR.x.DR_den) - cat_DR_num = self.config_inst.get_category(cat_SR.x.DR_num) + mc_hists = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")] + data_hists = [h for p, h in hists_by_proc.items() if p.is_data] - def get_hist (h, category): - return h[{"category": hist.loc(category.id)}] + mc_hists = sum(mc_hists[1:], mc_hists[0].copy()) + data_hists = sum(data_hists[1:], data_hists[0].copy()) - h_DR_num = get_hist(merged_hist,cat_DR_num).values() - h_DR_den = get_hist(merged_hist,cat_DR_den).values() + dr_names = ['dr_num_wj','dr_den_wj','dr_num_qcd','dr_den_qcd'] + + def get_hist(h, category): + return h[{"category": hist.loc(category.id)}] - ff_values = np.where((h_DR_num > 0) & (h_DR_den > 0), - h_DR_num / np.maximum(h_DR_den, 1), - 0.0, - ) - #For the control: make 2d hists and plot them: - hist2d = merged_hist.project('tau_pt','tau_dm_pnet') - ff_hist = hist.Hist(*hist2d.axes, data=ff_values[0]) - fig, ax = plt.subplots(figsize=(12, 8)) - ff_hist.plot2d(ax=ax) - plt.savefig('fake_factors.pdf') - from IPython import embed; embed() - #outputs["hists"][variable_name].dump(merged, formatter="pickle")F - - # optionally remove inputs - if self.remove_previous: - inputs.remove() - - -# MergeFakeFactorsWrapper = wrapper_factory( -# base_cls=AnalysisTask, -# require_cls=MergeFakeFactors, -# enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], -# ) + #Create two dictionaries that contain histograms for different determination regions + data_h_cat ={} + mc_h_cat = {} + for dr_name in dr_names: + cat = self.config_inst.get_category(self.branch_data.category.replace('sr',dr_name)) + data_h_cat[dr_name] = get_hist(data_hists, cat) + mc_h_cat[dr_name] = get_hist(mc_hists, cat) + + + def get_ff_corr(self, h_data, h_mc, num_cat, den_cat, name='ff_hist', label='ff_hist'): + num = h_data[num_cat].values() - h_mc[num_cat].values() + den = h_data[den_cat].values() - h_mc[den_cat].values() + ff_val = np.where((num > 0) & (den > 0), + num / np.maximum(den, 1), + 1) + def rel_err(x): + return x.variances()/np.maximum(x.values()**2, 1) + ff_err2 = np.where((num > 0) & (den > 0), + np.sqrt(rel_err(h_data[num_cat]) + + + rel_err(h_mc[den_cat]) + + + rel_err(h_data[num_cat]) + + + rel_err(h_mc[den_cat])) * ff_val**2, + 0.5* np.ones_like(ff_val)) + h = hist.Hist.new + for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): + h = eval(f'h.{var_axis.ax_str}') + h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor') + ff_hist= h.Weight() + ff_hist.view().value[...,0] = ff_val + ff_hist.view().value[...,1] = ff_val + np.sqrt(ff_err2) + ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0) + ff_hist.name = name + ff_hist.label = label + ff_corr = cl_convert.from_histogram(ff_hist) #temporary correction without systematic axis + ff_corr.data.flow = "clamp" + return ff_corr, ff_hist + + import rich + + wj_corr, wj_h = get_ff_corr(self, + data_h_cat, + mc_h_cat, + num_cat = 'dr_num_wj', + den_cat = 'dr_den_wj', + name='ff_wjets', + label='Fake factor W+jets') + + qcd_corr, qcd_h = get_ff_corr(self, + data_h_cat, + mc_h_cat, + num_cat = 'dr_num_qcd', + den_cat = 'dr_den_qcd', + name='ff_qcd', + label='Fake factor QCD') + + for h_name in ['wj', 'qcd']: + the_hist = eval(f'{h_name}_h') + + for syst in ['nominal','up','down']: + fig, ax = plt.subplots(figsize=(12, 8)) + the_hist[...,syst].plot2d(ax=ax) + self.output()['plots'][syst].dump(fig, formatter="mpl") + + + self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json") + self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json") + + + + + + + + + + From b9d4b3f1a7d150a4a5bbdb169a7eefce267f635b Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Tue, 28 Jan 2025 14:05:57 +0100 Subject: [PATCH 09/26] Fake factor method: work in progress --- columnflow/tasks/data_driven_methods.py | 70 ++++++++++++------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index d12da8645..931f507df 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -309,7 +309,9 @@ def requires(self): } def output(self): return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']}, - "plots": {syst: self.target(f"fake_factor_syst_{syst}.png") for syst in ['nominal', 'up', 'down']},} + "plots": {'_'.join((ff_type, syst)): self.target(f"fake_factor_{ff_type}_{syst}.png") + for syst in ['nominal', 'up', 'down'] + for ff_type in ['qcd','wj']},} @law.decorator.log def run(self): @@ -333,7 +335,7 @@ def run(self): self.publish_message(f"merging Fake factor histograms for {dataset_name}") ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy()) hists_by_dataset.append(ds_single_hist) - + #Create a dict of histograms indexed by the process hists_by_proc = {} for proc_name in self.config_inst.processes.names(): proc = self.config_inst.processes.get(proc_name) @@ -349,40 +351,39 @@ def run(self): else: hists_by_proc[proc] = h + #Divide histograms to data and bkg mc_hists = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")] data_hists = [h for p, h in hists_by_proc.items() if p.is_data] + #Merge histograms to get a joint data and mc histogram mc_hists = sum(mc_hists[1:], mc_hists[0].copy()) data_hists = sum(data_hists[1:], data_hists[0].copy()) - dr_names = ['dr_num_wj','dr_den_wj','dr_num_qcd','dr_den_qcd'] - - def get_hist(h, category): - return h[{"category": hist.loc(category.id)}] - - - #Create two dictionaries that contain histograms for different determination regions - data_h_cat ={} - mc_h_cat = {} - for dr_name in dr_names: - cat = self.config_inst.get_category(self.branch_data.category.replace('sr',dr_name)) - data_h_cat[dr_name] = get_hist(data_hists, cat) - mc_h_cat[dr_name] = get_hist(mc_hists, cat) + #Function that performs the calculation of th + def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'): - - def get_ff_corr(self, h_data, h_mc, num_cat, den_cat, name='ff_hist', label='ff_hist'): - num = h_data[num_cat].values() - h_mc[num_cat].values() - den = h_data[den_cat].values() - h_mc[den_cat].values() + def get_dr_hist(self, h, det_reg): + cat = self.config_inst.get_category(self.branch_data.category.replace('sr',det_reg)) + return h[{"category": hist.loc(cat.id)}] + + data_num = get_dr_hist(self, h_data, num_reg) + data_den = get_dr_hist(self, h_data, den_reg) + mc_num = get_dr_hist(self, h_mc, num_reg) + mc_den = get_dr_hist(self, h_mc, den_reg) + + num = data_num.values() - mc_num.values() + den = data_den.values() - mc_den.values() ff_val = np.where((num > 0) & (den > 0), num / np.maximum(den, 1), 1) def rel_err(x): return x.variances()/np.maximum(x.values()**2, 1) + ff_err2 = np.where((num > 0) & (den > 0), - np.sqrt(rel_err(h_data[num_cat]) + - + rel_err(h_mc[den_cat]) + - + rel_err(h_data[num_cat]) + - + rel_err(h_mc[den_cat])) * ff_val**2, + np.sqrt(rel_err(data_num) + + + rel_err(data_den) + + + rel_err(mc_num) + + + rel_err(mc_den)) * ff_val**2, 0.5* np.ones_like(ff_val)) h = hist.Hist.new for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): @@ -394,25 +395,23 @@ def rel_err(x): ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0) ff_hist.name = name ff_hist.label = label - ff_corr = cl_convert.from_histogram(ff_hist) #temporary correction without systematic axis + ff_corr = cl_convert.from_histogram(ff_hist) ff_corr.data.flow = "clamp" return ff_corr, ff_hist - import rich - wj_corr, wj_h = get_ff_corr(self, - data_h_cat, - mc_h_cat, - num_cat = 'dr_num_wj', - den_cat = 'dr_den_wj', + data_hists, + mc_hists, + num_reg = 'dr_num_wj', + den_reg = 'dr_den_wj', name='ff_wjets', label='Fake factor W+jets') qcd_corr, qcd_h = get_ff_corr(self, - data_h_cat, - mc_h_cat, - num_cat = 'dr_num_qcd', - den_cat = 'dr_den_qcd', + data_hists, + mc_hists, + num_reg = 'dr_num_qcd', + den_reg = 'dr_den_qcd', name='ff_qcd', label='Fake factor QCD') @@ -422,9 +421,8 @@ def rel_err(x): for syst in ['nominal','up','down']: fig, ax = plt.subplots(figsize=(12, 8)) the_hist[...,syst].plot2d(ax=ax) - self.output()['plots'][syst].dump(fig, formatter="mpl") + self.output()['plots']['_'.join((h_name,syst))].dump(fig, formatter="mpl") - self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json") self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json") From c76efd3ca63b4ea3a209319402d884afba940bb5 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 3 Feb 2025 12:24:08 +0100 Subject: [PATCH 10/26] FF method work in progress. Trying to slove problems of framework freezing while executing Compute fake factors task --- columnflow/columnar_util.py | 18 +- columnflow/tasks/data_driven_methods.py | 254 ++++++++++++++++++++---- 2 files changed, 224 insertions(+), 48 deletions(-) diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py index 9057b3faf..a7b3c5ebe 100644 --- a/columnflow/columnar_util.py +++ b/columnflow/columnar_util.py @@ -1405,15 +1405,15 @@ def allows_shift(ax) -> bool: flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5 # fill - if 'event' in data.keys(): - arrays = {} - for ax_name in axis_names: - if ax_name in data.keys(): - arrays[ax_name] = data[ax_name] - h.fill(**fill_kwargs, **arrays) - else: - arrays = ak.flatten(ak.cartesian(data)) - h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) + # if 'event' in data.keys(): + # arrays = {} + # for ax_name in axis_names: + # if ax_name in data.keys(): + # arrays[ax_name] = data[ax_name] + # h.fill(**fill_kwargs, **arrays) + # else: + arrays = ak.flatten(ak.cartesian(data)) + h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) class RouteFilter(object): diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 931f507df..7f58ad75a 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -174,9 +174,9 @@ def run(self): else: weight = ak.Array(np.ones(len(events), dtype=np.float32)) # define and fill histograms, taking into account multiple axes - + categories = self.config_inst.categories.ids() h = (hist.Hist.new - .IntCat([], name="category", growth=True) + .IntCat(categories , name="category", growth=True) .IntCat([], name="process", growth=True)) for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): h = eval(f'h.{var_axis.ax_str}') @@ -231,10 +231,6 @@ class ComputeFakeFactors( CategoriesMixin, WeightProducerMixin, ProducersMixin, - SelectorStepsMixin, - CalibratorsMixin, - law.LocalWorkflow, - RemoteWorkflow, ): sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) @@ -266,25 +262,6 @@ def req_params(cls, inst: AnalysisTask, **kwargs) -> dict: kwargs["_prefer_cli"] = _prefer_cli return super().req_params(inst, **kwargs) - def create_branch_map(self): - return [ - DotDict({"category": cat_name}) - for cat_name in sorted(self.categories) - ] - - def _get_variables(self): - if self.is_workflow(): - return self.as_branch()._get_variables() - - variables = self.variables - - # optional dynamic behavior: determine not yet created variables and require only those - if self.only_missing: - missing = self.output().count(existing=False, keys=True)[1] - variables = sorted(missing, key=variables.index) - - return variables - def workflow_requires(self): reqs = super().workflow_requires() if not self.pilot: @@ -356,14 +333,15 @@ def run(self): data_hists = [h for p, h in hists_by_proc.items() if p.is_data] #Merge histograms to get a joint data and mc histogram - mc_hists = sum(mc_hists[1:], mc_hists[0].copy()) - data_hists = sum(data_hists[1:], data_hists[0].copy()) + if len(mc_hists) > 1: mc_hists = sum(mc_hists[1:], mc_hists[0].copy()) + if len(data_hists) > 1: data_hists = sum(data_hists[1:], data_hists[0].copy()) #Function that performs the calculation of th def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'): - def get_dr_hist(self, h, det_reg): - cat = self.config_inst.get_category(self.branch_data.category.replace('sr',det_reg)) + cat_name = self.categories[0] + from IPython import embed; embed() + cat = self.config_inst.get_category(cat_name.replace('sr',det_reg)) return h[{"category": hist.loc(cat.id)}] data_num = get_dr_hist(self, h_data, num_reg) @@ -425,14 +403,212 @@ def rel_err(x): self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json") self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json") - - - - - - - - - - + + + +class CreateDataDrivenHistograms( + VariablesMixin, + WeightProducerMixin, + ProducersMixin, + ReducedEventsUser, + ChunkedIOMixin, + law.LocalWorkflow, + RemoteWorkflow, +): + + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + + # upstream requirements + reqs = Requirements( + ReducedEventsUser.reqs, + RemoteWorkflow.reqs, + ComputeFakeFactors=ComputeFakeFactors, + ProduceColumns=ProduceColumns, + ) + + def requires(self): + reqs = {"events": self.reqs.ProvideReducedEvents.req(self)} + from IPython import embed; embed() + if self.producer_insts: + reqs["producers"] = [ + self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) + for producer_inst in self.producer_insts + if producer_inst.produced_columns + ] + reqs['ff_json'] = self.reqs.ComputeFakeFactors.req(self) + reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) + return reqs + + def output(self): + return {"hists": self.target(f"histograms__vars_{self.variables_repr}__{self.branch}.pickle")} + + @law.decorator.log + @law.decorator.localize(input=True, output=False) + @law.decorator.safe_output + def run(self): + import hist + import numpy as np + import awkward as ak + from columnflow.columnar_util import ( + Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, + ) + + # prepare inputs + inputs = self.input() + from IPython import embed; embed() + # declare output: dict of histograms + histograms = {} + +# # run the weight_producer setup +# producer_reqs = self.weight_producer_inst.run_requires() +# reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs)) + +# # create a temp dir for saving intermediate files +# tmp_dir = law.LocalDirectoryTarget(is_tmp=True) +# tmp_dir.touch() + +# # get shift dependent aliases +# aliases = self.local_shift_inst.x("column_aliases", {}) + +# # define columns that need to be read +# read_columns = {Route("process_id")} +# read_columns |= set(map(Route, self.category_id_columns)) +# read_columns |= set(self.weight_producer_inst.used_columns) +# read_columns |= set(map(Route, aliases.values())) +# read_columns |= { +# Route(inp) +# for variable_inst in ( +# self.config_inst.get_variable(var_name) +# for var_name in law.util.flatten(self.variable_tuples.values()) +# ) +# for inp in (( +# {variable_inst.expression} +# if isinstance(variable_inst.expression, str) +# # for variable_inst with custom expressions, read columns declared via aux key +# else set(variable_inst.x("inputs", [])) +# ) | ( +# # for variable_inst with selection, read columns declared via aux key +# set(variable_inst.x("inputs", [])) +# if variable_inst.selection != "1" +# else set() +# )) +# } + +# # empty float array to use when input files have no entries +# empty_f32 = ak.Array(np.array([], dtype=np.float32)) + +# # iterate over chunks of events and diffs +# file_targets = [inputs["events"]["events"]] +# if self.producer_insts: +# file_targets.extend([inp["columns"] for inp in inputs["producers"]]) +# # if self.ml_model_insts: +# # file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]]) + +# # prepare inputs for localization +# with law.localize_file_targets( +# [*file_targets, *reader_targets.values()], +# mode="r", +# ) as inps: +# for (events, *columns), pos in self.iter_chunked_io( +# [inp.abspath for inp in inps], +# source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets), +# read_columns=(len(file_targets) + len(reader_targets)) * [read_columns], +# chunk_size=self.weight_producer_inst.get_min_chunk_size(), +# ): +# # optional check for overlapping inputs +# if self.check_overlapping_inputs: +# self.raise_if_overlapping([events] + list(columns)) + +# # add additional columns +# events = update_ak_array(events, *columns) + +# # add aliases +# events = add_ak_aliases( +# events, +# aliases, +# remove_src=True, +# missing_strategy=self.missing_column_alias_strategy, +# ) + +# # build the full event weight without fake factors +# if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): +# events, weight = self.weight_producer_inst(events) +# else: +# weight = ak.Array(np.ones(len(events), dtype=np.float32)) + +# # define and fill histograms, taking into account multiple axes +# for var_key, var_names in self.variable_tuples.items(): +# # get variable instances +# variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] + + +# # create the histogram if not present yet +# if var_key not in histograms: +# for reg_key in ['ar_wj','ar_wj','ar_yields']: +# h = ( +# hist.Hist.new +# .IntCat([], name="process", growth=True) +# .IntCat([], name="shift", growth=True) +# ) +# # add variable axes +# for variable_inst in variable_insts: +# h = h.Var( +# variable_inst.bin_edges, +# name='_'.join((variable_inst.name, reg_key)) +# label=variable_inst.get_full_x_title(), +# ) +# # enable weights and store it +# histograms[var_key] = h.Weight() + +# # merge category ids +# category_ids = ak.concatenate( +# [Route(c).apply(events) for c in self.category_id_columns], +# axis=-1, +# ) + +# # broadcast arrays so that each event can be filled for all its categories +# fill_data = { +# "category": category_ids, +# "process": events.process_id, +# "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, +# "weight": weight, +# } +# for variable_inst in variable_insts: +# # prepare the expression +# expr = variable_inst.expression +# if isinstance(expr, str): +# route = Route(expr) +# def expr(events, *args, **kwargs): +# if len(events) == 0 and not has_ak_column(events, route): +# return empty_f32 +# return route.apply(events, null_value=variable_inst.null_value) +# # apply it +# fill_data[variable_inst.name] = expr(masked_events) + +# # fill it +# fill_hist( +# histograms[var_key], +# fill_data, +# last_edge_inclusive=self.last_edge_inclusive, +# ) + +# # merge output files +# self.output()["hists"].dump(histograms, formatter="pickle") + + +# # overwrite class defaults +# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True) +# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( +# default=CreateHistograms.task_family in check_overlap_tasks, +# add_default_to_description=True, +# ) + + +# CreateHistogramsWrapper = wrapper_factory( +# base_cls=AnalysisTask, +# require_cls=CreateHistograms, +# enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], +# ) + + From 9bf8095d918a09fa58bec89c2c6289e1ee91c978 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Wed, 12 Feb 2025 11:23:08 +0100 Subject: [PATCH 11/26] Fake Factor method: minimal working version --- columnflow/tasks/data_driven_methods.py | 282 ++++++---------- columnflow/tasks/histograms.py | 423 ++++++++++++++++++++---- columnflow/tasks/plotting.py | 67 ++-- 3 files changed, 492 insertions(+), 280 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 7f58ad75a..4a7367521 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -24,8 +24,8 @@ class PrepareFakeFactorHistograms( - VariablesMixin, WeightProducerMixin, + MLModelsMixin, ProducersMixin, ReducedEventsUser, ChunkedIOMixin, @@ -226,7 +226,6 @@ def run(self): ) class ComputeFakeFactors( - VariablesMixin, DatasetsProcessesMixin, CategoriesMixin, WeightProducerMixin, @@ -285,16 +284,21 @@ def requires(self): for d in self.datasets } def output(self): - return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']}, + return {"ff_json": self.target(f"fake_factors.json"), "plots": {'_'.join((ff_type, syst)): self.target(f"fake_factor_{ff_type}_{syst}.png") for syst in ['nominal', 'up', 'down'] - for ff_type in ['qcd','wj']},} + for ff_type in ['qcd','wj']}, + "plots1d": {'_'.join((ff_type,str(dm))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}.png") + for ff_type in ['qcd','wj'] + for dm in [0,1,2,10,11]}} @law.decorator.log def run(self): import hist import numpy as np + from scipy.optimize import curve_fit import matplotlib.pyplot as plt + import correctionlib import correctionlib.convert as cl_convert # preare inputs and outputs inputs = self.input() @@ -309,14 +313,12 @@ def run(self): inp['hists'].load(formatter="pickle")['fake_factors'] for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50)) ] - self.publish_message(f"merging Fake factor histograms for {dataset_name}") ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy()) hists_by_dataset.append(ds_single_hist) #Create a dict of histograms indexed by the process hists_by_proc = {} for proc_name in self.config_inst.processes.names(): proc = self.config_inst.processes.get(proc_name) - self.publish_message(f"merging Fake factor histograms for process: {proc.name}") for the_hist in hists_by_dataset: if proc.id in the_hist.axes["process"]: @@ -334,13 +336,14 @@ def run(self): #Merge histograms to get a joint data and mc histogram if len(mc_hists) > 1: mc_hists = sum(mc_hists[1:], mc_hists[0].copy()) + else: mc_hists = mc_hists[0].copy() if len(data_hists) > 1: data_hists = sum(data_hists[1:], data_hists[0].copy()) + else: data_hists = data_hists[0].copy() #Function that performs the calculation of th def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'): def get_dr_hist(self, h, det_reg): cat_name = self.categories[0] - from IPython import embed; embed() cat = self.config_inst.get_category(cat_name.replace('sr',det_reg)) return h[{"category": hist.loc(cat.id)}] @@ -353,31 +356,54 @@ def get_dr_hist(self, h, det_reg): den = data_den.values() - mc_den.values() ff_val = np.where((num > 0) & (den > 0), num / np.maximum(den, 1), - 1) + -1) def rel_err(x): return x.variances()/np.maximum(x.values()**2, 1) - ff_err2 = np.where((num > 0) & (den > 0), - np.sqrt(rel_err(data_num) + - + rel_err(data_den) + - + rel_err(mc_num) + - + rel_err(mc_den)) * ff_val**2, - 0.5* np.ones_like(ff_val)) + ff_err2 = np.abs(1./den) * (data_num.variances()**0.5 + mc_num.variances()**0.5) + np.abs(num)/(den**2) * (data_den.variances()**0.5 + mc_den.variances()**0.5) + + def fitf(x, a, b): + return a + b * x + #make interpolation of the ff values + ipt_range = ff_val.shape[0] + x = data_num.axes[0].centers + + ff_fit = np.zeros((*np.shape(ff_val),3)) + for idm in range(ff_val.shape[1]): + mask = ff_val[:,idm] > 0 + y = ff_val[mask,idm] + y_err = ff_err2[mask,idm] + x_masked = x[mask] + popt, pcov = curve_fit(fitf, + x_masked, + y, + sigma=y_err, + absolute_sigma=True) + ff_fit[:,idm,0] = fitf(x, *popt) + ff_fit[:,idm,1] = fitf(x, *popt + np.sqrt(np.diag(pcov))) + ff_fit[:,idm,2] = fitf(x, *popt - np.sqrt(np.diag(pcov))) h = hist.Hist.new for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): h = eval(f'h.{var_axis.ax_str}') h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor') - ff_hist= h.Weight() - ff_hist.view().value[...,0] = ff_val - ff_hist.view().value[...,1] = ff_val + np.sqrt(ff_err2) - ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0) - ff_hist.name = name - ff_hist.label = label - ff_corr = cl_convert.from_histogram(ff_hist) - ff_corr.data.flow = "clamp" - return ff_corr, ff_hist + ff_fitted = h.Weight() + + ff_fitted.view().value = ff_fit + ff_fitted.name = name + ff_fitted.label = label + + ff_raw = ff_fitted.copy().reset() + ff_raw.view().value[...,0] = ff_val + ff_raw.view().variance[...,0] = ff_err2 + ff_raw.name = name + '_raw' + ff_raw.label = label + '_raw' + + + + + return ff_raw, ff_fitted - wj_corr, wj_h = get_ff_corr(self, + wj_raw, wj_fitted = get_ff_corr(self, data_hists, mc_hists, num_reg = 'dr_num_wj', @@ -385,7 +411,7 @@ def rel_err(x): name='ff_wjets', label='Fake factor W+jets') - qcd_corr, qcd_h = get_ff_corr(self, + qcd_raw, qcd_fitted = get_ff_corr(self, data_hists, mc_hists, num_reg = 'dr_num_qcd', @@ -393,16 +419,52 @@ def rel_err(x): name='ff_qcd', label='Fake factor QCD') + corr_list = [] + for h in [wj_raw, wj_fitted, qcd_raw, qcd_fitted]: + corr = cl_convert.from_histogram(h) + corr.data.flow = "clamp" + corr.version = 2 + corr_list.append(corr) + cset = correctionlib.schemav2.CorrectionSet( + schema_version=2, + description="Fake factors", + corrections=corr_list + ) + self.output()['ff_json'].dump(cset.json(exclude_unset=True), formatter="json") for h_name in ['wj', 'qcd']: - the_hist = eval(f'{h_name}_h') + h_raw = eval(f'{h_name}_raw') + h_fitted = eval(f'{h_name}_fitted') - for syst in ['nominal','up','down']: - fig, ax = plt.subplots(figsize=(12, 8)) - the_hist[...,syst].plot2d(ax=ax) - self.output()['plots']['_'.join((h_name,syst))].dump(fig, formatter="mpl") + fig, ax = plt.subplots(figsize=(12, 8)) + h_raw[...,'nominal'].plot2d(ax=ax) + self.output()['plots']['_'.join((h_name,'nominal'))].dump(fig, formatter="mpl") + + dm_axis = h_raw.axes['tau_dm_pnet'] + for dm in dm_axis: + h1d = h_raw[{'tau_dm_pnet': hist.loc(dm), + 'syst': hist.loc('nominal')}] + + hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm)}] - self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json") - self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json") + fig, ax = plt.subplots(figsize=(8, 6)) + mask = h1d.counts() > 0 + x = h1d.axes[0].centers[mask] + y = h1d.counts()[mask] + xerr = (np.diff(h1d.axes[0]).flatten()/2.)[mask], + yerr = np.sqrt(h1d.variances()).flatten()[mask], + ax.errorbar(x, y, xerr = xerr, yerr = yerr, + label=f"PNet decay mode = {dm}", + marker='o', + fmt='o', + line=None, color='#2478B7', capsize=4) + ax.plot(hfit.axes[0].centers, + hfit[:,0].counts(), + color='#FF867B') + ax.fill_between(hfit.axes[0].centers, hfit[:,2].counts(), hfit[:,1].counts(), color='#83d55f', alpha=0.5) + ax.set_ylabel('Fake Factor') + ax.set_xlabel('Tau pT [GeV]') + ax.set_title(f'Jet Fake Factors (Tau PNet Decay Mode {(dm)}') + self.output()['plots1d']['_'.join((h_name,str(dm)))].dump(fig, formatter="mpl") @@ -457,158 +519,4 @@ def run(self): inputs = self.input() from IPython import embed; embed() # declare output: dict of histograms - histograms = {} - -# # run the weight_producer setup -# producer_reqs = self.weight_producer_inst.run_requires() -# reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs)) - -# # create a temp dir for saving intermediate files -# tmp_dir = law.LocalDirectoryTarget(is_tmp=True) -# tmp_dir.touch() - -# # get shift dependent aliases -# aliases = self.local_shift_inst.x("column_aliases", {}) - -# # define columns that need to be read -# read_columns = {Route("process_id")} -# read_columns |= set(map(Route, self.category_id_columns)) -# read_columns |= set(self.weight_producer_inst.used_columns) -# read_columns |= set(map(Route, aliases.values())) -# read_columns |= { -# Route(inp) -# for variable_inst in ( -# self.config_inst.get_variable(var_name) -# for var_name in law.util.flatten(self.variable_tuples.values()) -# ) -# for inp in (( -# {variable_inst.expression} -# if isinstance(variable_inst.expression, str) -# # for variable_inst with custom expressions, read columns declared via aux key -# else set(variable_inst.x("inputs", [])) -# ) | ( -# # for variable_inst with selection, read columns declared via aux key -# set(variable_inst.x("inputs", [])) -# if variable_inst.selection != "1" -# else set() -# )) -# } - -# # empty float array to use when input files have no entries -# empty_f32 = ak.Array(np.array([], dtype=np.float32)) - -# # iterate over chunks of events and diffs -# file_targets = [inputs["events"]["events"]] -# if self.producer_insts: -# file_targets.extend([inp["columns"] for inp in inputs["producers"]]) -# # if self.ml_model_insts: -# # file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]]) - -# # prepare inputs for localization -# with law.localize_file_targets( -# [*file_targets, *reader_targets.values()], -# mode="r", -# ) as inps: -# for (events, *columns), pos in self.iter_chunked_io( -# [inp.abspath for inp in inps], -# source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets), -# read_columns=(len(file_targets) + len(reader_targets)) * [read_columns], -# chunk_size=self.weight_producer_inst.get_min_chunk_size(), -# ): -# # optional check for overlapping inputs -# if self.check_overlapping_inputs: -# self.raise_if_overlapping([events] + list(columns)) - -# # add additional columns -# events = update_ak_array(events, *columns) - -# # add aliases -# events = add_ak_aliases( -# events, -# aliases, -# remove_src=True, -# missing_strategy=self.missing_column_alias_strategy, -# ) - -# # build the full event weight without fake factors -# if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): -# events, weight = self.weight_producer_inst(events) -# else: -# weight = ak.Array(np.ones(len(events), dtype=np.float32)) - -# # define and fill histograms, taking into account multiple axes -# for var_key, var_names in self.variable_tuples.items(): -# # get variable instances -# variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] - - -# # create the histogram if not present yet -# if var_key not in histograms: -# for reg_key in ['ar_wj','ar_wj','ar_yields']: -# h = ( -# hist.Hist.new -# .IntCat([], name="process", growth=True) -# .IntCat([], name="shift", growth=True) -# ) -# # add variable axes -# for variable_inst in variable_insts: -# h = h.Var( -# variable_inst.bin_edges, -# name='_'.join((variable_inst.name, reg_key)) -# label=variable_inst.get_full_x_title(), -# ) -# # enable weights and store it -# histograms[var_key] = h.Weight() - -# # merge category ids -# category_ids = ak.concatenate( -# [Route(c).apply(events) for c in self.category_id_columns], -# axis=-1, -# ) - -# # broadcast arrays so that each event can be filled for all its categories -# fill_data = { -# "category": category_ids, -# "process": events.process_id, -# "shift": np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id, -# "weight": weight, -# } -# for variable_inst in variable_insts: -# # prepare the expression -# expr = variable_inst.expression -# if isinstance(expr, str): -# route = Route(expr) -# def expr(events, *args, **kwargs): -# if len(events) == 0 and not has_ak_column(events, route): -# return empty_f32 -# return route.apply(events, null_value=variable_inst.null_value) -# # apply it -# fill_data[variable_inst.name] = expr(masked_events) - -# # fill it -# fill_hist( -# histograms[var_key], -# fill_data, -# last_edge_inclusive=self.last_edge_inclusive, -# ) - -# # merge output files -# self.output()["hists"].dump(histograms, formatter="pickle") - - -# # overwrite class defaults -# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True) -# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( -# default=CreateHistograms.task_family in check_overlap_tasks, -# add_default_to_description=True, -# ) - - -# CreateHistogramsWrapper = wrapper_factory( -# base_cls=AnalysisTask, -# require_cls=CreateHistograms, -# enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], -# ) - - - + histograms = {} \ No newline at end of file diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py index 070e9c49d..bfc316e9e 100644 --- a/columnflow/tasks/histograms.py +++ b/columnflow/tasks/histograms.py @@ -22,7 +22,6 @@ from columnflow.util import dev_sandbox from columnflow.hist_util import create_hist_from_variables - class CreateHistograms( VariablesMixin, WeightProducerMixin, @@ -143,6 +142,7 @@ def run(self): read_columns = {Route("process_id")} read_columns |= set(map(Route, self.category_id_columns)) read_columns |= set(self.weight_producer_inst.used_columns) + read_columns |= set(map(Route, [n +'*' for n in self.config_inst.x.fake_factor_method.columns])) read_columns |= set(map(Route, aliases.values())) read_columns |= { Route(inp) @@ -201,72 +201,85 @@ def run(self): # attach coffea behavior aiding functional variable expressions events = attach_coffea_behavior(events) - + # build the full event weight if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): events, weight = self.weight_producer_inst(events) else: weight = ak.Array(np.ones(len(events), dtype=np.float32)) + categories = self.config_inst.categories.names() + sig_regs = [the_cat for the_cat in categories if 'sr' in the_cat] # define and fill histograms, taking into account multiple axes - for var_key, var_names in self.variable_tuples.items(): - # get variable instances - variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] - - if var_key not in histograms: - # create the histogram in the first chunk - histograms[var_key] = create_hist_from_variables( - *variable_insts, - int_cat_axes=("category", "process", "shift"), - ) - - # mask events and weights when selection expressions are found - masked_events = events - masked_weights = weight - for variable_inst in variable_insts: - sel = variable_inst.selection - if sel == "1": - continue - if not callable(sel): - raise ValueError( - f"invalid selection '{sel}', for now only callables are supported", + for sig_reg in sig_regs: + #iterate over the regions needed for calculation of the ff_method + for region in ["sr", "ar_wj", "ar_qcd", "ar_yields"]: + #by accessing the list of categories we check if the category with this name exists + cat = self.config_inst.get_category(sig_reg.replace('sr',region)) + if cat.name not in histograms.keys(): histograms[cat.name] = {} + for var_key, var_names in self.variable_tuples.items(): + # get variable instances + variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] + + if var_key not in histograms[cat.name].keys(): + # create the histogram in the first chunk + histograms[cat.name][var_key] = create_hist_from_variables( + *variable_insts, + int_cat_axes=("category", "process", "shift"), + ) + # mask events and weights when selection expressions are found + masked_events = events + + if region == 'ar_wj': + masked_weights = weight * events.ff_weight_wj_nominal + elif region == 'ar_qcd': + masked_weights = weight * events.ff_weight_qcd_nominal + else: + masked_weights = weight + for variable_inst in variable_insts: + sel = variable_inst.selection + if sel == "1": + continue + if not callable(sel): + raise ValueError( + f"invalid selection '{sel}', for now only callables are supported", + ) + mask = sel(masked_events) + #select only one category per histogram + masked_events = masked_events[mask] + masked_weights = masked_weights[mask] + + # merge category ids + category_ids = ak.concatenate( + [Route(c).apply(masked_events) for c in self.category_id_columns], + axis=-1, ) - mask = sel(masked_events) - masked_events = masked_events[mask] - masked_weights = masked_weights[mask] - - # merge category ids - category_ids = ak.concatenate( - [Route(c).apply(masked_events) for c in self.category_id_columns], - axis=-1, - ) - - # broadcast arrays so that each event can be filled for all its categories - fill_data = { - "category": category_ids, - "process": masked_events.process_id, - "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id, - "weight": masked_weights, - } - for variable_inst in variable_insts: - # prepare the expression - expr = variable_inst.expression - if isinstance(expr, str): - route = Route(expr) - def expr(events, *args, **kwargs): - if len(events) == 0 and not has_ak_column(events, route): - return empty_f32 - return route.apply(events, null_value=variable_inst.null_value) - # apply it - fill_data[variable_inst.name] = expr(masked_events) - - # fill it - fill_hist( - histograms[var_key], - fill_data, - last_edge_inclusive=self.last_edge_inclusive, - ) + # broadcast arrays so that each event can be filled for all its categories + fill_data = { + "category": category_ids, + "process": masked_events.process_id, + "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id, + "weight": masked_weights, + } + for variable_inst in variable_insts: + # prepare the expression + expr = variable_inst.expression + if isinstance(expr, str): + route = Route(expr) + def expr(events, *args, **kwargs): + if len(events) == 0 and not has_ak_column(events, route): + return empty_f32 + return route.apply(events, null_value=variable_inst.null_value) + # apply it + fill_data[variable_inst.name] = expr(masked_events) + # fill it + + fill_hist( + histograms[cat.name][var_key], + fill_data, + last_edge_inclusive=self.last_edge_inclusive, + ) # merge output files self.output()["hists"].dump(histograms, formatter="pickle") @@ -278,6 +291,261 @@ def expr(events, *args, **kwargs): add_default_to_description=True, ) +# class CreateHistograms( +# VariablesMixin, +# WeightProducerMixin, +# MLModelsMixin, +# ProducersMixin, +# ReducedEventsUser, +# ChunkedIOMixin, +# law.LocalWorkflow, +# RemoteWorkflow, +# ): +# last_edge_inclusive = last_edge_inclusive_inst + +# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + +# # upstream requirements +# reqs = Requirements( +# ReducedEventsUser.reqs, +# RemoteWorkflow.reqs, +# ProduceColumns=ProduceColumns, +# MLEvaluation=MLEvaluation, +# ) + +# # strategy for handling missing source columns when adding aliases on event chunks +# missing_column_alias_strategy = "original" + +# # names of columns that contain category ids +# # (might become a parameter at some point) +# category_id_columns = {"category_ids"} + +# # register sandbox and shifts found in the chosen weight producer to this task +# register_weight_producer_sandbox = True +# register_weight_producer_shifts = True + +# @law.util.classproperty +# def mandatory_columns(cls) -> set[str]: +# return set(cls.category_id_columns) | {"process_id"} + +# def workflow_requires(self): +# reqs = super().workflow_requires() + +# # require the full merge forest +# reqs["events"] = self.reqs.ProvideReducedEvents.req(self) + +# if not self.pilot: +# if self.producer_insts: +# reqs["producers"] = [ +# self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) +# for producer_inst in self.producer_insts +# if producer_inst.produced_columns +# ] +# if self.ml_model_insts: +# reqs["ml"] = [ +# self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name) +# for ml_model_inst in self.ml_model_insts +# ] + +# # add weight_producer dependent requirements +# reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) + +# return reqs + +# def requires(self): +# reqs = {"events": self.reqs.ProvideReducedEvents.req(self)} + +# if self.producer_insts: +# reqs["producers"] = [ +# self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) +# for producer_inst in self.producer_insts +# if producer_inst.produced_columns +# ] +# if self.ml_model_insts: +# reqs["ml"] = [ +# self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name) +# for ml_model_inst in self.ml_model_insts +# ] + +# # add weight_producer dependent requirements +# reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) + +# return reqs + +# workflow_condition = ReducedEventsUser.workflow_condition.copy() + +# @workflow_condition.output +# def output(self): +# return {"hists": self.target(f"hist__vars_{self.variables_repr}__{self.branch}.pickle")} + +# @law.decorator.notify +# @law.decorator.log +# @law.decorator.localize(input=True, output=False) +# @law.decorator.safe_output +# def run(self): +# import numpy as np +# import awkward as ak +# from columnflow.columnar_util import ( +# Route, update_ak_array, add_ak_aliases, has_ak_column, attach_coffea_behavior, +# ) +# from columnflow.hist_util import fill_hist + +# # prepare inputs +# inputs = self.input() + +# # declare output: dict of histograms +# histograms = {} + +# # run the weight_producer setup +# producer_reqs = self.weight_producer_inst.run_requires() +# reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs)) + +# # create a temp dir for saving intermediate files +# tmp_dir = law.LocalDirectoryTarget(is_tmp=True) +# tmp_dir.touch() + +# # get shift dependent aliases +# aliases = self.local_shift_inst.x("column_aliases", {}) + +# # define columns that need to be read +# read_columns = {Route("process_id")} +# read_columns |= set(map(Route, self.category_id_columns)) +# read_columns |= set(self.weight_producer_inst.used_columns) +# read_columns |= set(map(Route, aliases.values())) +# read_columns |= { +# Route(inp) +# for variable_inst in ( +# self.config_inst.get_variable(var_name) +# for var_name in law.util.flatten(self.variable_tuples.values()) +# ) +# for inp in (( +# {variable_inst.expression} +# if isinstance(variable_inst.expression, str) +# # for variable_inst with custom expressions, read columns declared via aux key +# else set(variable_inst.x("inputs", [])) +# ) | ( +# # for variable_inst with selection, read columns declared via aux key +# set(variable_inst.x("inputs", [])) +# if variable_inst.selection != "1" +# else set() +# )) +# } + +# # empty float array to use when input files have no entries +# empty_f32 = ak.Array(np.array([], dtype=np.float32)) + +# # iterate over chunks of events and diffs +# file_targets = [inputs["events"]["events"]] +# if self.producer_insts: +# file_targets.extend([inp["columns"] for inp in inputs["producers"]]) +# if self.ml_model_insts: +# file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]]) + +# # prepare inputs for localization +# with law.localize_file_targets( +# [*file_targets, *reader_targets.values()], +# mode="r", +# ) as inps: +# for (events, *columns), pos in self.iter_chunked_io( +# [inp.abspath for inp in inps], +# source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets), +# read_columns=(len(file_targets) + len(reader_targets)) * [read_columns], +# chunk_size=self.weight_producer_inst.get_min_chunk_size(), +# ): +# # optional check for overlapping inputs +# if self.check_overlapping_inputs: +# self.raise_if_overlapping([events] + list(columns)) + +# # add additional columns +# events = update_ak_array(events, *columns) + +# # add aliases +# events = add_ak_aliases( +# events, +# aliases, +# remove_src=True, +# missing_strategy=self.missing_column_alias_strategy, +# ) + +# # attach coffea behavior aiding functional variable expressions +# events = attach_coffea_behavior(events) + +# # build the full event weight +# if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): +# events, weight = self.weight_producer_inst(events) +# else: +# weight = ak.Array(np.ones(len(events), dtype=np.float32)) + +# # define and fill histograms, taking into account multiple axes +# for var_key, var_names in self.variable_tuples.items(): +# # get variable instances +# variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] + +# if var_key not in histograms: +# # create the histogram in the first chunk +# histograms[var_key] = create_hist_from_variables( +# *variable_insts, +# int_cat_axes=("category", "process", "shift"), +# ) + +# # mask events and weights when selection expressions are found +# masked_events = events +# masked_weights = weight +# for variable_inst in variable_insts: +# sel = variable_inst.selection +# if sel == "1": +# continue +# if not callable(sel): +# raise ValueError( +# f"invalid selection '{sel}', for now only callables are supported", +# ) +# mask = sel(masked_events) +# masked_events = masked_events[mask] +# masked_weights = masked_weights[mask] + +# # merge category ids +# category_ids = ak.concatenate( +# [Route(c).apply(masked_events) for c in self.category_id_columns], +# axis=-1, +# ) + +# # broadcast arrays so that each event can be filled for all its categories +# fill_data = { +# "category": category_ids, +# "process": masked_events.process_id, +# "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id, +# "weight": masked_weights, +# } +# for variable_inst in variable_insts: +# # prepare the expression +# expr = variable_inst.expression +# if isinstance(expr, str): +# route = Route(expr) +# def expr(events, *args, **kwargs): +# if len(events) == 0 and not has_ak_column(events, route): +# return empty_f32 +# return route.apply(events, null_value=variable_inst.null_value) +# # apply it +# fill_data[variable_inst.name] = expr(masked_events) + +# # fill it +# fill_hist( +# histograms[var_key], +# fill_data, +# last_edge_inclusive=self.last_edge_inclusive, +# ) + +# # merge output files +# self.output()["hists"].dump(histograms, formatter="pickle") + + +# # overwrite class defaults +# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True) +# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( +# default=CreateHistograms.task_family in check_overlap_tasks, +# add_default_to_description=True, +# ) + CreateHistogramsWrapper = wrapper_factory( base_cls=AnalysisTask, @@ -384,18 +652,45 @@ def run(self): for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) ] + cats = list(hists[0].keys()) + variable_names = list(hists[0][cats[0]].keys()) + get_hists = lambda hists, cat, var : [h[cat][var] for h in hists] # create a separate file per output variable - variable_names = list(hists[0].keys()) for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): - self.publish_message(f"merging histograms for '{variable_name}'") - - variable_hists = [h[variable_name] for h in hists] - merged = sum(variable_hists[1:], variable_hists[0].copy()) - outputs["hists"][variable_name].dump(merged, formatter="pickle") + merged_hists = {} + for the_cat in cats: + self.publish_message(f"merging histograms for {variable_name}, category: {the_cat}") + variable_hists = get_hists(hists, the_cat, variable_name) + merged_hists[the_cat] = sum(variable_hists[1:], variable_hists[0].copy()) + outputs["hists"][variable_name].dump(merged_hists, formatter="pickle") # optionally remove inputs if self.remove_previous: inputs.remove() + + # def run(self): + # # preare inputs and outputs + # inputs = self.input()["collection"] + # outputs = self.output() + + # # load input histograms + # hists = [ + # inp["hists"].load(formatter="pickle") + # for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) + # ] + + # # create a separate file per output variable + # variable_names = list(hists[0].keys()) + # for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): + # self.publish_message(f"merging histograms for '{variable_name}'") + + # variable_hists = [h[variable_name] for h in hists] + # merged = sum(variable_hists[1:], variable_hists[0].copy()) + # outputs["hists"][variable_name].dump(merged, formatter="pickle") + + # # optionally remove inputs + # if self.remove_previous: + # inputs.remove() MergeHistogramsWrapper = wrapper_factory( diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py index 6709d3fbb..d15a18cec 100644 --- a/columnflow/tasks/plotting.py +++ b/columnflow/tasks/plotting.py @@ -111,43 +111,52 @@ def run(self): for dataset, inp in self.input().items(): dataset_inst = self.config_inst.get_dataset(dataset) h_in = inp["collection"][0]["hists"].targets[self.branch_data.variable].load(formatter="pickle") - + # loop and extract one histogram per process - for process_inst in process_insts: - # skip when the dataset is already known to not contain any sub process - if not any( - dataset_inst.has_process(sub_process_inst.name) - for sub_process_inst in sub_process_insts[process_inst] - ): - continue - - # select processes and reduce axis - h = h_in.copy() - h = h[{ - "process": [ - hist.loc(p.id) - for p in sub_process_insts[process_inst] - if p.id in h.axes["process"] - ], - }] - h = h[{"process": sum}] - - # add the histogram - if process_inst in hists: - hists[process_inst] += h - else: - hists[process_inst] = h - + for region in h_in.keys(): + if region not in hists: hists[region] = {} + for process_inst in process_insts: + # skip when the dataset is already known to not contain any sub process + if not any( + dataset_inst.has_process(sub_process_inst.name) + for sub_process_inst in sub_process_insts[process_inst] + ): + continue + + # select processes and reduce axis + h = h_in[region].copy() + h = h[{ + "process": [ + hist.loc(p.id) + for p in sub_process_insts[process_inst] + if p.id in h.axes["process"] + ], + }] + h = h[{"process": sum}] + + # add the histogram + if process_inst in hists[region]: + hists[region][process_inst] += h + else: + hists[region][process_inst] = h + + # there should be hists to plot + if not hists: raise Exception( "no histograms found to plot; possible reasons:\n" " - requested variable requires columns that were missing during histogramming\n" " - selected --processes did not match any value on the process axis of the input histogram", ) - - # update histograms using custom hooks - hists = self.invoke_hist_hooks(hists) + + if 'sr' in category_inst.name: + hists = self.invoke_hist_hooks(hists) + else: + if category_inst.name in hists.keys(): + hists = hists[category_inst.name] + else: + hists[list(hists.keys())[0]] # add new processes to the end of the list for process_inst in hists: From e2e2ef4838647bc5988cacc44115d0d057378f87 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Wed, 12 Feb 2025 12:45:36 +0100 Subject: [PATCH 12/26] Minor code updates --- columnflow/columnar_util.py | 20 +++++++++++--------- columnflow/hist_util.py | 11 +++++++++-- columnflow/production/normalization.py | 8 +++++--- columnflow/tasks/yields.py | 2 +- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py index a7b3c5ebe..171ab3661 100644 --- a/columnflow/columnar_util.py +++ b/columnflow/columnar_util.py @@ -14,6 +14,7 @@ import math import time import enum + import inspect import threading import multiprocessing @@ -40,6 +41,7 @@ maybe_import("coffea.nanoevents.methods.base") maybe_import("coffea.nanoevents.methods.nanoaod") pq = maybe_import("pyarrow.parquet") +hist = maybe_import("hist") # loggers @@ -1405,15 +1407,15 @@ def allows_shift(ax) -> bool: flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5 # fill - # if 'event' in data.keys(): - # arrays = {} - # for ax_name in axis_names: - # if ax_name in data.keys(): - # arrays[ax_name] = data[ax_name] - # h.fill(**fill_kwargs, **arrays) - # else: - arrays = ak.flatten(ak.cartesian(data)) - h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) + if 'event' in data.keys(): + arrays = {} + for ax_name in axis_names: + if ax_name in data.keys(): + arrays[ax_name] = data[ax_name] + h.fill(**fill_kwargs, **arrays) + else: + arrays = ak.flatten(ak.cartesian(data)) + h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) class RouteFilter(object): diff --git a/columnflow/hist_util.py b/columnflow/hist_util.py index 3c2b60ca6..92a9ed42a 100644 --- a/columnflow/hist_util.py +++ b/columnflow/hist_util.py @@ -72,8 +72,15 @@ def allows_shift(ax) -> bool: flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5 # fill - arrays = ak.flatten(ak.cartesian(data)) - h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) + if 'event' in data.keys(): + arrays = {} + for ax_name in axis_names: + if ax_name in data.keys(): + arrays[ax_name] = data[ax_name] + h.fill(**fill_kwargs, **arrays) + else: + arrays = ak.flatten(ak.cartesian(data)) + h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) def add_hist_axis(histogram: hist.Hist, variable_inst: od.Variable) -> hist.Hist: diff --git a/columnflow/production/normalization.py b/columnflow/production/normalization.py index 66616ac7e..2144a52be 100644 --- a/columnflow/production/normalization.py +++ b/columnflow/production/normalization.py @@ -339,7 +339,8 @@ def normalization_weights_setup( # fill the process weight table for proc_id, br in branching_ratios.items(): - sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(proc_id)] + #sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(proc_id)] + sum_weights = self.dataset_inst.n_events process_weight_table[0, proc_id] = lumi * inclusive_xsec * br / sum_weights else: # fill the process weight table with per-process cross sections @@ -349,10 +350,9 @@ def normalization_weights_setup( f"no cross section registered for process {process_inst} for center-of-mass " f"energy of {self.config_inst.campaign.ecm}", ) - sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(process_inst.id)] + #sum_weights = merged_selection_stats["sum_mc_weight_per_process"][str(process_inst.id)] #quick fix that need to be fixed ################################ - #n_evt_per_file = /self.dataset_inst.n_files sum_weights = self.dataset_inst.n_events ################################ xsec = process_inst.get_xsec(self.config_inst.campaign.ecm).nominal @@ -401,3 +401,5 @@ def normalization_weights_init(self: Producer) -> None: "get_xsecs_from_inclusive_dataset": False, }, ) + + diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py index e7d26ca57..9de6a31cc 100644 --- a/columnflow/tasks/yields.py +++ b/columnflow/tasks/yields.py @@ -143,7 +143,7 @@ def run(self): dataset_inst = self.config_inst.get_dataset(dataset) # load the histogram of the variable named "event" - h_in = inp["hists"]["event"].load(formatter="pickle") + input_hists = inp["hists"]["event"].load(formatter="pickle") # loop and extract one histogram per process for process_inst in process_insts: From b3cc09f3ecf8f418ee82464c4465654bb516c506 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Fri, 14 Feb 2025 11:14:36 +0100 Subject: [PATCH 13/26] Fake_factor method update --- columnflow/tasks/data_driven_methods.py | 187 +++++++++++++++++------- 1 file changed, 138 insertions(+), 49 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 4a7367521..b8228b361 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -225,11 +225,23 @@ def run(self): enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], ) +class dict_creator(): + def init_dict(self, ax_list): + if not ax_list: + return -1. + else: + ax = ax_list[0] + updated_ax = ax_list[1:] + get_ax_dict = lambda ax, ax_list, func : {ax.bin(i): func(ax_list) for i in range(ax.size)} + return get_ax_dict(ax,updated_ax, self.init_dict) + + class ComputeFakeFactors( DatasetsProcessesMixin, CategoriesMixin, WeightProducerMixin, ProducersMixin, + dict_creator, ): sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) @@ -279,7 +291,7 @@ def requires(self): d: self.reqs.PrepareFakeFactorHistograms.req( self, dataset=d, - branch=-1, + branch=-1 ) for d in self.datasets } @@ -297,9 +309,15 @@ def run(self): import hist import numpy as np from scipy.optimize import curve_fit + from scipy.special import erf import matplotlib.pyplot as plt - import correctionlib - import correctionlib.convert as cl_convert + import correctionlib.schemav2 as cs + plt.figure(dpi=200) + plt.rcParams.update({ + "text.usetex": True, + "font.family": "monospace", + "font.monospace": 'Computer Modern Typewriter' + }) # preare inputs and outputs inputs = self.input() outputs = self.output() @@ -346,6 +364,8 @@ def get_dr_hist(self, h, det_reg): cat_name = self.categories[0] cat = self.config_inst.get_category(cat_name.replace('sr',det_reg)) return h[{"category": hist.loc(cat.id)}] + + get_id = lambda ax, key: [i in enumerate(ax.keys)] data_num = get_dr_hist(self, h_data, num_reg) data_den = get_dr_hist(self, h_data, den_reg) @@ -360,50 +380,72 @@ def get_dr_hist(self, h, det_reg): def rel_err(x): return x.variances()/np.maximum(x.values()**2, 1) - ff_err2 = np.abs(1./den) * (data_num.variances()**0.5 + mc_num.variances()**0.5) + np.abs(num)/(den**2) * (data_den.variances()**0.5 + mc_den.variances()**0.5) - - def fitf(x, a, b): - return a + b * x - #make interpolation of the ff values - ipt_range = ff_val.shape[0] - x = data_num.axes[0].centers + ff_err = ff_val * ((data_num.variances() + mc_num.variances())**0.5 / np.abs(num) + (data_den.variances() + mc_den.variances())**0.5 / np.abs(den)) + - ff_fit = np.zeros((*np.shape(ff_val),3)) - for idm in range(ff_val.shape[1]): - mask = ff_val[:,idm] > 0 - y = ff_val[mask,idm] - y_err = ff_err2[mask,idm] - x_masked = x[mask] - popt, pcov = curve_fit(fitf, - x_masked, - y, - sigma=y_err, - absolute_sigma=True) - ff_fit[:,idm,0] = fitf(x, *popt) - ff_fit[:,idm,1] = fitf(x, *popt + np.sqrt(np.diag(pcov))) - ff_fit[:,idm,2] = fitf(x, *popt - np.sqrt(np.diag(pcov))) h = hist.Hist.new for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): h = eval(f'h.{var_axis.ax_str}') h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor') - ff_fitted = h.Weight() - - ff_fitted.view().value = ff_fit - ff_fitted.name = name - ff_fitted.label = label - - ff_raw = ff_fitted.copy().reset() + ff_raw = h.Weight() ff_raw.view().value[...,0] = ff_val - ff_raw.view().variance[...,0] = ff_err2 + ff_raw.view().variance[...,0] = ff_err**2 ff_raw.name = name + '_raw' ff_raw.label = label + '_raw' + #Make an approximation of tau pt dependance + formula_str = 'p0 + p1*x+p2*x*x' + def fitf(x, p0, p1, p2): + return eval(formula_str) + def jac(x): + from numpy import array + out = array([[ 1, x, x**2],[x, x**2, x**3],[x**2, x**3, x**4]]) + return out + + def eval_formula(formula_str, popt): + for i,p in enumerate(popt): + formula_str = formula_str.replace(f'p{i}',str(popt[i])) + return formula_str + + ff_fitted = ff_raw.copy().reset() + ff_fitted.name = name + ff_fitted.label = label + fitres = {} - + axes = list(ff_raw.axes[1:2]) + fitres = {} + dc = dict_creator() + for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str']: + fitres[the_field]= dc.init_dict(axes) - return ff_raw, ff_fitted + dm_axis = ff_raw.axes['tau_dm_pnet'] + for dm in dm_axis: + h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm), + 'syst': hist.loc('nominal')}] + mask = h1d.values() > 0 + y = h1d.values()[mask] + y_err = (h1d.variances()[mask])**0.5 + x = h1d.axes[0].centers[mask] + popt, pcov = curve_fit(fitf,x,y, + sigma=y_err, + absolute_sigma=True, + ) + fitres['chi2'][dm] = sum(((y - fitf(x, *popt))/y_err)**2) + fitres['ndf'][dm] = len(y) - len(popt) + fitres['popt'][dm] = popt + fitres['pcov'][dm] = pcov + + fitres['fitf_str'][dm] = eval_formula(formula_str,popt) + for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0 + ff_fitted.view().value[:, + ff_fitted.axes[1].index(dm), + ff_fitted.axes[2].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov))) + fitres['name'] = name + fitres['jac'] = jac + fitres['fitf'] = fitf + return ff_raw, ff_fitted, fitres - wj_raw, wj_fitted = get_ff_corr(self, + wj_raw, wj_fitted, wj_fitres = get_ff_corr(self, data_hists, mc_hists, num_reg = 'dr_num_wj', @@ -411,7 +453,7 @@ def fitf(x, a, b): name='ff_wjets', label='Fake factor W+jets') - qcd_raw, qcd_fitted = get_ff_corr(self, + qcd_raw, qcd_fitted, qcd_fitres = get_ff_corr(self, data_hists, mc_hists, num_reg = 'dr_num_qcd', @@ -420,17 +462,46 @@ def fitf(x, a, b): label='Fake factor QCD') corr_list = [] - for h in [wj_raw, wj_fitted, qcd_raw, qcd_fitted]: - corr = cl_convert.from_histogram(h) - corr.data.flow = "clamp" - corr.version = 2 - corr_list.append(corr) - cset = correctionlib.schemav2.CorrectionSet( + for fitres in [wj_fitres, qcd_fitres]: + formula_str = fitres['fitf_str'] + dm_bins = [] + for (dm, the_formula) in formula_str.items(): + x_max = 100 + last_val = fitres['fitf'](x_max,* fitres['popt'][dm]) + + dm_bins.append(cs.CategoryItem( + key=dm, + value=cs.Formula( + nodetype="formula", + variables=["tau_pt"], + parser="TFormula", + expression=f'({the_formula})/(1. + exp(10.*(x-{x_max}))) + ({last_val})/(1. + exp(-10.*(x-{x_max})))', + ))) + corr_list.append(cs.Correction( + name=fitres['name'], + description=f"fake factor correcton for {fitres['name'].split('_')[1]}", + version=2, + inputs=[ + cs.Variable(name="tau_pt", type="real",description="pt of tau"), + cs.Variable(name="tau_dm_pnet", type="int", description="PNet decay mode of tau"), + ], + output=cs.Variable(name="weight", type="real", description="Multiplicative event weight"), + data=cs.Category( + nodetype="category", + input="tau_dm_pnet", + content=dm_bins,) + )) + + cset = cs.CorrectionSet( schema_version=2, description="Fake factors", corrections=corr_list ) self.output()['ff_json'].dump(cset.json(exclude_unset=True), formatter="json") + + + + #Plot fake factors: for h_name in ['wj', 'qcd']: h_raw = eval(f'{h_name}_raw') h_fitted = eval(f'{h_name}_fitted') @@ -438,14 +509,12 @@ def fitf(x, a, b): fig, ax = plt.subplots(figsize=(12, 8)) h_raw[...,'nominal'].plot2d(ax=ax) self.output()['plots']['_'.join((h_name,'nominal'))].dump(fig, formatter="mpl") - + fitres = wj_fitres if h_name == 'wj' else qcd_fitres dm_axis = h_raw.axes['tau_dm_pnet'] for dm in dm_axis: h1d = h_raw[{'tau_dm_pnet': hist.loc(dm), 'syst': hist.loc('nominal')}] - hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm)}] - fig, ax = plt.subplots(figsize=(8, 6)) mask = h1d.counts() > 0 x = h1d.axes[0].centers[mask] @@ -457,13 +526,33 @@ def fitf(x, a, b): marker='o', fmt='o', line=None, color='#2478B7', capsize=4) - ax.plot(hfit.axes[0].centers, - hfit[:,0].counts(), + x_fine = np.linspace(x[0],x[-1],num=100) + popt = fitres['popt'][dm] + pcov = fitres['pcov'][dm] + jac = fitres['jac'] + def err(x,jac,pcov): + from numpy import sqrt,einsum + return sqrt(einsum('ij,ij',jac(x),pcov)) + + import functools + err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine)) + + y_fitf = fitres['fitf'](x_fine,*popt) + y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y + y_fitf_down = fitres['fitf'](x_fine,*(popt)) - err_y + + ax.plot(x_fine, + y_fitf, color='#FF867B') - ax.fill_between(hfit.axes[0].centers, hfit[:,2].counts(), hfit[:,1].counts(), color='#83d55f', alpha=0.5) + ax.fill_between(x_fine, y_fitf_up, y_fitf_down, color='#83d55f', alpha=0.5) ax.set_ylabel('Fake Factor') ax.set_xlabel('Tau pT [GeV]') - ax.set_title(f'Jet Fake Factors (Tau PNet Decay Mode {(dm)}') + ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}') + ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm],2)}}}{{{fitres['ndf'][dm]}}}$", + (0.8, 0.9), + xycoords='axes fraction', + fontsize=20) + self.output()['plots1d']['_'.join((h_name,str(dm)))].dump(fig, formatter="mpl") From 766350c32d2308c3cb8e9820519129e271b98533 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Thu, 27 Feb 2025 09:45:00 +0100 Subject: [PATCH 14/26] Update on the fake factor method and plotting, jets.py form columnflow didn't work, so I commented them out --- columnflow/calibration/cms/jets.py | 2188 +++++++++++----------- columnflow/plotting/plot_functions_1d.py | 2 +- columnflow/tasks/data_driven_methods.py | 8 +- columnflow/tasks/framework/mixins.py | 4 +- columnflow/tasks/histograms.py | 345 +--- columnflow/tasks/plotting.py | 19 +- 6 files changed, 1155 insertions(+), 1411 deletions(-) diff --git a/columnflow/calibration/cms/jets.py b/columnflow/calibration/cms/jets.py index 32c7c816b..20e600fa3 100644 --- a/columnflow/calibration/cms/jets.py +++ b/columnflow/calibration/cms/jets.py @@ -1,1091 +1,1109 @@ -# coding: utf-8 - -""" -Jet energy corrections and jet resolution smearing. -""" -from pprint import pprint - -import functools - -import law - -from columnflow.types import Any -from columnflow.calibration import Calibrator, calibrator -from columnflow.calibration.util import ak_random, propagate_met -from columnflow.production.util import attach_coffea_behavior -from columnflow.util import maybe_import, InsertableDict, DotDict -from columnflow.columnar_util import set_ak_column, layout_ak_array, optional_column as optional - -np = maybe_import("numpy") -ak = maybe_import("awkward") -correctionlib = maybe_import("correctionlib") - -logger = law.logger.get_logger(__name__) - - -# -# helper functions -# - -set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32) - - -import difflib - -def get_evaluators( - correction_set: correctionlib.highlevel.CorrectionSet, - names: list[str], -) -> list[Any]: - """ - Helper function to get a list of correction evaluators from a - :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` object given - a list of *names*. The *names* can refer to either simple or compound - corrections. - - :param correction_set: evaluator provided by :external+correctionlib:doc:`index` - :param names: List of names of corrections to be applied - :raises RuntimeError: If a requested correction in *names* is not available - :return: List of compounded corrections, see - :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` - """ - available_keys = set(correction_set.keys()).union(correction_set.compound.keys()) - corrected_names = [] - - for name in names: - if name not in available_keys: - # Find the closest match using difflib - closest_matches = difflib.get_close_matches(name, available_keys, n=1) - if closest_matches: - closest_match = closest_matches[0] - print( - f"Correction '{name}' not found. Using closest match: '{closest_match}'", - ) - corrected_names.append(closest_match) - else: - raise RuntimeError(f"Correction '{name}' not found and no close match available.") - else: - corrected_names.append(name) - - # Retrieve the evaluators - return [ - correction_set.compound[name] - if name in correction_set.compound - else correction_set[name] - for name in corrected_names - ] - -def ak_evaluate(evaluator: correctionlib.highlevel.Correction, *args) -> float: - """ - Evaluate a :external+correctionlib:py:class:`correctionlib.highlevel.Correction` - using one or more :external+ak:py:class:`awkward arrays ` as inputs. - - :param evaluator: Evaluator instance - :raises ValueError: If no :external+ak:py:class:`awkward arrays ` are provided - :return: The correction factor derived from the input arrays - """ - # fail if no arguments - if not args: - raise ValueError("Expected at least one argument.") - - # collect arguments that are awkward arrays - ak_args = [ - arg for arg in args if isinstance(arg, ak.Array) - ] - - # broadcast akward arrays together and flatten - if ak_args: - bc_args = ak.broadcast_arrays(*ak_args) - flat_args = ( - np.asarray(ak.flatten(bc_arg, axis=None)) - for bc_arg in bc_args - ) - output_layout_array = bc_args[0] - else: - flat_args = iter(()) - output_layout_array = None - - # multiplex flattened and non-awkward inputs - all_flat_args = [ - next(flat_args) if isinstance(arg, ak.Array) else arg - for arg in args - ] - - # apply evaluator to flattened/multiplexed inputs - result = evaluator.evaluate(*all_flat_args) - - # apply broadcasted layout to result - if output_layout_array is not None: - result = layout_ak_array(result, output_layout_array) - - return result - - -# -# jet energy corrections -# - -# define default functions for jec calibrator -def get_jerc_file_default(self: Calibrator, external_files: DotDict) -> str: - """ - Function to obtain external correction files for JEC and/or JER. - - By default, this function extracts the location of the jec correction - files from the current config instance *config_inst*. The key of the - external file depends on the jet collection. For ``Jet`` (AK4 jets), this - resolves to ``jet_jerc``, and for ``FatJet`` it is resolved to - ``fat_jet_jerc``. - - .. code-block:: python - - cfg.x.external_files = DotDict.wrap({ - "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", - "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz", - }) - - :param external_files: Dictionary containing the information about the file location - :return: path or url to correction file(s) - """ # noqa - - # get config - try_attrs = ("get_jec_config", "get_jer_config") - jerc_config = None - for try_attr in try_attrs: - try: - jerc_config = getattr(self, try_attr)() - except AttributeError: - continue - else: - break - - # fail if not found - if jerc_config is None: - raise ValueError( - "could not retrieve jer/jec config, none of the following methods " - f"were found: {try_attrs}", - ) - - # first check config for user-supplied `external_file_key` - ext_file_key = jerc_config.get("external_file_key", None) - if ext_file_key is not None: - return external_files[ext_file_key] - - # if not found, try to resolve from jet collection name and fail if not standard NanoAOD - if self.jet_name not in get_jerc_file_default.map_jet_name_file_key: - available_keys = ", ".join(sorted(get_jerc_file_default.map_jet_name_file_key)) - raise ValueError( - f"could not determine external file key for jet collection '{self.jet_name}', " - f"name is not one of standard NanoAOD jet collections: {available_keys}", - ) - - # return external file - ext_file_key = get_jerc_file_default.map_jet_name_file_key[self.jet_name] - return external_files[ext_file_key] - - -# default external file keys for known jet collections -get_jerc_file_default.map_jet_name_file_key = { - "Jet": "jet_jerc", - "FatJet": "fat_jet_jerc", -} - - -def get_jec_config_default(self: Calibrator) -> DotDict: - """ - Load config relevant to the jet energy corrections (JEC). - - By default, this is extracted from the current *config_inst*, - assuming the JEC configurations are stored under the 'jec' - aux key. Separate configurations should be specified for each - jet collection, using the collection name as a key. For example, - the configuration for the default jet collection ``Jet`` will - be retrieved from the following config entry: - - .. code-block:: python - - self.config_inst.x.jec.Jet - - Used in :py:meth:`~.jec.setup_func`. - - :return: Dictionary containing configuration for jet energy calibration - """ - jec_cfg = self.config_inst.x.jec - - # check for old-style config - if self.jet_name not in jec_cfg: - # if jet collection is `Jet`, issue deprecation warning - if self.jet_name == "Jet": - logger.warning_once( - f"{id(self)}_depr_jec_config", - "config aux 'jec' does not contain key for input jet " - f"collection '{self.jet_name}'. This may be due to " - "an outdated config. Continuing under the assumption that " - "the entire 'jec' entry refers to this jet collection. " - "This assumption will be removed in future versions of " - "columnflow, so please adapt the config according to the " - "documentation to remove this warning and ensure future " - "compatibility of the code.", - ) - return jec_cfg - - # otherwise raise exception - raise ValueError( - "config aux 'jec' does not contain key for input jet " - f"collection '{self.jet_name}'.", - ) - - return jec_cfg[self.jet_name] - - -@calibrator( - uses={ - optional("fixedGridRhoFastjetAll"), - optional("Rho.fixedGridRhoFastjetAll"), - attach_coffea_behavior, - }, - # name of the jet collection to calibrate - jet_name="Jet", - # name of the associated MET collection - met_name="MET", - # name of the associated Raw MET collection - raw_met_name="RawMET", - # custom uncertainty sources, defaults to config when empty - uncertainty_sources=None, - # toggle for propagation to PuppiMET - propagate_met=True, - # # function to determine the correction file - get_jec_file=get_jec_file_default, - # # function to determine the jec configuration dict - get_jec_config=get_jec_config_default, -) - -def jec( - self: Calibrator, - events: ak.Array, - min_pt_met_prop: float = 15.0, - max_eta_met_prop: float = 5.2, - **kwargs, -) -> ak.Array: - """Performs the jet energy corrections (JECs) and uncertainty shifts using the - :external+correctionlib:doc:`index`, optionally - propagating the changes to the PuppiMET. - - The *jet_name* should be set to the name of the NanoAOD jet collection to calibrate - (default: ``Jet``, i.e. AK4 jets). - - Requires an external file in the config pointing to the JSON files containing the JECs. - The file key can be specified via an optional ``external_file_key`` in the ``jec`` config entry. - If not given, the file key will be determined automatically based on the jet collection name: - ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files - can be specified as: - - .. code-block:: python - - cfg.x.external_files = DotDict.wrap({ - "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", - "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz", - }) - - For more file-grained control, the *get_jec_file* can be adapted in a subclass in case it is stored - differently in the external files - - The JEC configuration should be an auxiliary entry in the config, specifying the correction - details under "jec". Separate configs should be given for each jet collection to calibrate, - using the jet collection name as a subkey. An example of a valid configuration for correction - AK4 jets with JEC is: - - .. code-block:: python - - cfg.x.jec = { - "Jet": { - "campaign": "Summer19UL17", - "version": "V5", - "jet_type": "AK4PFchs", - "levels": ["L1L2L3Res"], # or individual correction levels - "levels_for_type1_met": ["L1FastJet"], - "uncertainty_sources": [ - "Total", - "CorrelationGroupMPFInSitu", - "CorrelationGroupIntercalibration", - "CorrelationGroupbJES", - "CorrelationGroupFlavor", - "CorrelationGroupUncorrelated", - ] - }, - } - - *get_jec_config* can be adapted in a subclass in case it is stored differently in the config. - - If running on data, the datasets must have an auxiliary field *jec_era* defined, e.g. "RunF", - or an auxiliary field *era*, e.g. "F". - - This instance of :py:class:`~columnflow.calibration.Calibrator` is - initialized with the following parameters by default: - - :param events: awkward array containing events to process - - :param min_pt_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet values - to the missing transverse energy (PuppiMET) using - :py:func:`~columnflow.calibration.util.propagate_met` for events where - ``met.pt > *min_pt_met_prop*``. - :param max_eta_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet - values to the missing transverse energy (PuppiMET) using - :py:func:`~columnflow.calibration.util.propagate_met` for events where - ``met.eta > *min_eta_met_prop*``. - """ # noqa +# # coding: utf-8 + +# """ +# Jet energy corrections and jet resolution smearing. +# """ +# from pprint import pprint + +# import functools + +# import law + +# from columnflow.types import Any +# from columnflow.calibration import Calibrator, calibrator +# from columnflow.calibration.util import ak_random, propagate_met +# from columnflow.production.util import attach_coffea_behavior +# from columnflow.util import maybe_import, InsertableDict, DotDict +# from columnflow.columnar_util import set_ak_column, layout_ak_array, optional_column as optional + +# np = maybe_import("numpy") +# ak = maybe_import("awkward") +# correctionlib = maybe_import("correctionlib") + +# logger = law.logger.get_logger(__name__) + + +# # +# # helper functions +# # + +# set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32) + + +# import difflib + +# def get_evaluators( +# correction_set: correctionlib.highlevel.CorrectionSet, +# names: list[str], +# ) -> list[Any]: +# """ +# Helper function to get a list of correction evaluators from a +# :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` object given +# a list of *names*. The *names* can refer to either simple or compound +# corrections. + +# :param correction_set: evaluator provided by :external+correctionlib:doc:`index` +# :param names: List of names of corrections to be applied +# :raises RuntimeError: If a requested correction in *names* is not available +# :return: List of compounded corrections, see +# :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` +# """ +# available_keys = set(correction_set.keys()).union(correction_set.compound.keys()) +# corrected_names = [] + +# for name in names: +# if name not in available_keys: +# # Find the closest match using difflib +# closest_matches = difflib.get_close_matches(name, available_keys, n=1) +# if closest_matches: +# closest_match = closest_matches[0] +# print( +# f"Correction '{name}' not found. Using closest match: '{closest_match}'", +# ) +# corrected_names.append(closest_match) +# else: +# raise RuntimeError(f"Correction '{name}' not found and no close match available.") +# else: +# corrected_names.append(name) + +# # Retrieve the evaluators +# return [ +# correction_set.compound[name] +# if name in correction_set.compound +# else correction_set[name] +# for name in corrected_names +# ] + +# def ak_evaluate(evaluator: correctionlib.highlevel.Correction, *args) -> float: +# """ +# Evaluate a :external+correctionlib:py:class:`correctionlib.highlevel.Correction` +# using one or more :external+ak:py:class:`awkward arrays ` as inputs. + +# :param evaluator: Evaluator instance +# :raises ValueError: If no :external+ak:py:class:`awkward arrays ` are provided +# :return: The correction factor derived from the input arrays +# """ +# # fail if no arguments +# if not args: +# raise ValueError("Expected at least one argument.") + +# # collect arguments that are awkward arrays +# ak_args = [ +# arg for arg in args if isinstance(arg, ak.Array) +# ] + +# # broadcast akward arrays together and flatten +# if ak_args: +# bc_args = ak.broadcast_arrays(*ak_args) +# flat_args = ( +# np.asarray(ak.flatten(bc_arg, axis=None)) +# for bc_arg in bc_args +# ) +# output_layout_array = bc_args[0] +# else: +# flat_args = iter(()) +# output_layout_array = None + +# # multiplex flattened and non-awkward inputs +# all_flat_args = [ +# next(flat_args) if isinstance(arg, ak.Array) else arg +# for arg in args +# ] + +# # apply evaluator to flattened/multiplexed inputs +# result = evaluator.evaluate(*all_flat_args) + +# # apply broadcasted layout to result +# if output_layout_array is not None: +# result = layout_ak_array(result, output_layout_array) + +# return result + + +# # +# # jet energy corrections +# # +# def get_jec_file_default(self, external_files: DotDict) -> str: +# """ +# Function to obtain external jec files. + +# By default, this function extracts the location of the jec correction +# files from the current config instance *config_inst*: + +# .. code-block:: python + +# cfg.x.external_files = DotDict.wrap({ +# "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", +# }) + +# :param external_files: Dictionary containing the information about the file location +# :return: path or url to correction file(s) +# """ # noqa +# return external_files.jet_jerc + + +# # define default functions for jec calibrator +# def get_jerc_file_default(self: Calibrator, external_files: DotDict) -> str: +# """ +# Function to obtain external correction files for JEC and/or JER. + +# By default, this function extracts the location of the jec correction +# files from the current config instance *config_inst*. The key of the +# external file depends on the jet collection. For ``Jet`` (AK4 jets), this +# resolves to ``jet_jerc``, and for ``FatJet`` it is resolved to +# ``fat_jet_jerc``. + +# .. code-block:: python + +# cfg.x.external_files = DotDict.wrap({ +# "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", +# "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz", +# }) + +# :param external_files: Dictionary containing the information about the file location +# :return: path or url to correction file(s) +# """ # noqa + +# # get config +# try_attrs = ("get_jec_config", "get_jer_config") +# jerc_config = None +# for try_attr in try_attrs: +# try: +# jerc_config = getattr(self, try_attr)() +# except AttributeError: +# continue +# else: +# break + +# # fail if not found +# if jerc_config is None: +# raise ValueError( +# "could not retrieve jer/jec config, none of the following methods " +# f"were found: {try_attrs}", +# ) + +# # first check config for user-supplied `external_file_key` +# ext_file_key = jerc_config.get("external_file_key", None) +# if ext_file_key is not None: +# return external_files[ext_file_key] + +# # if not found, try to resolve from jet collection name and fail if not standard NanoAOD +# if self.jet_name not in get_jerc_file_default.map_jet_name_file_key: +# available_keys = ", ".join(sorted(get_jerc_file_default.map_jet_name_file_key)) +# raise ValueError( +# f"could not determine external file key for jet collection '{self.jet_name}', " +# f"name is not one of standard NanoAOD jet collections: {available_keys}", +# ) + +# # return external file +# ext_file_key = get_jerc_file_default.map_jet_name_file_key[self.jet_name] +# return external_files[ext_file_key] + + +# # default external file keys for known jet collections +# get_jerc_file_default.map_jet_name_file_key = { +# "Jet": "jet_jerc", +# "FatJet": "fat_jet_jerc", +# } + + +# def get_jec_config_default(self: Calibrator) -> DotDict: +# """ +# Load config relevant to the jet energy corrections (JEC). + +# By default, this is extracted from the current *config_inst*, +# assuming the JEC configurations are stored under the 'jec' +# aux key. Separate configurations should be specified for each +# jet collection, using the collection name as a key. For example, +# the configuration for the default jet collection ``Jet`` will +# be retrieved from the following config entry: + +# .. code-block:: python + +# self.config_inst.x.jec.Jet + +# Used in :py:meth:`~.jec.setup_func`. + +# :return: Dictionary containing configuration for jet energy calibration +# """ +# jec_cfg = self.config_inst.x.jec + +# # check for old-style config +# if self.jet_name not in jec_cfg: +# # if jet collection is `Jet`, issue deprecation warning +# if self.jet_name == "Jet": +# logger.warning_once( +# f"{id(self)}_depr_jec_config", +# "config aux 'jec' does not contain key for input jet " +# f"collection '{self.jet_name}'. This may be due to " +# "an outdated config. Continuing under the assumption that " +# "the entire 'jec' entry refers to this jet collection. " +# "This assumption will be removed in future versions of " +# "columnflow, so please adapt the config according to the " +# "documentation to remove this warning and ensure future " +# "compatibility of the code.", +# ) +# return jec_cfg + +# # otherwise raise exception +# raise ValueError( +# "config aux 'jec' does not contain key for input jet " +# f"collection '{self.jet_name}'.", +# ) + +# return jec_cfg[self.jet_name] + + +# @calibrator( +# uses={ +# optional("fixedGridRhoFastjetAll"), +# optional("Rho.fixedGridRhoFastjetAll"), +# attach_coffea_behavior, +# }, +# # name of the jet collection to calibrate +# jet_name="Jet", +# # name of the associated MET collection +# met_name="MET", +# # name of the associated Raw MET collection +# raw_met_name="RawMET", +# # custom uncertainty sources, defaults to config when empty +# uncertainty_sources=None, +# # toggle for propagation to PuppiMET +# propagate_met=True, +# # # function to determine the correction file +# get_jec_file=get_jec_file_default, +# # # function to determine the jec configuration dict +# get_jec_config=get_jec_config_default, +# ) + +# def jec( +# self: Calibrator, +# events: ak.Array, +# min_pt_met_prop: float = 15.0, +# max_eta_met_prop: float = 5.2, +# **kwargs, +# ) -> ak.Array: +# """Performs the jet energy corrections (JECs) and uncertainty shifts using the +# :external+correctionlib:doc:`index`, optionally +# propagating the changes to the PuppiMET. + +# The *jet_name* should be set to the name of the NanoAOD jet collection to calibrate +# (default: ``Jet``, i.e. AK4 jets). + +# Requires an external file in the config pointing to the JSON files containing the JECs. +# The file key can be specified via an optional ``external_file_key`` in the ``jec`` config entry. +# If not given, the file key will be determined automatically based on the jet collection name: +# ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files +# can be specified as: + +# .. code-block:: python + +# cfg.x.external_files = DotDict.wrap({ +# "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", +# "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz", +# }) + +# For more file-grained control, the *get_jec_file* can be adapted in a subclass in case it is stored +# differently in the external files + +# The JEC configuration should be an auxiliary entry in the config, specifying the correction +# details under "jec". Separate configs should be given for each jet collection to calibrate, +# using the jet collection name as a subkey. An example of a valid configuration for correction +# AK4 jets with JEC is: + +# .. code-block:: python + +# cfg.x.jec = { +# "Jet": { +# "campaign": "Summer19UL17", +# "version": "V5", +# "jet_type": "AK4PFchs", +# "levels": ["L1L2L3Res"], # or individual correction levels +# "levels_for_type1_met": ["L1FastJet"], +# "uncertainty_sources": [ +# "Total", +# "CorrelationGroupMPFInSitu", +# "CorrelationGroupIntercalibration", +# "CorrelationGroupbJES", +# "CorrelationGroupFlavor", +# "CorrelationGroupUncorrelated", +# ] +# }, +# } + +# *get_jec_config* can be adapted in a subclass in case it is stored differently in the config. + +# If running on data, the datasets must have an auxiliary field *jec_era* defined, e.g. "RunF", +# or an auxiliary field *era*, e.g. "F". + +# This instance of :py:class:`~columnflow.calibration.Calibrator` is +# initialized with the following parameters by default: + +# :param events: awkward array containing events to process + +# :param min_pt_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet values +# to the missing transverse energy (PuppiMET) using +# :py:func:`~columnflow.calibration.util.propagate_met` for events where +# ``met.pt > *min_pt_met_prop*``. +# :param max_eta_met_prop: If *propagate_met* variable is ``True`` propagate the updated jet +# values to the missing transverse energy (PuppiMET) using +# :py:func:`~columnflow.calibration.util.propagate_met` for events where +# ``met.eta > *min_eta_met_prop*``. +# """ # noqa - # calculate uncorrected pt, mass - events = set_ak_column_f32(events, "Jet.pt_raw", events.Jet.pt * (1 - events.Jet.rawFactor)) - events = set_ak_column_f32(events, "Jet.mass_raw", events.Jet.mass * (1 - events.Jet.rawFactor)) - - # calculate uncorrected pt, mass - events = set_ak_column_f32(events, f"{jet_name}.pt_raw", events[jet_name].pt * (1 - events[jet_name].rawFactor)) - events = set_ak_column_f32(events, f"{jet_name}.mass_raw", events[jet_name].mass * (1 - events[jet_name].rawFactor)) - - def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): - # variable naming convention - variable_map = { - "JetA": area, - "JetEta": eta, - "JetPt": pt, - "JetPhi": phi, - "Rho": ak.values_astype(rho, np.float32), - } - - # apply all correctors sequentially, updating the pt each time - full_correction = ak.ones_like(pt, dtype=np.float32) +# # calculate uncorrected pt, mass +# events = set_ak_column_f32(events, "Jet.pt_raw", events.Jet.pt * (1 - events.Jet.rawFactor)) +# events = set_ak_column_f32(events, "Jet.mass_raw", events.Jet.mass * (1 - events.Jet.rawFactor)) + +# # calculate uncorrected pt, mass +# events = set_ak_column_f32(events, f"{jet_name}.pt_raw", events[jet_name].pt * (1 - events[jet_name].rawFactor)) +# events = set_ak_column_f32(events, f"{jet_name}.mass_raw", events[jet_name].mass * (1 - events[jet_name].rawFactor)) + +# def correct_jets(*, pt, eta, phi, area, rho, evaluator_key="jec"): +# # variable naming convention +# variable_map = { +# "JetA": area, +# "JetEta": eta, +# "JetPt": pt, +# "JetPhi": phi, +# "Rho": ak.values_astype(rho, np.float32), +# } + +# # apply all correctors sequentially, updating the pt each time +# full_correction = ak.ones_like(pt, dtype=np.float32) - for corrector in self.evaluators[evaluator_key]: - # determine correct inputs (change depending on corrector) - inputs = [ - variable_map[inp.name] - for inp in corrector.inputs - ] - correction = ak_evaluate(corrector, *inputs) - # update pt for subsequent correctors - #pprint(corrector.__dict__) # If `corrector` is a custom object with attributes - variable_map["JetPt"] = variable_map["JetPt"] * correction - full_correction = full_correction * correction - - return full_correction - - # obtain rho, which might be located at different routes, depending on the nano version - rho = ( - events.fixedGridRhoFastjetAll - if "fixedGridRhoFastjetAll" in events.fields - else events.Rho.fixedGridRhoFastjetAll - ) - - # correct jets with only a subset of correction levels - # (for calculating TypeI PuppiMET correction) - if self.propagate_met: - # get correction factors - jec_factors_subset_type1_met = correct_jets( - pt=events[jet_name].pt_raw, - eta=events[jet_name].eta, - phi=events[jet_name].phi, - area=events[jet_name].area, - rho=rho, - evaluator_key="jec_subset_type1_met", - ) - - # temporarily apply the new factors with only subset of corrections - events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors_subset_type1_met) - events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors_subset_type1_met) - events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) - - # store pt and phi of the full jet system for PuppiMET propagation, including a selection in raw info - # see https://twiki.cern.ch/twiki/bin/view/CMS/JECAnalysesRecommendations?rev=19#Minimum_jet_selection_cuts - met_prop_mask = (events[jet_name].pt_raw > min_pt_met_prop) & (abs(events[jet_name].eta) < max_eta_met_prop) - jetsum = events[jet_name][met_prop_mask].sum(axis=1) - jetsum_pt_subset_type1_met = jetsum.pt - jetsum_phi_subset_type1_met = jetsum.phi - - # factors for full jet correction with all levels - jec_factors = correct_jets( - pt=events[jet_name].pt_raw, - eta=events[jet_name].eta, - phi=events[jet_name].phi, - area=events[jet_name].area, - rho=rho, - evaluator_key="jec", - ) - - # apply full jet correction - events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors) - events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors) - rawFactor = ak.nan_to_num(1 - events[jet_name].pt_raw / events[jet_name].pt, nan=0.0) - events = set_ak_column_f32(events, f"{jet_name}.rawFactor", rawFactor) - events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) - - # nominal met propagation - if self.propagate_met: - # get pt and phi of all jets after correcting - jetsum = events[jet_name][met_prop_mask].sum(axis=1) - jetsum_pt_all_levels = jetsum.pt - jetsum_phi_all_levels = jetsum.phi - - # propagate changes to PuppiMET, starting from jets corrected with subset of JEC levels - # (recommendation is to propagate only L2 corrections and onwards) - met_pt, met_phi = propagate_met( - jetsum_pt_subset_type1_met, - jetsum_phi_subset_type1_met, - jetsum_pt_all_levels, - jetsum_phi_all_levels, - events.RawPuppiMET.pt, - events.RawPuppiMET.phi, - ) - - events = set_ak_column_f32(events, "PuppiMET.pt", met_pt) - events = set_ak_column_f32(events, "PuppiMET.phi", met_phi) - - # variable naming conventions - variable_map = { - "JetEta": events[jet_name].eta, - "JetPt": events[jet_name].pt_raw, - } - - # jet energy uncertainty components - for name, evaluator in self.evaluators["junc"].items(): - # get uncertainty - inputs = [variable_map[inp.name] for inp in evaluator.inputs] - jec_uncertainty = ak_evaluate(evaluator, *inputs) - - # apply jet uncertainty shifts - events = set_ak_column_f32( - events, f"{jet_name}.pt_jec_{name}_up", events[jet_name].pt * (1.0 + jec_uncertainty), - ) - events = set_ak_column_f32( - events, f"{jet_name}.pt_jec_{name}_down", events[jet_name].pt * (1.0 - jec_uncertainty), - ) - events = set_ak_column_f32( - events, f"{jet_name}.mass_jec_{name}_up", events[jet_name].mass * (1.0 + jec_uncertainty), - ) - events = set_ak_column_f32( - events, f"{jet_name}.mass_jec_{name}_down", events[jet_name].mass * (1.0 - jec_uncertainty), - ) - - # propagate shifts to PuppiMET - if self.propagate_met: - jet_pt_up = events[jet_name][met_prop_mask][f"pt_jec_{name}_up"] - jet_pt_down = events[jet_name][met_prop_mask][f"pt_jec_{name}_down"] - met_pt_up, met_phi_up = propagate_met( - jetsum_pt_all_levels, - jetsum_phi_all_levels, - jet_pt_up, - events[jet_name][met_prop_mask].phi, - met_pt, - met_phi, - ) - met_pt_down, met_phi_down = propagate_met( - jetsum_pt_all_levels, - jetsum_phi_all_levels, - jet_pt_down, - events[jet_name][met_prop_mask].phi, - met_pt, - met_phi, - ) - events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_up", met_pt_up) - events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_down", met_pt_down) - events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_up", met_phi_up) - events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_down", met_phi_down) - - return events - - -@jec.init -def jec_init(self: Calibrator) -> None: - jec_cfg = self.get_jec_config() - - sources = self.uncertainty_sources - if sources is None: - sources = jec_cfg.uncertainty_sources - - # register used jet columns - self.uses.add(f"{self.jet_name}.{{pt,eta,phi,mass,area,rawFactor}}") - - # register produced jet columns - self.produces.add(f"{self.jet_name}.{{pt,mass,rawFactor}}") - - # add shifted jet variables - self.produces |= { - f"{self.jet_name}.{shifted_var}_jec_{junc_name}_{junc_dir}" - for shifted_var in ("pt", "mass") - for junc_name in sources - for junc_dir in ("up", "down") - } - - # add PuppiMET variables - if self.propagate_met: - self.uses |= {"RawPuppiMET.pt", "RawPuppiMET.phi","PuppiMET.pt", "PuppiMET.phi"} - self.produces |= {"PuppiMET.pt", "PuppiMET.phi"} - - # add shifted PuppiMET variables - self.produces |= { - f"PuppiMET.{shifted_var}_jec_{junc_name}_{junc_dir}" - for shifted_var in ("pt", "phi") - for junc_name in sources - for junc_dir in ("up", "down") - } - - -@jec.requires -def jec_requires(self: Calibrator, reqs: dict) -> None: - if "external_files" in reqs: - return - - from columnflow.tasks.external import BundleExternalFiles - reqs["external_files"] = BundleExternalFiles.req(self.task) - - -@jec.setup -def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None: - """ - Load the correct jec files using the :py:func:`from_string` method of the - :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` - function and apply the corrections as needed. - - The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` - instance are extracted with the :py:meth:`~.jec.get_jec_file`. - - Uses the member function :py:meth:`~.jec.get_jec_config` to construct the - required keys, which are based on the following information about the JEC: - - - levels - - campaign - - version - - jet_type - - A corresponding example snippet wihtin the *config_inst* could like something - like this: - - .. code-block:: python - - cfg.x.jec = DotDict.wrap({ - # campaign name for this JEC correctiono - "campaign": f"Summer19UL{year2}{jerc_postfix}", - # version of the corrections - "version": "V7", - # Type of jets that the corrections should be applied on - "jet_type": "AK4PFchs", - # relevant levels in the derivation process of the JEC - "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], - # relevant levels in the derivation process of the Type 1 PuppiMET JEC - "levels_for_type1_met": ["L1FastJet"], - # names of the uncertainties to be applied - "uncertainty_sources": [ - "Total", - "CorrelationGroupMPFInSitu", - "CorrelationGroupIntercalibration", - "CorrelationGroupbJES", - "CorrelationGroupFlavor", - "CorrelationGroupUncorrelated", - ], - }) - - :param reqs: Requirement dictionary for this - :py:class:`~columnflow.calibration.Calibrator` instance - :param inputs: Additional inputs, currently not used - :param reader_targets: TODO: add documentation - """ - - bundle = reqs["external_files"] +# for corrector in self.evaluators[evaluator_key]: +# # determine correct inputs (change depending on corrector) +# inputs = [ +# variable_map[inp.name] +# for inp in corrector.inputs +# ] +# correction = ak_evaluate(corrector, *inputs) +# # update pt for subsequent correctors +# #pprint(corrector.__dict__) # If `corrector` is a custom object with attributes +# variable_map["JetPt"] = variable_map["JetPt"] * correction +# full_correction = full_correction * correction + +# return full_correction + +# # obtain rho, which might be located at different routes, depending on the nano version +# rho = ( +# events.fixedGridRhoFastjetAll +# if "fixedGridRhoFastjetAll" in events.fields +# else events.Rho.fixedGridRhoFastjetAll +# ) + +# # correct jets with only a subset of correction levels +# # (for calculating TypeI PuppiMET correction) +# if self.propagate_met: +# # get correction factors +# jec_factors_subset_type1_met = correct_jets( +# pt=events[jet_name].pt_raw, +# eta=events[jet_name].eta, +# phi=events[jet_name].phi, +# area=events[jet_name].area, +# rho=rho, +# evaluator_key="jec_subset_type1_met", +# ) + +# # temporarily apply the new factors with only subset of corrections +# events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors_subset_type1_met) +# events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors_subset_type1_met) +# events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) + +# # store pt and phi of the full jet system for PuppiMET propagation, including a selection in raw info +# # see https://twiki.cern.ch/twiki/bin/view/CMS/JECAnalysesRecommendations?rev=19#Minimum_jet_selection_cuts +# met_prop_mask = (events[jet_name].pt_raw > min_pt_met_prop) & (abs(events[jet_name].eta) < max_eta_met_prop) +# jetsum = events[jet_name][met_prop_mask].sum(axis=1) +# jetsum_pt_subset_type1_met = jetsum.pt +# jetsum_phi_subset_type1_met = jetsum.phi + +# # factors for full jet correction with all levels +# jec_factors = correct_jets( +# pt=events[jet_name].pt_raw, +# eta=events[jet_name].eta, +# phi=events[jet_name].phi, +# area=events[jet_name].area, +# rho=rho, +# evaluator_key="jec", +# ) + +# # apply full jet correction +# events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt_raw * jec_factors) +# events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass_raw * jec_factors) +# rawFactor = ak.nan_to_num(1 - events[jet_name].pt_raw / events[jet_name].pt, nan=0.0) +# events = set_ak_column_f32(events, f"{jet_name}.rawFactor", rawFactor) +# events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) + +# # nominal met propagation +# if self.propagate_met: +# # get pt and phi of all jets after correcting +# jetsum = events[jet_name][met_prop_mask].sum(axis=1) +# jetsum_pt_all_levels = jetsum.pt +# jetsum_phi_all_levels = jetsum.phi + +# # propagate changes to PuppiMET, starting from jets corrected with subset of JEC levels +# # (recommendation is to propagate only L2 corrections and onwards) +# met_pt, met_phi = propagate_met( +# jetsum_pt_subset_type1_met, +# jetsum_phi_subset_type1_met, +# jetsum_pt_all_levels, +# jetsum_phi_all_levels, +# events.RawPuppiMET.pt, +# events.RawPuppiMET.phi, +# ) + +# events = set_ak_column_f32(events, "PuppiMET.pt", met_pt) +# events = set_ak_column_f32(events, "PuppiMET.phi", met_phi) + +# # variable naming conventions +# variable_map = { +# "JetEta": events[jet_name].eta, +# "JetPt": events[jet_name].pt_raw, +# } + +# # jet energy uncertainty components +# for name, evaluator in self.evaluators["junc"].items(): +# # get uncertainty +# inputs = [variable_map[inp.name] for inp in evaluator.inputs] +# jec_uncertainty = ak_evaluate(evaluator, *inputs) + +# # apply jet uncertainty shifts +# events = set_ak_column_f32( +# events, f"{jet_name}.pt_jec_{name}_up", events[jet_name].pt * (1.0 + jec_uncertainty), +# ) +# events = set_ak_column_f32( +# events, f"{jet_name}.pt_jec_{name}_down", events[jet_name].pt * (1.0 - jec_uncertainty), +# ) +# events = set_ak_column_f32( +# events, f"{jet_name}.mass_jec_{name}_up", events[jet_name].mass * (1.0 + jec_uncertainty), +# ) +# events = set_ak_column_f32( +# events, f"{jet_name}.mass_jec_{name}_down", events[jet_name].mass * (1.0 - jec_uncertainty), +# ) + +# # propagate shifts to PuppiMET +# if self.propagate_met: +# jet_pt_up = events[jet_name][met_prop_mask][f"pt_jec_{name}_up"] +# jet_pt_down = events[jet_name][met_prop_mask][f"pt_jec_{name}_down"] +# met_pt_up, met_phi_up = propagate_met( +# jetsum_pt_all_levels, +# jetsum_phi_all_levels, +# jet_pt_up, +# events[jet_name][met_prop_mask].phi, +# met_pt, +# met_phi, +# ) +# met_pt_down, met_phi_down = propagate_met( +# jetsum_pt_all_levels, +# jetsum_phi_all_levels, +# jet_pt_down, +# events[jet_name][met_prop_mask].phi, +# met_pt, +# met_phi, +# ) +# events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_up", met_pt_up) +# events = set_ak_column_f32(events, f"PuppiMET.pt_jec_{name}_down", met_pt_down) +# events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_up", met_phi_up) +# events = set_ak_column_f32(events, f"PuppiMET.phi_jec_{name}_down", met_phi_down) + +# return events + + +# @jec.init +# def jec_init(self: Calibrator) -> None: +# jec_cfg = self.get_jec_config() + +# sources = self.uncertainty_sources +# if sources is None: +# sources = jec_cfg.uncertainty_sources + +# # register used jet columns +# self.uses.add(f"{self.jet_name}.{{pt,eta,phi,mass,area,rawFactor}}") + +# # register produced jet columns +# self.produces.add(f"{self.jet_name}.{{pt,mass,rawFactor}}") + +# # add shifted jet variables +# self.produces |= { +# f"{self.jet_name}.{shifted_var}_jec_{junc_name}_{junc_dir}" +# for shifted_var in ("pt", "mass") +# for junc_name in sources +# for junc_dir in ("up", "down") +# } + +# # add PuppiMET variables +# if self.propagate_met: +# self.uses |= {"RawPuppiMET.pt", "RawPuppiMET.phi","PuppiMET.pt", "PuppiMET.phi"} +# self.produces |= {"PuppiMET.pt", "PuppiMET.phi"} + +# # add shifted PuppiMET variables +# self.produces |= { +# f"PuppiMET.{shifted_var}_jec_{junc_name}_{junc_dir}" +# for shifted_var in ("pt", "phi") +# for junc_name in sources +# for junc_dir in ("up", "down") +# } + + +# @jec.requires +# def jec_requires(self: Calibrator, reqs: dict) -> None: +# if "external_files" in reqs: +# return + +# from columnflow.tasks.external import BundleExternalFiles +# reqs["external_files"] = BundleExternalFiles.req(self.task) + + +# @jec.setup +# def jec_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None: +# """ +# Load the correct jec files using the :py:func:`from_string` method of the +# :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` +# function and apply the corrections as needed. + +# The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` +# instance are extracted with the :py:meth:`~.jec.get_jec_file`. + +# Uses the member function :py:meth:`~.jec.get_jec_config` to construct the +# required keys, which are based on the following information about the JEC: + +# - levels +# - campaign +# - version +# - jet_type + +# A corresponding example snippet wihtin the *config_inst* could like something +# like this: + +# .. code-block:: python + +# cfg.x.jec = DotDict.wrap({ +# # campaign name for this JEC correctiono +# "campaign": f"Summer19UL{year2}{jerc_postfix}", +# # version of the corrections +# "version": "V7", +# # Type of jets that the corrections should be applied on +# "jet_type": "AK4PFchs", +# # relevant levels in the derivation process of the JEC +# "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], +# # relevant levels in the derivation process of the Type 1 PuppiMET JEC +# "levels_for_type1_met": ["L1FastJet"], +# # names of the uncertainties to be applied +# "uncertainty_sources": [ +# "Total", +# "CorrelationGroupMPFInSitu", +# "CorrelationGroupIntercalibration", +# "CorrelationGroupbJES", +# "CorrelationGroupFlavor", +# "CorrelationGroupUncorrelated", +# ], +# }) + +# :param reqs: Requirement dictionary for this +# :py:class:`~columnflow.calibration.Calibrator` instance +# :param inputs: Additional inputs, currently not used +# :param reader_targets: TODO: add documentation +# """ + +# bundle = reqs["external_files"] - # import the correction sets from the external file - import correctionlib - - correction_set = correctionlib.CorrectionSet.from_string( - self.get_jec_file(bundle.files).load(formatter="gzip").decode("utf-8"), - ) - - # compute JEC keys from config information - jec_cfg = self.get_jec_config() - - def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data): - if is_data: - - jec_era = self.dataset_inst.get_aux("jec_era", None) - # if no special JEC era is specified, infer based on 'era' - if jec_era is None: - jec_era = "Run" + self.dataset_inst.get_aux("era") - - return [ - f"{jec.campaign}_{jec_era}_{jec.version}_DATA_{name}_{jec.jet_type}" - if is_data else - f"{jec.campaign}_{jec.version}_MC_{name}_{jec.jet_type}" - for name in names - ] - - # take sources from constructor or config - sources = self.uncertainty_sources - if sources is None: - sources = jec_cfg.uncertainty_sources +# # import the correction sets from the external file +# import correctionlib + +# correction_set = correctionlib.CorrectionSet.from_string( +# self.get_jec_file(bundle.files).load(formatter="gzip").decode("utf-8"), +# ) + +# # compute JEC keys from config information +# jec_cfg = self.get_jec_config() + +# def make_jme_keys(names, jec=jec_cfg, is_data=self.dataset_inst.is_data): +# if is_data: + +# jec_era = self.dataset_inst.get_aux("jec_era", None) +# # if no special JEC era is specified, infer based on 'era' +# if jec_era is None: +# jec_era = "Run" + self.dataset_inst.get_aux("era") + +# return [ +# f"{jec.campaign}_{jec_era}_{jec.version}_DATA_{name}_{jec.jet_type}" +# if is_data else +# f"{jec.campaign}_{jec.version}_MC_{name}_{jec.jet_type}" +# for name in names +# ] + +# # take sources from constructor or config +# sources = self.uncertainty_sources +# if sources is None: +# sources = jec_cfg.uncertainty_sources - if self.dataset_inst.is_data : - jec_keys = make_jme_keys(jec_cfg.levels_DATA) - else : - jec_keys = make_jme_keys(jec_cfg.levels_MC) - jec_keys_subset_type1_met = make_jme_keys(jec_cfg.levels_for_type1_met) - junc_keys = make_jme_keys(sources, is_data=False) # uncertainties only stored as MC keys - - # store the evaluators - self.evaluators = { - "jec": get_evaluators(correction_set, jec_keys), - "jec_subset_type1_met": get_evaluators(correction_set, jec_keys_subset_type1_met), - "junc": dict(zip(sources, get_evaluators(correction_set, junc_keys))), - } - - -# custom jec calibrator that only runs nominal correction -jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []}) - -# define default functions for jec calibrator -def get_jer_file(self, external_files: DotDict) -> str: - """ - Load config relevant to the jet energy resolution (JER) smearing. - - By default, this is extracted from the current *config_inst*, - assuming the JER configurations are stored under the 'jer' - aux key. Separate configurations should be specified for each - jet collection, using the collection name as a key. For example, - the configuration for the default jet collection ``Jet`` will - be retrieved from the following config entry: - - .. code-block:: python - - self.config_inst.x.jer.Jet - - Used in :py:meth:`~.jer.setup_func`. - - :return: Dictionary containing configuration for JER smearing - """ - jer_cfg = self.config_inst.x.jer - - # check for old-style config - if self.jet_name not in jer_cfg: - # if jet collection is `Jet`, issue deprecation warning - if self.jet_name == "Jet": - logger.warning_once( - f"{id(self)}_depr_jer_config", - "config aux 'jer' does not contain key for input jet " - f"collection '{self.jet_name}'. This may be due to " - "an outdated config. Continuing under the assumption that " - "the entire 'jer' entry refers to this jet collection. " - "This assumption will be removed in future versions of " - "columnflow, so please adapt the config according to the " - "documentation to remove this warning and ensure future " - "compatibility of the code.", - ) - return jer_cfg - - # otherwise raise exception - raise ValueError( - "config aux 'jer' does not contain key for input jet " - f"collection '{self.jet_name}'.", - ) - - return jer_cfg[self.jet_name] - - -# -# jet energy resolution smearing -# - -@calibrator( - uses={ - optional("Rho.fixedGridRhoFastjetAll"), - optional("fixedGridRhoFastjetAll"), - "GenJet.pt", "GenJet.eta", "GenJet.phi", - "PuppiMET.pt", "PuppiMET.phi", - attach_coffea_behavior, - }, - produces={ - "Jet.pt", "Jet.mass", - "Jet.pt_unsmeared", "Jet.mass_unsmeared", - "Jet.pt_jer_up", "Jet.pt_jer_down", "Jet.mass_jer_up", "Jet.mass_jer_down", - "PuppiMET.pt", "PuppiMET.phi", - "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", "PuppiMET.phi_jer_down", - }, - # toggle for propagation to PuppiMET - propagate_met=True, - # only run on mc - mc_only=True, - # use deterministic seeds for random smearing and - # take the "index"-th random number per seed when not -1 - deterministic_seed_index=-1, - # function to determine the correction file - get_jer_file=get_jerc_file_default, - # function to determine the jer configuration dict - get_jer_config=get_jer_config_default, -) -def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: - """ - Applies the jet energy resolution smearing in MC and calculates the associated uncertainty - shifts using the :external+correctionlib:doc:`index`, following the recommendations given in - https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetResolution. - - The *jet_name* and *gen_jet_name* should be set to the name of the NanoAOD jet and gen jet - collections to use as an input for JER smearing (default: ``Jet`` and ``GenJet``, respectively, - i.e. AK4 jets). - - Requires an external file in the config pointing to the JSON files containing the JER information. - The file key can be specified via an optional ``external_file_key`` in the ``jer`` config entry. - If not given, the file key will be determined automatically based on the jet collection name: - ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files - can be specified as: - - .. code-block:: python - - cfg.x.external_files = DotDict.wrap({ - "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", - "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz", - }) - - For more fine-grained control, the *get_jer_file* can be adapted in a subclass in case it is stored - differently in the external files. - - The JER smearing configuration should be an auxiliary entry in the config, specifying the input - JER to use under "jer". Separate configs should be given for each jet collection to smear, using - the jet collection name as a subkey. An example of a valid configuration for smearing - AK4 jets with JER is: - - .. code-block:: python - - cfg.x.jer = { - "Jet": { - "campaign": "Summer19UL17", - "version": "JRV2", - "jet_type": "AK4PFchs", - }, - } - - *get_jer_config* can be adapted in a subclass in case it is stored differently in the config. - - Throws an error if running on data. - - :param events: awkward array containing events to process - """ # noqa - # use local variables for convenience - jet_name = self.jet_name - gen_jet_name = self.gen_jet_name - - # fail when running on data - if self.dataset_inst.is_data: - raise ValueError("attempt to apply jet energy resolution smearing in data") - - # save the unsmeared properties in case they are needed later - events = set_ak_column_f32(events, f"{jet_name}.pt_unsmeared", events[jet_name].pt) - events = set_ak_column_f32(events, f"{jet_name}.mass_unsmeared", events[jet_name].mass) - - # obtain rho, which might be located at different routes, depending on the nano version - rho = ( - events.fixedGridRhoFastjetAll - if "fixedGridRhoFastjetAll" in events.fields else - events.Rho.fixedGridRhoFastjetAll - ) - - # variable naming convention - variable_map = { - "JetEta": events[jet_name].eta, - "JetPt": events[jet_name].pt, - "Rho": rho, - } - - # pt resolution - inputs = [variable_map[inp.name] for inp in self.evaluators["jer"].inputs] - jer = ak_evaluate(self.evaluators["jer"], *inputs) - - # JER scale factors and systematic variations - jersf = {} - for syst in ("nom", "up", "down"): - variable_map_syst = dict(variable_map, systematic=syst) - inputs = [variable_map_syst[inp.name] for inp in self.evaluators["sf"].inputs] - jersf[syst] = ak_evaluate(self.evaluators["sf"], *inputs) - - # array with all JER scale factor variations as an additional axis - # (note: axis needs to be regular for broadcasting to work correctly) - jersf = ak.concatenate( - [jersf[syst][..., None] for syst in ("nom", "up", "down")], - axis=-1, - ) - - # -- stochastic smearing - # normally distributed random numbers according to JER - jer_random_normal = ( - ak_random(0, jer, events[jet_name].deterministic_seed, rand_func=self.deterministic_normal) - if self.deterministic_seed_index >= 0 - else ak_random(0, jer, rand_func=np.random.Generator( - np.random.SFC64(events.event.to_list())).normal, - ) - ) - - # scale random numbers according to JER SF - jersf2_m1 = jersf ** 2 - 1 - add_smear = np.sqrt(ak.where(jersf2_m1 < 0, 0, jersf2_m1)) - - # broadcast over JER SF variations - jer_random_normal, jersf_z = ak.broadcast_arrays(jer_random_normal, add_smear) - - # compute smearing factors (stochastic method) - smear_factors_stochastic = 1.0 + jer_random_normal * add_smear - - # -- scaling method (using gen match) - - # mask negative gen jet indices (= no gen match) - gen_jet_idx = events[jet_name][self.gen_jet_idx_column] - valid_gen_jet_idxs = ak.mask(gen_jet_idx, gen_jet_idx >= 0) - - # pad list of gen jets to prevent index error on match lookup - max_gen_jet_idx = ak.max(valid_gen_jet_idxs) - padded_gen_jets = ak.pad_none( - events[gen_jet_name], - 0 if max_gen_jet_idx is None else (max_gen_jet_idx + 1), - ) - - # gen jets that match the reconstructed jets - matched_gen_jets = padded_gen_jets[valid_gen_jet_idxs] - - # compute the relative (reco - gen) pt difference - pt_relative_diff = (events[jet_name].pt - matched_gen_jets.pt) / events[jet_name].pt - - # test if matched gen jets are within 3 * resolution - is_matched_pt = np.abs(pt_relative_diff) < 3 * jer - is_matched_pt = ak.fill_none(is_matched_pt, False) # masked values = no gen match - - # (no check for Delta-R matching criterion; we assume this was done during - # nanoAOD production to get the `genJetIdx`) - - # broadcast over JER SF variations - pt_relative_diff, jersf = ak.broadcast_arrays(pt_relative_diff, jersf) - - # compute smearing factors (scaling method) - smear_factors_scaling = 1.0 + (jersf - 1.0) * pt_relative_diff - - # -- hybrid smearing: take smear factors from scaling if there was a match, - # otherwise take the stochastic ones - smear_factors = ak.where( - is_matched_pt[:, :, None], - smear_factors_scaling, - smear_factors_stochastic, - ) - - # ensure array is not nullable (avoid ambiguity on Arrow/Parquet conversion) - smear_factors = ak.fill_none(smear_factors, 0.0) - - # store pt and phi of the full jet system - if self.propagate_met: - jetsum = events[jet_name].sum(axis=1) - jetsum_pt_before = jetsum.pt - jetsum_phi_before = jetsum.phi - - # apply the smearing factors to the pt and mass - # (note: apply variations first since they refer to the original pt) - events = set_ak_column_f32(events, f"{jet_name}.pt_jer_up", events[jet_name].pt * smear_factors[:, :, 1]) - events = set_ak_column_f32(events, f"{jet_name}.mass_jer_up", events[jet_name].mass * smear_factors[:, :, 1]) - events = set_ak_column_f32(events, f"{jet_name}.pt_jer_down", events[jet_name].pt * smear_factors[:, :, 2]) - events = set_ak_column_f32(events, f"{jet_name}.mass_jer_down", events[jet_name].mass * smear_factors[:, :, 2]) - events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt * smear_factors[:, :, 0]) - events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass * smear_factors[:, :, 0]) - - # recover coffea behavior - events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) - - # met propagation - if self.propagate_met: - - # save unsmeared quantities - events = set_ak_column_f32(events, "PuppiMET.pt_unsmeared", events.PuppiMET.pt) - events = set_ak_column_f32(events, "PuppiMET.phi_unsmeared", events.PuppiMET.phi) - - # get pt and phi of all jets after correcting - jetsum = events[jet_name].sum(axis=1) - jetsum_pt_after = jetsum.pt - jetsum_phi_after = jetsum.phi - - # propagate changes to PuppiMET - met_pt, met_phi = propagate_met( - jetsum_pt_before, - jetsum_phi_before, - jetsum_pt_after, - jetsum_phi_after, - events.PuppiMET.pt, - events.PuppiMET.phi, - ) - events = set_ak_column_f32(events, "PuppiMET.pt", met_pt) - events = set_ak_column_f32(events, "PuppiMET.phi", met_phi) - - # syst variations on top of corrected PuppiMET - met_pt_up, met_phi_up = propagate_met( - jetsum_pt_after, - jetsum_phi_after, - events[jet_name].pt_jer_up, - events[jet_name].phi, - met_pt, - met_phi, - ) - met_pt_down, met_phi_down = propagate_met( - jetsum_pt_after, - jetsum_phi_after, - events[jet_name].pt_jer_down, - events[jet_name].phi, - met_pt, - met_phi, - ) - events = set_ak_column_f32(events, "PuppiMET.pt_jer_up", met_pt_up) - events = set_ak_column_f32(events, "PuppiMET.pt_jer_down", met_pt_down) - events = set_ak_column_f32(events, "PuppiMET.phi_jer_up", met_phi_up) - events = set_ak_column_f32(events, "PuppiMET.phi_jer_down", met_phi_down) - - return events - - -@jer.init -def jer_init(self: Calibrator) -> None: - # determine gen-level jet index column - lower_first = lambda s: s[0].lower() + s[1:] if s else s - self.gen_jet_idx_column = lower_first(self.gen_jet_name) + "Idx" - - self.uses |= { - "PuppiMET.pt", "PuppiMET.phi", - } - self.produces |= { - "PuppiMET.pt", "PuppiMET.phi", "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", - "PuppiMET.phi_jer_down", "PuppiMET.pt_unsmeared", "PuppiMET.phi_unsmeared", - } - - -@jer.requires -def jer_requires(self: Calibrator, reqs: dict) -> None: - if "external_files" in reqs: - return - - from columnflow.tasks.external import BundleExternalFiles - reqs["external_files"] = BundleExternalFiles.req(self.task) - - -@jer.setup -def jer_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None: - """ - Load the correct jer files using the :py:func:`from_string` method of the - :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` function and apply the - corrections as needed. - - The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` - instance are extracted with the :py:meth:`~.jer.get_jer_file`. - - Uses the member function :py:meth:`~.jer.get_jer_config` to construct the required keys, which - are based on the following information about the JER: - - - campaign - - version - - jet_type - - A corresponding example snippet within the *config_inst* could like something like this: - - .. code-block:: python - - cfg.x.jer = DotDict.wrap({ - "Jet": { - "campaign": f"Summer19UL{year2}{jerc_postfix}", - "version": "JRV3", - "jet_type": "AK4PFchs", - }, - }) - - :param reqs: Requirement dictionary for this :py:class:`~columnflow.calibration.Calibrator` - instance. - :param inputs: Additional inputs, currently not used. - :param reader_targets: TODO: add documentation. - """ - bundle = reqs["external_files"] - - # import the correction sets from the external file - import correctionlib - correction_set = correctionlib.CorrectionSet.from_string( - self.get_jer_file(bundle.files).load(formatter="gzip").decode("utf-8"), - ) +# if self.dataset_inst.is_data : +# jec_keys = make_jme_keys(jec_cfg.levels_DATA) +# else : +# jec_keys = make_jme_keys(jec_cfg.levels_MC) +# jec_keys_subset_type1_met = make_jme_keys(jec_cfg.levels_for_type1_met) +# junc_keys = make_jme_keys(sources, is_data=False) # uncertainties only stored as MC keys + +# # store the evaluators +# self.evaluators = { +# "jec": get_evaluators(correction_set, jec_keys), +# "jec_subset_type1_met": get_evaluators(correction_set, jec_keys_subset_type1_met), +# "junc": dict(zip(sources, get_evaluators(correction_set, junc_keys))), +# } + + +# # custom jec calibrator that only runs nominal correction +# jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []}) + +# # define default functions for jec calibrator +# def get_jer_file(self, external_files: DotDict) -> str: +# """ +# Load config relevant to the jet energy resolution (JER) smearing. + +# By default, this is extracted from the current *config_inst*, +# assuming the JER configurations are stored under the 'jer' +# aux key. Separate configurations should be specified for each +# jet collection, using the collection name as a key. For example, +# the configuration for the default jet collection ``Jet`` will +# be retrieved from the following config entry: + +# .. code-block:: python + +# self.config_inst.x.jer.Jet + +# Used in :py:meth:`~.jer.setup_func`. + +# :return: Dictionary containing configuration for JER smearing +# """ +# jer_cfg = self.config_inst.x.jer + +# # check for old-style config +# if self.jet_name not in jer_cfg: +# # if jet collection is `Jet`, issue deprecation warning +# if self.jet_name == "Jet": +# logger.warning_once( +# f"{id(self)}_depr_jer_config", +# "config aux 'jer' does not contain key for input jet " +# f"collection '{self.jet_name}'. This may be due to " +# "an outdated config. Continuing under the assumption that " +# "the entire 'jer' entry refers to this jet collection. " +# "This assumption will be removed in future versions of " +# "columnflow, so please adapt the config according to the " +# "documentation to remove this warning and ensure future " +# "compatibility of the code.", +# ) +# return jer_cfg + +# # otherwise raise exception +# raise ValueError( +# "config aux 'jer' does not contain key for input jet " +# f"collection '{self.jet_name}'.", +# ) + +# return jer_cfg[self.jet_name] + + +# # +# # jet energy resolution smearing +# # + +# @calibrator( +# uses={ +# optional("Rho.fixedGridRhoFastjetAll"), +# optional("fixedGridRhoFastjetAll"), +# "GenJet.pt", "GenJet.eta", "GenJet.phi", +# "PuppiMET.pt", "PuppiMET.phi", +# attach_coffea_behavior, +# }, +# produces={ +# "Jet.pt", "Jet.mass", +# "Jet.pt_unsmeared", "Jet.mass_unsmeared", +# "Jet.pt_jer_up", "Jet.pt_jer_down", "Jet.mass_jer_up", "Jet.mass_jer_down", +# "PuppiMET.pt", "PuppiMET.phi", +# "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", "PuppiMET.phi_jer_down", +# }, +# # toggle for propagation to PuppiMET +# propagate_met=True, +# # only run on mc +# mc_only=True, +# # use deterministic seeds for random smearing and +# # take the "index"-th random number per seed when not -1 +# deterministic_seed_index=-1, +# # function to determine the correction file +# get_jer_file=get_jerc_file_default, +# # function to determine the jer configuration dict +# get_jer_config=get_jer_config_default, +# ) +# def jer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: +# """ +# Applies the jet energy resolution smearing in MC and calculates the associated uncertainty +# shifts using the :external+correctionlib:doc:`index`, following the recommendations given in +# https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetResolution. + +# The *jet_name* and *gen_jet_name* should be set to the name of the NanoAOD jet and gen jet +# collections to use as an input for JER smearing (default: ``Jet`` and ``GenJet``, respectively, +# i.e. AK4 jets). + +# Requires an external file in the config pointing to the JSON files containing the JER information. +# The file key can be specified via an optional ``external_file_key`` in the ``jer`` config entry. +# If not given, the file key will be determined automatically based on the jet collection name: +# ``jet_jerc`` for ``Jet`` (AK4 jets), ``fat_jet_jerc`` for``FatJet`` (AK8 jets). A full set of JSON files +# can be specified as: + +# .. code-block:: python + +# cfg.x.external_files = DotDict.wrap({ +# "jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/jet_jerc.json.gz", +# "fat_jet_jerc": "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c/POG/JME/2017_UL/fatJet_jerc.json.gz", +# }) + +# For more fine-grained control, the *get_jer_file* can be adapted in a subclass in case it is stored +# differently in the external files. + +# The JER smearing configuration should be an auxiliary entry in the config, specifying the input +# JER to use under "jer". Separate configs should be given for each jet collection to smear, using +# the jet collection name as a subkey. An example of a valid configuration for smearing +# AK4 jets with JER is: + +# .. code-block:: python + +# cfg.x.jer = { +# "Jet": { +# "campaign": "Summer19UL17", +# "version": "JRV2", +# "jet_type": "AK4PFchs", +# }, +# } + +# *get_jer_config* can be adapted in a subclass in case it is stored differently in the config. + +# Throws an error if running on data. + +# :param events: awkward array containing events to process +# """ # noqa +# # use local variables for convenience +# jet_name = self.jet_name +# gen_jet_name = self.gen_jet_name + +# # fail when running on data +# if self.dataset_inst.is_data: +# raise ValueError("attempt to apply jet energy resolution smearing in data") + +# # save the unsmeared properties in case they are needed later +# events = set_ak_column_f32(events, f"{jet_name}.pt_unsmeared", events[jet_name].pt) +# events = set_ak_column_f32(events, f"{jet_name}.mass_unsmeared", events[jet_name].mass) + +# # obtain rho, which might be located at different routes, depending on the nano version +# rho = ( +# events.fixedGridRhoFastjetAll +# if "fixedGridRhoFastjetAll" in events.fields else +# events.Rho.fixedGridRhoFastjetAll +# ) + +# # variable naming convention +# variable_map = { +# "JetEta": events[jet_name].eta, +# "JetPt": events[jet_name].pt, +# "Rho": rho, +# } + +# # pt resolution +# inputs = [variable_map[inp.name] for inp in self.evaluators["jer"].inputs] +# jer = ak_evaluate(self.evaluators["jer"], *inputs) + +# # JER scale factors and systematic variations +# jersf = {} +# for syst in ("nom", "up", "down"): +# variable_map_syst = dict(variable_map, systematic=syst) +# inputs = [variable_map_syst[inp.name] for inp in self.evaluators["sf"].inputs] +# jersf[syst] = ak_evaluate(self.evaluators["sf"], *inputs) + +# # array with all JER scale factor variations as an additional axis +# # (note: axis needs to be regular for broadcasting to work correctly) +# jersf = ak.concatenate( +# [jersf[syst][..., None] for syst in ("nom", "up", "down")], +# axis=-1, +# ) + +# # -- stochastic smearing +# # normally distributed random numbers according to JER +# jer_random_normal = ( +# ak_random(0, jer, events[jet_name].deterministic_seed, rand_func=self.deterministic_normal) +# if self.deterministic_seed_index >= 0 +# else ak_random(0, jer, rand_func=np.random.Generator( +# np.random.SFC64(events.event.to_list())).normal, +# ) +# ) + +# # scale random numbers according to JER SF +# jersf2_m1 = jersf ** 2 - 1 +# add_smear = np.sqrt(ak.where(jersf2_m1 < 0, 0, jersf2_m1)) + +# # broadcast over JER SF variations +# jer_random_normal, jersf_z = ak.broadcast_arrays(jer_random_normal, add_smear) + +# # compute smearing factors (stochastic method) +# smear_factors_stochastic = 1.0 + jer_random_normal * add_smear + +# # -- scaling method (using gen match) + +# # mask negative gen jet indices (= no gen match) +# gen_jet_idx = events[jet_name][self.gen_jet_idx_column] +# valid_gen_jet_idxs = ak.mask(gen_jet_idx, gen_jet_idx >= 0) + +# # pad list of gen jets to prevent index error on match lookup +# max_gen_jet_idx = ak.max(valid_gen_jet_idxs) +# padded_gen_jets = ak.pad_none( +# events[gen_jet_name], +# 0 if max_gen_jet_idx is None else (max_gen_jet_idx + 1), +# ) + +# # gen jets that match the reconstructed jets +# matched_gen_jets = padded_gen_jets[valid_gen_jet_idxs] + +# # compute the relative (reco - gen) pt difference +# pt_relative_diff = (events[jet_name].pt - matched_gen_jets.pt) / events[jet_name].pt + +# # test if matched gen jets are within 3 * resolution +# is_matched_pt = np.abs(pt_relative_diff) < 3 * jer +# is_matched_pt = ak.fill_none(is_matched_pt, False) # masked values = no gen match + +# # (no check for Delta-R matching criterion; we assume this was done during +# # nanoAOD production to get the `genJetIdx`) + +# # broadcast over JER SF variations +# pt_relative_diff, jersf = ak.broadcast_arrays(pt_relative_diff, jersf) + +# # compute smearing factors (scaling method) +# smear_factors_scaling = 1.0 + (jersf - 1.0) * pt_relative_diff + +# # -- hybrid smearing: take smear factors from scaling if there was a match, +# # otherwise take the stochastic ones +# smear_factors = ak.where( +# is_matched_pt[:, :, None], +# smear_factors_scaling, +# smear_factors_stochastic, +# ) + +# # ensure array is not nullable (avoid ambiguity on Arrow/Parquet conversion) +# smear_factors = ak.fill_none(smear_factors, 0.0) + +# # store pt and phi of the full jet system +# if self.propagate_met: +# jetsum = events[jet_name].sum(axis=1) +# jetsum_pt_before = jetsum.pt +# jetsum_phi_before = jetsum.phi + +# # apply the smearing factors to the pt and mass +# # (note: apply variations first since they refer to the original pt) +# events = set_ak_column_f32(events, f"{jet_name}.pt_jer_up", events[jet_name].pt * smear_factors[:, :, 1]) +# events = set_ak_column_f32(events, f"{jet_name}.mass_jer_up", events[jet_name].mass * smear_factors[:, :, 1]) +# events = set_ak_column_f32(events, f"{jet_name}.pt_jer_down", events[jet_name].pt * smear_factors[:, :, 2]) +# events = set_ak_column_f32(events, f"{jet_name}.mass_jer_down", events[jet_name].mass * smear_factors[:, :, 2]) +# events = set_ak_column_f32(events, f"{jet_name}.pt", events[jet_name].pt * smear_factors[:, :, 0]) +# events = set_ak_column_f32(events, f"{jet_name}.mass", events[jet_name].mass * smear_factors[:, :, 0]) + +# # recover coffea behavior +# events = self[attach_coffea_behavior](events, collections=[jet_name], **kwargs) + +# # met propagation +# if self.propagate_met: + +# # save unsmeared quantities +# events = set_ak_column_f32(events, "PuppiMET.pt_unsmeared", events.PuppiMET.pt) +# events = set_ak_column_f32(events, "PuppiMET.phi_unsmeared", events.PuppiMET.phi) + +# # get pt and phi of all jets after correcting +# jetsum = events[jet_name].sum(axis=1) +# jetsum_pt_after = jetsum.pt +# jetsum_phi_after = jetsum.phi + +# # propagate changes to PuppiMET +# met_pt, met_phi = propagate_met( +# jetsum_pt_before, +# jetsum_phi_before, +# jetsum_pt_after, +# jetsum_phi_after, +# events.PuppiMET.pt, +# events.PuppiMET.phi, +# ) +# events = set_ak_column_f32(events, "PuppiMET.pt", met_pt) +# events = set_ak_column_f32(events, "PuppiMET.phi", met_phi) + +# # syst variations on top of corrected PuppiMET +# met_pt_up, met_phi_up = propagate_met( +# jetsum_pt_after, +# jetsum_phi_after, +# events[jet_name].pt_jer_up, +# events[jet_name].phi, +# met_pt, +# met_phi, +# ) +# met_pt_down, met_phi_down = propagate_met( +# jetsum_pt_after, +# jetsum_phi_after, +# events[jet_name].pt_jer_down, +# events[jet_name].phi, +# met_pt, +# met_phi, +# ) +# events = set_ak_column_f32(events, "PuppiMET.pt_jer_up", met_pt_up) +# events = set_ak_column_f32(events, "PuppiMET.pt_jer_down", met_pt_down) +# events = set_ak_column_f32(events, "PuppiMET.phi_jer_up", met_phi_up) +# events = set_ak_column_f32(events, "PuppiMET.phi_jer_down", met_phi_down) + +# return events + + +# @jer.init +# def jer_init(self: Calibrator) -> None: +# # determine gen-level jet index column +# lower_first = lambda s: s[0].lower() + s[1:] if s else s +# self.gen_jet_idx_column = lower_first(self.gen_jet_name) + "Idx" + +# self.uses |= { +# "PuppiMET.pt", "PuppiMET.phi", +# } +# self.produces |= { +# "PuppiMET.pt", "PuppiMET.phi", "PuppiMET.pt_jer_up", "PuppiMET.pt_jer_down", "PuppiMET.phi_jer_up", +# "PuppiMET.phi_jer_down", "PuppiMET.pt_unsmeared", "PuppiMET.phi_unsmeared", +# } + + +# @jer.requires +# def jer_requires(self: Calibrator, reqs: dict) -> None: +# if "external_files" in reqs: +# return + +# from columnflow.tasks.external import BundleExternalFiles +# reqs["external_files"] = BundleExternalFiles.req(self.task) + + +# @jer.setup +# def jer_setup(self: Calibrator, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None: +# """ +# Load the correct jer files using the :py:func:`from_string` method of the +# :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` function and apply the +# corrections as needed. + +# The source files for the :external+correctionlib:py:class:`correctionlib.highlevel.CorrectionSet` +# instance are extracted with the :py:meth:`~.jer.get_jer_file`. + +# Uses the member function :py:meth:`~.jer.get_jer_config` to construct the required keys, which +# are based on the following information about the JER: + +# - campaign +# - version +# - jet_type + +# A corresponding example snippet within the *config_inst* could like something like this: + +# .. code-block:: python + +# cfg.x.jer = DotDict.wrap({ +# "Jet": { +# "campaign": f"Summer19UL{year2}{jerc_postfix}", +# "version": "JRV3", +# "jet_type": "AK4PFchs", +# }, +# }) + +# :param reqs: Requirement dictionary for this :py:class:`~columnflow.calibration.Calibrator` +# instance. +# :param inputs: Additional inputs, currently not used. +# :param reader_targets: TODO: add documentation. +# """ +# bundle = reqs["external_files"] + +# # import the correction sets from the external file +# import correctionlib +# correction_set = correctionlib.CorrectionSet.from_string( +# self.get_jer_file(bundle.files).load(formatter="gzip").decode("utf-8"), +# ) - # compute JER keys from config information - jer_cfg = self.get_jer_config() - jer_keys = { - "jer": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_PtResolution_{jer_cfg.jet_type}", - "sf": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_ScaleFactor_{jer_cfg.jet_type}", - } - - # store the evaluators - self.evaluators = { - name: get_evaluators(correction_set, [key])[0] - for name, key in jer_keys.items() - } - - # use deterministic seeds for random smearing if requested - if self.deterministic_seed_index >= 0: - idx = self.deterministic_seed_index - bit_generator = np.random.SFC64 - def deterministic_normal(loc, scale, seed): - return np.asarray([ - np.random.Generator(bit_generator(_seed)).normal(_loc, _scale, size=idx + 1)[-1] - for _loc, _scale, _seed in zip(loc, scale, seed) - ]) - self.deterministic_normal = deterministic_normal - - -# explicit calibrators for standard jet collections -jer_ak4 = jer.derive("jer_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"}) -jer_ak8 = jer.derive("jer_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8", "propagate_met": False}) - - -# -# single calibrator for doing both JEC and JER smearing -# - -@calibrator( - uses={jec, jer}, - produces={jec, jer}, - # toggle for propagation to PuppiMET - propagate_met=None, - # functions to determine configs and files - get_jec_file=None, - get_jec_config=None, - get_jer_file=None, - get_jer_config=None, -) -def jets(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: - """ - Instance of :py:class:`~columnflow.calibration.Calibrator` that does all relevant calibrations - for jets, i.e. JEC and JER. For more information, see :py:func:`~.jec` and :py:func:`~.jer`. - - :param events: awkward array containing events to process - """ - # apply jet energy corrections - events = self[jec](events, **kwargs) - - # apply jer smearing on MC only - if self.dataset_inst.is_mc: - events = self[jer](events, **kwargs) - - return events - - -@jets.init -def jets_init(self: Calibrator) -> None: - # forward argument to the producers - self.deps_kwargs[jec]["jet_name"] = self.jet_name - self.deps_kwargs[jer]["jet_name"] = self.jet_name - self.deps_kwargs[jer]["gen_jet_name"] = self.gen_jet_name - if self.propagate_met is not None: - self.deps_kwargs[jec]["propagate_met"] = self.propagate_met - self.deps_kwargs[jer]["propagate_met"] = self.propagate_met - if self.get_jec_file is not None: - self.deps_kwargs[jec]["get_jec_file"] = self.get_jec_file - if self.get_jec_config is not None: - self.deps_kwargs[jec]["get_jec_config"] = self.get_jec_config - if self.get_jer_file is not None: - self.deps_kwargs[jer]["get_jer_file"] = self.get_jer_file - if self.get_jer_config is not None: - self.deps_kwargs[jer]["get_jer_config"] = self.get_jer_config - - -# explicit calibrators for standard jet collections -jets_ak4 = jets.derive("jets_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"}) -jets_ak8 = jets.derive("jets_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8"}) +# # compute JER keys from config information +# jer_cfg = self.get_jer_config() +# jer_keys = { +# "jer": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_PtResolution_{jer_cfg.jet_type}", +# "sf": f"{jer_cfg.campaign}_{jer_cfg.version}_MC_ScaleFactor_{jer_cfg.jet_type}", +# } + +# # store the evaluators +# self.evaluators = { +# name: get_evaluators(correction_set, [key])[0] +# for name, key in jer_keys.items() +# } + +# # use deterministic seeds for random smearing if requested +# if self.deterministic_seed_index >= 0: +# idx = self.deterministic_seed_index +# bit_generator = np.random.SFC64 +# def deterministic_normal(loc, scale, seed): +# return np.asarray([ +# np.random.Generator(bit_generator(_seed)).normal(_loc, _scale, size=idx + 1)[-1] +# for _loc, _scale, _seed in zip(loc, scale, seed) +# ]) +# self.deterministic_normal = deterministic_normal + + +# # explicit calibrators for standard jet collections +# jer_ak4 = jer.derive("jer_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"}) +# jer_ak8 = jer.derive("jer_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8", "propagate_met": False}) + + +# # +# # single calibrator for doing both JEC and JER smearing +# # + +# @calibrator( +# uses={jec, jer}, +# produces={jec, jer}, +# # toggle for propagation to PuppiMET +# propagate_met=None, +# # functions to determine configs and files +# get_jec_file=None, +# get_jec_config=None, +# get_jer_file=None, +# get_jer_config=None, +# ) +# def jets(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: +# """ +# Instance of :py:class:`~columnflow.calibration.Calibrator` that does all relevant calibrations +# for jets, i.e. JEC and JER. For more information, see :py:func:`~.jec` and :py:func:`~.jer`. + +# :param events: awkward array containing events to process +# """ +# # apply jet energy corrections +# events = self[jec](events, **kwargs) + +# # apply jer smearing on MC only +# if self.dataset_inst.is_mc: +# events = self[jer](events, **kwargs) + +# return events + + +# @jets.init +# def jets_init(self: Calibrator) -> None: +# # forward argument to the producers +# self.deps_kwargs[jec]["jet_name"] = self.jet_name +# self.deps_kwargs[jer]["jet_name"] = self.jet_name +# self.deps_kwargs[jer]["gen_jet_name"] = self.gen_jet_name +# if self.propagate_met is not None: +# self.deps_kwargs[jec]["propagate_met"] = self.propagate_met +# self.deps_kwargs[jer]["propagate_met"] = self.propagate_met +# if self.get_jec_file is not None: +# self.deps_kwargs[jec]["get_jec_file"] = self.get_jec_file +# if self.get_jec_config is not None: +# self.deps_kwargs[jec]["get_jec_config"] = self.get_jec_config +# if self.get_jer_file is not None: +# self.deps_kwargs[jer]["get_jer_file"] = self.get_jer_file +# if self.get_jer_config is not None: +# self.deps_kwargs[jer]["get_jer_config"] = self.get_jer_config + + +# # explicit calibrators for standard jet collections +# jets_ak4 = jets.derive("jets_ak4", cls_dict={"jet_name": "Jet", "gen_jet_name": "GenJet"}) +# jets_ak8 = jets.derive("jets_ak8", cls_dict={"jet_name": "FatJet", "gen_jet_name": "GenJetAK8"}) diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py index 0c60ff0fb..ceeccb986 100644 --- a/columnflow/plotting/plot_functions_1d.py +++ b/columnflow/plotting/plot_functions_1d.py @@ -316,7 +316,7 @@ def plot_shifted_variable( default_style_config = prepare_style_config( config_inst, category_inst, variable_inst, density, shape_norm, yscale, ) - default_style_config["rax_cfg"]["ylim"] = (0.25, 1.75) + default_style_config["rax_cfg"]["ylim"] = (0.75, 1.25) default_style_config["rax_cfg"]["ylabel"] = "Ratio" if legend_title: default_style_config["legend_cfg"]["title"] = legend_title diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index b8228b361..e7bc90763 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -425,12 +425,14 @@ def eval_formula(formula_str, popt): mask = h1d.values() > 0 y = h1d.values()[mask] y_err = (h1d.variances()[mask])**0.5 - x = h1d.axes[0].centers[mask] - popt, pcov = curve_fit(fitf,x,y, + x = h1d.axes[0].centers + x_masked = x[mask] + + popt, pcov = curve_fit(fitf,x_masked,y, sigma=y_err, absolute_sigma=True, ) - fitres['chi2'][dm] = sum(((y - fitf(x, *popt))/y_err)**2) + fitres['chi2'][dm] = sum(((y - fitf(x_masked, *popt))/y_err)**2) fitres['ndf'][dm] = len(y) - len(popt) fitres['popt'][dm] = popt fitres['pcov'][dm] = pcov diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py index 0de908b80..2bc75c005 100644 --- a/columnflow/tasks/framework/mixins.py +++ b/columnflow/tasks/framework/mixins.py @@ -2447,7 +2447,7 @@ class HistHookMixin(ConfigTask): "default: empty", ) - def invoke_hist_hooks(self, hists: dict) -> dict: + def invoke_hist_hooks(self, hists: dict, category_inst: od.Category) -> dict: """ Invoke hooks to update histograms before plotting. """ @@ -2470,7 +2470,7 @@ def invoke_hist_hooks(self, hists: dict) -> dict: # invoke it self.publish_message(f"invoking hist hook '{hook}'") - hists = func(self, hists) + hists = func(self, hists, category_inst) return hists diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py index bfc316e9e..f1d9c7e61 100644 --- a/columnflow/tasks/histograms.py +++ b/columnflow/tasks/histograms.py @@ -209,13 +209,19 @@ def run(self): weight = ak.Array(np.ones(len(events), dtype=np.float32)) categories = self.config_inst.categories.names() - sig_regs = [the_cat for the_cat in categories if 'sr' in the_cat] + sr_names = [the_cat for the_cat in categories if 'sr' in the_cat] # define and fill histograms, taking into account multiple axes - for sig_reg in sig_regs: + for sr_name in sr_names: #iterate over the regions needed for calculation of the ff_method - for region in ["sr", "ar_wj", "ar_qcd", "ar_yields"]: + the_sr = self.config_inst.get_category(sr_name) + regions = [sr_name] + if the_sr.aux: + for the_key in the_sr.aux.keys(): + if (the_key == 'abcd_regs') or (the_key == 'ff_regs'): + regions += list(the_sr.aux[the_key].values()) + for region in regions: #by accessing the list of categories we check if the category with this name exists - cat = self.config_inst.get_category(sig_reg.replace('sr',region)) + cat = self.config_inst.get_category(region) if cat.name not in histograms.keys(): histograms[cat.name] = {} for var_key, var_names in self.variable_tuples.items(): # get variable instances @@ -225,39 +231,38 @@ def run(self): # create the histogram in the first chunk histograms[cat.name][var_key] = create_hist_from_variables( *variable_insts, - int_cat_axes=("category", "process", "shift"), + int_cat_axes=("process", "shift"), ) # mask events and weights when selection expressions are found masked_events = events - if region == 'ar_wj': + if 'ar_wj' in region: masked_weights = weight * events.ff_weight_wj_nominal - elif region == 'ar_qcd': + elif 'ar_qcd' in region: masked_weights = weight * events.ff_weight_qcd_nominal else: masked_weights = weight - for variable_inst in variable_insts: - sel = variable_inst.selection - if sel == "1": - continue - if not callable(sel): - raise ValueError( - f"invalid selection '{sel}', for now only callables are supported", - ) - mask = sel(masked_events) - #select only one category per histogram - masked_events = masked_events[mask] - masked_weights = masked_weights[mask] - - # merge category ids + + # for variable_inst in variable_insts: + # sel = variable_inst.selection + # if sel == "1": + # continue + # if not callable(sel): + # raise ValueError( + # f"invalid selection '{sel}', for now only callables are supported", + # ) + # mask = sel(masked_events) + # #select only one category per histogram + # merge category ids category_ids = ak.concatenate( [Route(c).apply(masked_events) for c in self.category_id_columns], axis=-1, ) - + mask = ak.any(category_ids == cat.id, axis = 1) + masked_events = masked_events[mask] + masked_weights = masked_weights[mask] # broadcast arrays so that each event can be filled for all its categories fill_data = { - "category": category_ids, "process": masked_events.process_id, "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id, "weight": masked_weights, @@ -274,7 +279,6 @@ def expr(events, *args, **kwargs): # apply it fill_data[variable_inst.name] = expr(masked_events) # fill it - fill_hist( histograms[cat.name][var_key], fill_data, @@ -291,261 +295,6 @@ def expr(events, *args, **kwargs): add_default_to_description=True, ) -# class CreateHistograms( -# VariablesMixin, -# WeightProducerMixin, -# MLModelsMixin, -# ProducersMixin, -# ReducedEventsUser, -# ChunkedIOMixin, -# law.LocalWorkflow, -# RemoteWorkflow, -# ): -# last_edge_inclusive = last_edge_inclusive_inst - -# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) - -# # upstream requirements -# reqs = Requirements( -# ReducedEventsUser.reqs, -# RemoteWorkflow.reqs, -# ProduceColumns=ProduceColumns, -# MLEvaluation=MLEvaluation, -# ) - -# # strategy for handling missing source columns when adding aliases on event chunks -# missing_column_alias_strategy = "original" - -# # names of columns that contain category ids -# # (might become a parameter at some point) -# category_id_columns = {"category_ids"} - -# # register sandbox and shifts found in the chosen weight producer to this task -# register_weight_producer_sandbox = True -# register_weight_producer_shifts = True - -# @law.util.classproperty -# def mandatory_columns(cls) -> set[str]: -# return set(cls.category_id_columns) | {"process_id"} - -# def workflow_requires(self): -# reqs = super().workflow_requires() - -# # require the full merge forest -# reqs["events"] = self.reqs.ProvideReducedEvents.req(self) - -# if not self.pilot: -# if self.producer_insts: -# reqs["producers"] = [ -# self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) -# for producer_inst in self.producer_insts -# if producer_inst.produced_columns -# ] -# if self.ml_model_insts: -# reqs["ml"] = [ -# self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name) -# for ml_model_inst in self.ml_model_insts -# ] - -# # add weight_producer dependent requirements -# reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) - -# return reqs - -# def requires(self): -# reqs = {"events": self.reqs.ProvideReducedEvents.req(self)} - -# if self.producer_insts: -# reqs["producers"] = [ -# self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) -# for producer_inst in self.producer_insts -# if producer_inst.produced_columns -# ] -# if self.ml_model_insts: -# reqs["ml"] = [ -# self.reqs.MLEvaluation.req(self, ml_model=ml_model_inst.cls_name) -# for ml_model_inst in self.ml_model_insts -# ] - -# # add weight_producer dependent requirements -# reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) - -# return reqs - -# workflow_condition = ReducedEventsUser.workflow_condition.copy() - -# @workflow_condition.output -# def output(self): -# return {"hists": self.target(f"hist__vars_{self.variables_repr}__{self.branch}.pickle")} - -# @law.decorator.notify -# @law.decorator.log -# @law.decorator.localize(input=True, output=False) -# @law.decorator.safe_output -# def run(self): -# import numpy as np -# import awkward as ak -# from columnflow.columnar_util import ( -# Route, update_ak_array, add_ak_aliases, has_ak_column, attach_coffea_behavior, -# ) -# from columnflow.hist_util import fill_hist - -# # prepare inputs -# inputs = self.input() - -# # declare output: dict of histograms -# histograms = {} - -# # run the weight_producer setup -# producer_reqs = self.weight_producer_inst.run_requires() -# reader_targets = self.weight_producer_inst.run_setup(producer_reqs, luigi.task.getpaths(producer_reqs)) - -# # create a temp dir for saving intermediate files -# tmp_dir = law.LocalDirectoryTarget(is_tmp=True) -# tmp_dir.touch() - -# # get shift dependent aliases -# aliases = self.local_shift_inst.x("column_aliases", {}) - -# # define columns that need to be read -# read_columns = {Route("process_id")} -# read_columns |= set(map(Route, self.category_id_columns)) -# read_columns |= set(self.weight_producer_inst.used_columns) -# read_columns |= set(map(Route, aliases.values())) -# read_columns |= { -# Route(inp) -# for variable_inst in ( -# self.config_inst.get_variable(var_name) -# for var_name in law.util.flatten(self.variable_tuples.values()) -# ) -# for inp in (( -# {variable_inst.expression} -# if isinstance(variable_inst.expression, str) -# # for variable_inst with custom expressions, read columns declared via aux key -# else set(variable_inst.x("inputs", [])) -# ) | ( -# # for variable_inst with selection, read columns declared via aux key -# set(variable_inst.x("inputs", [])) -# if variable_inst.selection != "1" -# else set() -# )) -# } - -# # empty float array to use when input files have no entries -# empty_f32 = ak.Array(np.array([], dtype=np.float32)) - -# # iterate over chunks of events and diffs -# file_targets = [inputs["events"]["events"]] -# if self.producer_insts: -# file_targets.extend([inp["columns"] for inp in inputs["producers"]]) -# if self.ml_model_insts: -# file_targets.extend([inp["mlcolumns"] for inp in inputs["ml"]]) - -# # prepare inputs for localization -# with law.localize_file_targets( -# [*file_targets, *reader_targets.values()], -# mode="r", -# ) as inps: -# for (events, *columns), pos in self.iter_chunked_io( -# [inp.abspath for inp in inps], -# source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets), -# read_columns=(len(file_targets) + len(reader_targets)) * [read_columns], -# chunk_size=self.weight_producer_inst.get_min_chunk_size(), -# ): -# # optional check for overlapping inputs -# if self.check_overlapping_inputs: -# self.raise_if_overlapping([events] + list(columns)) - -# # add additional columns -# events = update_ak_array(events, *columns) - -# # add aliases -# events = add_ak_aliases( -# events, -# aliases, -# remove_src=True, -# missing_strategy=self.missing_column_alias_strategy, -# ) - -# # attach coffea behavior aiding functional variable expressions -# events = attach_coffea_behavior(events) - -# # build the full event weight -# if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): -# events, weight = self.weight_producer_inst(events) -# else: -# weight = ak.Array(np.ones(len(events), dtype=np.float32)) - -# # define and fill histograms, taking into account multiple axes -# for var_key, var_names in self.variable_tuples.items(): -# # get variable instances -# variable_insts = [self.config_inst.get_variable(var_name) for var_name in var_names] - -# if var_key not in histograms: -# # create the histogram in the first chunk -# histograms[var_key] = create_hist_from_variables( -# *variable_insts, -# int_cat_axes=("category", "process", "shift"), -# ) - -# # mask events and weights when selection expressions are found -# masked_events = events -# masked_weights = weight -# for variable_inst in variable_insts: -# sel = variable_inst.selection -# if sel == "1": -# continue -# if not callable(sel): -# raise ValueError( -# f"invalid selection '{sel}', for now only callables are supported", -# ) -# mask = sel(masked_events) -# masked_events = masked_events[mask] -# masked_weights = masked_weights[mask] - -# # merge category ids -# category_ids = ak.concatenate( -# [Route(c).apply(masked_events) for c in self.category_id_columns], -# axis=-1, -# ) - -# # broadcast arrays so that each event can be filled for all its categories -# fill_data = { -# "category": category_ids, -# "process": masked_events.process_id, -# "shift": np.ones(len(masked_events), dtype=np.int32) * self.global_shift_inst.id, -# "weight": masked_weights, -# } -# for variable_inst in variable_insts: -# # prepare the expression -# expr = variable_inst.expression -# if isinstance(expr, str): -# route = Route(expr) -# def expr(events, *args, **kwargs): -# if len(events) == 0 and not has_ak_column(events, route): -# return empty_f32 -# return route.apply(events, null_value=variable_inst.null_value) -# # apply it -# fill_data[variable_inst.name] = expr(masked_events) - -# # fill it -# fill_hist( -# histograms[var_key], -# fill_data, -# last_edge_inclusive=self.last_edge_inclusive, -# ) - -# # merge output files -# self.output()["hists"].dump(histograms, formatter="pickle") - - -# # overwrite class defaults -# check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True) -# CreateHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy( -# default=CreateHistograms.task_family in check_overlap_tasks, -# add_default_to_description=True, -# ) - CreateHistogramsWrapper = wrapper_factory( base_cls=AnalysisTask, @@ -651,7 +400,6 @@ def run(self): inp["hists"].load(formatter="pickle") for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) ] - cats = list(hists[0].keys()) variable_names = list(hists[0][cats[0]].keys()) get_hists = lambda hists, cat, var : [h[cat][var] for h in hists] @@ -663,35 +411,9 @@ def run(self): variable_hists = get_hists(hists, the_cat, variable_name) merged_hists[the_cat] = sum(variable_hists[1:], variable_hists[0].copy()) outputs["hists"][variable_name].dump(merged_hists, formatter="pickle") - # optionally remove inputs if self.remove_previous: inputs.remove() - - # def run(self): - # # preare inputs and outputs - # inputs = self.input()["collection"] - # outputs = self.output() - - # # load input histograms - # hists = [ - # inp["hists"].load(formatter="pickle") - # for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) - # ] - - # # create a separate file per output variable - # variable_names = list(hists[0].keys()) - # for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): - # self.publish_message(f"merging histograms for '{variable_name}'") - - # variable_hists = [h[variable_name] for h in hists] - # merged = sum(variable_hists[1:], variable_hists[0].copy()) - # outputs["hists"][variable_name].dump(merged, formatter="pickle") - - # # optionally remove inputs - # if self.remove_previous: - # inputs.remove() - MergeHistogramsWrapper = wrapper_factory( base_cls=AnalysisTask, @@ -769,13 +491,18 @@ def run(self): self.publish_message(f"merging histograms for '{variable_name}'") # load hists + + variable_hists = [ coll["hists"].targets[variable_name].load(formatter="pickle") for coll in inputs.values() ] - - # merge and write the output - merged = sum(variable_hists[1:], variable_hists[0].copy()) + merged = {} + get_hists = lambda hists, cat : [h[cat] for h in hists] + for the_cat in variable_hists[0].keys(): + single_cat_hists = get_hists(variable_hists, the_cat) + merged[the_cat] = sum(single_cat_hists[1:], single_cat_hists[0].copy()) + outp.dump(merged, formatter="pickle") diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py index d15a18cec..8ac757ed0 100644 --- a/columnflow/tasks/plotting.py +++ b/columnflow/tasks/plotting.py @@ -149,14 +149,18 @@ def run(self): " - requested variable requires columns that were missing during histogramming\n" " - selected --processes did not match any value on the process axis of the input histogram", ) - - if 'sr' in category_inst.name: - hists = self.invoke_hist_hooks(hists) + if category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions + if self.hist_hooks: + hists = self.invoke_hist_hooks(hists,category_inst) + else: + hists = hists[category_inst.name] else: if category_inst.name in hists.keys(): hists = hists[category_inst.name] else: - hists[list(hists.keys())[0]] + raise Exception( + f"no histograms found to plot for {category_inst.name}" + ) # add new processes to the end of the list for process_inst in hists: @@ -169,11 +173,6 @@ def run(self): h = hists[process_inst] # selections h = h[{ - "category": [ - hist.loc(c.id) - for c in leaf_category_insts - if c.id in h.axes["category"] - ], "shift": [ hist.loc(s.id) for s in plot_shifts @@ -181,11 +180,9 @@ def run(self): ], }] # reductions - h = h[{"category": sum}] # store _hists[process_inst] = h hists = _hists - # call the plot function fig, _ = self.call_plot_func( self.plot_function, From 46c51ee600f42af344121a1dca061686de3bdc97 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 24 Mar 2025 16:28:08 +0100 Subject: [PATCH 15/26] Updated fake factor method: fixed bugs with chunked io --- columnflow/tasks/data_driven_methods.py | 666 +++++++++++++++--------- columnflow/tasks/plotting.py | 5 +- 2 files changed, 415 insertions(+), 256 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index e7bc90763..759152614 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -24,6 +24,7 @@ class PrepareFakeFactorHistograms( + CategoriesMixin, WeightProducerMixin, MLModelsMixin, ProducersMixin, @@ -58,6 +59,10 @@ class PrepareFakeFactorHistograms( def mandatory_columns(cls) -> set[str]: return set(cls.category_id_columns) | {"process_id"} + # def create_branch_map(self): + # # create a dummy branch map so that this task could be submitted as a job + # return {0: None} + def workflow_requires(self): reqs = super().workflow_requires() @@ -96,8 +101,8 @@ def requires(self): @workflow_condition.output def output(self): - return {"hists": self.target(f"fake_factor__{self.branch}.pickle")} - + return {"hists": self.target(f"ff_hist_{self.branch}.pickle")} + @law.decorator.notify @law.decorator.log @law.decorator.localize(input=True, output=False) @law.decorator.safe_output @@ -106,9 +111,9 @@ def run(self): import numpy as np import awkward as ak from columnflow.columnar_util import ( - Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, EMPTY_FLOAT + Route, update_ak_array, add_ak_aliases, has_ak_column, attach_coffea_behavior, EMPTY_FLOAT ) - + from columnflow.hist_util import fill_hist # prepare inputs inputs = self.input() @@ -125,15 +130,14 @@ def run(self): # get shift dependent aliases aliases = self.local_shift_inst.x("column_aliases", {}) - + ff_variables = [var.var_route for var in self.config_inst.x.fake_factor_method.axes.values()] # define columns that need to be read + read_columns = {Route("process_id")} read_columns |= set(map(Route, self.category_id_columns)) read_columns |= set(self.weight_producer_inst.used_columns) read_columns |= set(map(Route, aliases.values())) - read_columns |= { - Route(the_ax.var_route) for the_ax in self.config_inst.x.fake_factor_method.axes.values() - } + read_columns |= set(map(Route, ff_variables)) # empty float array to use when input files have no entries empty_f32 = ak.Array(np.array([], dtype=np.float32)) @@ -141,14 +145,15 @@ def run(self): file_targets = [inputs["events"]["events"]] if self.producer_insts: file_targets.extend([inp["columns"] for inp in inputs["producers"]]) - + # prepare inputs for localization with law.localize_file_targets( [*file_targets, *reader_targets.values()], mode="r", ) as inps: + for (events, *columns), pos in self.iter_chunked_io( - [inp.path for inp in inps], + [inp.abspath for inp in inps], source_type=len(file_targets) * ["awkward_parquet"] + [None] * len(reader_targets), read_columns=(len(file_targets) + len(reader_targets)) * [read_columns], chunk_size=self.weight_producer_inst.get_min_chunk_size(), @@ -156,10 +161,8 @@ def run(self): # optional check for overlapping inputs if self.check_overlapping_inputs: self.raise_if_overlapping([events] + list(columns)) - # add additional columns events = update_ak_array(events, *columns) - # add aliases events = add_ak_aliases( events, @@ -168,47 +171,77 @@ def run(self): missing_strategy=self.missing_column_alias_strategy, ) + # attach coffea behavior aiding functional variable expressions + events = attach_coffea_behavior(events) # build the full event weight if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): events, weight = self.weight_producer_inst(events) else: weight = ak.Array(np.ones(len(events), dtype=np.float32)) # define and fill histograms, taking into account multiple axes - categories = self.config_inst.categories.ids() - h = (hist.Hist.new - .IntCat(categories , name="category", growth=True) - .IntCat([], name="process", growth=True)) - for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): - h = eval(f'h.{var_axis.ax_str}') - - histograms['fake_factors'] = h.Weight() - category_ids = ak.concatenate( [Route(c).apply(events) for c in self.category_id_columns], - axis=-1, - ) - # broadcast arrays so that each event can be filled for all its categories - - fill_data = { - "category" : category_ids, - "process" : events.process_id, - "weight" : weight, - } - for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): - route = Route(var_axis.var_route) - if len(events) == 0 and not has_ak_column(events, route): - values = empty_f32 + axis=-1,) + sr_names = self.categories + for sr_name in sr_names: + the_sr = self.config_inst.get_category(sr_name) + regions = [sr_name] + if the_sr.aux: + for the_key in the_sr.aux.keys(): + if (the_key == 'abcd_regs') or (the_key == 'ff_regs'): + regions += list(the_sr.aux[the_key].values()) else: - values = ak.fill_none(ak.firsts(route.apply(events),axis=1), EMPTY_FLOAT) - if 'IntCategory' in var_axis.ax_str: values = ak.values_astype(values, np.int64) - fill_data[var_name] = values - # fill it - fill_hist( - histograms['fake_factors'], - fill_data, - ) + raise KeyError(f"Application and determination regions are not found for {the_sr}. \n Check aux field of the category map!") + + for region in regions: + #by accessing the list of categories we check if the category with this name exists + cat = self.config_inst.get_category(region) + + # get variable instances + mask = ak.any(category_ids == cat.id, axis = 1) + masked_events = events[mask] + masked_weight = weight[mask] + + h = (hist.Hist.new.IntCat([], name="process", growth=True)) + for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): + h = eval(f'h.{var_axis.ax_str}') + + h = h.Weight() + # broadcast arrays so that each event can be filled for all its categories + + fill_data = { + "process": masked_events.process_id, + "weight" : masked_weight, + } + for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): + route = Route(var_axis.var_route) + if len(masked_events) == 0 and not has_ak_column(masked_events, route): + values = empty_f32 + else: + values = route.apply(masked_events) + if values.ndim != 1: values = ak.firsts(values,axis=1) + values = ak.fill_none(values, EMPTY_FLOAT) + + if var_name == 'n_jets': values = ak.where (values > 2, + 2 * ak.ones_like(values), + values) + + if 'Int' in var_axis.ax_str: values = ak.values_astype(values, np.int64) + fill_data[var_name] = values + # fill it + fill_hist( + h, + fill_data, + ) + if cat.name not in histograms.keys(): + histograms[cat.name] = h + else: + histograms[cat.name] +=h + # merge output files self.output()["hists"].dump(histograms, formatter="pickle") + + # overwrite class defaults @@ -225,6 +258,121 @@ def run(self): enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], ) + +class MergeFakeFactorHistograms( + #VariablesMixin, + #WeightProducerMixin, + #MLModelsMixin, + #ProducersMixin, + #SelectorStepsMixin, + #CalibratorsMixin, + DatasetTask, + law.LocalWorkflow, + RemoteWorkflow, +): + only_missing = luigi.BoolParameter( + default=False, + description="when True, identify missing variables first and only require histograms of " + "missing ones; default: False", + ) + remove_previous = luigi.BoolParameter( + default=False, + significant=False, + description="when True, remove particlar input histograms after merging; default: False", + ) + + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + + # upstream requirements + reqs = Requirements( + RemoteWorkflow.reqs, + PrepareFakeFactorHistograms=PrepareFakeFactorHistograms, + ) + + @classmethod + def req_params(cls, inst: AnalysisTask, **kwargs) -> dict: + _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"} + kwargs["_prefer_cli"] = _prefer_cli + return super().req_params(inst, **kwargs) + + def create_branch_map(self): + # create a dummy branch map so that this task could be submitted as a job + return {0: None} + + # def _get_variables(self): + # if self.is_workflow(): + # return self.as_branch()._get_variables() + + # variables = self.variables + + # # optional dynamic behavior: determine not yet created variables and require only those + # if self.only_missing: + # missing = self.output().count(existing=False, keys=True)[1] + # variables = sorted(missing, key=variables.index) + + # return variables + + def workflow_requires(self): + reqs = super().workflow_requires() + + if not self.pilot: + #variables = self._get_variables() + #if variables: + reqs["hists"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching( + self, + branch=-1, + #variables=tuple(variables), + ) + + return reqs + + def requires(self): + #variables = self._get_variables() + #if not variables: + # return [] + + return self.reqs.PrepareFakeFactorHistograms.req_different_branching( + self, + branch=-1, + #variables=tuple(variables), + workflow="local", + ) + + def output(self): + return {"hists": self.target(f"merged_ff_hist.pickle")} + + @law.decorator.notify + @law.decorator.log + def run(self): + # preare inputs and outputs + inputs = self.input()["collection"] + outputs = self.output() + + # load input histograms + hists = [ + inp["hists"].load(formatter="pickle") + for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) + ] + cats = list(hists[0].keys()) + get_hists = lambda hists, cat : [h[cat] for h in hists] + # create a separate file per output variable + merged_hists = {} + self.publish_message(f"merging {len(hists)} histograms for {self.dataset}") + for the_cat in cats: + h = get_hists(hists, the_cat) + merged_hists[the_cat] = sum(h[1:], h[0].copy()) + outputs["hists"].dump(merged_hists, formatter="pickle") + # optionally remove inputs + if self.remove_previous: + inputs.remove() + +MergeFakeFactorHistogramsWrapper = wrapper_factory( + base_cls=AnalysisTask, + require_cls=MergeFakeFactorHistograms, + enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], +) + + class dict_creator(): def init_dict(self, ax_list): if not ax_list: @@ -259,7 +407,7 @@ class ComputeFakeFactors( # upstream requirements reqs = Requirements( RemoteWorkflow.reqs, - PrepareFakeFactorHistograms=PrepareFakeFactorHistograms, + MergeFakeFactorHistograms=MergeFakeFactorHistograms, ) def store_parts(self): @@ -272,37 +420,43 @@ def req_params(cls, inst: AnalysisTask, **kwargs) -> dict: _prefer_cli = law.util.make_set(kwargs.get("_prefer_cli", [])) | {"variables"} kwargs["_prefer_cli"] = _prefer_cli return super().req_params(inst, **kwargs) - - def workflow_requires(self): - reqs = super().workflow_requires() - if not self.pilot: - variables = self._get_variables() - if variables: - reqs["ff_method"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching( - self, - branch=-1, - variables=tuple(variables), - ) + + def create_branch_map(self): + # create a dummy branch map so that this task could be submitted as a job + return {0: None} return reqs - def requires(self): return { - d: self.reqs.PrepareFakeFactorHistograms.req( + d: self.reqs.MergeFakeFactorHistograms.req_different_branching( self, + branch=-1, dataset=d, - branch=-1 + workflow="local", ) for d in self.datasets } + def output(self): - return {"ff_json": self.target(f"fake_factors.json"), - "plots": {'_'.join((ff_type, syst)): self.target(f"fake_factor_{ff_type}_{syst}.png") + year = self.config_inst.campaign.aux['year'] + tag = self.config_inst.campaign.aux['tag'] + channel = self.config_inst.channels.get_first().name + return {"ff_json": self.target('_'.join(('fake_factors', + channel, + str(year), + tag)) + '.json'), + "plots": {'_'.join((ff_type, + syst, + f'n_jets_{str(nj)}')): self.target(f"fake_factor_{ff_type}_{syst}_njets_{str(nj)}.png") for syst in ['nominal', 'up', 'down'] - for ff_type in ['qcd','wj']}, - "plots1d": {'_'.join((ff_type,str(dm))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}.png") for ff_type in ['qcd','wj'] - for dm in [0,1,2,10,11]}} + for nj in [0,1,2]}, + "plots1d": {'_'.join((ff_type, + str(dm), + str(nj))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}_njets_{str(nj)}.png") + for ff_type in ['qcd','wj'] + for dm in [0,1,2,10,11] + for nj in [0,1,2]}} @law.decorator.log def run(self): @@ -321,57 +475,59 @@ def run(self): # preare inputs and outputs inputs = self.input() outputs = self.output() - merged_per_dataset = {} - projected_hists = [] + hists_by_dataset = [] + merged_hists = {} for (dataset_name, dataset) in inputs.items(): - files = dataset['collection'] + files = dataset['collection'][0] + # load input histograms per dataset - hists_per_ds = [ - inp['hists'].load(formatter="pickle")['fake_factors'] - for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50)) - ] - ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy()) - hists_by_dataset.append(ds_single_hist) - #Create a dict of histograms indexed by the process - hists_by_proc = {} - for proc_name in self.config_inst.processes.names(): - proc = self.config_inst.processes.get(proc_name) - for the_hist in hists_by_dataset: - - if proc.id in the_hist.axes["process"]: - h = the_hist.copy() - h = h[{"process": hist.loc(proc.id)}] - # add the histogram - if proc in hists_by_proc: - hists_by_proc[proc] += h + input_chunked_hists = [] + input_chunked_hists = [f.load(formatter='pickle') for f in files.values()] + + for hists in input_chunked_hists: + for the_cat, the_hist in hists.items(): + if the_cat not in merged_hists.keys(): + merged_hists[the_cat] = [] + merged_hists[the_cat].append(the_hist) else: - hists_by_proc[proc] = h - - #Divide histograms to data and bkg - mc_hists = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")] - data_hists = [h for p, h in hists_by_proc.items() if p.is_data] - - #Merge histograms to get a joint data and mc histogram - if len(mc_hists) > 1: mc_hists = sum(mc_hists[1:], mc_hists[0].copy()) - else: mc_hists = mc_hists[0].copy() - if len(data_hists) > 1: data_hists = sum(data_hists[1:], data_hists[0].copy()) - else: data_hists = data_hists[0].copy() + merged_hists[the_cat].append(the_hist) - #Function that performs the calculation of th - def get_ff_corr(self, h_data, h_mc, num_reg = 'dr_num_wj', den_reg = 'dr_den_wj', name='ff_hist', label='ff_hist'): - def get_dr_hist(self, h, det_reg): - cat_name = self.categories[0] - cat = self.config_inst.get_category(cat_name.replace('sr',det_reg)) - return h[{"category": hist.loc(cat.id)}] - - get_id = lambda ax, key: [i in enumerate(ax.keys)] - - data_num = get_dr_hist(self, h_data, num_reg) - data_den = get_dr_hist(self, h_data, den_reg) - mc_num = get_dr_hist(self, h_mc, num_reg) - mc_den = get_dr_hist(self, h_mc, den_reg) + #merge histograms + mc_hists = {} + data_hists = {} + #devide between data and mc + for the_cat, h_list in merged_hists.items(): + for the_hist in h_list: + for proc_name in self.config_inst.processes.names(): + proc = self.config_inst.processes.get(proc_name) + if proc.id in the_hist.axes["process"]: + h = the_hist.copy() + h = h[{"process": hist.loc(proc.id)}] + if proc.is_mc and not proc.has_tag("signal"): + if the_cat in mc_hists: mc_hists[the_cat] += h + else: mc_hists[the_cat] = h + if proc.is_data: + if the_cat in data_hists: data_hists[the_cat] += h + else: data_hists[the_cat] = h + #Function that performs the calculation of t + def get_ff_corr(self, h_data, h_mc, dr_num, dr_den, name='ff_hist', label='ff_hist'): + def get_single_cat(self, h, reg_name): + cat_name = self.config_inst.get_category(self.categories[0]).aux['ff_regs'][reg_name] + return h[cat_name] + data_num = get_single_cat(self, h_data, dr_num) + data_den = get_single_cat(self, h_data, dr_den) + mc_num = get_single_cat(self, h_mc, dr_num) + mc_den = get_single_cat(self, h_mc, dr_den) + print(name) + for nj in [0,1,2]: + for dm in [0,1,2,10,11]: + print(f'DM {dm} Nj {nj}') + print(f"data_num: {data_num[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}") + print(f"data_den: {data_den[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}") + print(f"mc_num: {mc_num[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}") + print(f"mc_den: {mc_den[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}") num = data_num.values() - mc_num.values() den = data_den.values() - mc_den.values() ff_val = np.where((num > 0) & (den > 0), @@ -386,6 +542,7 @@ def rel_err(x): h = hist.Hist.new for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): h = eval(f'h.{var_axis.ax_str}') + axes = list(h.axes[1:]) h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor') ff_raw = h.Weight() ff_raw.view().value[...,0] = ff_val @@ -395,90 +552,116 @@ def rel_err(x): #Make an approximation of tau pt dependance formula_str = 'p0 + p1*x+p2*x*x' + #formula_str = 'p0 + p1*x' def fitf(x, p0, p1, p2): return eval(formula_str) + def jac(x): from numpy import array out = array([[ 1, x, x**2],[x, x**2, x**3],[x**2, x**3, x**4]]) + #out = array([[ 1., x],[x, x**2]]) return out def eval_formula(formula_str, popt): for i,p in enumerate(popt): - formula_str = formula_str.replace(f'p{i}',str(popt[i])) + par = round(popt[i],6) + formula_str = formula_str.replace(f'p{i}',str(par)) return formula_str ff_fitted = ff_raw.copy().reset() ff_fitted.name = name ff_fitted.label = label - fitres = {} - axes = list(ff_raw.axes[1:2]) + fitres = {} dc = dict_creator() - for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str']: + for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str','x_max']: fitres[the_field]= dc.init_dict(axes) dm_axis = ff_raw.axes['tau_dm_pnet'] - for dm in dm_axis: - h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm), - 'syst': hist.loc('nominal')}] - mask = h1d.values() > 0 - y = h1d.values()[mask] - y_err = (h1d.variances()[mask])**0.5 - x = h1d.axes[0].centers - x_masked = x[mask] + n_jets_axis = ff_raw.axes['n_jets'] + for nj in n_jets_axis: + for dm in dm_axis: + h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm), + 'n_jets': hist.loc(nj), + 'syst': hist.loc('nominal')}] + mask = h1d.values() > 0 + x = h1d.axes[0].centers + if np.sum(mask) < 3: + #if np.sum(mask) < 2: + y = np.zeros_like(x) + y_err = np.ones_like(x) + x_masked = x + else: + y = h1d.values()[mask] + y_err = (h1d.variances()[mask])**0.5 + x_masked = x[mask] + popt, pcov = curve_fit(fitf, + x_masked, + y, + sigma=y_err, + absolute_sigma=True, + ) + fitres['chi2'][dm][nj] = sum(((y - fitf(x_masked, *popt))/y_err)**2) + fitres['ndf'][dm][nj] = len(y) - len(popt) + fitres['popt'][dm][nj] = popt + fitres['pcov'][dm][nj] = pcov + fitres['x_max'][dm][nj] = np.max(x_masked) - popt, pcov = curve_fit(fitf,x_masked,y, - sigma=y_err, - absolute_sigma=True, - ) - fitres['chi2'][dm] = sum(((y - fitf(x_masked, *popt))/y_err)**2) - fitres['ndf'][dm] = len(y) - len(popt) - fitres['popt'][dm] = popt - fitres['pcov'][dm] = pcov - - fitres['fitf_str'][dm] = eval_formula(formula_str,popt) - for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0 - ff_fitted.view().value[:, - ff_fitted.axes[1].index(dm), - ff_fitted.axes[2].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov))) - fitres['name'] = name - fitres['jac'] = jac - fitres['fitf'] = fitf + fitres['fitf_str'][dm][nj] = eval_formula(formula_str,popt) + for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0 + ff_fitted.view().value[:, + ff_fitted.axes[1].index(dm), + ff_fitted.axes[2].index(nj), + ff_fitted.axes[3].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov))) + fitres['name'] = name + fitres['jac'] = jac + fitres['fitf'] = fitf return ff_raw, ff_fitted, fitres wj_raw, wj_fitted, wj_fitres = get_ff_corr(self, data_hists, mc_hists, - num_reg = 'dr_num_wj', - den_reg = 'dr_den_wj', + dr_num = 'dr_num_wj', + dr_den = 'dr_den_wj', name='ff_wjets', label='Fake factor W+jets') qcd_raw, qcd_fitted, qcd_fitres = get_ff_corr(self, data_hists, mc_hists, - num_reg = 'dr_num_qcd', - den_reg = 'dr_den_qcd', + dr_num = 'dr_num_qcd', + dr_den = 'dr_den_qcd', name='ff_qcd', label='Fake factor QCD') corr_list = [] + + corr_list = [] for fitres in [wj_fitres, qcd_fitres]: formula_str = fitres['fitf_str'] - dm_bins = [] - for (dm, the_formula) in formula_str.items(): - x_max = 100 - last_val = fitres['fitf'](x_max,* fitres['popt'][dm]) - - dm_bins.append(cs.CategoryItem( - key=dm, - value=cs.Formula( - nodetype="formula", - variables=["tau_pt"], - parser="TFormula", - expression=f'({the_formula})/(1. + exp(10.*(x-{x_max}))) + ({last_val})/(1. + exp(-10.*(x-{x_max})))', - ))) + dm_cats = [] + for dm in formula_str.keys(): + formula_str_njet_binned = formula_str[dm] + single_dm = [] + for nj, the_formula in formula_str_njet_binned.items(): + x_max = fitres['x_max'][dm][nj] + fx_max = np.maximum(fitres['fitf'](x_max,* fitres['popt'][dm][nj]),0) + single_dm.append(cs.CategoryItem( + key=nj, + value=cs.Formula( + nodetype="formula", + variables=["tau_pt"], + parser="TFormula", + expression=f'({the_formula})*((x-{x_max})<0) + ({fx_max})*((x-{x_max})>=0)', + ))) + dm_cats.append(cs.CategoryItem( + key=dm, + value=cs.Category( + nodetype="category", + input="n_jets", + content=single_dm, + ))) corr_list.append(cs.Correction( name=fitres['name'], description=f"fake factor correcton for {fitres['name'].split('_')[1]}", @@ -486,14 +669,16 @@ def eval_formula(formula_str, popt): inputs=[ cs.Variable(name="tau_pt", type="real",description="pt of tau"), cs.Variable(name="tau_dm_pnet", type="int", description="PNet decay mode of tau"), + cs.Variable(name="n_jets", type="int", description="Number of jets with pt > 20 GeV and eta < 4.7"), ], output=cs.Variable(name="weight", type="real", description="Multiplicative event weight"), data=cs.Category( nodetype="category", input="tau_dm_pnet", - content=dm_bins,) + content=dm_cats, + ) )) - + cset = cs.CorrectionSet( schema_version=2, description="Fake factors", @@ -508,106 +693,79 @@ def eval_formula(formula_str, popt): h_raw = eval(f'{h_name}_raw') h_fitted = eval(f'{h_name}_fitted') - fig, ax = plt.subplots(figsize=(12, 8)) - h_raw[...,'nominal'].plot2d(ax=ax) - self.output()['plots']['_'.join((h_name,'nominal'))].dump(fig, formatter="mpl") + fitres = wj_fitres if h_name == 'wj' else qcd_fitres dm_axis = h_raw.axes['tau_dm_pnet'] - for dm in dm_axis: - h1d = h_raw[{'tau_dm_pnet': hist.loc(dm), - 'syst': hist.loc('nominal')}] - hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm)}] - fig, ax = plt.subplots(figsize=(8, 6)) - mask = h1d.counts() > 0 - x = h1d.axes[0].centers[mask] - y = h1d.counts()[mask] - xerr = (np.diff(h1d.axes[0]).flatten()/2.)[mask], - yerr = np.sqrt(h1d.variances()).flatten()[mask], - ax.errorbar(x, y, xerr = xerr, yerr = yerr, - label=f"PNet decay mode = {dm}", - marker='o', - fmt='o', - line=None, color='#2478B7', capsize=4) - x_fine = np.linspace(x[0],x[-1],num=100) - popt = fitres['popt'][dm] - pcov = fitres['pcov'][dm] - jac = fitres['jac'] - def err(x,jac,pcov): - from numpy import sqrt,einsum - return sqrt(einsum('ij,ij',jac(x),pcov)) - - import functools - err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine)) + nj_axis = h_raw.axes['n_jets'] + for nj in nj_axis: + print(f"Plotting 2d map for n jets = {nj}") + fig, ax = plt.subplots(figsize=(12, 8)) - y_fitf = fitres['fitf'](x_fine,*popt) - y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y - y_fitf_down = fitres['fitf'](x_fine,*(popt)) - err_y - - ax.plot(x_fine, - y_fitf, - color='#FF867B') - ax.fill_between(x_fine, y_fitf_up, y_fitf_down, color='#83d55f', alpha=0.5) - ax.set_ylabel('Fake Factor') - ax.set_xlabel('Tau pT [GeV]') - ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}') - ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm],2)}}}{{{fitres['ndf'][dm]}}}$", - (0.8, 0.9), - xycoords='axes fraction', - fontsize=20) + single2d_h = h_raw[{'n_jets': hist.loc(nj), + 'syst': hist.loc('nominal')}] + pcm = ax.pcolormesh(*np.meshgrid(*single2d_h.axes.edges), single2d_h.view().value.T, cmap="viridis", vmin=0, vmax=0.5) + ax.set_yticks(dm_axis.centers, labels=list(map(dm_axis.bin, range(dm_axis.size)))) + plt.colorbar(pcm, ax=ax) + plt.xlabel(single2d_h.axes.label[0]) + plt.ylabel(single2d_h.axes.label[1]) + plt.title(single2d_h.label) + + self.output()['plots']['_'.join((h_name,'nominal',f'n_jets_{str(nj)}'))].dump(fig, formatter="mpl") + for dm in dm_axis: + print(f"Plotting 1d plot for n jets = {nj}, dm = {dm}") + h1d = h_raw[{'tau_dm_pnet': hist.loc(dm), + 'n_jets': hist.loc(nj), + 'syst': hist.loc('nominal')}] + hfit = h_fitted[{'tau_dm_pnet': hist.loc(dm), + 'n_jets': hist.loc(nj),}] + fig, ax = plt.subplots(figsize=(8, 6)) + mask = h1d.counts() > 0 + if np.sum(mask) > 0: + x = h1d.axes[0].centers[mask] + y = h1d.counts()[mask] + xerr = (np.diff(h1d.axes[0]).flatten()/2.)[mask], + yerr = np.sqrt(h1d.variances()).flatten()[mask], + else: + x = h1d.axes[0].centers + y = np.zeros_like(x) + xerr = (np.diff(h1d.axes[0]).flatten()/2.) + yerr = np.ones_like(y), + + ax.errorbar(x, y, xerr = xerr, yerr = yerr, + label=f"PNet decay mode = {dm}", + marker='o', + fmt='o', + line=None, color='#2478B7', capsize=4) + x_fine = np.linspace(x[0],x[-1],num=100) + popt = fitres['popt'][dm][nj] + pcov = fitres['pcov'][dm][nj] + jac = fitres['jac'] + def err(x,jac,pcov): + from numpy import sqrt,einsum + return sqrt(einsum('ij,ij',jac(x),pcov)) + + import functools + err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine)) + + y_fitf = fitres['fitf'](x_fine,*popt) + y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y + y_fitf_down = fitres['fitf'](x_fine,*(popt)) - err_y - self.output()['plots1d']['_'.join((h_name,str(dm)))].dump(fig, formatter="mpl") - - - -class CreateDataDrivenHistograms( - VariablesMixin, - WeightProducerMixin, - ProducersMixin, - ReducedEventsUser, - ChunkedIOMixin, - law.LocalWorkflow, - RemoteWorkflow, -): - - sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) - - # upstream requirements - reqs = Requirements( - ReducedEventsUser.reqs, - RemoteWorkflow.reqs, - ComputeFakeFactors=ComputeFakeFactors, - ProduceColumns=ProduceColumns, - ) - - def requires(self): - reqs = {"events": self.reqs.ProvideReducedEvents.req(self)} - from IPython import embed; embed() - if self.producer_insts: - reqs["producers"] = [ - self.reqs.ProduceColumns.req(self, producer=producer_inst.cls_name) - for producer_inst in self.producer_insts - if producer_inst.produced_columns - ] - reqs['ff_json'] = self.reqs.ComputeFakeFactors.req(self) - reqs["weight_producer"] = law.util.make_unique(law.util.flatten(self.weight_producer_inst.run_requires())) - return reqs - - def output(self): - return {"hists": self.target(f"histograms__vars_{self.variables_repr}__{self.branch}.pickle")} - - @law.decorator.log - @law.decorator.localize(input=True, output=False) - @law.decorator.safe_output - def run(self): - import hist - import numpy as np - import awkward as ak - from columnflow.columnar_util import ( - Route, update_ak_array, add_ak_aliases, has_ak_column, fill_hist, - ) - - # prepare inputs - inputs = self.input() - from IPython import embed; embed() - # declare output: dict of histograms - histograms = {} \ No newline at end of file + ax.plot(x_fine, + y_fitf, + color='#FF867B') + ax.fill_between(x_fine, y_fitf_up, y_fitf_down, color='#83d55f', alpha=0.5) + ax.set_ylabel('Fake Factor') + ax.set_xlabel('Tau pT [GeV]') + ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}') + ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm][nj],2)}}}{{{fitres['ndf'][dm][nj]}}}$", + (0.8, 0.9), + xycoords='axes fraction', + fontsize=20) + print(str(fitres['fitf_str'][dm][nj])) + ax.annotate('y=' + str(fitres['fitf_str'][dm][nj]), + (0.1, 0.9), + xycoords='axes fraction', + fontsize=12) + + self.output()['plots1d']['_'.join((h_name,str(dm),str(nj)))].dump(fig, formatter="mpl") \ No newline at end of file diff --git a/columnflow/tasks/plotting.py b/columnflow/tasks/plotting.py index 8ac757ed0..71cbc5f27 100644 --- a/columnflow/tasks/plotting.py +++ b/columnflow/tasks/plotting.py @@ -155,13 +155,14 @@ def run(self): else: hists = hists[category_inst.name] else: - if category_inst.name in hists.keys(): + if 'dr' in category_inst.name: + hists = self.invoke_hist_hooks(hists,category_inst) + elif category_inst.name in hists.keys(): hists = hists[category_inst.name] else: raise Exception( f"no histograms found to plot for {category_inst.name}" ) - # add new processes to the end of the list for process_inst in hists: if process_inst not in process_insts: From 02429bdf98f2442bf19e320aa8146ce416e7701d Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 24 Mar 2025 16:37:58 +0100 Subject: [PATCH 16/26] Fixed the long-standing issue with flattening of the arrays at the stage of Creating the histograms --- columnflow/hist_util.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/columnflow/hist_util.py b/columnflow/hist_util.py index 92a9ed42a..7929bbbd7 100644 --- a/columnflow/hist_util.py +++ b/columnflow/hist_util.py @@ -72,15 +72,12 @@ def allows_shift(ax) -> bool: flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5 # fill - if 'event' in data.keys(): - arrays = {} - for ax_name in axis_names: - if ax_name in data.keys(): - arrays[ax_name] = data[ax_name] - h.fill(**fill_kwargs, **arrays) - else: - arrays = ak.flatten(ak.cartesian(data)) - h.fill(**fill_kwargs, **{field: arrays[field] for field in arrays.fields}) + flat_data = {} + for key, arr in data.items(): + if arr.ndim != 1: flat_data[key] = ak.flatten(arr) + else: flat_data[key] = arr + h.fill(**fill_kwargs, **flat_data) + def add_hist_axis(histogram: hist.Hist, variable_inst: od.Variable) -> hist.Hist: From fdb30eb69082f4e10c2f180fee50d5de57bef999 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 24 Mar 2025 16:42:21 +0100 Subject: [PATCH 17/26] Updated framework tasks according to a new approach of storing different categories --- columnflow/tasks/framework/mixins.py | 2 +- columnflow/tasks/histograms.py | 44 ++- columnflow/tasks/yields.py | 399 ++++++++++++++++++++------- law.cfg | 2 +- 4 files changed, 323 insertions(+), 124 deletions(-) diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py index 2bc75c005..35549393b 100644 --- a/columnflow/tasks/framework/mixins.py +++ b/columnflow/tasks/framework/mixins.py @@ -2452,7 +2452,7 @@ def invoke_hist_hooks(self, hists: dict, category_inst: od.Category) -> dict: Invoke hooks to update histograms before plotting. """ if not self.hist_hooks: - return hists + return hists[category_inst.name] for hook in self.hist_hooks: if hook in (None, "", law.NO_STR): diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py index f1d9c7e61..d7603112c 100644 --- a/columnflow/tasks/histograms.py +++ b/columnflow/tasks/histograms.py @@ -57,7 +57,7 @@ class CreateHistograms( @law.util.classproperty def mandatory_columns(cls) -> set[str]: - return set(cls.category_id_columns) | {"process_id"} + return set(cls.category_id_columns) | {"process_id", "ff_weight*"} def workflow_requires(self): reqs = super().workflow_requires() @@ -142,7 +142,9 @@ def run(self): read_columns = {Route("process_id")} read_columns |= set(map(Route, self.category_id_columns)) read_columns |= set(self.weight_producer_inst.used_columns) - read_columns |= set(map(Route, [n +'*' for n in self.config_inst.x.fake_factor_method.columns])) + read_columns |= set(map(Route, ['_'.join((the_name,the_shift)) + for the_name in self.config_inst.x.fake_factor_method.columns + for the_shift in self.config_inst.x.fake_factor_method.shifts])) read_columns |= set(map(Route, aliases.values())) read_columns |= { Route(inp) @@ -201,7 +203,6 @@ def run(self): # attach coffea behavior aiding functional variable expressions events = attach_coffea_behavior(events) - # build the full event weight if hasattr(self.weight_producer_inst, "skip_func") and not self.weight_producer_inst.skip_func(): events, weight = self.weight_producer_inst(events) @@ -236,24 +237,18 @@ def run(self): # mask events and weights when selection expressions are found masked_events = events - if 'ar_wj' in region: - masked_weights = weight * events.ff_weight_wj_nominal - elif 'ar_qcd' in region: - masked_weights = weight * events.ff_weight_qcd_nominal + if 'apply_ff' in cat.aux.keys(): + if cat.aux['apply_ff'] == 'wj': + self.publish_message(f"applying FF weights: ff_weight_wj_nominal, category: {cat.name}") + masked_weights = weight * events.ff_weight_wj_nominal + elif cat.aux['apply_ff'] == 'qcd': + self.publish_message(f"applying FF weights: ff_weight_qcd_nominal, category: {cat.name}") + masked_weights = weight * events.ff_weight_qcd_nominal + else: + masked_weights = weight else: masked_weights = weight - - # for variable_inst in variable_insts: - # sel = variable_inst.selection - # if sel == "1": - # continue - # if not callable(sel): - # raise ValueError( - # f"invalid selection '{sel}', for now only callables are supported", - # ) - # mask = sel(masked_events) - # #select only one category per histogram - # merge category ids + category_ids = ak.concatenate( [Route(c).apply(masked_events) for c in self.category_id_columns], axis=-1, @@ -272,12 +267,15 @@ def run(self): expr = variable_inst.expression if isinstance(expr, str): route = Route(expr) - def expr(events, *args, **kwargs): - if len(events) == 0 and not has_ak_column(events, route): + def expr(masked_events, *args, **kwargs): + if len(masked_events) == 0 and not has_ak_column(masked_events, route): return empty_f32 - return route.apply(events, null_value=variable_inst.null_value) + return route.apply(masked_events, null_value=variable_inst.null_value) # apply it - fill_data[variable_inst.name] = expr(masked_events) + if variable_inst.name == "event": + fill_data[variable_inst.name] = np.sign(masked_events.event) + else: + fill_data[variable_inst.name] = expr(masked_events) # fill it fill_hist( histograms[cat.name][var_key], diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py index 9de6a31cc..3abab6e15 100644 --- a/columnflow/tasks/yields.py +++ b/columnflow/tasks/yields.py @@ -21,6 +21,245 @@ from columnflow.util import dev_sandbox, try_int +# class CreateYieldTable( +# DatasetsProcessesMixin, +# CategoriesMixin, +# WeightProducerMixin, +# ProducersMixin, +# SelectorStepsMixin, +# CalibratorsMixin, +# law.LocalWorkflow, +# RemoteWorkflow, +# ): +# sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + +# table_format = luigi.Parameter( +# default="fancy_grid", +# significant=False, +# description="format of the yield table; accepts all formats of the tabulate package; " +# "default: fancy_grid", +# ) +# number_format = luigi.Parameter( +# default="pdg", +# significant=False, +# description="rounding format of each number in the yield table; accepts all formats " +# "understood by scinum.Number.str(), e.g. 'pdg', 'publication', '%.1f' or an integer " +# "(number of signficant digits); default: pdg", +# ) +# skip_uncertainties = luigi.BoolParameter( +# default=False, +# significant=False, +# description="when True, uncertainties are not displayed in the table; default: False", +# ) +# normalize_yields = luigi.ChoiceParameter( +# choices=(law.NO_STR, "per_process", "per_category", "all"), +# default=law.NO_STR, +# significant=False, +# description="string parameter to define the normalization of the yields; " +# "choices: '', per_process, per_category, all; empty default", +# ) +# output_suffix = luigi.Parameter( +# default=law.NO_STR, +# description="Adds a suffix to the output name of the yields table; empty default", +# ) + +# # upstream requirements +# reqs = Requirements( +# RemoteWorkflow.reqs, +# MergeHistograms=MergeHistograms, +# ) + +# # dummy branch map +# def create_branch_map(self): +# return [0] + +# def requires(self): +# return { +# d: self.reqs.MergeHistograms.req( +# self, +# dataset=d, +# variables=("event",), +# _prefer_cli={"variables"}, +# ) +# for d in self.datasets +# } + +# def workflow_requires(self): +# reqs = super().workflow_requires() + +# reqs["merged_hists"] = [ +# self.reqs.MergeHistograms.req( +# self, +# dataset=d, +# variables=("event",), +# _exclude={"branches"}, +# ) +# for d in self.datasets +# ] + +# return reqs + +# @classmethod +# def resolve_param_values(cls, params): +# params = super().resolve_param_values(params) + +# if "number_format" in params and try_int(params["number_format"]): +# # convert 'number_format' in integer if possible +# params["number_format"] = int(params["number_format"]) + +# return params + +# def output(self): +# suffix = "" +# if self.output_suffix and self.output_suffix != law.NO_STR: +# suffix = f"__{self.output_suffix}" + +# return { +# "table": self.target(f"table__proc_{self.processes_repr}__cat_{self.categories_repr}{suffix}.txt"), +# "yields": self.target(f"yields__proc_{self.processes_repr}__cat_{self.categories_repr}{suffix}.json"), +# } + +# @law.decorator.notify +# @law.decorator.log +# def run(self): +# import hist +# from tabulate import tabulate + +# inputs = self.input() +# outputs = self.output() + +# category_insts = list(map(self.config_inst.get_category, self.categories)) +# process_insts = list(map(self.config_inst.get_process, self.processes)) +# sub_process_insts = { +# proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] +# for proc in process_insts +# } + +# # histogram data per process +# hists = {} + +# with self.publish_step(f"Creating yields for processes {self.processes}, categories {self.categories}"): +# for dataset, inp in inputs.items(): +# dataset_inst = self.config_inst.get_dataset(dataset) + +# # load the histogram of the variable named "event" +# input_hists = inp["hists"]["event"].load(formatter="pickle") + +# # loop and extract one histogram per process +# for process_inst in process_insts: +# # skip when the dataset is already known to not contain any sub process +# if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): +# continue + +# # work on a copy +# h = h_in.copy() + +# # axis selections +# h = h[{ +# "process": [ +# hist.loc(p.id) +# for p in sub_process_insts[process_inst] +# if p.id in h.axes["process"] +# ], +# }] + +# # axis reductions +# h = h[{"process": sum, "shift": sum, "event": sum}] + +# # add the histogram +# if process_inst in hists: +# hists[process_inst] += h +# else: +# hists[process_inst] = h + +# # there should be hists to plot +# if not hists: +# raise Exception("no histograms found to plot") + +# # sort hists by process order +# hists = OrderedDict( +# (process_inst, hists[process_inst]) +# for process_inst in sorted(hists, key=process_insts.index) +# ) + +# yields, processes = defaultdict(list), [] + +# # read out yields per category and per process +# for process_inst, h in hists.items(): +# processes.append(process_inst) + +# for category_inst in category_insts: +# leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] + +# h_cat = h[{"category": [ +# hist.loc(c.id) +# for c in leaf_category_insts +# if c.id in h.axes["category"] +# ]}] +# h_cat = h_cat[{"category": sum}] + +# value = Number(h_cat.value) +# if not self.skip_uncertainties: +# # set a unique uncertainty name for correct propagation below +# value.set_uncertainty( +# f"mcstat_{process_inst.name}_{category_inst.name}", +# math.sqrt(h_cat.variance), +# ) +# yields[category_inst].append(value) + +# # obtain normalizaton factors +# norm_factors = 1 +# if self.normalize_yields == "all": +# norm_factors = sum( +# sum(category_yields) +# for category_yields in yields.values() +# ) +# elif self.normalize_yields == "per_process": +# norm_factors = [ +# sum(yields[category][i] for category in yields.keys()) +# for i in range(len(yields[category_insts[0]])) +# ] +# elif self.normalize_yields == "per_category": +# norm_factors = { +# category: sum(category_yields) +# for category, category_yields in yields.items() +# } + +# # initialize dicts +# yields_str = defaultdict(list, {"Process": [proc.label for proc in processes]}) +# raw_yields = defaultdict(dict, {}) + +# # apply normalization and format +# for category, category_yields in yields.items(): +# for i, value in enumerate(category_yields): +# # get correct norm factor per category and process +# if self.normalize_yields == "per_process": +# norm_factor = norm_factors[i] +# elif self.normalize_yields == "per_category": +# norm_factor = norm_factors[category] +# else: +# norm_factor = norm_factors + +# raw_yield = (value / norm_factor).nominal +# raw_yields[category.name][processes[i].name] = raw_yield + +# # format yields into strings +# yield_str = (value / norm_factor).str( +# combine_uncs="all", +# format=self.number_format, +# style="latex" if "latex" in self.table_format else "plain", +# ) +# if "latex" in self.table_format: +# yield_str = f"${yield_str}$" +# yields_str[category.label].append(yield_str) + +# # create, print and save the yield table +# yield_table = tabulate(yields_str, headers="keys", tablefmt=self.table_format) +# self.publish_message(yield_table) + +# outputs["table"].dump(yield_table, formatter="text") +# outputs["yields"].dump(raw_yields, formatter="json") + class CreateYieldTable( DatasetsProcessesMixin, CategoriesMixin, @@ -136,123 +375,85 @@ def run(self): } # histogram data per process - hists = {} - + merged_hists = {} with self.publish_step(f"Creating yields for processes {self.processes}, categories {self.categories}"): for dataset, inp in inputs.items(): dataset_inst = self.config_inst.get_dataset(dataset) # load the histogram of the variable named "event" input_hists = inp["hists"]["event"].load(formatter="pickle") - - # loop and extract one histogram per process - for process_inst in process_insts: - # skip when the dataset is already known to not contain any sub process - if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): - continue - - # work on a copy - h = h_in.copy() - - # axis selections - h = h[{ - "process": [ - hist.loc(p.id) - for p in sub_process_insts[process_inst] - if p.id in h.axes["process"] - ], - }] - - # axis reductions - h = h[{"process": sum, "shift": sum, "event": sum}] - - # add the histogram - if process_inst in hists: - hists[process_inst] += h + + + for the_cat, the_hist in input_hists.items(): + if the_cat not in merged_hists.keys(): + merged_hists[the_cat] = [] else: - hists[process_inst] = h - + merged_hists[the_cat].append(the_hist) + #merge histograms + merged_hists_ = {the_cat: sum(h[1:],h[0].copy()) for the_cat, h in merged_hists.items()} + hists_per_proc = {} + for the_cat, the_hist in merged_hists_.items(): + hists_per_proc[the_cat] = {} + for proc in process_insts: + leaf_procs = proc.get_leaf_processes() + if len(leaf_procs) == 0 : leaf_procs = [proc] + for leaf_proc in leaf_procs: + if leaf_proc.id in the_hist.axes["process"]: + h = the_hist.copy() + h = h[{"process": hist.loc(leaf_proc.id)}] + + if proc in hists_per_proc[the_cat]: + hists_per_proc[the_cat][proc] +=h + else: + hists_per_proc[the_cat][proc] = h + # there should be hists to plot - if not hists: + if not hists_per_proc: raise Exception("no histograms found to plot") - # sort hists by process order - hists = OrderedDict( - (process_inst, hists[process_inst]) - for process_inst in sorted(hists, key=process_insts.index) + hists = {} + for the_cat in hists_per_proc.keys(): + single_cat_hists = hists_per_proc[the_cat] + hists[the_cat] = OrderedDict( + (process_inst, single_cat_hists[process_inst]) + for process_inst in sorted(single_cat_hists, key=process_insts.index) ) - - yields, processes = defaultdict(list), [] - - # read out yields per category and per process - for process_inst, h in hists.items(): - processes.append(process_inst) - - for category_inst in category_insts: - leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] - - h_cat = h[{"category": [ - hist.loc(c.id) - for c in leaf_category_insts - if c.id in h.axes["category"] - ]}] - h_cat = h_cat[{"category": sum}] - - value = Number(h_cat.value) - if not self.skip_uncertainties: + #Calculate yields + yields = {} + for the_cat in hists.keys(): + tmp = {} + for the_proc in hists[the_cat].keys(): + val = Number(hists[the_cat][the_proc].sum().value) + + if not self.skip_uncertainties and not the_proc.is_data: # set a unique uncertainty name for correct propagation below - value.set_uncertainty( - f"mcstat_{process_inst.name}_{category_inst.name}", - math.sqrt(h_cat.variance), + val.set_uncertainty( + f"mcstat_{the_proc.name}_{the_cat}", + math.sqrt(hists[the_cat][the_proc].sum().variance), ) - yields[category_inst].append(value) - - # obtain normalizaton factors - norm_factors = 1 - if self.normalize_yields == "all": - norm_factors = sum( - sum(category_yields) - for category_yields in yields.values() - ) - elif self.normalize_yields == "per_process": - norm_factors = [ - sum(yields[category][i] for category in yields.keys()) - for i in range(len(yields[category_insts[0]])) - ] - elif self.normalize_yields == "per_category": - norm_factors = { - category: sum(category_yields) - for category, category_yields in yields.items() - } - + tmp[the_proc]=val + yields[the_cat] = OrderedDict(tmp) # initialize dicts - yields_str = defaultdict(list, {"Process": [proc.label for proc in processes]}) + yields_str = defaultdict(list, {"Process" : [proc.label for proc in process_insts]}) raw_yields = defaultdict(dict, {}) - # apply normalization and format - for category, category_yields in yields.items(): - for i, value in enumerate(category_yields): - # get correct norm factor per category and process - if self.normalize_yields == "per_process": - norm_factor = norm_factors[i] - elif self.normalize_yields == "per_category": - norm_factor = norm_factors[category] - else: - norm_factor = norm_factors - - raw_yield = (value / norm_factor).nominal - raw_yields[category.name][processes[i].name] = raw_yield - - # format yields into strings - yield_str = (value / norm_factor).str( - combine_uncs="all", - format=self.number_format, - style="latex" if "latex" in self.table_format else "plain", - ) + for cat in yields.keys(): + yields_per_cat = yields[cat] + for proc in process_insts: + if proc in yields_per_cat: + raw_yield = yields_per_cat[proc].nominal + yield_str = (yields_per_cat[proc]).str( + combine_uncs="all", + format=self.number_format, + style="latex" if "latex" in self.table_format else "plain", + ) + else: + raw_yield = Number(-1).nominal + yield_str = str(-1) + raw_yields[cat][proc.name] = raw_yield if "latex" in self.table_format: yield_str = f"${yield_str}$" - yields_str[category.label].append(yield_str) - + yields_str[cat].append(yield_str) # create, print and save the yield table yield_table = tabulate(yields_str, headers="keys", tablefmt=self.table_format) self.publish_message(yield_table) diff --git a/law.cfg b/law.cfg index 0d6ae338f..5d01d5d05 100644 --- a/law.cfg +++ b/law.cfg @@ -60,7 +60,7 @@ slurm_flavor: $CF_SLURM_FLAVOR slurm_partition: $CF_SLURM_PARTITION # ChunkedIOHandler defaults -chunked_io_chunk_size: 100000 +chunked_io_chunk_size: 50000 chunked_io_pool_size: 2 chunked_io_debug: False From 000ee8b1e14bd945413a4b63ecea985d35f66a47 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 7 Apr 2025 14:00:31 +0200 Subject: [PATCH 18/26] Updated code for the Fake Factor calculation --- columnflow/tasks/data_driven_methods.py | 220 ++++++++++++++---------- 1 file changed, 126 insertions(+), 94 deletions(-) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 759152614..7c4b3f375 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -372,24 +372,11 @@ def run(self): enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], ) - -class dict_creator(): - def init_dict(self, ax_list): - if not ax_list: - return -1. - else: - ax = ax_list[0] - updated_ax = ax_list[1:] - get_ax_dict = lambda ax, ax_list, func : {ax.bin(i): func(ax_list) for i in range(ax.size)} - return get_ax_dict(ax,updated_ax, self.init_dict) - - class ComputeFakeFactors( DatasetsProcessesMixin, CategoriesMixin, WeightProducerMixin, ProducersMixin, - dict_creator, ): sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) @@ -456,7 +443,12 @@ def output(self): str(nj))): self.target(f"fake_factor_{ff_type}_PNet_dm_{str(dm)}_njets_{str(nj)}.png") for ff_type in ['qcd','wj'] for dm in [0,1,2,10,11] - for nj in [0,1,2]}} + for nj in [0,1,2]}, + "fitres": self.target('_'.join(('fitres', + channel, + str(year), + tag)) + '.json'), + } @law.decorator.log def run(self): @@ -466,6 +458,7 @@ def run(self): from scipy.special import erf import matplotlib.pyplot as plt import correctionlib.schemav2 as cs + from numpy import exp plt.figure(dpi=200) plt.rcParams.update({ "text.usetex": True, @@ -510,6 +503,15 @@ def run(self): if proc.is_data: if the_cat in data_hists: data_hists[the_cat] += h else: data_hists[the_cat] = h + + def eval_formula(formula_str, popt,make_rounding=False): + for i,p in enumerate(popt): + if make_rounding: + formula_str = formula_str.replace(f'p{i}', '{:.3e}'.format(p)) + else: + formula_str = formula_str.replace(f'p{i}',str(p)) + return formula_str + #Function that performs the calculation of t def get_ff_corr(self, h_data, h_mc, dr_num, dr_den, name='ff_hist', label='ff_hist'): @@ -529,6 +531,7 @@ def get_single_cat(self, h, reg_name): print(f"mc_num: {mc_num[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}") print(f"mc_den: {mc_den[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj)}].values()}") num = data_num.values() - mc_num.values() + den = data_den.values() - mc_den.values() ff_val = np.where((num > 0) & (den > 0), num / np.maximum(den, 1), @@ -538,7 +541,7 @@ def rel_err(x): ff_err = ff_val * ((data_num.variances() + mc_num.variances())**0.5 / np.abs(num) + (data_den.variances() + mc_den.variances())**0.5 / np.abs(den)) - + ff_err[ff_val < 0] = 1 h = hist.Hist.new for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): h = eval(f'h.{var_axis.ax_str}') @@ -550,45 +553,54 @@ def rel_err(x): ff_raw.name = name + '_raw' ff_raw.label = label + '_raw' - #Make an approximation of tau pt dependance - formula_str = 'p0 + p1*x+p2*x*x' - #formula_str = 'p0 + p1*x' - def fitf(x, p0, p1, p2): - return eval(formula_str) - - def jac(x): - from numpy import array - out = array([[ 1, x, x**2],[x, x**2, x**3],[x**2, x**3, x**4]]) - #out = array([[ 1., x],[x, x**2]]) - return out - - def eval_formula(formula_str, popt): - for i,p in enumerate(popt): - par = round(popt[i],6) - formula_str = formula_str.replace(f'p{i}',str(par)) - return formula_str + def get_fitf(dm): + if dm==0: + formula_str = 'p0+p1*x+p2*x*x' + def fitf(x,p0,p1,p2): + return eval(formula_str) + else: + formula_str = 'p0+p1*exp(-p2*x)' + def fitf(x,p0,p1,p2): + from numpy import exp + return eval(formula_str) + return fitf, formula_str + + def get_jac(dm): + if dm==0: + def jac(x,p): + from numpy import array + return array([ 1., x, x**2]) + else: + def jac(x,p): + from numpy import array,exp,outer + ders=array([ 1., + exp(-p[2]*x), + -1*p[1]*x*exp(-p[2]*x)]) + return ders + return jac ff_fitted = ff_raw.copy().reset() ff_fitted.name = name ff_fitted.label = label - fitres = {} - dc = dict_creator() - for the_field in ['chi2','ndf','popt', 'pcov', 'fitf_str','x_max']: - fitres[the_field]= dc.init_dict(axes) - dm_axis = ff_raw.axes['tau_dm_pnet'] n_jets_axis = ff_raw.axes['n_jets'] + for nj in n_jets_axis: + if nj not in fitres.keys(): fitres[nj] = {} for dm in dm_axis: + if dm not in fitres[nj].keys(): fitres[nj][dm] = {} + + + + h1d = ff_raw[{'tau_dm_pnet': hist.loc(dm), 'n_jets': hist.loc(nj), 'syst': hist.loc('nominal')}] mask = h1d.values() > 0 x = h1d.axes[0].centers - if np.sum(mask) < 3: - #if np.sum(mask) < 2: + if np.sum(mask) < 2: y = np.zeros_like(x) y_err = np.ones_like(x) x_masked = x @@ -596,27 +608,37 @@ def eval_formula(formula_str, popt): y = h1d.values()[mask] y_err = (h1d.variances()[mask])**0.5 x_masked = x[mask] - popt, pcov = curve_fit(fitf, + + fitf, formula_str = get_fitf(dm) + if dm==0: + the_bounds = ([-10,-5,-1],[10,5,1]) + else: + the_bounds = ([-0.5, -1, 0],[0.5,1,0.1]) + popt, pcov, infodict, mesg, ier = curve_fit(fitf, x_masked, y, sigma=y_err, + bounds=the_bounds, absolute_sigma=True, + full_output=True ) - fitres['chi2'][dm][nj] = sum(((y - fitf(x_masked, *popt))/y_err)**2) - fitres['ndf'][dm][nj] = len(y) - len(popt) - fitres['popt'][dm][nj] = popt - fitres['pcov'][dm][nj] = pcov - fitres['x_max'][dm][nj] = np.max(x_masked) - - fitres['fitf_str'][dm][nj] = eval_formula(formula_str,popt) + fitres[nj][dm]['chi2'] = sum((infodict['fvec'])**2) + fitres[nj][dm]['ndf'] = len(y) - len(popt) + fitres[nj][dm]['popt'] = popt + fitres[nj][dm]['pcov'] = pcov + fitres[nj][dm]['x_max'] = np.max(x_masked) + + fitres[nj][dm]['jac'] = get_jac(dm) + fitres[nj][dm]['name'] = name + fitres[nj][dm]['fitf'] = fitf + fitres[nj][dm]['fitf_str'] = formula_str + for c, shift_name in enumerate(['down', 'nominal', 'up']): # if down then c=-1, if up c=+1, nominal => c=0 ff_fitted.view().value[:, ff_fitted.axes[1].index(dm), ff_fitted.axes[2].index(nj), ff_fitted.axes[3].index(shift_name)] = fitf(x, *popt + (c-1) * np.sqrt(np.diag(pcov))) - fitres['name'] = name - fitres['jac'] = jac - fitres['fitf'] = fitf + return ff_raw, ff_fitted, fitres wj_raw, wj_fitted, wj_fitres = get_ff_corr(self, @@ -635,36 +657,36 @@ def eval_formula(formula_str, popt): name='ff_qcd', label='Fake factor QCD') - corr_list = [] - + corr_list = [] - for fitres in [wj_fitres, qcd_fitres]: - formula_str = fitres['fitf_str'] - dm_cats = [] - for dm in formula_str.keys(): - formula_str_njet_binned = formula_str[dm] - single_dm = [] - for nj, the_formula in formula_str_njet_binned.items(): - x_max = fitres['x_max'][dm][nj] - fx_max = np.maximum(fitres['fitf'](x_max,* fitres['popt'][dm][nj]),0) - single_dm.append(cs.CategoryItem( - key=nj, + for fitres_per_proc in [wj_fitres, qcd_fitres]: + nj_categories = [] + for nj, fitres_per_nj in fitres_per_proc.items(): + single_nj = [] + for dm, fitres in fitres_per_nj.items(): + x_max = fitres['x_max'] + fitf = fitres['fitf'] + popt = fitres['popt'] + fitf_str = eval_formula(fitres['fitf_str'], popt) + fx_max = np.maximum(fitf(x_max,*popt),0) + single_nj.append(cs.CategoryItem( + key=dm, value=cs.Formula( nodetype="formula", variables=["tau_pt"], parser="TFormula", - expression=f'({the_formula})*((x-{x_max})<0) + ({fx_max})*((x-{x_max})>=0)', + expression=f'({fitf_str})*((x-{x_max})<0) + ({fx_max})*((x-{x_max})>=0)', ))) - dm_cats.append(cs.CategoryItem( - key=dm, + nj_categories.append(cs.CategoryItem( + key=nj, value=cs.Category( nodetype="category", - input="n_jets", - content=single_dm, + input="tau_dm_pnet", + content=single_nj, ))) corr_list.append(cs.Correction( - name=fitres['name'], - description=f"fake factor correcton for {fitres['name'].split('_')[1]}", + name=fitres_per_proc[0][0]['name'], + description=f"fake factor correcton for {fitres_per_proc[0][0]['name'].split('_')[1]}", version=2, inputs=[ cs.Variable(name="tau_pt", type="real",description="pt of tau"), @@ -674,11 +696,10 @@ def eval_formula(formula_str, popt): output=cs.Variable(name="weight", type="real", description="Multiplicative event weight"), data=cs.Category( nodetype="category", - input="tau_dm_pnet", - content=dm_cats, + input="n_jets", + content=nj_categories, ) )) - cset = cs.CorrectionSet( schema_version=2, description="Fake factors", @@ -686,17 +707,25 @@ def eval_formula(formula_str, popt): ) self.output()['ff_json'].dump(cset.json(exclude_unset=True), formatter="json") - + chi2_string = 'type nj dm chi2 ndf,' + for fitres_per_proc in [wj_fitres, qcd_fitres]: + for dm, fitres_per_dm in fitres_per_proc.items(): + for nj, fitres in fitres_per_dm.items(): + chi2_string += ' '.join((fitres['name'], + str(nj), + str(dm), + str(fitres['chi2']), + str(fitres['ndf']))) + chi2_string += ',' + self.output()['fitres'].dump(chi2_string, formatter="json") #Plot fake factors: for h_name in ['wj', 'qcd']: - h_raw = eval(f'{h_name}_raw') - h_fitted = eval(f'{h_name}_fitted') - - - fitres = wj_fitres if h_name == 'wj' else qcd_fitres - dm_axis = h_raw.axes['tau_dm_pnet'] - nj_axis = h_raw.axes['n_jets'] + h_raw = eval(f'{h_name}_raw') + h_fitted = eval(f'{h_name}_fitted') + fitres_dict = eval(f'{h_name}_fitres') + dm_axis = h_raw.axes['tau_dm_pnet'] + nj_axis = h_raw.axes['n_jets'] for nj in nj_axis: print(f"Plotting 2d map for n jets = {nj}") fig, ax = plt.subplots(figsize=(12, 8)) @@ -736,16 +765,17 @@ def eval_formula(formula_str, popt): marker='o', fmt='o', line=None, color='#2478B7', capsize=4) - x_fine = np.linspace(x[0],x[-1],num=100) - popt = fitres['popt'][dm][nj] - pcov = fitres['pcov'][dm][nj] + x_fine = np.linspace(x[0],x[-1],num=30) + fitres = fitres_dict[nj][dm] + popt = fitres['popt'] + pcov = fitres['pcov'] jac = fitres['jac'] - def err(x,jac,pcov): - from numpy import sqrt,einsum - return sqrt(einsum('ij,ij',jac(x),pcov)) + def err(x,jac,pcov,popt): + from numpy import sqrt,einsum,abs + return sqrt(abs(einsum('i,ij,j',jac(x,popt).T,pcov,jac(x,popt)))) import functools - err_y = list(map(functools.partial(err, jac=jac,pcov=pcov), x_fine)) + err_y = list(map(functools.partial(err, jac=jac,pcov=pcov,popt=popt), x_fine)) y_fitf = fitres['fitf'](x_fine,*popt) y_fitf_up = fitres['fitf'](x_fine,*popt) + err_y @@ -757,14 +787,16 @@ def err(x,jac,pcov): ax.fill_between(x_fine, y_fitf_up, y_fitf_down, color='#83d55f', alpha=0.5) ax.set_ylabel('Fake Factor') ax.set_xlabel('Tau pT [GeV]') - ax.set_title(f'Jet Fake Factors :Tau PNet Decay Mode {(dm)}') - ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'][dm][nj],2)}}}{{{fitres['ndf'][dm][nj]}}}$", - (0.8, 0.9), + ax.set_title(f'Jet Fake Factors : Tau PNet Decay Mode {dm}, Njets {nj}') + ax.annotate(rf"$\frac{{\chi^2}}{{ndf}} = \frac{{{np.round(fitres['chi2'],2)}}}{{{fitres['ndf']}}}$", + (0.8, 0.75), xycoords='axes fraction', fontsize=20) - print(str(fitres['fitf_str'][dm][nj])) - ax.annotate('y=' + str(fitres['fitf_str'][dm][nj]), - (0.1, 0.9), + + formula_str = eval_formula(fitres['fitf_str'],popt, make_rounding=True) + + ax.annotate('y=' + formula_str, + (0.01, 0.95), xycoords='axes fraction', fontsize=12) From 13d104d714ab704a70433f90811955d0aa740dbe Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Fri, 25 Apr 2025 13:30:50 +0200 Subject: [PATCH 19/26] Small update in the histogram filling process and boundaries of the fake factor fit --- columnflow/hist_util.py | 17 ++++++++++++++++- columnflow/tasks/data_driven_methods.py | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/columnflow/hist_util.py b/columnflow/hist_util.py index 7929bbbd7..ff44709d8 100644 --- a/columnflow/hist_util.py +++ b/columnflow/hist_util.py @@ -72,9 +72,24 @@ def allows_shift(ax) -> bool: flat_np_view(data[ax.name])[right_egde_mask] -= ax.widths[-1] * 1e-5 # fill + flat_data = {} + arr_shape = None for key, arr in data.items(): - if arr.ndim != 1: flat_data[key] = ak.flatten(arr) + if arr.ndim > 1: + logger.warning( + f"Found axis {key} that is not 1-dimensional: trying to broadcast all other axes:" + ) + arr_shape = ak.local_index(arr) + + for key, arr in data.items(): + if arr_shape is not None: + if arr.ndim == 1: + _, br_arr = ak.broadcast_arrays(arr_shape, arr) + flat_data[key] = ak.flatten(br_arr) + else: + flat_data[key] = ak.flatten(arr) + else: flat_data[key] = arr h.fill(**fill_kwargs, **flat_data) diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py index 7c4b3f375..80162dacb 100644 --- a/columnflow/tasks/data_driven_methods.py +++ b/columnflow/tasks/data_driven_methods.py @@ -613,7 +613,7 @@ def jac(x,p): if dm==0: the_bounds = ([-10,-5,-1],[10,5,1]) else: - the_bounds = ([-0.5, -1, 0],[0.5,1,0.1]) + the_bounds = ([-0.5, -3, 0],[0.5,3,0.1]) popt, pcov, infodict, mesg, ier = curve_fit(fitf, x_masked, y, From 44033d33a106b33639533613faed29d44d34b100 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Mon, 28 Apr 2025 14:45:59 +0200 Subject: [PATCH 20/26] Bug fix: while creating cutflow histogram from a set of files, histograms from the first file were missing --- columnflow/tasks/yields.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py index 3abab6e15..ecf73c85f 100644 --- a/columnflow/tasks/yields.py +++ b/columnflow/tasks/yields.py @@ -383,14 +383,20 @@ def run(self): # load the histogram of the variable named "event" input_hists = inp["hists"]["event"].load(formatter="pickle") - for the_cat, the_hist in input_hists.items(): if the_cat not in merged_hists.keys(): merged_hists[the_cat] = [] + merged_hists[the_cat].append(the_hist) else: merged_hists[the_cat].append(the_hist) #merge histograms - merged_hists_ = {the_cat: sum(h[1:],h[0].copy()) for the_cat, h in merged_hists.items()} + + merged_hists_ = {} + for the_cat, h in merged_hists.items(): + if len(h) > 1: merged_hists_[the_cat] = sum(h[1:],h[0].copy()) + else: + merged_hists_[the_cat] = h[0].copy() + hists_per_proc = {} for the_cat, the_hist in merged_hists_.items(): hists_per_proc[the_cat] = {} @@ -403,7 +409,7 @@ def run(self): h = h[{"process": hist.loc(leaf_proc.id)}] if proc in hists_per_proc[the_cat]: - hists_per_proc[the_cat][proc] +=h + hists_per_proc[the_cat][proc] += h else: hists_per_proc[the_cat][proc] = h From 2a7e90403777995d7ed123b510974b0c07a91535 Mon Sep 17 00:00:00 2001 From: Aliya Nigamova Date: Mon, 19 May 2025 17:19:05 +0200 Subject: [PATCH 21/26] first version of datacards --- bin/cf_sandbox_file_hash | 2 +- columnflow/tasks/cms/inference.py | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/bin/cf_sandbox_file_hash b/bin/cf_sandbox_file_hash index 18f846c35..bf3ae5387 100755 --- a/bin/cf_sandbox_file_hash +++ b/bin/cf_sandbox_file_hash @@ -11,6 +11,6 @@ action() { setopt globdots fi - python "${this_dir}/$( basename "${this_file}" ).py" "$@" + python3 "${this_dir}/$( basename "${this_file}" ).py" "$@" } action "$@" diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py index 9386a47f6..06d7a051d 100644 --- a/columnflow/tasks/cms/inference.py +++ b/columnflow/tasks/cms/inference.py @@ -211,24 +211,19 @@ def run(self): continue # open the histogram and work on a copy - h = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy() - + h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy() + h = h_dict[cat_obj.name].copy() # axis selections h = h[{ "process": [ hist.loc(p.id) for p in sub_process_insts if p.id in h.axes["process"] - ], - "category": [ - hist.loc(c.id) - for c in leaf_category_insts - if c.id in h.axes["category"] - ], + ] }] # axis reductions - h = h[{"process": sum, "category": sum}] + h = h[{"process": sum}] # add the histogram for this dataset if h_proc is None: From 3eed85b9a8e82a149f23bf474ab56a29095fd15b Mon Sep 17 00:00:00 2001 From: Aliya Nigamova Date: Tue, 20 May 2025 15:37:38 +0200 Subject: [PATCH 22/26] working version with abcd method --- columnflow/tasks/cms/inference.py | 118 +++++++++++++++--------------- 1 file changed, 60 insertions(+), 58 deletions(-) diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py index 06d7a051d..b5a391fbb 100644 --- a/columnflow/tasks/cms/inference.py +++ b/columnflow/tasks/cms/inference.py @@ -10,7 +10,7 @@ from columnflow.tasks.framework.base import Requirements, AnalysisTask, wrapper_factory from columnflow.tasks.framework.mixins import ( - CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin, + CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin, HistHookMixin ) from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.histograms import MergeHistograms, MergeShiftedHistograms @@ -19,6 +19,7 @@ class CreateDatacards( + HistHookMixin, InferenceModelMixin, MLModelsMixin, ProducersMixin, @@ -183,82 +184,83 @@ def run(self): category_inst = self.config_inst.get_category(cat_obj.config_category) variable_inst = self.config_inst.get_variable(cat_obj.config_variable) leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] - + # histogram data per process hists = OrderedDict() - + process_insts = [] + #prepare histogram objects with self.publish_step(f"extracting {variable_inst.name} in {category_inst.name} ..."): for proc_obj_name, inp in inputs.items(): if proc_obj_name == "data": proc_obj = None process_inst = self.config_inst.get_process("data") - else: + elif proc_obj_name != "qcd" and proc_obj_name != "wj": proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name) process_inst = self.config_inst.get_process(proc_obj.config_process) + else: + continue sub_process_insts = [sub for sub, _, _ in process_inst.walk_processes(include_self=True)] - + process_insts.append(process_inst) h_proc = None for dataset, _inp in inp.items(): dataset_inst = self.config_inst.get_dataset(dataset) - - # skip when the dataset is already known to not contain any sub process - if not any(map(dataset_inst.has_process, sub_process_insts)): - self.logger.warning( - f"dataset '{dataset}' does not contain process '{process_inst.name}' " - "or any of its subprocesses which indicates a misconfiguration in the " - f"inference model '{self.inference_model}'", - ) - continue - - # open the histogram and work on a copy h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy() - h = h_dict[cat_obj.name].copy() - # axis selections - h = h[{ - "process": [ - hist.loc(p.id) - for p in sub_process_insts - if p.id in h.axes["process"] - ] - }] - - # axis reductions - h = h[{"process": sum}] - - # add the histogram for this dataset - if h_proc is None: - h_proc = h - else: - h_proc += h - - # there must be a histogram - if h_proc is None: - raise Exception(f"no histograms found for process '{process_inst.name}'") - - # create the nominal hist - hists[proc_obj_name] = OrderedDict() - nominal_shift_inst = self.config_inst.get_shift("nominal") - hists[proc_obj_name]["nominal"] = h_proc[ - {"shift": hist.loc(nominal_shift_inst.id)} - ] - - # per shift - if proc_obj: - for param_obj in proc_obj.parameters: - # skip the parameter when varied hists are not needed - if not self.inference_model_inst.require_shapes_for_parameter(param_obj): + + for region in h_dict.keys(): + if region not in hists: hists[region] = {} + # skip when the dataset is already known to not contain any sub process + if not any(map(dataset_inst.has_process, sub_process_insts)): + self.logger.warning( + f"dataset '{dataset}' does not contain process '{process_inst.name}' " + "or any of its subprocesses which indicates a misconfiguration in the " + f"inference model '{self.inference_model}'", + ) continue - # store the varied hists - hists[proc_obj_name][param_obj.name] = {} - for d in ["up", "down"]: - shift_inst = self.config_inst.get_shift(f"{param_obj.config_shift_source}_{d}") - hists[proc_obj_name][param_obj.name][d] = h_proc[ - {"shift": hist.loc(shift_inst.id)} + # open the histogram and work on a copy + h = h_dict[region] + # axis selections + h = h[{ + "process": [ + hist.loc(p.id) + for p in sub_process_insts + if p.id in h.axes["process"] ] + }] + + # axis reductions + h = h[{"process": sum}] + if process_inst in hists[region]: + hists[region][process_inst] += h + else: + hists[region][process_inst] = h + + # there must be a histogra + if hists[region][process_inst] is None: + raise Exception(f"no histograms found for process '{process_inst.name}'") + + + + if category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions + if self.hist_hooks: + hists = self.invoke_hist_hooks(hists,category_inst) + else: + hists = hists[category_inst.name] + for process_inst in hists: + if process_inst not in process_insts: + process_insts.append(process_inst) + else: # get the histogram for the pro + hists = hists[category_inst.name] + datacard_hists = OrderedDict() + for process_inst in process_insts: + # get the histogram for the process + datacard_hists[process_inst.name] = OrderedDict() + nominal_shift_inst = self.config_inst.get_shift("nominal") + # add the histogram to the datacard + datacard_hists[process_inst.name]["nominal"] = hists[process_inst][{"shift": hist.loc(nominal_shift_inst.id)}] # forward objects to the datacard writer outputs = self.output() - writer = DatacardWriter(self.inference_model_inst, {cat_obj.name: hists}) + writer = DatacardWriter(self.inference_model_inst, {cat_obj.name: datacard_hists}) with outputs["card"].localize("w") as tmp_card, outputs["shapes"].localize("w") as tmp_shapes: writer.write(tmp_card.abspath, tmp_shapes.abspath, shapes_path_ref=outputs["shapes"].basename) From fb06da518e874d5b706e1ff8d790f04437c6bcf9 Mon Sep 17 00:00:00 2001 From: Aliya Nigamova Date: Wed, 21 May 2025 10:16:30 +0200 Subject: [PATCH 23/26] working version for the ff method --- columnflow/tasks/cms/inference.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py index b5a391fbb..d091f11bb 100644 --- a/columnflow/tasks/cms/inference.py +++ b/columnflow/tasks/cms/inference.py @@ -194,14 +194,12 @@ def run(self): if proc_obj_name == "data": proc_obj = None process_inst = self.config_inst.get_process("data") - elif proc_obj_name != "qcd" and proc_obj_name != "wj": + elif proc_obj_name != "qcd" and proc_obj_name != "jet_fakes": proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name) process_inst = self.config_inst.get_process(proc_obj.config_process) else: continue sub_process_insts = [sub for sub, _, _ in process_inst.walk_processes(include_self=True)] - process_insts.append(process_inst) - h_proc = None for dataset, _inp in inp.items(): dataset_inst = self.config_inst.get_dataset(dataset) h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy() @@ -238,24 +236,16 @@ def run(self): if hists[region][process_inst] is None: raise Exception(f"no histograms found for process '{process_inst.name}'") - - - if category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions - if self.hist_hooks: - hists = self.invoke_hist_hooks(hists,category_inst) - else: - hists = hists[category_inst.name] - for process_inst in hists: - if process_inst not in process_insts: - process_insts.append(process_inst) - else: # get the histogram for the pro - hists = hists[category_inst.name] + if self.hist_hooks and category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions + hists = self.invoke_hist_hooks(hists,category_inst) + else: + hists = hists[category_inst.name] + # prepare the hists to be used in the datacard writer datacard_hists = OrderedDict() - for process_inst in process_insts: + for process_inst in hists.keys(): # get the histogram for the process datacard_hists[process_inst.name] = OrderedDict() nominal_shift_inst = self.config_inst.get_shift("nominal") - # add the histogram to the datacard datacard_hists[process_inst.name]["nominal"] = hists[process_inst][{"shift": hist.loc(nominal_shift_inst.id)}] # forward objects to the datacard writer From 4e94516c0990e34ae65ab4dc8f8a80c544a3389c Mon Sep 17 00:00:00 2001 From: Aliya Nigamova Date: Wed, 21 May 2025 11:17:19 +0200 Subject: [PATCH 24/26] adding data_driven flag --- columnflow/inference/__init__.py | 3 +++ columnflow/tasks/cms/inference.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/columnflow/inference/__init__.py b/columnflow/inference/__init__.py index 7926a9f78..d60fd87c4 100644 --- a/columnflow/inference/__init__.py +++ b/columnflow/inference/__init__.py @@ -325,6 +325,7 @@ def process_spec( name: str, config_process: str | None = None, is_signal: bool = False, + data_driven: bool = False, config_mc_datasets: Sequence[str] | None = None, scale: float | int = 1.0, ) -> DotDict: @@ -333,6 +334,7 @@ def process_spec( - *name*: The name of the process in the model. - *is_signal*: A boolean flag deciding whether this process describes signal. + - *data_driven*: A boolean flag deciding whether this process is data driven. - *config_process*: The name of the source process in the config to use. - *config_mc_datasets*: List of names or patterns of MC datasets in the config to use. - *scale*: A float value to scale the process, defaulting to 1.0. @@ -340,6 +342,7 @@ def process_spec( return DotDict([ ("name", str(name)), ("is_signal", bool(is_signal)), + ("data_driven", bool(data_driven)), ("config_process", str(config_process) if config_process else None), ("config_mc_datasets", list(map(str, config_mc_datasets or []))), ("scale", float(scale)), diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py index d091f11bb..949b88cff 100644 --- a/columnflow/tasks/cms/inference.py +++ b/columnflow/tasks/cms/inference.py @@ -191,11 +191,11 @@ def run(self): #prepare histogram objects with self.publish_step(f"extracting {variable_inst.name} in {category_inst.name} ..."): for proc_obj_name, inp in inputs.items(): + proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name) if proc_obj_name == "data": proc_obj = None process_inst = self.config_inst.get_process("data") - elif proc_obj_name != "qcd" and proc_obj_name != "jet_fakes": - proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name) + elif not proc_obj.data_driven : # data driven processes will be added later with invoke_hist_hooks process_inst = self.config_inst.get_process(proc_obj.config_process) else: continue From 0099c46a810a069dc58405c6a70ffc50e48492d5 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Wed, 16 Jul 2025 16:12:06 +0200 Subject: [PATCH 25/26] Changes to make datacard production working and some cosmetics --- columnflow/columnar_util.py | 13 +++++++ columnflow/plotting/plot_functions_1d.py | 43 ++++++++++++++++++++---- columnflow/tasks/cms/inference.py | 41 ++++++++++++---------- columnflow/tasks/yields.py | 5 +-- sandboxes/dev.txt | 4 ++- 5 files changed, 79 insertions(+), 27 deletions(-) diff --git a/columnflow/columnar_util.py b/columnflow/columnar_util.py index 171ab3661..24be344f2 100644 --- a/columnflow/columnar_util.py +++ b/columnflow/columnar_util.py @@ -2465,6 +2465,19 @@ def setup(cls, func: Callable[[dict], None]) -> None: """ cls.setup_func = func + @classmethod + def teardown(cls, func: Callable[[dict], None]) -> None: + """ + Decorator to wrap a function *func* that should be registered as :py:meth:`teardown_func` + which is used to perform a custom teardown of objects at the end of processing. The function + should accept one argument: + + - *task*, the invoking task instance. + + The decorator does not return the wrapped function. + """ + cls.teardown_func = func + def __init__( self, *args, diff --git a/columnflow/plotting/plot_functions_1d.py b/columnflow/plotting/plot_functions_1d.py index ceeccb986..839418c67 100644 --- a/columnflow/plotting/plot_functions_1d.py +++ b/columnflow/plotting/plot_functions_1d.py @@ -271,20 +271,36 @@ def plot_shifted_variable( plot_config = {} colors = { "nominal": "black", - "up": "red", - "down": "blue", + "up": "blue", + "down": "red", } + shift_names = { + "nominal": "max mixing", + "ts_up": "CP-odd", + "ts_down": "CP-even", + } + + hist_up = None + hist_down = None + hist_up_err = None + hist_down_err = None for i, shift_id in enumerate(h_sum.axes["shift"]): shift_inst = config_inst.get_shift(shift_id) - + h = h_sum[{"shift": hist.loc(shift_id)}] + if "up" in shift_inst.label: + hist_up = h.values() + hist_up_err = h.variances() + elif "down" in shift_inst.label: + hist_down = h.values() + hist_down_err = h.variances() # assuming `nominal` always has shift id 0 ratio_norm = h_sum[{"shift": hist.loc(0)}].values() diff = sum(h.values()) / sum(ratio_norm) - 1 - label = shift_inst.label + label = shift_names[shift_inst.label] if not shift_inst.is_nominal: - label += " ({0:+.2f}%)".format(diff * 100) + pass #label += " ({0:+.2f}%)".format(diff * 100) plot_config[shift_inst.name] = plot_cfg = { "method": "draw_hist", @@ -302,8 +318,18 @@ def plot_shifted_variable( if hide_errors: for key in ("kwargs", "ratio_kwargs"): if key in plot_cfg: - plot_cfg[key]["yerr"] = None - + plot_cfg[key]["yerr"] = False + h_sum = (hist_up + hist_down) + mask = (h_sum > 0) + asym_hist = np.where(mask, + np.abs(hist_up - hist_down)/h_sum, + 0) + herr_num = np.sqrt(hist_up_err + hist_down_err) + herr_den = np.sqrt(hist_up_err + hist_down_err) + dA = np.average(np.sqrt( (herr_num/h_sum)**2 + (herr_den*np.abs(hist_up - hist_down)/h_sum/h_sum)**2)) + + A = np.average(asym_hist) + # legend title setting if not legend_title and len(hists) == 1: # use process label as default if 1 process @@ -318,6 +344,9 @@ def plot_shifted_variable( ) default_style_config["rax_cfg"]["ylim"] = (0.75, 1.25) default_style_config["rax_cfg"]["ylabel"] = "Ratio" + + default_style_config["annotate_cfg"]["text"] = f'A={A:1.3f}$\pm${dA:1.3f}' + default_style_config["annotate_cfg"]["fontsize"] = 22 if legend_title: default_style_config["legend_cfg"]["title"] = legend_title diff --git a/columnflow/tasks/cms/inference.py b/columnflow/tasks/cms/inference.py index 949b88cff..24abc829d 100644 --- a/columnflow/tasks/cms/inference.py +++ b/columnflow/tasks/cms/inference.py @@ -92,6 +92,7 @@ def workflow_requires(self): for cat_obj in self.branch_map.values(): for proc_obj in cat_obj.processes: + if proc_obj.data_driven: continue for dataset in self.get_mc_datasets(proc_obj): # add all required variables and shifts per dataset mc_dataset_params[dataset]["variables"].add(cat_obj.config_variable) @@ -100,10 +101,8 @@ def workflow_requires(self): for param_obj in proc_obj.parameters if self.inference_model_inst.require_shapes_for_parameter(param_obj) ) - for dataset in self.get_data_datasets(cat_obj): data_dataset_params[dataset]["variables"].add(cat_obj.config_variable) - # set workflow requirements per mc dataset reqs["merged_hists"] = set( self.reqs.MergeShiftedHistograms.req_different_branching( @@ -129,6 +128,7 @@ def workflow_requires(self): def requires(self): cat_obj = self.branch_data + processes = [proc_obj for proc_obj in cat_obj.processes if not proc_obj.data_driven] reqs = { proc_obj.name: { dataset: self.reqs.MergeShiftedHistograms.req_different_branching( @@ -143,9 +143,9 @@ def requires(self): branch=-1, workflow="local", ) - for dataset in self.get_mc_datasets(proc_obj) + for dataset in self.get_mc_datasets(proc_obj) } - for proc_obj in cat_obj.processes + for proc_obj in processes } if cat_obj.config_data_datasets: reqs["data"] = { @@ -191,19 +191,20 @@ def run(self): #prepare histogram objects with self.publish_step(f"extracting {variable_inst.name} in {category_inst.name} ..."): for proc_obj_name, inp in inputs.items(): - proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name) if proc_obj_name == "data": proc_obj = None process_inst = self.config_inst.get_process("data") - elif not proc_obj.data_driven : # data driven processes will be added later with invoke_hist_hooks - process_inst = self.config_inst.get_process(proc_obj.config_process) - else: - continue + else: + proc_obj = self.inference_model_inst.get_process(proc_obj_name, category=cat_obj.name) + if not proc_obj.data_driven: # data driven processes will be added later with invoke_hist_hooks + process_inst = self.config_inst.get_process(proc_obj.config_process) + else: + pass + sub_process_insts = [sub for sub, _, _ in process_inst.walk_processes(include_self=True)] for dataset, _inp in inp.items(): dataset_inst = self.config_inst.get_dataset(dataset) h_dict = _inp["collection"][0]["hists"][variable_inst.name].load(formatter="pickle").copy() - for region in h_dict.keys(): if region not in hists: hists[region] = {} # skip when the dataset is already known to not contain any sub process @@ -235,19 +236,25 @@ def run(self): # there must be a histogra if hists[region][process_inst] is None: raise Exception(f"no histograms found for process '{process_inst.name}'") - if self.hist_hooks and category_inst.aux: #Assume that aux exists only for signal regions since it contains the information about application and determination regions hists = self.invoke_hist_hooks(hists,category_inst) else: hists = hists[category_inst.name] # prepare the hists to be used in the datacard writer datacard_hists = OrderedDict() - for process_inst in hists.keys(): - # get the histogram for the process - datacard_hists[process_inst.name] = OrderedDict() - nominal_shift_inst = self.config_inst.get_shift("nominal") - datacard_hists[process_inst.name]["nominal"] = hists[process_inst][{"shift": hist.loc(nominal_shift_inst.id)}] - + for combine_proc, proc_name in self.inference_model_inst.proc_map.items(): + process_inst = [the_proc for the_proc in hists.keys() if the_proc.name == proc_name] + if len(process_inst) and not (hists[process_inst[0]].empty()): + # get the histogram for the process + datacard_hists[combine_proc] = OrderedDict() + nominal_shift_inst = self.config_inst.get_shift("nominal") + datacard_hists[combine_proc]["nominal"] = hists[process_inst[0]][{"shift": hist.loc(nominal_shift_inst.id)}] + # add data: + data_proc = [the_proc for the_proc in hists.keys() if the_proc.name == 'data'] + datacard_hists['data'] = OrderedDict() + nominal_shift_inst = self.config_inst.get_shift("nominal") + datacard_hists['data']["nominal"] = hists[data_proc[0]][{"shift": hist.loc(nominal_shift_inst.id)}] + # forward objects to the datacard writer outputs = self.output() writer = DatacardWriter(self.inference_model_inst, {cat_obj.name: datacard_hists}) diff --git a/columnflow/tasks/yields.py b/columnflow/tasks/yields.py index ecf73c85f..01ba92079 100644 --- a/columnflow/tasks/yields.py +++ b/columnflow/tasks/yields.py @@ -367,7 +367,7 @@ def run(self): inputs = self.input() outputs = self.output() - category_insts = list(map(self.config_inst.get_category, self.categories)) + category_insts = list(self.categories) process_insts = list(map(self.config_inst.get_process, self.processes)) sub_process_insts = { proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] @@ -383,7 +383,8 @@ def run(self): # load the histogram of the variable named "event" input_hists = inp["hists"]["event"].load(formatter="pickle") - for the_cat, the_hist in input_hists.items(): + for the_cat in category_insts: + the_hist = input_hists[the_cat] if the_cat not in merged_hists.keys(): merged_hists[the_cat] = [] merged_hists[the_cat].append(the_hist) diff --git a/sandboxes/dev.txt b/sandboxes/dev.txt index cc5455448..3be3f1914 100644 --- a/sandboxes/dev.txt +++ b/sandboxes/dev.txt @@ -1,4 +1,4 @@ -# version 10 +# version 11 # last version to support python 3.9 ipython~=8.18.1 @@ -10,3 +10,5 @@ flake8-quotes~=3.4.0 pipdeptree~=2.23.4 pymarkdownlnt~=0.9.25 uniplot~=0.15.1 +xgboost~=2.1.4 +scikit-learn From 518a10262e2031004394e2ead1048b2df200e7f1 Mon Sep 17 00:00:00 2001 From: zakharov-binp Date: Thu, 31 Jul 2025 13:39:36 +0200 Subject: [PATCH 26/26] returned dev.txt to the original state --- sandboxes/dev.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sandboxes/dev.txt b/sandboxes/dev.txt index 3be3f1914..cc5455448 100644 --- a/sandboxes/dev.txt +++ b/sandboxes/dev.txt @@ -1,4 +1,4 @@ -# version 11 +# version 10 # last version to support python 3.9 ipython~=8.18.1 @@ -10,5 +10,3 @@ flake8-quotes~=3.4.0 pipdeptree~=2.23.4 pymarkdownlnt~=0.9.25 uniplot~=0.15.1 -xgboost~=2.1.4 -scikit-learn